summaryrefslogtreecommitdiffstats
path: root/src/cmd/compile/internal/ssa/_gen
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/compile/internal/ssa/_gen')
-rw-r--r--src/cmd/compile/internal/ssa/_gen/386.rules1091
-rw-r--r--src/cmd/compile/internal/ssa/_gen/386Ops.go588
-rw-r--r--src/cmd/compile/internal/ssa/_gen/386splitload.rules11
-rw-r--r--src/cmd/compile/internal/ssa/_gen/AMD64.rules2216
-rw-r--r--src/cmd/compile/internal/ssa/_gen/AMD64Ops.go1133
-rw-r--r--src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules8
-rw-r--r--src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules45
-rw-r--r--src/cmd/compile/internal/ssa/_gen/ARM.rules1474
-rw-r--r--src/cmd/compile/internal/ssa/_gen/ARM64.rules3030
-rw-r--r--src/cmd/compile/internal/ssa/_gen/ARM64Ops.go794
-rw-r--r--src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules21
-rw-r--r--src/cmd/compile/internal/ssa/_gen/ARMOps.go600
-rw-r--r--src/cmd/compile/internal/ssa/_gen/LOONG64.rules694
-rw-r--r--src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go484
-rw-r--r--src/cmd/compile/internal/ssa/_gen/MIPS.rules703
-rw-r--r--src/cmd/compile/internal/ssa/_gen/MIPS64.rules691
-rw-r--r--src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go482
-rw-r--r--src/cmd/compile/internal/ssa/_gen/MIPSOps.go439
-rw-r--r--src/cmd/compile/internal/ssa/_gen/PPC64.rules1274
-rw-r--r--src/cmd/compile/internal/ssa/_gen/PPC64Ops.go740
-rw-r--r--src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules10
-rw-r--r--src/cmd/compile/internal/ssa/_gen/README7
-rw-r--r--src/cmd/compile/internal/ssa/_gen/RISCV64.rules845
-rw-r--r--src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go482
-rw-r--r--src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules19
-rw-r--r--src/cmd/compile/internal/ssa/_gen/S390X.rules1704
-rw-r--r--src/cmd/compile/internal/ssa/_gen/S390XOps.go817
-rw-r--r--src/cmd/compile/internal/ssa/_gen/Wasm.rules396
-rw-r--r--src/cmd/compile/internal/ssa/_gen/WasmOps.go277
-rw-r--r--src/cmd/compile/internal/ssa/_gen/allocators.go198
-rwxr-xr-xsrc/cmd/compile/internal/ssa/_gen/cover.bash26
-rw-r--r--src/cmd/compile/internal/ssa/_gen/dec.rules93
-rw-r--r--src/cmd/compile/internal/ssa/_gen/dec64.rules401
-rw-r--r--src/cmd/compile/internal/ssa/_gen/dec64Ops.go18
-rw-r--r--src/cmd/compile/internal/ssa/_gen/decOps.go18
-rw-r--r--src/cmd/compile/internal/ssa/_gen/generic.rules2672
-rw-r--r--src/cmd/compile/internal/ssa/_gen/genericOps.go664
-rw-r--r--src/cmd/compile/internal/ssa/_gen/main.go571
-rw-r--r--src/cmd/compile/internal/ssa/_gen/rulegen.go1885
39 files changed, 27621 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/_gen/386.rules b/src/cmd/compile/internal/ssa/_gen/386.rules
new file mode 100644
index 0000000..5e30ca9
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/386.rules
@@ -0,0 +1,1091 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(Ptr|32|16|8) ...) => (ADDL ...)
+(Add(32|64)F ...) => (ADDS(S|D) ...)
+(Add32carry ...) => (ADDLcarry ...)
+(Add32withcarry ...) => (ADCL ...)
+
+(Sub(Ptr|32|16|8) ...) => (SUBL ...)
+(Sub(32|64)F ...) => (SUBS(S|D) ...)
+(Sub32carry ...) => (SUBLcarry ...)
+(Sub32withcarry ...) => (SBBL ...)
+
+(Mul(32|16|8) ...) => (MULL ...)
+(Mul(32|64)F ...) => (MULS(S|D) ...)
+(Mul32uhilo ...) => (MULLQU ...)
+
+(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y))
+(Select1 (Mul32uover x y)) => (SETO (Select1 <types.TypeFlags> (MULLU x y)))
+
+(Avg32u ...) => (AVGLU ...)
+
+(Div(32|64)F ...) => (DIVS(S|D) ...)
+(Div(32|32u|16|16u) ...) => (DIV(L|LU|W|WU) ...)
+(Div8 x y) => (DIVW (SignExt8to16 x) (SignExt8to16 y))
+(Div8u x y) => (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+
+(Hmul(32|32u) ...) => (HMUL(L|LU) ...)
+
+(Mod(32|32u|16|16u) ...) => (MOD(L|LU|W|WU) ...)
+(Mod8 x y) => (MODW (SignExt8to16 x) (SignExt8to16 y))
+(Mod8u x y) => (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+
+(And(32|16|8) ...) => (ANDL ...)
+(Or(32|16|8) ...) => (ORL ...)
+(Xor(32|16|8) ...) => (XORL ...)
+
+(Neg(32|16|8) ...) => (NEGL ...)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
+
+(Com(32|16|8) ...) => (NOTL ...)
+
+// Lowering boolean ops
+(AndB ...) => (ANDL ...)
+(OrB ...) => (ORL ...)
+(Not x) => (XORLconst [1] x)
+
+// Lowering pointer arithmetic
+(OffPtr [off] ptr) => (ADDLconst [int32(off)] ptr)
+
+(Bswap32 ...) => (BSWAPL ...)
+
+(Sqrt ...) => (SQRTSD ...)
+(Sqrt32 ...) => (SQRTSS ...)
+
+(Ctz16 x) => (BSFL (ORLconst <typ.UInt32> [0x10000] x))
+(Ctz16NonZero ...) => (BSFL ...)
+
+// Lowering extension
+(SignExt8to16 ...) => (MOVBLSX ...)
+(SignExt8to32 ...) => (MOVBLSX ...)
+(SignExt16to32 ...) => (MOVWLSX ...)
+
+(ZeroExt8to16 ...) => (MOVBLZX ...)
+(ZeroExt8to32 ...) => (MOVBLZX ...)
+(ZeroExt16to32 ...) => (MOVWLZX ...)
+
+(Signmask x) => (SARLconst x [31])
+(Zeromask <t> x) => (XORLconst [-1] (SBBLcarrymask <t> (CMPLconst x [1])))
+(Slicemask <t> x) => (SARLconst (NEGL <t> x) [31])
+
+// Lowering truncation
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+
+// Lowering float-int conversions
+(Cvt32to32F ...) => (CVTSL2SS ...)
+(Cvt32to64F ...) => (CVTSL2SD ...)
+
+(Cvt32Fto32 ...) => (CVTTSS2SL ...)
+(Cvt64Fto32 ...) => (CVTTSD2SL ...)
+
+(Cvt32Fto64F ...) => (CVTSS2SD ...)
+(Cvt64Fto32F ...) => (CVTSD2SS ...)
+
+(Round32F ...) => (Copy ...)
+(Round64F ...) => (Copy ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+// Lowering shifts
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+// result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
+(Lsh32x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+(Lsh16x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+(Lsh8x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+
+(Lsh32x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y)
+(Lsh16x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y)
+(Lsh8x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y)
+
+(Rsh32Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+(Rsh16Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [16])))
+(Rsh8Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [8])))
+
+(Rsh32Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRL <t> x y)
+(Rsh16Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRW <t> x y)
+(Rsh8Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRB <t> x y)
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
+
+(Rsh32x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [32])))))
+(Rsh16x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [16])))))
+(Rsh8x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [8])))))
+
+(Rsh32x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARL x y)
+(Rsh16x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARW x y)
+(Rsh8x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARB x y)
+
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SHLLconst x [int32(c)])
+(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SARLconst x [int32(c)])
+(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 => (SHRLconst x [int32(c)])
+(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SHLLconst x [int32(c)])
+(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SARWconst x [int16(c)])
+(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 => (SHRWconst x [int16(c)])
+(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SHLLconst x [int32(c)])
+(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SARBconst x [int8(c)])
+(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 => (SHRBconst x [int8(c)])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh32x64 x (Const64 [c])) && uint64(c) >= 32 => (SARLconst x [31])
+(Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 => (SARWconst x [15])
+(Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SARBconst x [7])
+
+// rotates
+(RotateLeft32 ...) => (ROLL ...)
+(RotateLeft16 ...) => (ROLW ...)
+(RotateLeft8 ...) => (ROLB ...)
+// constant rotates
+(ROLL x (MOVLconst [c])) => (ROLLconst [c&31] x)
+(ROLW x (MOVLconst [c])) => (ROLWconst [int16(c&15)] x)
+(ROLB x (MOVLconst [c])) => (ROLBconst [int8(c&7)] x)
+
+// Lowering comparisons
+(Less32 x y) => (SETL (CMPL x y))
+(Less16 x y) => (SETL (CMPW x y))
+(Less8 x y) => (SETL (CMPB x y))
+(Less32U x y) => (SETB (CMPL x y))
+(Less16U x y) => (SETB (CMPW x y))
+(Less8U x y) => (SETB (CMPB x y))
+// Use SETGF with reversed operands to dodge NaN case
+(Less64F x y) => (SETGF (UCOMISD y x))
+(Less32F x y) => (SETGF (UCOMISS y x))
+
+(Leq32 x y) => (SETLE (CMPL x y))
+(Leq16 x y) => (SETLE (CMPW x y))
+(Leq8 x y) => (SETLE (CMPB x y))
+(Leq32U x y) => (SETBE (CMPL x y))
+(Leq16U x y) => (SETBE (CMPW x y))
+(Leq8U x y) => (SETBE (CMPB x y))
+// Use SETGEF with reversed operands to dodge NaN case
+(Leq64F x y) => (SETGEF (UCOMISD y x))
+(Leq32F x y) => (SETGEF (UCOMISS y x))
+
+(Eq32 x y) => (SETEQ (CMPL x y))
+(Eq16 x y) => (SETEQ (CMPW x y))
+(Eq8 x y) => (SETEQ (CMPB x y))
+(EqB x y) => (SETEQ (CMPB x y))
+(EqPtr x y) => (SETEQ (CMPL x y))
+(Eq64F x y) => (SETEQF (UCOMISD x y))
+(Eq32F x y) => (SETEQF (UCOMISS x y))
+
+(Neq32 x y) => (SETNE (CMPL x y))
+(Neq16 x y) => (SETNE (CMPW x y))
+(Neq8 x y) => (SETNE (CMPB x y))
+(NeqB x y) => (SETNE (CMPB x y))
+(NeqPtr x y) => (SETNE (CMPL x y))
+(Neq64F x y) => (SETNEF (UCOMISD x y))
+(Neq32F x y) => (SETNEF (UCOMISS x y))
+
+// Lowering loads
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVLload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem)
+
+// Lowering stores
+// These more-specific FP versions of Store pattern should come first.
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem)
+
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Lowering moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem)
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))
+(Move [5] dst src mem) =>
+ (MOVBstore [4] dst (MOVBload [4] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+(Move [6] dst src mem) =>
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+(Move [7] dst src mem) =>
+ (MOVLstore [3] dst (MOVLload [3] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+(Move [8] dst src mem) =>
+ (MOVLstore [4] dst (MOVLload [4] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+
+// Adjust moves to be a multiple of 4 bytes.
+(Move [s] dst src mem)
+ && s > 8 && s%4 != 0 =>
+ (Move [s-s%4]
+ (ADDLconst <dst.Type> dst [int32(s%4)])
+ (ADDLconst <src.Type> src [int32(s%4)])
+ (MOVLstore dst (MOVLload src mem) mem))
+
+// Medium copying uses a duff device.
+(Move [s] dst src mem)
+ && s > 8 && s <= 4*128 && s%4 == 0
+ && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [10*(128-s/4)] dst src mem)
+// 10 and 128 are magic constants. 10 is the number of bytes to encode:
+// MOVL (SI), CX
+// ADDL $4, SI
+// MOVL CX, (DI)
+// ADDL $4, DI
+// and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy.
+
+// Large copying uses REP MOVSL.
+(Move [s] dst src mem) && (s > 4*128 || config.noDuffDevice) && s%4 == 0 && logLargeCopy(v, s) =>
+ (REPMOVSL dst src (MOVLconst [int32(s/4)]) mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [0] destptr mem)
+(Zero [2] destptr mem) => (MOVWstoreconst [0] destptr mem)
+(Zero [4] destptr mem) => (MOVLstoreconst [0] destptr mem)
+
+(Zero [3] destptr mem) =>
+ (MOVBstoreconst [makeValAndOff(0,2)] destptr
+ (MOVWstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [5] destptr mem) =>
+ (MOVBstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [6] destptr mem) =>
+ (MOVWstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [7] destptr mem) =>
+ (MOVLstoreconst [makeValAndOff(0,3)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+
+// Strip off any fractional word zeroing.
+(Zero [s] destptr mem) && s%4 != 0 && s > 4 =>
+ (Zero [s-s%4] (ADDLconst destptr [int32(s%4)])
+ (MOVLstoreconst [0] destptr mem))
+
+// Zero small numbers of words directly.
+(Zero [8] destptr mem) =>
+ (MOVLstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [12] destptr mem) =>
+ (MOVLstoreconst [makeValAndOff(0,8)] destptr
+ (MOVLstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)))
+(Zero [16] destptr mem) =>
+ (MOVLstoreconst [makeValAndOff(0,12)] destptr
+ (MOVLstoreconst [makeValAndOff(0,8)] destptr
+ (MOVLstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))))
+
+// Medium zeroing uses a duff device.
+(Zero [s] destptr mem)
+ && s > 16 && s <= 4*128 && s%4 == 0
+ && !config.noDuffDevice =>
+ (DUFFZERO [1*(128-s/4)] destptr (MOVLconst [0]) mem)
+// 1 and 128 are magic constants. 1 is the number of bytes to encode STOSL.
+// 128 is the number of STOSL instructions in duffzero.
+// See src/runtime/duff_386.s:duffzero.
+
+// Large zeroing uses REP STOSQ.
+(Zero [s] destptr mem)
+ && (s > 4*128 || (config.noDuffDevice && s > 16))
+ && s%4 == 0 =>
+ (REPSTOSL destptr (MOVLconst [int32(s/4)]) (MOVLconst [0]) mem)
+
+
+// Lowering constants
+(Const8 [c]) => (MOVLconst [int32(c)])
+(Const16 [c]) => (MOVLconst [int32(c)])
+(Const32 ...) => (MOVLconst ...)
+(Const(32|64)F ...) => (MOVS(S|D)const ...)
+(ConstNil) => (MOVLconst [0])
+(ConstBool [c]) => (MOVLconst [b2i32(c)])
+
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// Miscellaneous
+(IsNonNil p) => (SETNE (TESTL p p))
+(IsInBounds idx len) => (SETB (CMPL idx len))
+(IsSliceInBounds idx len) => (SETBE (CMPL idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetG ...) => (LoweredGetG ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(Addr {sym} base) => (LEAL {sym} base)
+(LocalAddr {sym} base _) => (LEAL {sym} base)
+
+// block rewrites
+(If (SETL cmp) yes no) => (LT cmp yes no)
+(If (SETLE cmp) yes no) => (LE cmp yes no)
+(If (SETG cmp) yes no) => (GT cmp yes no)
+(If (SETGE cmp) yes no) => (GE cmp yes no)
+(If (SETEQ cmp) yes no) => (EQ cmp yes no)
+(If (SETNE cmp) yes no) => (NE cmp yes no)
+(If (SETB cmp) yes no) => (ULT cmp yes no)
+(If (SETBE cmp) yes no) => (ULE cmp yes no)
+(If (SETA cmp) yes no) => (UGT cmp yes no)
+(If (SETAE cmp) yes no) => (UGE cmp yes no)
+(If (SETO cmp) yes no) => (OS cmp yes no)
+
+// Special case for floating point - LF/LEF not generated
+(If (SETGF cmp) yes no) => (UGT cmp yes no)
+(If (SETGEF cmp) yes no) => (UGE cmp yes no)
+(If (SETEQF cmp) yes no) => (EQF cmp yes no)
+(If (SETNEF cmp) yes no) => (NEF cmp yes no)
+
+(If cond yes no) => (NE (TESTB cond cond) yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem)
+
+// ***************************
+// Above: lowering rules
+// Below: optimizations
+// ***************************
+// TODO: Should the optimizations be a separate pass?
+
+// Fold boolean tests into blocks
+(NE (TESTB (SETL cmp) (SETL cmp)) yes no) => (LT cmp yes no)
+(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE cmp yes no)
+(NE (TESTB (SETG cmp) (SETG cmp)) yes no) => (GT cmp yes no)
+(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE cmp yes no)
+(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ cmp yes no)
+(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE cmp yes no)
+(NE (TESTB (SETB cmp) (SETB cmp)) yes no) => (ULT cmp yes no)
+(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no)
+(NE (TESTB (SETA cmp) (SETA cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no)
+
+// Special case for floating point - LF/LEF not generated
+(NE (TESTB (SETGF cmp) (SETGF cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF cmp yes no)
+(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF cmp yes no)
+
+// fold constants into instructions
+(ADDL x (MOVLconst [c])) => (ADDLconst [c] x)
+(ADDLcarry x (MOVLconst [c])) => (ADDLconstcarry [c] x)
+(ADCL x (MOVLconst [c]) f) => (ADCLconst [c] x f)
+
+(SUBL x (MOVLconst [c])) => (SUBLconst x [c])
+(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c]))
+(SUBLcarry x (MOVLconst [c])) => (SUBLconstcarry [c] x)
+(SBBL x (MOVLconst [c]) f) => (SBBLconst [c] x f)
+
+(MULL x (MOVLconst [c])) => (MULLconst [c] x)
+(ANDL x (MOVLconst [c])) => (ANDLconst [c] x)
+
+(ANDLconst [c] (ANDLconst [d] x)) => (ANDLconst [c & d] x)
+(XORLconst [c] (XORLconst [d] x)) => (XORLconst [c ^ d] x)
+(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x)
+
+(ORL x (MOVLconst [c])) => (ORLconst [c] x)
+(XORL x (MOVLconst [c])) => (XORLconst [c] x)
+
+(SHLL x (MOVLconst [c])) => (SHLLconst [c&31] x)
+(SHRL x (MOVLconst [c])) => (SHRLconst [c&31] x)
+(SHRW x (MOVLconst [c])) && c&31 < 16 => (SHRWconst [int16(c&31)] x)
+(SHRW _ (MOVLconst [c])) && c&31 >= 16 => (MOVLconst [0])
+(SHRB x (MOVLconst [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x)
+(SHRB _ (MOVLconst [c])) && c&31 >= 8 => (MOVLconst [0])
+
+(SARL x (MOVLconst [c])) => (SARLconst [c&31] x)
+(SARW x (MOVLconst [c])) => (SARWconst [int16(min(int64(c&31),15))] x)
+(SARB x (MOVLconst [c])) => (SARBconst [int8(min(int64(c&31),7))] x)
+
+(SARL x (ANDLconst [31] y)) => (SARL x y)
+(SHLL x (ANDLconst [31] y)) => (SHLL x y)
+(SHRL x (ANDLconst [31] y)) => (SHRL x y)
+
+// Constant shift simplifications
+
+(SHLLconst x [0]) => x
+(SHRLconst x [0]) => x
+(SARLconst x [0]) => x
+
+(SHRWconst x [0]) => x
+(SARWconst x [0]) => x
+
+(SHRBconst x [0]) => x
+(SARBconst x [0]) => x
+
+(ROLLconst [0] x) => x
+(ROLWconst [0] x) => x
+(ROLBconst [0] x) => x
+
+// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
+// because the x86 instructions are defined to use all 5 bits of the shift even
+// for the small shifts. I don't think we'll ever generate a weird shift (e.g.
+// (SHRW x (MOVLconst [24])), but just in case.
+
+(CMPL x (MOVLconst [c])) => (CMPLconst x [c])
+(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c]))
+(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)])
+(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)]))
+(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)])
+(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)]))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+(CMP(L|W|B) x y) && canonLessThan(x,y) => (InvertFlags (CMP(L|W|B) y x))
+
+// strength reduction
+// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
+// 1 - addl, shll, leal, negl, subl
+// 3 - imull
+// This limits the rewrites to two instructions.
+// Note that negl always operates in-place,
+// which can require a register-register move
+// to preserve the original value,
+// so it must be used with care.
+(MULLconst [-9] x) => (NEGL (LEAL8 <v.Type> x x))
+(MULLconst [-5] x) => (NEGL (LEAL4 <v.Type> x x))
+(MULLconst [-3] x) => (NEGL (LEAL2 <v.Type> x x))
+(MULLconst [-1] x) => (NEGL x)
+(MULLconst [0] _) => (MOVLconst [0])
+(MULLconst [1] x) => x
+(MULLconst [3] x) => (LEAL2 x x)
+(MULLconst [5] x) => (LEAL4 x x)
+(MULLconst [7] x) => (LEAL2 x (LEAL2 <v.Type> x x))
+(MULLconst [9] x) => (LEAL8 x x)
+(MULLconst [11] x) => (LEAL2 x (LEAL4 <v.Type> x x))
+(MULLconst [13] x) => (LEAL4 x (LEAL2 <v.Type> x x))
+(MULLconst [19] x) => (LEAL2 x (LEAL8 <v.Type> x x))
+(MULLconst [21] x) => (LEAL4 x (LEAL4 <v.Type> x x))
+(MULLconst [25] x) => (LEAL8 x (LEAL2 <v.Type> x x))
+(MULLconst [27] x) => (LEAL8 (LEAL2 <v.Type> x x) (LEAL2 <v.Type> x x))
+(MULLconst [37] x) => (LEAL4 x (LEAL8 <v.Type> x x))
+(MULLconst [41] x) => (LEAL8 x (LEAL4 <v.Type> x x))
+(MULLconst [45] x) => (LEAL8 (LEAL4 <v.Type> x x) (LEAL4 <v.Type> x x))
+(MULLconst [73] x) => (LEAL8 x (LEAL8 <v.Type> x x))
+(MULLconst [81] x) => (LEAL8 (LEAL8 <v.Type> x x) (LEAL8 <v.Type> x x))
+
+(MULLconst [c] x) && isPowerOfTwo32(c+1) && c >= 15 => (SUBL (SHLLconst <v.Type> [int32(log32(c+1))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-1) && c >= 17 => (LEAL1 (SHLLconst <v.Type> [int32(log32(c-1))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-2) && c >= 34 => (LEAL2 (SHLLconst <v.Type> [int32(log32(c-2))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-4) && c >= 68 => (LEAL4 (SHLLconst <v.Type> [int32(log32(c-4))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEAL8 (SHLLconst <v.Type> [int32(log32(c-8))] x) x)
+(MULLconst [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHLLconst [int32(log32(c/3))] (LEAL2 <v.Type> x x))
+(MULLconst [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHLLconst [int32(log32(c/5))] (LEAL4 <v.Type> x x))
+(MULLconst [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHLLconst [int32(log32(c/9))] (LEAL8 <v.Type> x x))
+
+// combine add/shift into LEAL
+(ADDL x (SHLLconst [3] y)) => (LEAL8 x y)
+(ADDL x (SHLLconst [2] y)) => (LEAL4 x y)
+(ADDL x (SHLLconst [1] y)) => (LEAL2 x y)
+(ADDL x (ADDL y y)) => (LEAL2 x y)
+(ADDL x (ADDL x y)) => (LEAL2 y x)
+
+// combine ADDL/ADDLconst into LEAL1
+(ADDLconst [c] (ADDL x y)) => (LEAL1 [c] x y)
+(ADDL (ADDLconst [c] x) y) => (LEAL1 [c] x y)
+
+// fold ADDL into LEAL
+(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x)
+(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x)
+(ADDLconst [c] x:(SP)) => (LEAL [c] x) // so it is rematerializeable
+(LEAL [c] {s} (ADDL x y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y)
+(ADDL x (LEAL [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y)
+
+// fold ADDLconst into LEALx
+(ADDLconst [c] (LEAL1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL1 [c+d] {s} x y)
+(ADDLconst [c] (LEAL2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL2 [c+d] {s} x y)
+(ADDLconst [c] (LEAL4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL4 [c+d] {s} x y)
+(ADDLconst [c] (LEAL8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL8 [c+d] {s} x y)
+(LEAL1 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL1 [c+d] {s} x y)
+(LEAL2 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL2 [c+d] {s} x y)
+(LEAL2 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEAL2 [c+2*d] {s} x y)
+(LEAL4 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL4 [c+d] {s} x y)
+(LEAL4 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEAL4 [c+4*d] {s} x y)
+(LEAL8 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL8 [c+d] {s} x y)
+(LEAL8 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEAL8 [c+8*d] {s} x y)
+
+// fold shifts into LEALx
+(LEAL1 [c] {s} x (SHLLconst [1] y)) => (LEAL2 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [2] y)) => (LEAL4 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [3] y)) => (LEAL8 [c] {s} x y)
+(LEAL2 [c] {s} x (SHLLconst [1] y)) => (LEAL4 [c] {s} x y)
+(LEAL2 [c] {s} x (SHLLconst [2] y)) => (LEAL8 [c] {s} x y)
+(LEAL4 [c] {s} x (SHLLconst [1] y)) => (LEAL8 [c] {s} x y)
+
+// reverse ordering of compare instruction
+(SETL (InvertFlags x)) => (SETG x)
+(SETG (InvertFlags x)) => (SETL x)
+(SETB (InvertFlags x)) => (SETA x)
+(SETA (InvertFlags x)) => (SETB x)
+(SETLE (InvertFlags x)) => (SETGE x)
+(SETGE (InvertFlags x)) => (SETLE x)
+(SETBE (InvertFlags x)) => (SETAE x)
+(SETAE (InvertFlags x)) => (SETBE x)
+(SETEQ (InvertFlags x)) => (SETEQ x)
+(SETNE (InvertFlags x)) => (SETNE x)
+
+// sign extended loads
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values. See test/atomicload.go.
+(MOVBLSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBLSXload <v.Type> [off] {sym} ptr mem)
+(MOVBLZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVWLSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWLSXload <v.Type> [off] {sym} ptr mem)
+(MOVWLZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLZX x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLZX x)
+(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVBLSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLSX x)
+(MOVWLSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLSX x)
+
+// Fold extensions and ANDs together.
+(MOVBLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x)
+(MOVWLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x)
+(MOVBLSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x)
+(MOVWLSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x)
+
+// Don't extend before storing
+(MOVWstore [off] {sym} ptr (MOVWL(S|Z)X x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBL(S|Z)X x) mem) => (MOVBstore [off] {sym} ptr x mem)
+
+// fold constants into memory operations
+// Note that this is not always a good idea because if not all the uses of
+// the ADDLconst get eliminated, we still have to compute the ADDLconst and we now
+// have potentially two live values (ptr and (ADDLconst [off] ptr)) instead of one.
+// Nevertheless, let's do it!
+(MOV(L|W|B|SS|SD)load [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOV(L|W|B|SS|SD)load [off1+off2] {sym} ptr mem)
+(MOV(L|W|B|SS|SD)store [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOV(L|W|B|SS|SD)store [off1+off2] {sym} ptr val mem)
+
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDLconst [off2] base) mem) && valoff1.canAdd32(off2) =>
+ ((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {sym} base mem)
+
+// Fold constants into stores.
+(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) =>
+ (MOVLstoreconst [makeValAndOff(c,off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVLconst [c]) mem) =>
+ (MOVWstoreconst [makeValAndOff(c,off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVLconst [c]) mem) =>
+ (MOVBstoreconst [makeValAndOff(c,off)] {sym} ptr mem)
+
+// Fold address offsets into constant stores.
+(MOV(L|W|B)storeconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+ (MOV(L|W|B)storeconst [sc.addOffset32(off)] {s} ptr mem)
+
+// We need to fold LEAL into the MOVx ops so that the live variable analysis knows
+// what variables are being read/written by the ops.
+// Note: we turn off this merging for operations on globals when building
+// position-independent code (when Flag_shared is set).
+// PIC needs a spare register to load the PC into. Having the LEAL be
+// a separate instruction gives us that register. Having the LEAL be
+// a separate instruction also allows it to be CSEd (which is good because
+// it compiles to a thunk call).
+(MOV(L|W|B|SS|SD|BLSX|WLSX)load [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
+ && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOV(L|W|B|SS|SD|BLSX|WLSX)load [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOV(L|W|B|SS|SD)store [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
+ && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOV(L|W|B|SS|SD)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(MOV(L|W|B)storeconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOV(L|W|B)storeconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ ((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAL [off2] {sym2} base) mem)
+ && valoff1.canAdd32(off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+ ((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+
+// Merge load/store to op
+((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+ ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr mem)) mem)
+ && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+ ((ADD|AND|OR|XOR)Lconstmodify [makeValAndOff(c,off)] {sym} ptr mem)
+
+// fold LEALs together
+(LEAL [off1] {sym1} (LEAL [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL [off1+off2] {mergeSym(sym1,sym2)} x)
+
+// LEAL into LEAL1
+(LEAL1 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL1 into LEAL
+(LEAL [off1] {sym1} (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL into LEAL[248]
+(LEAL2 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL4 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL8 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL[248] into LEAL
+(LEAL [off1] {sym1} (LEAL2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL[1248] into LEAL[1248]. Only some such merges are possible.
+(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} x y)
+(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} y x)
+(LEAL2 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+2*int64(off2)) =>
+ (LEAL4 [off1+2*off2] {sym} x y)
+(LEAL4 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+4*int64(off2)) =>
+ (LEAL8 [off1+4*off2] {sym} x y)
+
+// Absorb InvertFlags into branches.
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+
+// Constant comparisons.
+(CMPLconst (MOVLconst [x]) [y]) && x==y => (FlagEQ)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT)
+
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y => (FlagEQ)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT)
+
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y => (FlagEQ)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT)
+
+// Other known comparisons.
+(CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) => (FlagLT_ULT)
+(CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT)
+(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m) && int8(m) < n => (FlagLT_ULT)
+// TODO: DIVxU also.
+
+// Absorb flag constants into SBB ops.
+(SBBLcarrymask (FlagEQ)) => (MOVLconst [0])
+(SBBLcarrymask (FlagLT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagLT_UGT)) => (MOVLconst [0])
+(SBBLcarrymask (FlagGT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagGT_UGT)) => (MOVLconst [0])
+
+// Absorb flag constants into branches.
+(EQ (FlagEQ) yes no) => (First yes no)
+(EQ (FlagLT_ULT) yes no) => (First no yes)
+(EQ (FlagLT_UGT) yes no) => (First no yes)
+(EQ (FlagGT_ULT) yes no) => (First no yes)
+(EQ (FlagGT_UGT) yes no) => (First no yes)
+
+(NE (FlagEQ) yes no) => (First no yes)
+(NE (FlagLT_ULT) yes no) => (First yes no)
+(NE (FlagLT_UGT) yes no) => (First yes no)
+(NE (FlagGT_ULT) yes no) => (First yes no)
+(NE (FlagGT_UGT) yes no) => (First yes no)
+
+(LT (FlagEQ) yes no) => (First no yes)
+(LT (FlagLT_ULT) yes no) => (First yes no)
+(LT (FlagLT_UGT) yes no) => (First yes no)
+(LT (FlagGT_ULT) yes no) => (First no yes)
+(LT (FlagGT_UGT) yes no) => (First no yes)
+
+(LE (FlagEQ) yes no) => (First yes no)
+(LE (FlagLT_ULT) yes no) => (First yes no)
+(LE (FlagLT_UGT) yes no) => (First yes no)
+(LE (FlagGT_ULT) yes no) => (First no yes)
+(LE (FlagGT_UGT) yes no) => (First no yes)
+
+(GT (FlagEQ) yes no) => (First no yes)
+(GT (FlagLT_ULT) yes no) => (First no yes)
+(GT (FlagLT_UGT) yes no) => (First no yes)
+(GT (FlagGT_ULT) yes no) => (First yes no)
+(GT (FlagGT_UGT) yes no) => (First yes no)
+
+(GE (FlagEQ) yes no) => (First yes no)
+(GE (FlagLT_ULT) yes no) => (First no yes)
+(GE (FlagLT_UGT) yes no) => (First no yes)
+(GE (FlagGT_ULT) yes no) => (First yes no)
+(GE (FlagGT_UGT) yes no) => (First yes no)
+
+(ULT (FlagEQ) yes no) => (First no yes)
+(ULT (FlagLT_ULT) yes no) => (First yes no)
+(ULT (FlagLT_UGT) yes no) => (First no yes)
+(ULT (FlagGT_ULT) yes no) => (First yes no)
+(ULT (FlagGT_UGT) yes no) => (First no yes)
+
+(ULE (FlagEQ) yes no) => (First yes no)
+(ULE (FlagLT_ULT) yes no) => (First yes no)
+(ULE (FlagLT_UGT) yes no) => (First no yes)
+(ULE (FlagGT_ULT) yes no) => (First yes no)
+(ULE (FlagGT_UGT) yes no) => (First no yes)
+
+(UGT (FlagEQ) yes no) => (First no yes)
+(UGT (FlagLT_ULT) yes no) => (First no yes)
+(UGT (FlagLT_UGT) yes no) => (First yes no)
+(UGT (FlagGT_ULT) yes no) => (First no yes)
+(UGT (FlagGT_UGT) yes no) => (First yes no)
+
+(UGE (FlagEQ) yes no) => (First yes no)
+(UGE (FlagLT_ULT) yes no) => (First no yes)
+(UGE (FlagLT_UGT) yes no) => (First yes no)
+(UGE (FlagGT_ULT) yes no) => (First no yes)
+(UGE (FlagGT_UGT) yes no) => (First yes no)
+
+// Absorb flag constants into SETxx ops.
+(SETEQ (FlagEQ)) => (MOVLconst [1])
+(SETEQ (FlagLT_ULT)) => (MOVLconst [0])
+(SETEQ (FlagLT_UGT)) => (MOVLconst [0])
+(SETEQ (FlagGT_ULT)) => (MOVLconst [0])
+(SETEQ (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETNE (FlagEQ)) => (MOVLconst [0])
+(SETNE (FlagLT_ULT)) => (MOVLconst [1])
+(SETNE (FlagLT_UGT)) => (MOVLconst [1])
+(SETNE (FlagGT_ULT)) => (MOVLconst [1])
+(SETNE (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETL (FlagEQ)) => (MOVLconst [0])
+(SETL (FlagLT_ULT)) => (MOVLconst [1])
+(SETL (FlagLT_UGT)) => (MOVLconst [1])
+(SETL (FlagGT_ULT)) => (MOVLconst [0])
+(SETL (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETLE (FlagEQ)) => (MOVLconst [1])
+(SETLE (FlagLT_ULT)) => (MOVLconst [1])
+(SETLE (FlagLT_UGT)) => (MOVLconst [1])
+(SETLE (FlagGT_ULT)) => (MOVLconst [0])
+(SETLE (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETG (FlagEQ)) => (MOVLconst [0])
+(SETG (FlagLT_ULT)) => (MOVLconst [0])
+(SETG (FlagLT_UGT)) => (MOVLconst [0])
+(SETG (FlagGT_ULT)) => (MOVLconst [1])
+(SETG (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETGE (FlagEQ)) => (MOVLconst [1])
+(SETGE (FlagLT_ULT)) => (MOVLconst [0])
+(SETGE (FlagLT_UGT)) => (MOVLconst [0])
+(SETGE (FlagGT_ULT)) => (MOVLconst [1])
+(SETGE (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETB (FlagEQ)) => (MOVLconst [0])
+(SETB (FlagLT_ULT)) => (MOVLconst [1])
+(SETB (FlagLT_UGT)) => (MOVLconst [0])
+(SETB (FlagGT_ULT)) => (MOVLconst [1])
+(SETB (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETBE (FlagEQ)) => (MOVLconst [1])
+(SETBE (FlagLT_ULT)) => (MOVLconst [1])
+(SETBE (FlagLT_UGT)) => (MOVLconst [0])
+(SETBE (FlagGT_ULT)) => (MOVLconst [1])
+(SETBE (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETA (FlagEQ)) => (MOVLconst [0])
+(SETA (FlagLT_ULT)) => (MOVLconst [0])
+(SETA (FlagLT_UGT)) => (MOVLconst [1])
+(SETA (FlagGT_ULT)) => (MOVLconst [0])
+(SETA (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETAE (FlagEQ)) => (MOVLconst [1])
+(SETAE (FlagLT_ULT)) => (MOVLconst [0])
+(SETAE (FlagLT_UGT)) => (MOVLconst [1])
+(SETAE (FlagGT_ULT)) => (MOVLconst [0])
+(SETAE (FlagGT_UGT)) => (MOVLconst [1])
+
+// Remove redundant *const ops
+(ADDLconst [c] x) && c==0 => x
+(SUBLconst [c] x) && c==0 => x
+(ANDLconst [c] _) && c==0 => (MOVLconst [0])
+(ANDLconst [c] x) && c==-1 => x
+(ORLconst [c] x) && c==0 => x
+(ORLconst [c] _) && c==-1 => (MOVLconst [-1])
+(XORLconst [c] x) && c==0 => x
+// TODO: since we got rid of the W/B versions, we might miss
+// things like (ANDLconst [0x100] x) which were formerly
+// (ANDBconst [0] x). Probably doesn't happen very often.
+// If we cared, we might do:
+// (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 => (MOVLconst [0])
+
+// Convert constant subtracts to constant adds
+(SUBLconst [c] x) => (ADDLconst [-c] x)
+
+// generic constant folding
+// TODO: more of this
+(ADDLconst [c] (MOVLconst [d])) => (MOVLconst [c+d])
+(ADDLconst [c] (ADDLconst [d] x)) => (ADDLconst [c+d] x)
+(SARLconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)])
+(SARWconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)])
+(SARBconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)])
+(NEGL (MOVLconst [c])) => (MOVLconst [-c])
+(MULLconst [c] (MOVLconst [d])) => (MOVLconst [c*d])
+(ANDLconst [c] (MOVLconst [d])) => (MOVLconst [c&d])
+(ORLconst [c] (MOVLconst [d])) => (MOVLconst [c|d])
+(XORLconst [c] (MOVLconst [d])) => (MOVLconst [c^d])
+(NOTL (MOVLconst [c])) => (MOVLconst [^c])
+
+// generic simplifications
+// TODO: more of this
+(ADDL x (NEGL y)) => (SUBL x y)
+(SUBL x x) => (MOVLconst [0])
+(ANDL x x) => x
+(ORL x x) => x
+(XORL x x) => (MOVLconst [0])
+
+// checking AND against 0.
+(CMP(L|W|B)const l:(ANDL x y) [0]) && l.Uses==1 => (TEST(L|W|B) x y)
+(CMPLconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTLconst [c] x)
+(CMPWconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTWconst [int16(c)] x)
+(CMPBconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTBconst [int8(c)] x)
+
+// TEST %reg,%reg is shorter than CMP
+(CMP(L|W|B)const x [0]) => (TEST(L|W|B) x x)
+
+// Convert LEAL1 back to ADDL if we can
+(LEAL1 [0] {nil} x y) => (ADDL x y)
+
+// Combining byte loads into larger (unaligned) loads.
+// There are many ways these combinations could occur. This is
+// designed to match the way encoding/binary.LittleEndian does it.
+(ORL x0:(MOVBload [i0] {s} p mem)
+ s0:(SHLLconst [8] x1:(MOVBload [i1] {s} p mem)))
+ && i1 == i0+1
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, s0)
+ => @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
+
+(ORL x0:(MOVBload [i] {s} p0 mem)
+ s0:(SHLLconst [8] x1:(MOVBload [i] {s} p1 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, s0)
+ => @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+
+(ORL o0:(ORL
+ x0:(MOVWload [i0] {s} p mem)
+ s0:(SHLLconst [16] x1:(MOVBload [i2] {s} p mem)))
+ s1:(SHLLconst [24] x2:(MOVBload [i3] {s} p mem)))
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && clobber(x0, x1, x2, s0, s1, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVLload [i0] {s} p mem)
+
+(ORL o0:(ORL
+ x0:(MOVWload [i] {s} p0 mem)
+ s0:(SHLLconst [16] x1:(MOVBload [i] {s} p1 mem)))
+ s1:(SHLLconst [24] x2:(MOVBload [i] {s} p2 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && o0.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && sequentialAddresses(p1, p2, 1)
+ && mergePoint(b,x0,x1,x2) != nil
+ && clobber(x0, x1, x2, s0, s1, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVLload [i] {s} p0 mem)
+
+// Combine constant stores into larger (unaligned) stores.
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+ && x.Uses == 1
+ && a.Off() + 1 == c.Off()
+ && clobber(x)
+ => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p mem)
+(MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
+ && x.Uses == 1
+ && a.Off() + 1 == c.Off()
+ && clobber(x)
+ => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p mem)
+
+(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
+ && x.Uses == 1
+ && a.Off() == c.Off()
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem)
+(MOVBstoreconst [a] {s} p0 x:(MOVBstoreconst [c] {s} p1 mem))
+ && x.Uses == 1
+ && a.Off() == c.Off()
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem)
+
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+ && x.Uses == 1
+ && a.Off() + 2 == c.Off()
+ && clobber(x)
+ => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p mem)
+(MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
+ && x.Uses == 1
+ && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
+ && clobber(x)
+ => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p mem)
+
+(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
+ && x.Uses == 1
+ && a.Off() == c.Off()
+ && sequentialAddresses(p0, p1, 2)
+ && clobber(x)
+ => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem)
+(MOVWstoreconst [a] {s} p0 x:(MOVWstoreconst [c] {s} p1 mem))
+ && x.Uses == 1
+ && a.Off() == c.Off()
+ && sequentialAddresses(p0, p1, 2)
+ && clobber(x)
+ => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem)
+
+// Combine stores into larger (unaligned) stores.
+(MOVBstore [i] {s} p (SHR(W|L)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w x:(MOVBstore {s} [i+1] p (SHR(W|L)const [8] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i] {s} p w mem)
+(MOVBstore [i] {s} p (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRLconst [j-8] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-1] {s} p w0 mem)
+
+(MOVBstore [i] {s} p1 (SHR(W|L)const [8] w) x:(MOVBstore [i] {s} p0 w mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p0 w x:(MOVBstore {s} [i] p1 (SHR(W|L)const [8] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRLconst [j-8] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstore [i] {s} p0 w0 mem)
+
+(MOVWstore [i] {s} p (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVLstore [i-2] {s} p w mem)
+(MOVWstore [i] {s} p (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRLconst [j-16] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVLstore [i-2] {s} p w0 mem)
+
+(MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i] {s} p0 w mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && clobber(x)
+ => (MOVLstore [i] {s} p0 w mem)
+(MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRLconst [j-16] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && clobber(x)
+ => (MOVLstore [i] {s} p0 w0 mem)
+
+// For PIC, break floating-point constant loading into two instructions so we have
+// a register to use for holding the address of the constant pool entry.
+(MOVSSconst [c]) && config.ctxt.Flag_shared => (MOVSSconst2 (MOVSSconst1 [c]))
+(MOVSDconst [c]) && config.ctxt.Flag_shared => (MOVSDconst2 (MOVSDconst1 [c]))
+
+(CMP(L|W|B) l:(MOV(L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) => (CMP(L|W|B)load {sym} [off] ptr x mem)
+(CMP(L|W|B) x l:(MOV(L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (InvertFlags (CMP(L|W|B)load {sym} [off] ptr x mem))
+
+(CMP(L|W|B)const l:(MOV(L|W|B)load {sym} [off] ptr mem) [c])
+ && l.Uses == 1
+ && clobber(l) =>
+ @l.Block (CMP(L|W|B)constload {sym} [makeValAndOff(int32(c),off)] ptr mem)
+
+(CMPLload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPLconstload {sym} [makeValAndOff(c,off)] ptr mem)
+(CMPWload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPWconstload {sym} [makeValAndOff(int32(int16(c)),off)] ptr mem)
+(CMPBload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPBconstload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+
+(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read8(sym, int64(off)))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVLload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/_gen/386Ops.go b/src/cmd/compile/internal/ssa/_gen/386Ops.go
new file mode 100644
index 0000000..c66650c
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/386Ops.go
@@ -0,0 +1,588 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - Floating-point types live in the low natural slot of an sse2 register.
+// Unused portions are junk.
+// - We do not use AH,BH,CH,DH registers.
+// - When doing sub-register operations, we try to write the whole
+// destination register to avoid a partial-register write.
+// - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+// filled by sign-extending the used portion. Users of AuxInt which interpret
+// AuxInt as unsigned (e.g. shifts) must be careful.
+
+// Suffixes encode the bit width of various instructions.
+// L (long word) = 32 bit
+// W (word) = 16 bit
+// B (byte) = 8 bit
+
+// copied from ../../x86/reg.go
+var regNames386 = []string{
+ "AX",
+ "CX",
+ "DX",
+ "BX",
+ "SP",
+ "BP",
+ "SI",
+ "DI",
+ "X0",
+ "X1",
+ "X2",
+ "X3",
+ "X4",
+ "X5",
+ "X6",
+ "X7",
+
+ // If you add registers, update asyncPreempt in runtime
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNames386) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNames386 {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ ax = buildReg("AX")
+ cx = buildReg("CX")
+ dx = buildReg("DX")
+ bx = buildReg("BX")
+ si = buildReg("SI")
+ gp = buildReg("AX CX DX BX BP SI DI")
+ fp = buildReg("X0 X1 X2 X3 X4 X5 X6 X7")
+ gpsp = gp | buildReg("SP")
+ gpspsb = gpsp | buildReg("SB")
+ callerSave = gp | fp
+ )
+ // Common slices of register masks
+ var (
+ gponly = []regMask{gp}
+ fponly = []regMask{fp}
+ )
+
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: gponly}
+ gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+ gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+ gp11sb = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
+ gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+ gp11carry = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
+ gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
+ gp1carry1 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+ gp2carry1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+ gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+ gp21sb = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
+ gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+ gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax}, clobbers: dx}
+ gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
+ gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx}, clobbers: ax}
+ gp21mul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}
+
+ gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+ gp1flags = regInfo{inputs: []regMask{gpsp}}
+ gp0flagsLoad = regInfo{inputs: []regMask{gpspsb, 0}}
+ gp1flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+ flagsgp = regInfo{inputs: nil, outputs: gponly}
+
+ readflags = regInfo{inputs: nil, outputs: gponly}
+ flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
+
+ gpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+ gp21load = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
+ gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+ gp21loadidx = regInfo{inputs: []regMask{gp, gpspsb, gpsp, 0}, outputs: gponly}
+
+ gpstore = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+ gpstoreconst = regInfo{inputs: []regMask{gpspsb, 0}}
+ gpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+ gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+
+ fp01 = regInfo{inputs: nil, outputs: fponly}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+ fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
+ fpgp = regInfo{inputs: fponly, outputs: gponly}
+ gpfp = regInfo{inputs: gponly, outputs: fponly}
+ fp11 = regInfo{inputs: fponly, outputs: fponly}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+
+ fpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+ fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+ fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+ fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+ )
+
+ var _386ops = []opData{
+ // fp ops
+ {name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add
+ {name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add
+ {name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true}, // fp32 sub
+ {name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true}, // fp64 sub
+ {name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul
+ {name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul
+ {name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true}, // fp32 div
+ {name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true}, // fp64 div
+
+ {name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
+ {name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
+ {name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, // fp32 constant
+ {name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, // fp64 constant
+ {name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by i
+ {name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by 4*i
+ {name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by i
+ {name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by 8*i
+
+ {name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store
+ {name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store
+ {name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by i store
+ {name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by 4i store
+ {name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by i store
+ {name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by 8i store
+
+ {name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+
+ // binary ops
+ {name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true}, // arg0 + arg1
+ {name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32", clobberFlags: true}, // arg0 + auxint
+
+ {name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true}, // arg0 + arg1, generates <carry,result> pair
+ {name: "ADDLconstcarry", argLength: 1, reg: gp11carry, asm: "ADDL", aux: "Int32", resultInArg0: true}, // arg0 + auxint, generates <carry,result> pair
+ {name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0+arg1+carry(arg2), where arg2 is flags
+ {name: "ADCLconst", argLength: 2, reg: gp1carry1, asm: "ADCL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0+auxint+carry(arg1), where arg1 is flags
+
+ {name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true}, // arg0 - arg1
+ {name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+
+ {name: "SUBLcarry", argLength: 2, reg: gp21carry, asm: "SUBL", resultInArg0: true}, // arg0-arg1, generates <borrow,result> pair
+ {name: "SUBLconstcarry", argLength: 1, reg: gp11carry, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0-auxint, generates <borrow,result> pair
+ {name: "SBBL", argLength: 3, reg: gp2carry1, asm: "SBBL", resultInArg0: true, clobberFlags: true}, // arg0-arg1-borrow(arg2), where arg2 is flags
+ {name: "SBBLconst", argLength: 2, reg: gp1carry1, asm: "SBBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0-auxint-borrow(arg1), where arg1 is flags
+
+ {name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+ {name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true}, // arg0 * auxint
+
+ {name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true}, // Let x = arg0*arg1 (full 32x32->64 unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x.
+
+ {name: "HMULL", argLength: 2, reg: gp21hmul, commutative: true, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+ {name: "HMULLU", argLength: 2, reg: gp21hmul, commutative: true, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+
+ {name: "MULLQU", argLength: 2, reg: gp21mul, commutative: true, asm: "MULL", clobberFlags: true}, // arg0 * arg1, high 32 in result[0], low 32 in result[1]
+
+ {name: "AVGLU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 32 result bits
+
+ // For DIVL, DIVW, MODL and MODW, AuxInt non-zero means that the divisor has been proved to be not -1.
+ {name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 / arg1
+ {name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 / arg1
+ {name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL", clobberFlags: true}, // arg0 / arg1
+ {name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW", clobberFlags: true}, // arg0 / arg1
+
+ {name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 % arg1
+ {name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 % arg1
+ {name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL", clobberFlags: true}, // arg0 % arg1
+ {name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW", clobberFlags: true}, // arg0 % arg1
+
+ {name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+ {name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+
+ {name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+ {name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+
+ {name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+ {name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+
+ {name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+ {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
+ {name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"}, // arg0 compare to auxint
+
+ // compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem.
+ {name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+ // compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem.
+ {name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+ {name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32
+ {name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64
+
+ {name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0
+ {name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0
+ {name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"}, // (arg0 & arg1) compare to 0
+ {name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+ {name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
+ {name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"}, // (arg0 & auxint) compare to 0
+
+ {name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true}, // arg0 << arg1, shift amount is mod 32
+ {name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
+ // Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+ {name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32
+ {name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32
+ {name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32
+ {name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+ {name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-15
+ {name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-7
+
+ {name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32
+ {name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32
+ {name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32
+ {name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+ {name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-15
+ {name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-7
+
+ {name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true}, // 32 bits of arg0 rotate left by arg1
+ {name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true}, // low 16 bits of arg0 rotate left by arg1
+ {name: "ROLB", argLength: 2, reg: gp21shift, asm: "ROLB", resultInArg0: true, clobberFlags: true}, // low 8 bits of arg0 rotate left by arg1
+ {name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+ {name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+ {name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-7
+
+ // binary-op with a memory source operand
+ {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "MULLload", argLength: 3, reg: gp21load, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+ {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+
+ // binary-op with an indexed memory source operand
+ {name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
+ {name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
+ {name: "MULLloadidx4", argLength: 4, reg: gp21loadidx, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
+ {name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
+ {name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
+ {name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
+
+ // unary ops
+ {name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
+
+ {name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
+
+ {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+ {name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+
+ {name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+ {name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+
+ {name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes
+
+ {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+ {name: "SQRTSS", argLength: 1, reg: fp11, asm: "SQRTSS"}, // sqrt(arg0), float32
+
+ {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+ // Note: SBBW and SBBB are subsumed by SBBL
+
+ {name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+ {name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+ {name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"}, // extract signed < condition from arg0
+ {name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+ {name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"}, // extract signed > condition from arg0
+ {name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+ {name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"}, // extract unsigned < condition from arg0
+ {name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+ {name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"}, // extract unsigned > condition from arg0
+ {name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+ {name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"}, // extract if overflow flag is set from arg0
+ // Need different opcodes for floating point conditions because
+ // any comparison involving a NaN is always FALSE and thus
+ // the patterns for inverting conditions cannot be used.
+ {name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+ {name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+ {name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"}, // extract "ordered" (No Nan present) condition from arg0
+ {name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"}, // extract "unordered" (Nan present) condition from arg0
+
+ {name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"}, // extract floating > condition from arg0
+ {name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+ {name: "MOVBLSX", argLength: 1, reg: gp11, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32
+ {name: "MOVBLZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32
+ {name: "MOVWLSX", argLength: 1, reg: gp11, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32
+ {name: "MOVWLZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32
+
+ {name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+
+ {name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
+ {name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
+ {name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"}, // convert int32 to float32
+ {name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"}, // convert int32 to float64
+ {name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"}, // convert float64 to float32
+ {name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"}, // convert float32 to float64
+
+ {name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+ {name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+ {name: "LEAL1", argLength: 2, reg: gp21sb, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+ {name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux
+ {name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux
+ {name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux
+ // Note: LEAL{1,2,4,8} must not have OpSB as either argument.
+
+ // auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+ {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int32
+ {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int32
+ {name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+ // direct binary-op on memory (read-modify-write)
+ {name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
+ {name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
+ {name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
+ {name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) |= arg1, arg2=mem
+ {name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
+
+ // direct binary-op on indexed memory (read-modify-write)
+ {name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) += arg2, arg3=mem
+ {name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) -= arg2, arg3=mem
+ {name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) &= arg2, arg3=mem
+ {name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) |= arg2, arg3=mem
+ {name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) ^= arg2, arg3=mem
+
+ // direct binary-op on memory with a constant (read-modify-write)
+ {name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+ {name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+ {name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+ {name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+ // direct binary-op on indexed memory with a constant (read-modify-write)
+ {name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+ {name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+ {name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+ {name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+
+ // indexed loads/stores
+ {name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", aux: "SymOff", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
+ {name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
+ {name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
+ {name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
+ {name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff", symEffect: "Read"}, // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
+ // TODO: sign-extending indexed loads
+ {name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+ {name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+ // TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
+
+ // For storeconst ops, the AuxInt field encodes both
+ // the value to store and an address offset of the store.
+ // Cast AuxInt to a ValAndOff to extract Val and Off fields.
+ {name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux. arg1=mem
+ {name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ...
+ {name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ...
+
+ {name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux. arg2=mem
+ {name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... arg1 ...
+ {name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... 2*arg1 ...
+ {name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... arg1 ...
+ {name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... 4*arg1 ...
+
+ // arg0 = pointer to start of memory to zero
+ // arg1 = value to store (will always be zero)
+ // arg2 = mem
+ // auxint = offset into duffzero code to start executing
+ // returns mem
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("AX")},
+ clobbers: buildReg("DI CX"),
+ // Note: CX is only clobbered when dynamic linking.
+ },
+ faultOnNilArg0: true,
+ },
+
+ // arg0 = address of memory to zero
+ // arg1 = # of 4-byte words to zero
+ // arg2 = value to store (will always be zero)
+ // arg3 = mem
+ // returns mem
+ {
+ name: "REPSTOSL",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+ clobbers: buildReg("DI CX"),
+ },
+ faultOnNilArg0: true,
+ },
+
+ {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // arg0 = destination pointer
+ // arg1 = source pointer
+ // arg2 = mem
+ // auxint = offset from duffcopy symbol to call
+ // returns memory
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("SI")},
+ clobbers: buildReg("DI SI CX"), // uses CX as a temporary
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // arg0 = destination pointer
+ // arg1 = source pointer
+ // arg2 = # of 8-byte words to copy
+ // arg3 = mem
+ // returns memory
+ {
+ name: "REPMOVSL",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+ clobbers: buildReg("DI SI CX"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // (InvertFlags (CMPL a b)) == (CMPL b a)
+ // So if we want (SETL (CMPL a b)) but we can't do that because a is a constant,
+ // then we do (SETL (InvertFlags (CMPL b a))) instead.
+ // Rewrites will convert this to (SETG (CMPL b a)).
+ // InvertFlags is a pseudo-op which can't appear in assembly output.
+ {name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+ // Pseudo-ops
+ {name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of DX (the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true},
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+ //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary, but may clobber others.
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ // Extend ops are the same as Bounds ops except the indexes are 64-bit.
+ {name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, dx, bx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ {name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, cx, dx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ {name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, ax, cx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+
+ // Constant flag values. For any comparison, there are 5 possible
+ // outcomes: the three from the signed total order (<,==,>) and the
+ // three from the unsigned total order. The == cases overlap.
+ // Note: there's a sixth "unordered" outcome for floating-point
+ // comparisons, but we don't use such a beast yet.
+ // These ops are for temporary use by rewrite rules. They
+ // cannot appear in the generated assembly.
+ {name: "FlagEQ"}, // equal
+ {name: "FlagLT_ULT"}, // signed < and unsigned <
+ {name: "FlagLT_UGT"}, // signed < and unsigned >
+ {name: "FlagGT_UGT"}, // signed > and unsigned <
+ {name: "FlagGT_ULT"}, // signed > and unsigned >
+
+ // Special ops for PIC floating-point constants.
+ // MOVSXconst1 loads the address of the constant-pool entry into a register.
+ // MOVSXconst2 loads the constant from that address.
+ // MOVSXconst1 returns a pointer, but we type it as uint32 because it can never point to the Go heap.
+ {name: "MOVSSconst1", reg: gp01, typ: "UInt32", aux: "Float32"},
+ {name: "MOVSDconst1", reg: gp01, typ: "UInt32", aux: "Float64"},
+ {name: "MOVSSconst2", argLength: 1, reg: gpfp, asm: "MOVSS"},
+ {name: "MOVSDconst2", argLength: 1, reg: gpfp, asm: "MOVSD"},
+ }
+
+ var _386blocks = []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LT", controls: 1},
+ {name: "LE", controls: 1},
+ {name: "GT", controls: 1},
+ {name: "GE", controls: 1},
+ {name: "OS", controls: 1},
+ {name: "OC", controls: 1},
+ {name: "ULT", controls: 1},
+ {name: "ULE", controls: 1},
+ {name: "UGT", controls: 1},
+ {name: "UGE", controls: 1},
+ {name: "EQF", controls: 1},
+ {name: "NEF", controls: 1},
+ {name: "ORD", controls: 1}, // FP, ordered comparison (parity zero)
+ {name: "NAN", controls: 1}, // FP, unordered comparison (parity one)
+ }
+
+ archs = append(archs, arch{
+ name: "386",
+ pkg: "cmd/internal/obj/x86",
+ genfile: "../../x86/ssa.go",
+ ops: _386ops,
+ blocks: _386blocks,
+ regnames: regNames386,
+ gpregmask: gp,
+ fpregmask: fp,
+ framepointerreg: int8(num["BP"]),
+ linkreg: -1, // not used
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/386splitload.rules b/src/cmd/compile/internal/ssa/_gen/386splitload.rules
new file mode 100644
index 0000000..29d4f8c
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/386splitload.rules
@@ -0,0 +1,11 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// See the top of AMD64splitload.rules for discussion of these rules.
+
+(CMP(L|W|B)load {sym} [off] ptr x mem) => (CMP(L|W|B) (MOV(L|W|B)load {sym} [off] ptr mem) x)
+
+(CMPLconstload {sym} [vo] ptr mem) => (CMPLconst (MOVLload {sym} [vo.Off()] ptr mem) [vo.Val()])
+(CMPWconstload {sym} [vo] ptr mem) => (CMPWconst (MOVWload {sym} [vo.Off()] ptr mem) [vo.Val16()])
+(CMPBconstload {sym} [vo] ptr mem) => (CMPBconst (MOVBload {sym} [vo.Off()] ptr mem) [vo.Val8()])
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
new file mode 100644
index 0000000..da5ef7e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -0,0 +1,2216 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(64|32|16|8) ...) => (ADD(Q|L|L|L) ...)
+(AddPtr ...) => (ADDQ ...)
+(Add(32|64)F ...) => (ADDS(S|D) ...)
+
+(Sub(64|32|16|8) ...) => (SUB(Q|L|L|L) ...)
+(SubPtr ...) => (SUBQ ...)
+(Sub(32|64)F ...) => (SUBS(S|D) ...)
+
+(Mul(64|32|16|8) ...) => (MUL(Q|L|L|L) ...)
+(Mul(32|64)F ...) => (MULS(S|D) ...)
+
+(Select0 (Mul64uover x y)) => (Select0 <typ.UInt64> (MULQU x y))
+(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y))
+(Select1 (Mul(64|32)uover x y)) => (SETO (Select1 <types.TypeFlags> (MUL(Q|L)U x y)))
+
+(Hmul(64|32) ...) => (HMUL(Q|L) ...)
+(Hmul(64|32)u ...) => (HMUL(Q|L)U ...)
+
+(Div(64|32|16) [a] x y) => (Select0 (DIV(Q|L|W) [a] x y))
+(Div8 x y) => (Select0 (DIVW (SignExt8to16 x) (SignExt8to16 y)))
+(Div(64|32|16)u x y) => (Select0 (DIV(Q|L|W)U x y))
+(Div8u x y) => (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
+(Div(32|64)F ...) => (DIVS(S|D) ...)
+
+(Select0 (Add64carry x y c)) =>
+ (Select0 <typ.UInt64> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
+(Select1 (Add64carry x y c)) =>
+ (NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))
+(Select0 (Sub64borrow x y c)) =>
+ (Select0 <typ.UInt64> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
+(Select1 (Sub64borrow x y c)) =>
+ (NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))
+
+// Optimize ADCQ and friends
+(ADCQ x (MOVQconst [c]) carry) && is32Bit(c) => (ADCQconst x [int32(c)] carry)
+(ADCQ x y (FlagEQ)) => (ADDQcarry x y)
+(ADCQconst x [c] (FlagEQ)) => (ADDQconstcarry x [c])
+(ADDQcarry x (MOVQconst [c])) && is32Bit(c) => (ADDQconstcarry x [int32(c)])
+(SBBQ x (MOVQconst [c]) borrow) && is32Bit(c) => (SBBQconst x [int32(c)] borrow)
+(SBBQ x y (FlagEQ)) => (SUBQborrow x y)
+(SBBQconst x [c] (FlagEQ)) => (SUBQconstborrow x [c])
+(SUBQborrow x (MOVQconst [c])) && is32Bit(c) => (SUBQconstborrow x [int32(c)])
+(Select1 (NEGLflags (MOVQconst [0]))) => (FlagEQ)
+(Select1 (NEGLflags (NEGQ (SBBQcarrymask x)))) => x
+
+
+(Mul64uhilo ...) => (MULQU2 ...)
+(Div128u ...) => (DIVQU2 ...)
+
+(Avg64u ...) => (AVGQU ...)
+
+(Mod(64|32|16) [a] x y) => (Select1 (DIV(Q|L|W) [a] x y))
+(Mod8 x y) => (Select1 (DIVW (SignExt8to16 x) (SignExt8to16 y)))
+(Mod(64|32|16)u x y) => (Select1 (DIV(Q|L|W)U x y))
+(Mod8u x y) => (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
+
+(And(64|32|16|8) ...) => (AND(Q|L|L|L) ...)
+(Or(64|32|16|8) ...) => (OR(Q|L|L|L) ...)
+(Xor(64|32|16|8) ...) => (XOR(Q|L|L|L) ...)
+(Com(64|32|16|8) ...) => (NOT(Q|L|L|L) ...)
+
+(Neg(64|32|16|8) ...) => (NEG(Q|L|L|L) ...)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
+
+// Lowering boolean ops
+(AndB ...) => (ANDL ...)
+(OrB ...) => (ORL ...)
+(Not x) => (XORLconst [1] x)
+
+// Lowering pointer arithmetic
+(OffPtr [off] ptr) && is32Bit(off) => (ADDQconst [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDQ (MOVQconst [off]) ptr)
+
+// Lowering other arithmetic
+(Ctz64 x) && buildcfg.GOAMD64 >= 3 => (TZCNTQ x)
+(Ctz32 x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz64 <t> x) && buildcfg.GOAMD64 < 3 => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
+(Ctz32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
+(Ctz16 x) => (BSFL (BTSLconst <typ.UInt32> [16] x))
+(Ctz8 x) => (BSFL (BTSLconst <typ.UInt32> [ 8] x))
+
+(Ctz64NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTQ x)
+(Ctz32NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz16NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz8NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz64NonZero x) && buildcfg.GOAMD64 < 3 => (Select0 (BSFQ x))
+(Ctz32NonZero x) && buildcfg.GOAMD64 < 3 => (BSFL x)
+(Ctz16NonZero x) && buildcfg.GOAMD64 < 3 => (BSFL x)
+(Ctz8NonZero x) && buildcfg.GOAMD64 < 3 => (BSFL x)
+
+// BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
+// However, for zero-extended values, we can cheat a bit, and calculate
+// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
+// places the index of the highest set bit where we want it.
+// For GOAMD64>=3, BitLen can be calculated by OperandSize - LZCNT(x).
+(BitLen64 <t> x) && buildcfg.GOAMD64 < 3 => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
+(BitLen32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
+(BitLen16 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
+(BitLen8 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
+(BitLen64 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-64] (LZCNTQ x)))
+// Use 64-bit version to allow const-fold remove unnecessary arithmetic.
+(BitLen32 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL x)))
+(BitLen16 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL (MOVWQZX <x.Type> x))))
+(BitLen8 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL (MOVBQZX <x.Type> x))))
+
+(Bswap(64|32) ...) => (BSWAP(Q|L) ...)
+
+(PopCount(64|32) ...) => (POPCNT(Q|L) ...)
+(PopCount16 x) => (POPCNTL (MOVWQZX <typ.UInt32> x))
+(PopCount8 x) => (POPCNTL (MOVBQZX <typ.UInt32> x))
+
+(Sqrt ...) => (SQRTSD ...)
+(Sqrt32 ...) => (SQRTSS ...)
+
+(RoundToEven x) => (ROUNDSD [0] x)
+(Floor x) => (ROUNDSD [1] x)
+(Ceil x) => (ROUNDSD [2] x)
+(Trunc x) => (ROUNDSD [3] x)
+
+(FMA x y z) => (VFMADD231SD z x y)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to16 ...) => (MOVBQSX ...)
+(SignExt8to32 ...) => (MOVBQSX ...)
+(SignExt8to64 ...) => (MOVBQSX ...)
+(SignExt16to32 ...) => (MOVWQSX ...)
+(SignExt16to64 ...) => (MOVWQSX ...)
+(SignExt32to64 ...) => (MOVLQSX ...)
+
+(ZeroExt8to16 ...) => (MOVBQZX ...)
+(ZeroExt8to32 ...) => (MOVBQZX ...)
+(ZeroExt8to64 ...) => (MOVBQZX ...)
+(ZeroExt16to32 ...) => (MOVWQZX ...)
+(ZeroExt16to64 ...) => (MOVWQZX ...)
+(ZeroExt32to64 ...) => (MOVLQZX ...)
+
+(Slicemask <t> x) => (SARQconst (NEGQ <t> x) [63])
+
+(SpectreIndex <t> x y) => (CMOVQCC x (MOVQconst [0]) (CMPQ x y))
+(SpectreSliceIndex <t> x y) => (CMOVQHI x (MOVQconst [0]) (CMPQ x y))
+
+// Lowering truncation
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Lowering float <-> int
+(Cvt32to32F ...) => (CVTSL2SS ...)
+(Cvt32to64F ...) => (CVTSL2SD ...)
+(Cvt64to32F ...) => (CVTSQ2SS ...)
+(Cvt64to64F ...) => (CVTSQ2SD ...)
+
+(Cvt32Fto32 ...) => (CVTTSS2SL ...)
+(Cvt32Fto64 ...) => (CVTTSS2SQ ...)
+(Cvt64Fto32 ...) => (CVTTSD2SL ...)
+(Cvt64Fto64 ...) => (CVTTSD2SQ ...)
+
+(Cvt32Fto64F ...) => (CVTSS2SD ...)
+(Cvt64Fto32F ...) => (CVTSD2SS ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+// Lowering shifts
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+// result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
+(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh8x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLQ x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+
+(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16])))
+(Rsh8Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8])))
+
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRQ x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRL x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRW x y)
+(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRB x y)
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
+(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64])))))
+(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32])))))
+(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16])))))
+(Rsh8x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8])))))
+
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SARQ x y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SARL x y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y)
+(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SARB x y)
+
+// Lowering integer comparisons
+(Less(64|32|16|8) x y) => (SETL (CMP(Q|L|W|B) x y))
+(Less(64|32|16|8)U x y) => (SETB (CMP(Q|L|W|B) x y))
+(Leq(64|32|16|8) x y) => (SETLE (CMP(Q|L|W|B) x y))
+(Leq(64|32|16|8)U x y) => (SETBE (CMP(Q|L|W|B) x y))
+(Eq(Ptr|64|32|16|8|B) x y) => (SETEQ (CMP(Q|Q|L|W|B|B) x y))
+(Neq(Ptr|64|32|16|8|B) x y) => (SETNE (CMP(Q|Q|L|W|B|B) x y))
+
+// Lowering floating point comparisons
+// Note Go assembler gets UCOMISx operand order wrong, but it is right here
+// and the operands are reversed when generating assembly language.
+(Eq(32|64)F x y) => (SETEQF (UCOMIS(S|D) x y))
+(Neq(32|64)F x y) => (SETNEF (UCOMIS(S|D) x y))
+// Use SETGF/SETGEF with reversed operands to dodge NaN case.
+(Less(32|64)F x y) => (SETGF (UCOMIS(S|D) y x))
+(Leq(32|64)F x y) => (SETGEF (UCOMIS(S|D) y x))
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVQload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) => (MOVLload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem)
+
+// Lowering stores
+// These more-specific FP versions of Store pattern should come first.
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem)
+
+(Store {t} ptr val mem) && t.Size() == 8 => (MOVQstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Lowering moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem)
+(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
+(Move [16] dst src mem) && config.useSSE => (MOVOstore dst (MOVOload src mem) mem)
+(Move [16] dst src mem) && !config.useSSE =>
+ (MOVQstore [8] dst (MOVQload [8] src mem)
+ (MOVQstore dst (MOVQload src mem) mem))
+
+(Move [32] dst src mem) =>
+ (Move [16]
+ (OffPtr <dst.Type> dst [16])
+ (OffPtr <src.Type> src [16])
+ (Move [16] dst src mem))
+
+(Move [48] dst src mem) && config.useSSE =>
+ (Move [32]
+ (OffPtr <dst.Type> dst [16])
+ (OffPtr <src.Type> src [16])
+ (Move [16] dst src mem))
+
+(Move [64] dst src mem) && config.useSSE =>
+ (Move [32]
+ (OffPtr <dst.Type> dst [32])
+ (OffPtr <src.Type> src [32])
+ (Move [32] dst src mem))
+
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))
+(Move [5] dst src mem) =>
+ (MOVBstore [4] dst (MOVBload [4] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+(Move [6] dst src mem) =>
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+(Move [7] dst src mem) =>
+ (MOVLstore [3] dst (MOVLload [3] src mem)
+ (MOVLstore dst (MOVLload src mem) mem))
+(Move [9] dst src mem) =>
+ (MOVBstore [8] dst (MOVBload [8] src mem)
+ (MOVQstore dst (MOVQload src mem) mem))
+(Move [10] dst src mem) =>
+ (MOVWstore [8] dst (MOVWload [8] src mem)
+ (MOVQstore dst (MOVQload src mem) mem))
+(Move [12] dst src mem) =>
+ (MOVLstore [8] dst (MOVLload [8] src mem)
+ (MOVQstore dst (MOVQload src mem) mem))
+(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 =>
+ (MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
+ (MOVQstore dst (MOVQload src mem) mem))
+
+// Adjust moves to be a multiple of 16 bytes.
+(Move [s] dst src mem)
+ && s > 16 && s%16 != 0 && s%16 <= 8 =>
+ (Move [s-s%16]
+ (OffPtr <dst.Type> dst [s%16])
+ (OffPtr <src.Type> src [s%16])
+ (MOVQstore dst (MOVQload src mem) mem))
+(Move [s] dst src mem)
+ && s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE =>
+ (Move [s-s%16]
+ (OffPtr <dst.Type> dst [s%16])
+ (OffPtr <src.Type> src [s%16])
+ (MOVOstore dst (MOVOload src mem) mem))
+(Move [s] dst src mem)
+ && s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE =>
+ (Move [s-s%16]
+ (OffPtr <dst.Type> dst [s%16])
+ (OffPtr <src.Type> src [s%16])
+ (MOVQstore [8] dst (MOVQload [8] src mem)
+ (MOVQstore dst (MOVQload src mem) mem)))
+
+// Medium copying uses a duff device.
+(Move [s] dst src mem)
+ && s > 64 && s <= 16*64 && s%16 == 0
+ && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [s] dst src mem)
+
+// Large copying uses REP MOVSQ.
+(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) =>
+ (REPMOVSQ dst src (MOVQconst [s/8]) mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [makeValAndOff(0,0)] destptr mem)
+(Zero [2] destptr mem) => (MOVWstoreconst [makeValAndOff(0,0)] destptr mem)
+(Zero [4] destptr mem) => (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)
+(Zero [8] destptr mem) => (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)
+
+(Zero [3] destptr mem) =>
+ (MOVBstoreconst [makeValAndOff(0,2)] destptr
+ (MOVWstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [5] destptr mem) =>
+ (MOVBstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [6] destptr mem) =>
+ (MOVWstoreconst [makeValAndOff(0,4)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [7] destptr mem) =>
+ (MOVLstoreconst [makeValAndOff(0,3)] destptr
+ (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))
+
+// Strip off any fractional word zeroing.
+(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE =>
+ (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
+ (MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
+
+// Zero small numbers of words directly.
+(Zero [16] destptr mem) && !config.useSSE =>
+ (MOVQstoreconst [makeValAndOff(0,8)] destptr
+ (MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [24] destptr mem) && !config.useSSE =>
+ (MOVQstoreconst [makeValAndOff(0,16)] destptr
+ (MOVQstoreconst [makeValAndOff(0,8)] destptr
+ (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)))
+(Zero [32] destptr mem) && !config.useSSE =>
+ (MOVQstoreconst [makeValAndOff(0,24)] destptr
+ (MOVQstoreconst [makeValAndOff(0,16)] destptr
+ (MOVQstoreconst [makeValAndOff(0,8)] destptr
+ (MOVQstoreconst [makeValAndOff(0,0)] destptr mem))))
+
+(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE =>
+ (MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr
+ (MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
+
+// Adjust zeros to be a multiple of 16 bytes.
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE =>
+ (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE =>
+ (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+
+(Zero [16] destptr mem) && config.useSSE =>
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
+(Zero [32] destptr mem) && config.useSSE =>
+ (MOVOstoreconst [makeValAndOff(0,16)] destptr
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [48] destptr mem) && config.useSSE =>
+ (MOVOstoreconst [makeValAndOff(0,32)] destptr
+ (MOVOstoreconst [makeValAndOff(0,16)] destptr
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
+(Zero [64] destptr mem) && config.useSSE =>
+ (MOVOstoreconst [makeValAndOff(0,48)] destptr
+ (MOVOstoreconst [makeValAndOff(0,32)] destptr
+ (MOVOstoreconst [makeValAndOff(0,16)] destptr
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
+
+// Medium zeroing uses a duff device.
+(Zero [s] destptr mem)
+ && s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice =>
+ (DUFFZERO [s] destptr mem)
+
+// Large zeroing uses REP STOSQ.
+(Zero [s] destptr mem)
+ && (s > 1024 || (config.noDuffDevice && s > 64 || !config.useSSE && s > 32))
+ && s%8 == 0 =>
+ (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
+
+// Lowering constants
+(Const8 [c]) => (MOVLconst [int32(c)])
+(Const16 [c]) => (MOVLconst [int32(c)])
+(Const32 ...) => (MOVLconst ...)
+(Const64 ...) => (MOVQconst ...)
+(Const32F ...) => (MOVSSconst ...)
+(Const64F ...) => (MOVSDconst ...)
+(ConstNil ) => (MOVQconst [0])
+(ConstBool [c]) => (MOVLconst [b2i32(c)])
+
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// Lowering conditional moves
+// If the condition is a SETxx, we can just run a CMOV from the comparison that was
+// setting the flags.
+// Legend: HI=unsigned ABOVE, CS=unsigned BELOW, CC=unsigned ABOVE EQUAL, LS=unsigned BELOW EQUAL
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && (is64BitInt(t) || isPtr(t))
+ => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is32BitInt(t)
+ => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is16BitInt(t)
+ => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+
+// If the condition does not set the flags, we need to generate a comparison.
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 1
+ => (CondSelect <t> x y (MOVBQZX <typ.UInt64> check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 2
+ => (CondSelect <t> x y (MOVWQZX <typ.UInt64> check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 4
+ => (CondSelect <t> x y (MOVLQZX <typ.UInt64> check))
+
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && (is64BitInt(t) || isPtr(t))
+ => (CMOVQNE y x (CMPQconst [0] check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is32BitInt(t)
+ => (CMOVLNE y x (CMPQconst [0] check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is16BitInt(t)
+ => (CMOVWNE y x (CMPQconst [0] check))
+
+// Absorb InvertFlags
+(CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+ => (CMOVQ(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+(CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+ => (CMOVL(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+(CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+ => (CMOVW(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+
+// Absorb constants generated during lower
+(CMOV(QEQ|QLE|QGE|QCC|QLS|LEQ|LLE|LGE|LCC|LLS|WEQ|WLE|WGE|WCC|WLS) _ x (FlagEQ)) => x
+(CMOV(QNE|QLT|QGT|QCS|QHI|LNE|LLT|LGT|LCS|LHI|WNE|WLT|WGT|WCS|WHI) y _ (FlagEQ)) => y
+(CMOV(QNE|QGT|QGE|QHI|QCC|LNE|LGT|LGE|LHI|LCC|WNE|WGT|WGE|WHI|WCC) _ x (FlagGT_UGT)) => x
+(CMOV(QEQ|QLE|QLT|QLS|QCS|LEQ|LLE|LLT|LLS|LCS|WEQ|WLE|WLT|WLS|WCS) y _ (FlagGT_UGT)) => y
+(CMOV(QNE|QGT|QGE|QLS|QCS|LNE|LGT|LGE|LLS|LCS|WNE|WGT|WGE|WLS|WCS) _ x (FlagGT_ULT)) => x
+(CMOV(QEQ|QLE|QLT|QHI|QCC|LEQ|LLE|LLT|LHI|LCC|WEQ|WLE|WLT|WHI|WCC) y _ (FlagGT_ULT)) => y
+(CMOV(QNE|QLT|QLE|QCS|QLS|LNE|LLT|LLE|LCS|LLS|WNE|WLT|WLE|WCS|WLS) _ x (FlagLT_ULT)) => x
+(CMOV(QEQ|QGT|QGE|QHI|QCC|LEQ|LGT|LGE|LHI|LCC|WEQ|WGT|WGE|WHI|WCC) y _ (FlagLT_ULT)) => y
+(CMOV(QNE|QLT|QLE|QHI|QCC|LNE|LLT|LLE|LHI|LCC|WNE|WLT|WLE|WHI|WCC) _ x (FlagLT_UGT)) => x
+(CMOV(QEQ|QGT|QGE|QCS|QLS|LEQ|LGT|LGE|LCS|LLS|WEQ|WGT|WGE|WCS|WLS) y _ (FlagLT_UGT)) => y
+
+// Miscellaneous
+(IsNonNil p) => (SETNE (TESTQ p p))
+(IsInBounds idx len) => (SETB (CMPQ idx len))
+(IsSliceInBounds idx len) => (SETBE (CMPQ idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetG mem) && v.Block.Func.OwnAux.Fn.ABI() != obj.ABIInternal => (LoweredGetG mem) // only lower in old ABI. in new ABI we have a G register.
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+
+(HasCPUFeature {s}) => (SETNE (CMPLconst [0] (LoweredHasCPUFeature {s})))
+(Addr {sym} base) => (LEAQ {sym} base)
+(LocalAddr {sym} base _) => (LEAQ {sym} base)
+
+(MOVBstore [off] {sym} ptr y:(SETL x) mem) && y.Uses == 1 => (SETLstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETLE x) mem) && y.Uses == 1 => (SETLEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETG x) mem) && y.Uses == 1 => (SETGstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETGE x) mem) && y.Uses == 1 => (SETGEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETEQ x) mem) && y.Uses == 1 => (SETEQstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETNE x) mem) && y.Uses == 1 => (SETNEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETB x) mem) && y.Uses == 1 => (SETBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETBE x) mem) && y.Uses == 1 => (SETBEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETA x) mem) && y.Uses == 1 => (SETAstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETAE x) mem) && y.Uses == 1 => (SETAEstore [off] {sym} ptr x mem)
+
+// block rewrites
+(If (SETL cmp) yes no) => (LT cmp yes no)
+(If (SETLE cmp) yes no) => (LE cmp yes no)
+(If (SETG cmp) yes no) => (GT cmp yes no)
+(If (SETGE cmp) yes no) => (GE cmp yes no)
+(If (SETEQ cmp) yes no) => (EQ cmp yes no)
+(If (SETNE cmp) yes no) => (NE cmp yes no)
+(If (SETB cmp) yes no) => (ULT cmp yes no)
+(If (SETBE cmp) yes no) => (ULE cmp yes no)
+(If (SETA cmp) yes no) => (UGT cmp yes no)
+(If (SETAE cmp) yes no) => (UGE cmp yes no)
+(If (SETO cmp) yes no) => (OS cmp yes no)
+
+// Special case for floating point - LF/LEF not generated
+(If (SETGF cmp) yes no) => (UGT cmp yes no)
+(If (SETGEF cmp) yes no) => (UGE cmp yes no)
+(If (SETEQF cmp) yes no) => (EQF cmp yes no)
+(If (SETNEF cmp) yes no) => (NEF cmp yes no)
+
+(If cond yes no) => (NE (TESTB cond cond) yes no)
+
+(JumpTable idx) => (JUMPTABLE {makeJumpTableSym(b)} idx (LEAQ <typ.Uintptr> {makeJumpTableSym(b)} (SB)))
+
+// Atomic loads. Other than preserving their ordering with respect to other loads, nothing special here.
+(AtomicLoad8 ptr mem) => (MOVBatomicload ptr mem)
+(AtomicLoad32 ptr mem) => (MOVLatomicload ptr mem)
+(AtomicLoad64 ptr mem) => (MOVQatomicload ptr mem)
+(AtomicLoadPtr ptr mem) => (MOVQatomicload ptr mem)
+
+// Atomic stores. We use XCHG to prevent the hardware reordering a subsequent load.
+// TODO: most runtime uses of atomic stores don't need that property. Use normal stores for those?
+(AtomicStore8 ptr val mem) => (Select1 (XCHGB <types.NewTuple(typ.UInt8,types.TypeMem)> val ptr mem))
+(AtomicStore32 ptr val mem) => (Select1 (XCHGL <types.NewTuple(typ.UInt32,types.TypeMem)> val ptr mem))
+(AtomicStore64 ptr val mem) => (Select1 (XCHGQ <types.NewTuple(typ.UInt64,types.TypeMem)> val ptr mem))
+(AtomicStorePtrNoWB ptr val mem) => (Select1 (XCHGQ <types.NewTuple(typ.BytePtr,types.TypeMem)> val ptr mem))
+
+// Atomic exchanges.
+(AtomicExchange32 ptr val mem) => (XCHGL val ptr mem)
+(AtomicExchange64 ptr val mem) => (XCHGQ val ptr mem)
+
+// Atomic adds.
+(AtomicAdd32 ptr val mem) => (AddTupleFirst32 val (XADDLlock val ptr mem))
+(AtomicAdd64 ptr val mem) => (AddTupleFirst64 val (XADDQlock val ptr mem))
+(Select0 <t> (AddTupleFirst32 val tuple)) => (ADDL val (Select0 <t> tuple))
+(Select1 (AddTupleFirst32 _ tuple)) => (Select1 tuple)
+(Select0 <t> (AddTupleFirst64 val tuple)) => (ADDQ val (Select0 <t> tuple))
+(Select1 (AddTupleFirst64 _ tuple)) => (Select1 tuple)
+
+// Atomic compare and swap.
+(AtomicCompareAndSwap32 ptr old new_ mem) => (CMPXCHGLlock ptr old new_ mem)
+(AtomicCompareAndSwap64 ptr old new_ mem) => (CMPXCHGQlock ptr old new_ mem)
+
+// Atomic memory updates.
+(AtomicAnd8 ptr val mem) => (ANDBlock ptr val mem)
+(AtomicAnd32 ptr val mem) => (ANDLlock ptr val mem)
+(AtomicOr8 ptr val mem) => (ORBlock ptr val mem)
+(AtomicOr32 ptr val mem) => (ORLlock ptr val mem)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// lowering rotates
+(RotateLeft8 ...) => (ROLB ...)
+(RotateLeft16 ...) => (ROLW ...)
+(RotateLeft32 ...) => (ROLL ...)
+(RotateLeft64 ...) => (ROLQ ...)
+
+// ***************************
+// Above: lowering rules
+// Below: optimizations
+// ***************************
+// TODO: Should the optimizations be a separate pass?
+
+// Fold boolean tests into blocks
+(NE (TESTB (SETL cmp) (SETL cmp)) yes no) => (LT cmp yes no)
+(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE cmp yes no)
+(NE (TESTB (SETG cmp) (SETG cmp)) yes no) => (GT cmp yes no)
+(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE cmp yes no)
+(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ cmp yes no)
+(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE cmp yes no)
+(NE (TESTB (SETB cmp) (SETB cmp)) yes no) => (ULT cmp yes no)
+(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no)
+(NE (TESTB (SETA cmp) (SETA cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no)
+
+// Unsigned comparisons to 0/1
+(ULT (TEST(Q|L|W|B) x x) yes no) => (First no yes)
+(UGE (TEST(Q|L|W|B) x x) yes no) => (First yes no)
+(SETB (TEST(Q|L|W|B) x x)) => (ConstBool [false])
+(SETAE (TEST(Q|L|W|B) x x)) => (ConstBool [true])
+
+// x & 1 != 0 -> x & 1
+(SETNE (TEST(B|W)const [1] x)) => (AND(L|L)const [1] x)
+(SETB (BT(L|Q)const [0] x)) => (AND(L|Q)const [1] x)
+
+// Recognize bit tests: a&(1<<b) != 0 for b suitably bounded
+// Note that BTx instructions use the carry bit, so we need to convert tests for zero flag
+// into tests for carry flags.
+// ULT and SETB check the carry flag; they are identical to CS and SETCS. Same, mutatis
+// mutandis, for UGE and SETAE, and CC and SETCC.
+((NE|EQ) (TESTL (SHLL (MOVLconst [1]) x) y)) => ((ULT|UGE) (BTL x y))
+((NE|EQ) (TESTQ (SHLQ (MOVQconst [1]) x) y)) => ((ULT|UGE) (BTQ x y))
+((NE|EQ) (TESTLconst [c] x)) && isUint32PowerOfTwo(int64(c))
+ => ((ULT|UGE) (BTLconst [int8(log32(c))] x))
+((NE|EQ) (TESTQconst [c] x)) && isUint64PowerOfTwo(int64(c))
+ => ((ULT|UGE) (BTQconst [int8(log32(c))] x))
+((NE|EQ) (TESTQ (MOVQconst [c]) x)) && isUint64PowerOfTwo(c)
+ => ((ULT|UGE) (BTQconst [int8(log64(c))] x))
+(SET(NE|EQ) (TESTL (SHLL (MOVLconst [1]) x) y)) => (SET(B|AE) (BTL x y))
+(SET(NE|EQ) (TESTQ (SHLQ (MOVQconst [1]) x) y)) => (SET(B|AE) (BTQ x y))
+(SET(NE|EQ) (TESTLconst [c] x)) && isUint32PowerOfTwo(int64(c))
+ => (SET(B|AE) (BTLconst [int8(log32(c))] x))
+(SET(NE|EQ) (TESTQconst [c] x)) && isUint64PowerOfTwo(int64(c))
+ => (SET(B|AE) (BTQconst [int8(log32(c))] x))
+(SET(NE|EQ) (TESTQ (MOVQconst [c]) x)) && isUint64PowerOfTwo(c)
+ => (SET(B|AE) (BTQconst [int8(log64(c))] x))
+// SET..store variant
+(SET(NE|EQ)store [off] {sym} ptr (TESTL (SHLL (MOVLconst [1]) x) y) mem)
+ => (SET(B|AE)store [off] {sym} ptr (BTL x y) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ (SHLQ (MOVQconst [1]) x) y) mem)
+ => (SET(B|AE)store [off] {sym} ptr (BTQ x y) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTLconst [c] x) mem) && isUint32PowerOfTwo(int64(c))
+ => (SET(B|AE)store [off] {sym} ptr (BTLconst [int8(log32(c))] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTQconst [c] x) mem) && isUint64PowerOfTwo(int64(c))
+ => (SET(B|AE)store [off] {sym} ptr (BTQconst [int8(log32(c))] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ (MOVQconst [c]) x) mem) && isUint64PowerOfTwo(c)
+ => (SET(B|AE)store [off] {sym} ptr (BTQconst [int8(log64(c))] x) mem)
+
+// Handle bit-testing in the form (a>>b)&1 != 0 by building the above rules
+// and further combining shifts.
+(BT(Q|L)const [c] (SHRQconst [d] x)) && (c+d)<64 => (BTQconst [c+d] x)
+(BT(Q|L)const [c] (SHLQconst [d] x)) && c>d => (BT(Q|L)const [c-d] x)
+(BT(Q|L)const [0] s:(SHRQ x y)) => (BTQ y x)
+(BTLconst [c] (SHRLconst [d] x)) && (c+d)<32 => (BTLconst [c+d] x)
+(BTLconst [c] (SHLLconst [d] x)) && c>d => (BTLconst [c-d] x)
+(BTLconst [0] s:(SHR(L|XL) x y)) => (BTL y x)
+
+// Rewrite a & 1 != 1 into a & 1 == 0.
+// Among other things, this lets us turn (a>>b)&1 != 1 into a bit test.
+(SET(NE|EQ) (CMPLconst [1] s:(ANDLconst [1] _))) => (SET(EQ|NE) (CMPLconst [0] s))
+(SET(NE|EQ)store [off] {sym} ptr (CMPLconst [1] s:(ANDLconst [1] _)) mem) => (SET(EQ|NE)store [off] {sym} ptr (CMPLconst [0] s) mem)
+(SET(NE|EQ) (CMPQconst [1] s:(ANDQconst [1] _))) => (SET(EQ|NE) (CMPQconst [0] s))
+(SET(NE|EQ)store [off] {sym} ptr (CMPQconst [1] s:(ANDQconst [1] _)) mem) => (SET(EQ|NE)store [off] {sym} ptr (CMPQconst [0] s) mem)
+
+// Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b)
+(OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y)
+(XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y)
+
+// Convert ORconst into BTS, if the code gets smaller, with boundary being
+// (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes).
+((ORQ|XORQ)const [c] x) && isUint64PowerOfTwo(int64(c)) && uint64(c) >= 128
+ => (BT(S|C)Qconst [int8(log32(c))] x)
+((ORL|XORL)const [c] x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128
+ => (BT(S|C)Lconst [int8(log32(c))] x)
+((ORQ|XORQ) (MOVQconst [c]) x) && isUint64PowerOfTwo(c) && uint64(c) >= 128
+ => (BT(S|C)Qconst [int8(log64(c))] x)
+((ORL|XORL) (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128
+ => (BT(S|C)Lconst [int8(log32(c))] x)
+
+// Recognize bit clearing: a &^= 1<<b
+(AND(Q|L) (NOT(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y)) x) => (BTR(Q|L) x y)
+(ANDN(Q|L) x (SHL(Q|L) (MOV(Q|L)const [1]) y)) => (BTR(Q|L) x y)
+(ANDQconst [c] x) && isUint64PowerOfTwo(int64(^c)) && uint64(^c) >= 128
+ => (BTRQconst [int8(log32(^c))] x)
+(ANDLconst [c] x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
+ => (BTRLconst [int8(log32(^c))] x)
+(ANDQ (MOVQconst [c]) x) && isUint64PowerOfTwo(^c) && uint64(^c) >= 128
+ => (BTRQconst [int8(log64(^c))] x)
+(ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
+ => (BTRLconst [int8(log32(^c))] x)
+
+// Special-case bit patterns on first/last bit.
+// generic.rules changes ANDs of high-part/low-part masks into a couple of shifts,
+// for instance:
+// x & 0xFFFF0000 -> (x >> 16) << 16
+// x & 0x80000000 -> (x >> 31) << 31
+//
+// In case the mask is just one bit (like second example above), it conflicts
+// with the above rules to detect bit-testing / bit-clearing of first/last bit.
+// We thus special-case them, by detecting the shift patterns.
+
+// Special case resetting first/last bit
+(SHL(L|Q)const [1] (SHR(L|Q)const [1] x))
+ => (BTR(L|Q)const [0] x)
+(SHRLconst [1] (SHLLconst [1] x))
+ => (BTRLconst [31] x)
+(SHRQconst [1] (SHLQconst [1] x))
+ => (BTRQconst [63] x)
+
+// Special case testing first/last bit (with double-shift generated by generic.rules)
+((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHLQconst [63] (SHRQconst [63] x)) z2)) && z1==z2
+ => ((SETB|SETAE|ULT|UGE) (BTQconst [63] x))
+((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHLLconst [31] (SHRQconst [31] x)) z2)) && z1==z2
+ => ((SETB|SETAE|ULT|UGE) (BTQconst [31] x))
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHLQconst [63] (SHRQconst [63] x)) z2) mem) && z1==z2
+ => (SET(B|AE)store [off] {sym} ptr (BTQconst [63] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHLLconst [31] (SHRLconst [31] x)) z2) mem) && z1==z2
+ => (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem)
+
+((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHRQconst [63] (SHLQconst [63] x)) z2)) && z1==z2
+ => ((SETB|SETAE|ULT|UGE) (BTQconst [0] x))
+((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHRLconst [31] (SHLLconst [31] x)) z2)) && z1==z2
+ => ((SETB|SETAE|ULT|UGE) (BTLconst [0] x))
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHRQconst [63] (SHLQconst [63] x)) z2) mem) && z1==z2
+ => (SET(B|AE)store [off] {sym} ptr (BTQconst [0] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHRLconst [31] (SHLLconst [31] x)) z2) mem) && z1==z2
+ => (SET(B|AE)store [off] {sym} ptr (BTLconst [0] x) mem)
+
+// Special-case manually testing last bit with "a>>63 != 0" (without "&1")
+((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHRQconst [63] x) z2)) && z1==z2
+ => ((SETB|SETAE|ULT|UGE) (BTQconst [63] x))
+((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHRLconst [31] x) z2)) && z1==z2
+ => ((SETB|SETAE|ULT|UGE) (BTLconst [31] x))
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHRQconst [63] x) z2) mem) && z1==z2
+ => (SET(B|AE)store [off] {sym} ptr (BTQconst [63] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHRLconst [31] x) z2) mem) && z1==z2
+ => (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem)
+
+// Fold combinations of bit ops on same bit. An example is math.Copysign(c,-1)
+(BTS(Q|L)const [c] (BTR(Q|L)const [c] x)) => (BTS(Q|L)const [c] x)
+(BTS(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTS(Q|L)const [c] x)
+(BTR(Q|L)const [c] (BTS(Q|L)const [c] x)) => (BTR(Q|L)const [c] x)
+(BTR(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTR(Q|L)const [c] x)
+
+// Fold boolean negation into SETcc.
+(XORLconst [1] (SETNE x)) => (SETEQ x)
+(XORLconst [1] (SETEQ x)) => (SETNE x)
+(XORLconst [1] (SETL x)) => (SETGE x)
+(XORLconst [1] (SETGE x)) => (SETL x)
+(XORLconst [1] (SETLE x)) => (SETG x)
+(XORLconst [1] (SETG x)) => (SETLE x)
+(XORLconst [1] (SETB x)) => (SETAE x)
+(XORLconst [1] (SETAE x)) => (SETB x)
+(XORLconst [1] (SETBE x)) => (SETA x)
+(XORLconst [1] (SETA x)) => (SETBE x)
+
+// Special case for floating point - LF/LEF not generated
+(NE (TESTB (SETGF cmp) (SETGF cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF cmp yes no)
+(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF cmp yes no)
+
+// Disabled because it interferes with the pattern match above and makes worse code.
+// (SETNEF x) => (ORQ (SETNE <typ.Int8> x) (SETNAN <typ.Int8> x))
+// (SETEQF x) => (ANDQ (SETEQ <typ.Int8> x) (SETORD <typ.Int8> x))
+
+// fold constants into instructions
+(ADDQ x (MOVQconst [c])) && is32Bit(c) => (ADDQconst [int32(c)] x)
+(ADDQ x (MOVLconst [c])) => (ADDQconst [c] x)
+(ADDL x (MOVLconst [c])) => (ADDLconst [c] x)
+
+(SUBQ x (MOVQconst [c])) && is32Bit(c) => (SUBQconst x [int32(c)])
+(SUBQ (MOVQconst [c]) x) && is32Bit(c) => (NEGQ (SUBQconst <v.Type> x [int32(c)]))
+(SUBL x (MOVLconst [c])) => (SUBLconst x [c])
+(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c]))
+
+(MULQ x (MOVQconst [c])) && is32Bit(c) => (MULQconst [int32(c)] x)
+(MULL x (MOVLconst [c])) => (MULLconst [c] x)
+
+(ANDQ x (MOVQconst [c])) && is32Bit(c) => (ANDQconst [int32(c)] x)
+(ANDL x (MOVLconst [c])) => (ANDLconst [c] x)
+
+(AND(L|Q)const [c] (AND(L|Q)const [d] x)) => (AND(L|Q)const [c & d] x)
+(XOR(L|Q)const [c] (XOR(L|Q)const [d] x)) => (XOR(L|Q)const [c ^ d] x)
+(OR(L|Q)const [c] (OR(L|Q)const [d] x)) => (OR(L|Q)const [c | d] x)
+
+(BTRLconst [c] (ANDLconst [d] x)) => (ANDLconst [d &^ (1<<uint32(c))] x)
+(ANDLconst [c] (BTRLconst [d] x)) => (ANDLconst [c &^ (1<<uint32(d))] x)
+(BTRLconst [c] (BTRLconst [d] x)) => (ANDLconst [^(1<<uint32(c) | 1<<uint32(d))] x)
+
+(BTCLconst [c] (XORLconst [d] x)) => (XORLconst [d ^ 1<<uint32(c)] x)
+(XORLconst [c] (BTCLconst [d] x)) => (XORLconst [c ^ 1<<uint32(d)] x)
+(BTCLconst [c] (BTCLconst [d] x)) => (XORLconst [1<<uint32(c) | 1<<uint32(d)] x)
+
+(BTSLconst [c] (ORLconst [d] x)) => (ORLconst [d | 1<<uint32(c)] x)
+(ORLconst [c] (BTSLconst [d] x)) => (ORLconst [c | 1<<uint32(d)] x)
+(BTSLconst [c] (BTSLconst [d] x)) => (ORLconst [1<<uint32(c) | 1<<uint32(d)] x)
+
+(BTRQconst [c] (ANDQconst [d] x)) && is32Bit(int64(d) &^ (1<<uint32(c))) => (ANDQconst [d &^ (1<<uint32(c))] x)
+(ANDQconst [c] (BTRQconst [d] x)) && is32Bit(int64(c) &^ (1<<uint32(d))) => (ANDQconst [c &^ (1<<uint32(d))] x)
+(BTRQconst [c] (BTRQconst [d] x)) && is32Bit(^(1<<uint32(c) | 1<<uint32(d))) => (ANDQconst [^(1<<uint32(c) | 1<<uint32(d))] x)
+
+(BTCQconst [c] (XORQconst [d] x)) && is32Bit(int64(d) ^ 1<<uint32(c)) => (XORQconst [d ^ 1<<uint32(c)] x)
+(XORQconst [c] (BTCQconst [d] x)) && is32Bit(int64(c) ^ 1<<uint32(d)) => (XORQconst [c ^ 1<<uint32(d)] x)
+(BTCQconst [c] (BTCQconst [d] x)) && is32Bit(1<<uint32(c) ^ 1<<uint32(d)) => (XORQconst [1<<uint32(c) ^ 1<<uint32(d)] x)
+
+(BTSQconst [c] (ORQconst [d] x)) && is32Bit(int64(d) | 1<<uint32(c)) => (ORQconst [d | 1<<uint32(c)] x)
+(ORQconst [c] (BTSQconst [d] x)) && is32Bit(int64(c) | 1<<uint32(d)) => (ORQconst [c | 1<<uint32(d)] x)
+(BTSQconst [c] (BTSQconst [d] x)) && is32Bit(1<<uint32(c) | 1<<uint32(d)) => (ORQconst [1<<uint32(c) | 1<<uint32(d)] x)
+
+
+(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x)
+(MULQconst [c] (MULQconst [d] x)) && is32Bit(int64(c)*int64(d)) => (MULQconst [c * d] x)
+
+(ORQ x (MOVQconst [c])) && is32Bit(c) => (ORQconst [int32(c)] x)
+(ORQ x (MOVLconst [c])) => (ORQconst [c] x)
+(ORL x (MOVLconst [c])) => (ORLconst [c] x)
+
+(XORQ x (MOVQconst [c])) && is32Bit(c) => (XORQconst [int32(c)] x)
+(XORL x (MOVLconst [c])) => (XORLconst [c] x)
+
+(SHLQ x (MOV(Q|L)const [c])) => (SHLQconst [int8(c&63)] x)
+(SHLL x (MOV(Q|L)const [c])) => (SHLLconst [int8(c&31)] x)
+
+(SHRQ x (MOV(Q|L)const [c])) => (SHRQconst [int8(c&63)] x)
+(SHRL x (MOV(Q|L)const [c])) => (SHRLconst [int8(c&31)] x)
+(SHRW x (MOV(Q|L)const [c])) && c&31 < 16 => (SHRWconst [int8(c&31)] x)
+(SHRW _ (MOV(Q|L)const [c])) && c&31 >= 16 => (MOVLconst [0])
+(SHRB x (MOV(Q|L)const [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x)
+(SHRB _ (MOV(Q|L)const [c])) && c&31 >= 8 => (MOVLconst [0])
+
+(SARQ x (MOV(Q|L)const [c])) => (SARQconst [int8(c&63)] x)
+(SARL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x)
+(SARW x (MOV(Q|L)const [c])) => (SARWconst [int8(min(int64(c)&31,15))] x)
+(SARB x (MOV(Q|L)const [c])) => (SARBconst [int8(min(int64(c)&31,7))] x)
+
+// Operations which don't affect the low 6/5 bits of the shift amount are NOPs.
+((SHLQ|SHRQ|SARQ) x (ADDQconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
+((SHLQ|SHRQ|SARQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
+
+((SHLL|SHRL|SARL) x (ADDQconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
+((SHLL|SHRL|SARL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
+
+((SHLQ|SHRQ|SARQ) x (ADDLconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
+((SHLQ|SHRQ|SARQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
+
+((SHLL|SHRL|SARL) x (ADDLconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL) x (NEGL <t> y))
+((SHLL|SHRL|SARL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGL <t> y))
+
+// rotate left negative = rotate right
+(ROLQ x (NEG(Q|L) y)) => (RORQ x y)
+(ROLL x (NEG(Q|L) y)) => (RORL x y)
+(ROLW x (NEG(Q|L) y)) => (RORW x y)
+(ROLB x (NEG(Q|L) y)) => (RORB x y)
+
+// rotate right negative = rotate left
+(RORQ x (NEG(Q|L) y)) => (ROLQ x y)
+(RORL x (NEG(Q|L) y)) => (ROLL x y)
+(RORW x (NEG(Q|L) y)) => (ROLW x y)
+(RORB x (NEG(Q|L) y)) => (ROLB x y)
+
+// rotate by constants
+(ROLQ x (MOV(Q|L)const [c])) => (ROLQconst [int8(c&63)] x)
+(ROLL x (MOV(Q|L)const [c])) => (ROLLconst [int8(c&31)] x)
+(ROLW x (MOV(Q|L)const [c])) => (ROLWconst [int8(c&15)] x)
+(ROLB x (MOV(Q|L)const [c])) => (ROLBconst [int8(c&7) ] x)
+
+(RORQ x (MOV(Q|L)const [c])) => (ROLQconst [int8((-c)&63)] x)
+(RORL x (MOV(Q|L)const [c])) => (ROLLconst [int8((-c)&31)] x)
+(RORW x (MOV(Q|L)const [c])) => (ROLWconst [int8((-c)&15)] x)
+(RORB x (MOV(Q|L)const [c])) => (ROLBconst [int8((-c)&7) ] x)
+
+// Constant shift simplifications
+((SHLQ|SHRQ|SARQ)const x [0]) => x
+((SHLL|SHRL|SARL)const x [0]) => x
+((SHRW|SARW)const x [0]) => x
+((SHRB|SARB)const x [0]) => x
+((ROLQ|ROLL|ROLW|ROLB)const x [0]) => x
+
+// Multi-register shifts
+(ORQ (SH(R|L)Q lo bits) (SH(L|R)Q hi (NEGQ bits))) => (SH(R|L)DQ lo hi bits)
+(ORQ (SH(R|L)XQ lo bits) (SH(L|R)XQ hi (NEGQ bits))) => (SH(R|L)DQ lo hi bits)
+
+// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
+// because the x86 instructions are defined to use all 5 bits of the shift even
+// for the small shifts. I don't think we'll ever generate a weird shift (e.g.
+// (SHRW x (MOVLconst [24])), but just in case.
+
+(CMPQ x (MOVQconst [c])) && is32Bit(c) => (CMPQconst x [int32(c)])
+(CMPQ (MOVQconst [c]) x) && is32Bit(c) => (InvertFlags (CMPQconst x [int32(c)]))
+(CMPL x (MOVLconst [c])) => (CMPLconst x [c])
+(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c]))
+(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)])
+(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)]))
+(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)])
+(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)]))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+(CMP(Q|L|W|B) x y) && canonLessThan(x,y) => (InvertFlags (CMP(Q|L|W|B) y x))
+
+// Using MOVZX instead of AND is cheaper.
+(AND(Q|L)const [ 0xFF] x) => (MOVBQZX x)
+(AND(Q|L)const [0xFFFF] x) => (MOVWQZX x)
+// This rule is currently invalid because 0xFFFFFFFF is not representable by a signed int32.
+// Commenting out for now, because it also can't trigger because of the is32bit guard on the
+// ANDQconst lowering-rule, above, prevents 0xFFFFFFFF from matching (for the same reason)
+// Using an alternate form of this rule segfaults some binaries because of
+// adverse interactions with other passes.
+// (ANDQconst [0xFFFFFFFF] x) => (MOVLQZX x)
+
+// strength reduction
+// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
+// 1 - addq, shlq, leaq, negq, subq
+// 3 - imulq
+// This limits the rewrites to two instructions.
+// Note that negq always operates in-place,
+// which can require a register-register move
+// to preserve the original value,
+// so it must be used with care.
+(MUL(Q|L)const [-9] x) => (NEG(Q|L) (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [-5] x) => (NEG(Q|L) (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [-3] x) => (NEG(Q|L) (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [-1] x) => (NEG(Q|L) x)
+(MUL(Q|L)const [ 0] _) => (MOV(Q|L)const [0])
+(MUL(Q|L)const [ 1] x) => x
+(MUL(Q|L)const [ 3] x) => (LEA(Q|L)2 x x)
+(MUL(Q|L)const [ 5] x) => (LEA(Q|L)4 x x)
+(MUL(Q|L)const [ 7] x) => (LEA(Q|L)2 x (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [ 9] x) => (LEA(Q|L)8 x x)
+(MUL(Q|L)const [11] x) => (LEA(Q|L)2 x (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [13] x) => (LEA(Q|L)4 x (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [19] x) => (LEA(Q|L)2 x (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [21] x) => (LEA(Q|L)4 x (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [25] x) => (LEA(Q|L)8 x (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [27] x) => (LEA(Q|L)8 (LEA(Q|L)2 <v.Type> x x) (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [37] x) => (LEA(Q|L)4 x (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [41] x) => (LEA(Q|L)8 x (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [45] x) => (LEA(Q|L)8 (LEA(Q|L)4 <v.Type> x x) (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [73] x) => (LEA(Q|L)8 x (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [81] x) => (LEA(Q|L)8 (LEA(Q|L)8 <v.Type> x x) (LEA(Q|L)8 <v.Type> x x))
+
+(MUL(Q|L)const [c] x) && isPowerOfTwo64(int64(c)+1) && c >= 15 => (SUB(Q|L) (SHL(Q|L)const <v.Type> [int8(log64(int64(c)+1))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-1) && c >= 17 => (LEA(Q|L)1 (SHL(Q|L)const <v.Type> [int8(log32(c-1))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-2) && c >= 34 => (LEA(Q|L)2 (SHL(Q|L)const <v.Type> [int8(log32(c-2))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-4) && c >= 68 => (LEA(Q|L)4 (SHL(Q|L)const <v.Type> [int8(log32(c-4))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEA(Q|L)8 (SHL(Q|L)const <v.Type> [int8(log32(c-8))] x) x)
+(MUL(Q|L)const [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHL(Q|L)const [int8(log32(c/3))] (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHL(Q|L)const [int8(log32(c/5))] (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHL(Q|L)const [int8(log32(c/9))] (LEA(Q|L)8 <v.Type> x x))
+
+// combine add/shift into LEAQ/LEAL
+(ADD(L|Q) x (SHL(L|Q)const [3] y)) => (LEA(L|Q)8 x y)
+(ADD(L|Q) x (SHL(L|Q)const [2] y)) => (LEA(L|Q)4 x y)
+(ADD(L|Q) x (SHL(L|Q)const [1] y)) => (LEA(L|Q)2 x y)
+(ADD(L|Q) x (ADD(L|Q) y y)) => (LEA(L|Q)2 x y)
+(ADD(L|Q) x (ADD(L|Q) x y)) => (LEA(L|Q)2 y x)
+
+// combine ADDQ/ADDQconst into LEAQ1/LEAL1
+(ADD(Q|L)const [c] (ADD(Q|L) x y)) => (LEA(Q|L)1 [c] x y)
+(ADD(Q|L) (ADD(Q|L)const [c] x) y) => (LEA(Q|L)1 [c] x y)
+(ADD(Q|L)const [c] (SHL(Q|L)const [1] x)) => (LEA(Q|L)1 [c] x x)
+
+// fold ADDQ/ADDL into LEAQ/LEAL
+(ADD(Q|L)const [c] (LEA(Q|L) [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L) [c+d] {s} x)
+(LEA(Q|L) [c] {s} (ADD(Q|L)const [d] x)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L) [c+d] {s} x)
+(LEA(Q|L) [c] {s} (ADD(Q|L) x y)) && x.Op != OpSB && y.Op != OpSB => (LEA(Q|L)1 [c] {s} x y)
+(ADD(Q|L) x (LEA(Q|L) [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEA(Q|L)1 [c] {s} x y)
+
+// fold ADDQconst/ADDLconst into LEAQx/LEALx
+(ADD(Q|L)const [c] (LEA(Q|L)1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)1 [c+d] {s} x y)
+(ADD(Q|L)const [c] (LEA(Q|L)2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)2 [c+d] {s} x y)
+(ADD(Q|L)const [c] (LEA(Q|L)4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)4 [c+d] {s} x y)
+(ADD(Q|L)const [c] (LEA(Q|L)8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)8 [c+d] {s} x y)
+(LEA(Q|L)1 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)1 [c+d] {s} x y)
+(LEA(Q|L)2 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)2 [c+d] {s} x y)
+(LEA(Q|L)2 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEA(Q|L)2 [c+2*d] {s} x y)
+(LEA(Q|L)4 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)4 [c+d] {s} x y)
+(LEA(Q|L)4 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEA(Q|L)4 [c+4*d] {s} x y)
+(LEA(Q|L)8 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)8 [c+d] {s} x y)
+(LEA(Q|L)8 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEA(Q|L)8 [c+8*d] {s} x y)
+
+// fold shifts into LEAQx/LEALx
+(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)2 [c] {s} x y)
+(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [2] y)) => (LEA(Q|L)4 [c] {s} x y)
+(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [3] y)) => (LEA(Q|L)8 [c] {s} x y)
+(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)4 [c] {s} x y)
+(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [2] y)) => (LEA(Q|L)8 [c] {s} x y)
+(LEA(Q|L)4 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)8 [c] {s} x y)
+
+// reverse ordering of compare instruction
+(SETL (InvertFlags x)) => (SETG x)
+(SETG (InvertFlags x)) => (SETL x)
+(SETB (InvertFlags x)) => (SETA x)
+(SETA (InvertFlags x)) => (SETB x)
+(SETLE (InvertFlags x)) => (SETGE x)
+(SETGE (InvertFlags x)) => (SETLE x)
+(SETBE (InvertFlags x)) => (SETAE x)
+(SETAE (InvertFlags x)) => (SETBE x)
+(SETEQ (InvertFlags x)) => (SETEQ x)
+(SETNE (InvertFlags x)) => (SETNE x)
+
+(SETLstore [off] {sym} ptr (InvertFlags x) mem) => (SETGstore [off] {sym} ptr x mem)
+(SETGstore [off] {sym} ptr (InvertFlags x) mem) => (SETLstore [off] {sym} ptr x mem)
+(SETBstore [off] {sym} ptr (InvertFlags x) mem) => (SETAstore [off] {sym} ptr x mem)
+(SETAstore [off] {sym} ptr (InvertFlags x) mem) => (SETBstore [off] {sym} ptr x mem)
+(SETLEstore [off] {sym} ptr (InvertFlags x) mem) => (SETGEstore [off] {sym} ptr x mem)
+(SETGEstore [off] {sym} ptr (InvertFlags x) mem) => (SETLEstore [off] {sym} ptr x mem)
+(SETBEstore [off] {sym} ptr (InvertFlags x) mem) => (SETAEstore [off] {sym} ptr x mem)
+(SETAEstore [off] {sym} ptr (InvertFlags x) mem) => (SETBEstore [off] {sym} ptr x mem)
+(SETEQstore [off] {sym} ptr (InvertFlags x) mem) => (SETEQstore [off] {sym} ptr x mem)
+(SETNEstore [off] {sym} ptr (InvertFlags x) mem) => (SETNEstore [off] {sym} ptr x mem)
+
+// sign extended loads
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values. See test/atomicload.go.
+(MOVBQSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVWQSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVLQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLQSXload <v.Type> [off] {sym} ptr mem)
+(MOVLQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLQSXload <v.Type> [off] {sym} ptr mem)
+(MOVLQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLload <v.Type> [off] {sym} ptr mem)
+(MOVLQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLload <v.Type> [off] {sym} ptr mem)
+
+(MOVLQZX x) && zeroUpper32Bits(x,3) => x
+(MOVWQZX x) && zeroUpper48Bits(x,3) => x
+(MOVBQZX x) && zeroUpper56Bits(x,3) => x
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBQZX x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWQZX x)
+(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVLQZX x)
+(MOVQload [off] {sym} ptr (MOVQstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVBQSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBQSX x)
+(MOVWQSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWQSX x)
+(MOVLQSXload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVLQSX x)
+
+// Fold extensions and ANDs together.
+(MOVBQZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x)
+(MOVWQZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x)
+(MOVLQZX (ANDLconst [c] x)) => (ANDLconst [c] x)
+(MOVBQSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x)
+(MOVWQSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x)
+(MOVLQSX (ANDLconst [c] x)) && uint32(c) & 0x80000000 == 0 => (ANDLconst [c & 0x7fffffff] x)
+
+// Don't extend before storing
+(MOVLstore [off] {sym} ptr (MOVLQSX x) mem) => (MOVLstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWQSX x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBQSX x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVLstore [off] {sym} ptr (MOVLQZX x) mem) => (MOVLstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWQZX x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBQZX x) mem) => (MOVBstore [off] {sym} ptr x mem)
+
+// fold constants into memory operations
+// Note that this is not always a good idea because if not all the uses of
+// the ADDQconst get eliminated, we still have to compute the ADDQconst and we now
+// have potentially two live values (ptr and (ADDQconst [off] ptr)) instead of one.
+// Nevertheless, let's do it!
+(MOV(Q|L|W|B|SS|SD|O)load [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOV(Q|L|W|B|SS|SD|O)load [off1+off2] {sym} ptr mem)
+(MOV(Q|L|W|B|SS|SD|O)store [off1] {sym} (ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOV(Q|L|W|B|SS|SD|O)store [off1+off2] {sym} ptr val mem)
+(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1+off2] {sym} base val mem)
+((ADD|SUB|AND|OR|XOR)Qload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {sym} val base mem)
+((ADD|SUB|AND|OR|XOR)Lload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
+(CMP(Q|L|W|B)load [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (CMP(Q|L|W|B)load [off1+off2] {sym} base val mem)
+(CMP(Q|L|W|B)constload [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
+ (CMP(Q|L|W|B)constload [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
+
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
+((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
+ ((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
+ ((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
+((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {sym} base val mem)
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+ ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
+
+// Fold constants into stores.
+(MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validVal(c) =>
+ (MOVQstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem)
+(MOVLstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) =>
+ (MOVLstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) =>
+ (MOVWstoreconst [makeValAndOff(int32(int16(c)),off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) =>
+ (MOVBstoreconst [makeValAndOff(int32(int8(c)),off)] {sym} ptr mem)
+
+// Fold address offsets into constant stores.
+(MOV(Q|L|W|B|O)storeconst [sc] {s} (ADDQconst [off] ptr) mem) && ValAndOff(sc).canAdd32(off) =>
+ (MOV(Q|L|W|B|O)storeconst [ValAndOff(sc).addOffset32(off)] {s} ptr mem)
+
+// We need to fold LEAQ into the MOVx ops so that the live variable analysis knows
+// what variables are being read/written by the ops.
+(MOV(Q|L|W|B|SS|SD|O|BQSX|WQSX|LQSX)load [off1] {sym1} (LEAQ [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOV(Q|L|W|B|SS|SD|O|BQSX|WQSX|LQSX)load [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOV(Q|L|W|B|SS|SD|O)store [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOV(Q|L|W|B|SS|SD|O)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOV(Q|L|W|B|O)storeconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd32(off) =>
+ (MOV(Q|L|W|B|O)storeconst [ValAndOff(sc).addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+((ADD|SUB|AND|OR|XOR)Qload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ ((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|AND|OR|XOR)Lload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+(CMP(Q|L|W|B)load [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (CMP(Q|L|W|B)load [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(CMP(Q|L|W|B)constload [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
+ && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
+ (CMP(Q|L|W|B)constload [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+
+((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ ((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
+ && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
+ ((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
+ && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
+ ((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ ((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+ && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+// fold LEAQs together
+(LEAQ [off1] {sym1} (LEAQ [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ [off1+off2] {mergeSym(sym1,sym2)} x)
+
+// LEAQ into LEAQ1
+(LEAQ1 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAQ1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ1 into LEAQ
+(LEAQ [off1] {sym1} (LEAQ1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ into LEAQ[248]
+(LEAQ2 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAQ2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ4 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAQ4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ8 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (LEAQ8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ[248] into LEAQ
+(LEAQ [off1] {sym1} (LEAQ2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ [off1] {sym1} (LEAQ4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ [off1] {sym1} (LEAQ8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ[1248] into LEAQ[1248]. Only some such merges are possible.
+(LEAQ1 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ2 [off1+off2] {mergeSym(sym1, sym2)} x y)
+(LEAQ1 [off1] {sym1} x (LEAQ1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (LEAQ2 [off1+off2] {mergeSym(sym1, sym2)} y x)
+(LEAQ2 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+2*int64(off2)) && sym2 == nil =>
+ (LEAQ4 [off1+2*off2] {sym1} x y)
+(LEAQ4 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+4*int64(off2)) && sym2 == nil =>
+ (LEAQ8 [off1+4*off2] {sym1} x y)
+// TODO: more?
+
+// Lower LEAQ2/4/8 when the offset is a constant
+(LEAQ2 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*2) =>
+ (LEAQ [off+int32(scale)*2] {sym} x)
+(LEAQ4 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*4) =>
+ (LEAQ [off+int32(scale)*4] {sym} x)
+(LEAQ8 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*8) =>
+ (LEAQ [off+int32(scale)*8] {sym} x)
+
+// Absorb InvertFlags into branches.
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+
+// Constant comparisons.
+(CMPQconst (MOVQconst [x]) [y]) && x==int64(y) => (FlagEQ)
+(CMPQconst (MOVQconst [x]) [y]) && x<int64(y) && uint64(x)<uint64(int64(y)) => (FlagLT_ULT)
+(CMPQconst (MOVQconst [x]) [y]) && x<int64(y) && uint64(x)>uint64(int64(y)) => (FlagLT_UGT)
+(CMPQconst (MOVQconst [x]) [y]) && x>int64(y) && uint64(x)<uint64(int64(y)) => (FlagGT_ULT)
+(CMPQconst (MOVQconst [x]) [y]) && x>int64(y) && uint64(x)>uint64(int64(y)) => (FlagGT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x==y => (FlagEQ)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y => (FlagEQ)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y => (FlagEQ)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT)
+
+// CMPQconst requires a 32 bit const, but we can still constant-fold 64 bit consts.
+// In theory this applies to any of the simplifications above,
+// but CMPQ is the only one I've actually seen occur.
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x==y => (FlagEQ)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x<y && uint64(x)<uint64(y) => (FlagLT_ULT)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x<y && uint64(x)>uint64(y) => (FlagLT_UGT)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x>y && uint64(x)<uint64(y) => (FlagGT_ULT)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x>y && uint64(x)>uint64(y) => (FlagGT_UGT)
+
+// Other known comparisons.
+(CMPQconst (MOVBQZX _) [c]) && 0xFF < c => (FlagLT_ULT)
+(CMPQconst (MOVWQZX _) [c]) && 0xFFFF < c => (FlagLT_ULT)
+(CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) => (FlagLT_ULT)
+(CMPQconst (SHRQconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 64 && (1<<uint64(64-c)) <= uint64(n) => (FlagLT_ULT)
+(CMPQconst (ANDQconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPQconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT)
+(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m) && int8(m) < n => (FlagLT_ULT)
+
+// TESTQ c c sets flags like CMPQ c 0.
+(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c == 0 => (FlagEQ)
+(TESTLconst [c] (MOVLconst [c])) && c == 0 => (FlagEQ)
+(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c < 0 => (FlagLT_UGT)
+(TESTLconst [c] (MOVLconst [c])) && c < 0 => (FlagLT_UGT)
+(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c > 0 => (FlagGT_UGT)
+(TESTLconst [c] (MOVLconst [c])) && c > 0 => (FlagGT_UGT)
+
+// TODO: DIVxU also.
+
+// Absorb flag constants into SBB ops.
+(SBBQcarrymask (FlagEQ)) => (MOVQconst [0])
+(SBBQcarrymask (FlagLT_ULT)) => (MOVQconst [-1])
+(SBBQcarrymask (FlagLT_UGT)) => (MOVQconst [0])
+(SBBQcarrymask (FlagGT_ULT)) => (MOVQconst [-1])
+(SBBQcarrymask (FlagGT_UGT)) => (MOVQconst [0])
+(SBBLcarrymask (FlagEQ)) => (MOVLconst [0])
+(SBBLcarrymask (FlagLT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagLT_UGT)) => (MOVLconst [0])
+(SBBLcarrymask (FlagGT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagGT_UGT)) => (MOVLconst [0])
+
+// Absorb flag constants into branches.
+((EQ|LE|GE|ULE|UGE) (FlagEQ) yes no) => (First yes no)
+((NE|LT|GT|ULT|UGT) (FlagEQ) yes no) => (First no yes)
+((NE|LT|LE|ULT|ULE) (FlagLT_ULT) yes no) => (First yes no)
+((EQ|GT|GE|UGT|UGE) (FlagLT_ULT) yes no) => (First no yes)
+((NE|LT|LE|UGT|UGE) (FlagLT_UGT) yes no) => (First yes no)
+((EQ|GT|GE|ULT|ULE) (FlagLT_UGT) yes no) => (First no yes)
+((NE|GT|GE|ULT|ULE) (FlagGT_ULT) yes no) => (First yes no)
+((EQ|LT|LE|UGT|UGE) (FlagGT_ULT) yes no) => (First no yes)
+((NE|GT|GE|UGT|UGE) (FlagGT_UGT) yes no) => (First yes no)
+((EQ|LT|LE|ULT|ULE) (FlagGT_UGT) yes no) => (First no yes)
+
+// Absorb flag constants into SETxx ops.
+((SETEQ|SETLE|SETGE|SETBE|SETAE) (FlagEQ)) => (MOVLconst [1])
+((SETNE|SETL|SETG|SETB|SETA) (FlagEQ)) => (MOVLconst [0])
+((SETNE|SETL|SETLE|SETB|SETBE) (FlagLT_ULT)) => (MOVLconst [1])
+((SETEQ|SETG|SETGE|SETA|SETAE) (FlagLT_ULT)) => (MOVLconst [0])
+((SETNE|SETL|SETLE|SETA|SETAE) (FlagLT_UGT)) => (MOVLconst [1])
+((SETEQ|SETG|SETGE|SETB|SETBE) (FlagLT_UGT)) => (MOVLconst [0])
+((SETNE|SETG|SETGE|SETB|SETBE) (FlagGT_ULT)) => (MOVLconst [1])
+((SETEQ|SETL|SETLE|SETA|SETAE) (FlagGT_ULT)) => (MOVLconst [0])
+((SETNE|SETG|SETGE|SETA|SETAE) (FlagGT_UGT)) => (MOVLconst [1])
+((SETEQ|SETL|SETLE|SETB|SETBE) (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETEQstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETEQstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETEQstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETEQstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETEQstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETNEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETNEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETNEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETNEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETNEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETLstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETLstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETLstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETLEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETLEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETGstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETGstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETGEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETGEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETGEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETBstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETBstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETBstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETBEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETBEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETAstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETAstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETAEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETAEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETAEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+// Remove redundant *const ops
+(ADDQconst [0] x) => x
+(ADDLconst [c] x) && c==0 => x
+(SUBQconst [0] x) => x
+(SUBLconst [c] x) && c==0 => x
+(ANDQconst [0] _) => (MOVQconst [0])
+(ANDLconst [c] _) && c==0 => (MOVLconst [0])
+(ANDQconst [-1] x) => x
+(ANDLconst [c] x) && c==-1 => x
+(ORQconst [0] x) => x
+(ORLconst [c] x) && c==0 => x
+(ORQconst [-1] _) => (MOVQconst [-1])
+(ORLconst [c] _) && c==-1 => (MOVLconst [-1])
+(XORQconst [0] x) => x
+(XORLconst [c] x) && c==0 => x
+// TODO: since we got rid of the W/B versions, we might miss
+// things like (ANDLconst [0x100] x) which were formerly
+// (ANDBconst [0] x). Probably doesn't happen very often.
+// If we cared, we might do:
+// (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 -> (MOVLconst [0])
+
+// Remove redundant ops
+// Not in generic rules, because they may appear after lowering e. g. Slicemask
+(NEG(Q|L) (NEG(Q|L) x)) => x
+(NEG(Q|L) s:(SUB(Q|L) x y)) && s.Uses == 1 => (SUB(Q|L) y x)
+
+// Convert constant subtracts to constant adds
+(SUBQconst [c] x) && c != -(1<<31) => (ADDQconst [-c] x)
+(SUBLconst [c] x) => (ADDLconst [-c] x)
+
+// generic constant folding
+// TODO: more of this
+(ADDQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)+d])
+(ADDLconst [c] (MOVLconst [d])) => (MOVLconst [c+d])
+(ADDQconst [c] (ADDQconst [d] x)) && is32Bit(int64(c)+int64(d)) => (ADDQconst [c+d] x)
+(ADDLconst [c] (ADDLconst [d] x)) => (ADDLconst [c+d] x)
+(SUBQconst (MOVQconst [d]) [c]) => (MOVQconst [d-int64(c)])
+(SUBQconst (SUBQconst x [d]) [c]) && is32Bit(int64(-c)-int64(d)) => (ADDQconst [-c-d] x)
+(SARQconst [c] (MOVQconst [d])) => (MOVQconst [d>>uint64(c)])
+(SARLconst [c] (MOVQconst [d])) => (MOVQconst [int64(int32(d))>>uint64(c)])
+(SARWconst [c] (MOVQconst [d])) => (MOVQconst [int64(int16(d))>>uint64(c)])
+(SARBconst [c] (MOVQconst [d])) => (MOVQconst [int64(int8(d))>>uint64(c)])
+(NEGQ (MOVQconst [c])) => (MOVQconst [-c])
+(NEGL (MOVLconst [c])) => (MOVLconst [-c])
+(MULQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)*d])
+(MULLconst [c] (MOVLconst [d])) => (MOVLconst [c*d])
+(ANDQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)&d])
+(ANDLconst [c] (MOVLconst [d])) => (MOVLconst [c&d])
+(ORQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)|d])
+(ORLconst [c] (MOVLconst [d])) => (MOVLconst [c|d])
+(XORQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)^d])
+(XORLconst [c] (MOVLconst [d])) => (MOVLconst [c^d])
+(NOTQ (MOVQconst [c])) => (MOVQconst [^c])
+(NOTL (MOVLconst [c])) => (MOVLconst [^c])
+(BTSQconst [c] (MOVQconst [d])) => (MOVQconst [d|(1<<uint32(c))])
+(BTSLconst [c] (MOVLconst [d])) => (MOVLconst [d|(1<<uint32(c))])
+(BTRQconst [c] (MOVQconst [d])) => (MOVQconst [d&^(1<<uint32(c))])
+(BTRLconst [c] (MOVLconst [d])) => (MOVLconst [d&^(1<<uint32(c))])
+(BTCQconst [c] (MOVQconst [d])) => (MOVQconst [d^(1<<uint32(c))])
+(BTCLconst [c] (MOVLconst [d])) => (MOVLconst [d^(1<<uint32(c))])
+
+// If c or d doesn't fit into 32 bits, then we can't construct ORQconst,
+// but we can still constant-fold.
+// In theory this applies to any of the simplifications above,
+// but ORQ is the only one I've actually seen occur.
+(ORQ (MOVQconst [c]) (MOVQconst [d])) => (MOVQconst [c|d])
+
+// generic simplifications
+// TODO: more of this
+(ADDQ x (NEGQ y)) => (SUBQ x y)
+(ADDL x (NEGL y)) => (SUBL x y)
+(SUBQ x x) => (MOVQconst [0])
+(SUBL x x) => (MOVLconst [0])
+(ANDQ x x) => x
+(ANDL x x) => x
+(ORQ x x) => x
+(ORL x x) => x
+(XORQ x x) => (MOVQconst [0])
+(XORL x x) => (MOVLconst [0])
+
+(SHLLconst [d] (MOVLconst [c])) => (MOVLconst [c << uint64(d)])
+(SHLQconst [d] (MOVQconst [c])) => (MOVQconst [c << uint64(d)])
+(SHLQconst [d] (MOVLconst [c])) => (MOVQconst [int64(c) << uint64(d)])
+
+// Fold NEG into ADDconst/MULconst. Take care to keep c in 32 bit range.
+(NEGQ (ADDQconst [c] (NEGQ x))) && c != -(1<<31) => (ADDQconst [-c] x)
+(MULQconst [c] (NEGQ x)) && c != -(1<<31) => (MULQconst [-c] x)
+
+// checking AND against 0.
+(CMPQconst a:(ANDQ x y) [0]) && a.Uses == 1 => (TESTQ x y)
+(CMPLconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTL x y)
+(CMPWconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTW x y)
+(CMPBconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTB x y)
+(CMPQconst a:(ANDQconst [c] x) [0]) && a.Uses == 1 => (TESTQconst [c] x)
+(CMPLconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTLconst [c] x)
+(CMPWconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTWconst [int16(c)] x)
+(CMPBconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTBconst [int8(c)] x)
+
+// Convert TESTx to TESTxconst if possible.
+(TESTQ (MOVQconst [c]) x) && is32Bit(c) => (TESTQconst [int32(c)] x)
+(TESTL (MOVLconst [c]) x) => (TESTLconst [c] x)
+(TESTW (MOVLconst [c]) x) => (TESTWconst [int16(c)] x)
+(TESTB (MOVLconst [c]) x) => (TESTBconst [int8(c)] x)
+
+// TEST %reg,%reg is shorter than CMP
+(CMPQconst x [0]) => (TESTQ x x)
+(CMPLconst x [0]) => (TESTL x x)
+(CMPWconst x [0]) => (TESTW x x)
+(CMPBconst x [0]) => (TESTB x x)
+(TESTQconst [-1] x) && x.Op != OpAMD64MOVQconst => (TESTQ x x)
+(TESTLconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTL x x)
+(TESTWconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTW x x)
+(TESTBconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTB x x)
+
+// Convert LEAQ1 back to ADDQ if we can
+(LEAQ1 [0] x y) && v.Aux == nil => (ADDQ x y)
+
+// Combining byte loads into larger (unaligned) loads.
+// There are many ways these combinations could occur. This is
+// designed to match the way encoding/binary.LittleEndian does it.
+
+// Little-endian loads
+
+(OR(L|Q) x0:(MOVBload [i0] {s} p mem)
+ sh:(SHL(L|Q)const [8] x1:(MOVBload [i1] {s} p mem)))
+ && i1 == i0+1
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
+
+(OR(L|Q) x0:(MOVBload [i] {s} p0 mem)
+ sh:(SHL(L|Q)const [8] x1:(MOVBload [i] {s} p1 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+
+(OR(L|Q) x0:(MOVWload [i0] {s} p mem)
+ sh:(SHL(L|Q)const [16] x1:(MOVWload [i1] {s} p mem)))
+ && i1 == i0+2
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
+
+(OR(L|Q) x0:(MOVWload [i] {s} p0 mem)
+ sh:(SHL(L|Q)const [16] x1:(MOVWload [i] {s} p1 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
+
+(ORQ x0:(MOVLload [i0] {s} p mem)
+ sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem)))
+ && i1 == i0+4
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem)
+
+(ORQ x0:(MOVLload [i] {s} p0 mem)
+ sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p0, p1, 4)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem)
+
+(OR(L|Q)
+ s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
+ or:(OR(L|Q)
+ s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
+ y))
+ && i1 == i0+1
+ && j1 == j0+8
+ && j0 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
+
+(OR(L|Q)
+ s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
+ or:(OR(L|Q)
+ s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
+ y))
+ && j1 == j0+8
+ && j0 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
+
+(ORQ
+ s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem))
+ or:(ORQ
+ s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem))
+ y))
+ && i1 == i0+2
+ && j1 == j0+16
+ && j0 % 32 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y)
+
+(ORQ
+ s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem))
+ or:(ORQ
+ s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem))
+ y))
+ && j1 == j0+16
+ && j0 % 32 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y)
+
+// Big-endian loads
+
+(OR(L|Q)
+ x1:(MOVBload [i1] {s} p mem)
+ sh:(SHL(L|Q)const [8] x0:(MOVBload [i0] {s} p mem)))
+ && i1 == i0+1
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
+
+(OR(L|Q)
+ x1:(MOVBload [i] {s} p1 mem)
+ sh:(SHL(L|Q)const [8] x0:(MOVBload [i] {s} p0 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
+
+(OR(L|Q)
+ r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))
+ sh:(SHL(L|Q)const [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
+ && i1 == i0+2
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
+
+(OR(L|Q)
+ r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))
+ sh:(SHL(L|Q)const [16] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
+
+(ORQ
+ r1:(BSWAPL x1:(MOVLload [i1] {s} p mem))
+ sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i0] {s} p mem))))
+ && i1 == i0+4
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem))
+
+(ORQ
+ r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem))
+ sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p0, p1, 4)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem))
+
+(OR(L|Q)
+ s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
+ or:(OR(L|Q)
+ s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
+ y))
+ && i1 == i0+1
+ && j1 == j0-8
+ && j1 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
+
+(OR(L|Q)
+ s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
+ or:(OR(L|Q)
+ s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
+ y))
+ && j1 == j0-8
+ && j1 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
+
+(ORQ
+ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem)))
+ or:(ORQ
+ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)))
+ y))
+ && i1 == i0+2
+ && j1 == j0-16
+ && j1 % 32 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, r0, r1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y)
+
+(ORQ
+ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem)))
+ or:(ORQ
+ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)))
+ y))
+ && j1 == j0-16
+ && j1 % 32 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, r0, r1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y)
+
+// Combine 2 byte stores + shift into rolw 8 + word store
+(MOVBstore [i] {s} p w
+ x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem))
+ && x0.Uses == 1
+ && clobber(x0)
+ => (MOVWstore [i-1] {s} p (ROLWconst <typ.UInt16> [8] w) mem)
+(MOVBstore [i] {s} p1 w
+ x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem))
+ && x0.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x0)
+ => (MOVWstore [i] {s} p0 (ROLWconst <typ.UInt16> [8] w) mem)
+
+// Combine stores + shifts into bswap and larger (unaligned) stores
+(MOVBstore [i] {s} p w
+ x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)
+ x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)
+ x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && clobber(x0, x1, x2)
+ => (MOVLstore [i-3] {s} p (BSWAPL <typ.UInt32> w) mem)
+(MOVBstore [i] {s} p3 w
+ x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w)
+ x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w)
+ x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && sequentialAddresses(p1, p2, 1)
+ && sequentialAddresses(p2, p3, 1)
+ && clobber(x0, x1, x2)
+ => (MOVLstore [i] {s} p0 (BSWAPL <typ.UInt32> w) mem)
+
+(MOVBstore [i] {s} p w
+ x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)
+ x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)
+ x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)
+ x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)
+ x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)
+ x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)
+ x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && x3.Uses == 1
+ && x4.Uses == 1
+ && x5.Uses == 1
+ && x6.Uses == 1
+ && clobber(x0, x1, x2, x3, x4, x5, x6)
+ => (MOVQstore [i-7] {s} p (BSWAPQ <typ.UInt64> w) mem)
+(MOVBstore [i] {s} p7 w
+ x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w)
+ x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w)
+ x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w)
+ x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w)
+ x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w)
+ x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w)
+ x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem))))))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && x3.Uses == 1
+ && x4.Uses == 1
+ && x5.Uses == 1
+ && x6.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && sequentialAddresses(p1, p2, 1)
+ && sequentialAddresses(p2, p3, 1)
+ && sequentialAddresses(p3, p4, 1)
+ && sequentialAddresses(p4, p5, 1)
+ && sequentialAddresses(p5, p6, 1)
+ && sequentialAddresses(p6, p7, 1)
+ && clobber(x0, x1, x2, x3, x4, x5, x6)
+ => (MOVQstore [i] {s} p0 (BSWAPQ <typ.UInt64> w) mem)
+
+// Combine constant stores into larger (unaligned) stores.
+(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+1-c.Off()))
+ && clobber(x)
+ => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem)
+(MOVBstoreconst [a] {s} p0 x:(MOVBstoreconst [c] {s} p1 mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+1-c.Off()))
+ && clobber(x)
+ => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem)
+(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+2-c.Off()))
+ && clobber(x)
+ => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem)
+(MOVWstoreconst [a] {s} p0 x:(MOVWstoreconst [c] {s} p1 mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+2-c.Off()))
+ && clobber(x)
+ => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem)
+(MOVLstoreconst [c] {s} p1 x:(MOVLstoreconst [a] {s} p0 mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+4-c.Off()))
+ && clobber(x)
+ => (MOVQstore [a.Off()] {s} p0 (MOVQconst [a.Val64()&0xffffffff | c.Val64()<<32]) mem)
+(MOVLstoreconst [a] {s} p0 x:(MOVLstoreconst [c] {s} p1 mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+4-c.Off()))
+ && clobber(x)
+ => (MOVQstore [a.Off()] {s} p0 (MOVQconst [a.Val64()&0xffffffff | c.Val64()<<32]) mem)
+(MOVQstoreconst [c] {s} p1 x:(MOVQstoreconst [a] {s} p0 mem))
+ && config.useSSE
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+8-c.Off()))
+ && a.Val() == 0
+ && c.Val() == 0
+ && clobber(x)
+ => (MOVOstoreconst [makeValAndOff(0,a.Off())] {s} p0 mem)
+(MOVQstoreconst [a] {s} p0 x:(MOVQstoreconst [c] {s} p1 mem))
+ && config.useSSE
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, int64(a.Off()+8-c.Off()))
+ && a.Val() == 0
+ && c.Val() == 0
+ && clobber(x)
+ => (MOVOstoreconst [makeValAndOff(0,a.Off())] {s} p0 mem)
+
+// Combine stores into larger (unaligned) stores. Little endian.
+(MOVBstore [i] {s} p (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHR(W|L|Q)const [8] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i] {s} p w mem)
+(MOVBstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p w0:(SHR(L|Q)const [j-8] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i] {s} p0 w mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 1)
+ && clobber(x)
+ => (MOVWstore [i] {s} p0 w0 mem)
+
+(MOVWstore [i] {s} p (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVLstore [i-2] {s} p w mem)
+(MOVWstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p w0:(SHR(L|Q)const [j-16] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVLstore [i-2] {s} p w0 mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i] {s} p0 w mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && clobber(x)
+ => (MOVLstore [i] {s} p0 w mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 2)
+ && clobber(x)
+ => (MOVLstore [i] {s} p0 w0 mem)
+
+(MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVQstore [i-4] {s} p w mem)
+(MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVQstore [i-4] {s} p w0 mem)
+(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 4)
+ && clobber(x)
+ => (MOVQstore [i] {s} p0 w mem)
+(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem))
+ && x.Uses == 1
+ && sequentialAddresses(p0, p1, 4)
+ && clobber(x)
+ => (MOVQstore [i] {s} p0 w0 mem)
+
+(MOVBstore [c3] {s} p3 (SHRQconst [56] w)
+ x1:(MOVWstore [c2] {s} p2 (SHRQconst [40] w)
+ x2:(MOVLstore [c1] {s} p1 (SHRQconst [8] w)
+ x3:(MOVBstore [c0] {s} p0 w mem))))
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && x3.Uses == 1
+ && sequentialAddresses(p0, p1, int64(1 + c0 - c1))
+ && sequentialAddresses(p0, p2, int64(5 + c0 - c2))
+ && sequentialAddresses(p0, p3, int64(7 + c0 - c3))
+ && clobber(x1, x2, x3)
+ => (MOVQstore [c0] {s} p0 w mem)
+
+(MOVBstore [i] {s} p
+ x1:(MOVBload [j] {s2} p2 mem)
+ mem2:(MOVBstore [i-1] {s} p
+ x2:(MOVBload [j-1] {s2} p2 mem) mem))
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && mem2.Uses == 1
+ && clobber(x1, x2, mem2)
+ => (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem)
+
+(MOVWstore [i] {s} p
+ x1:(MOVWload [j] {s2} p2 mem)
+ mem2:(MOVWstore [i-2] {s} p
+ x2:(MOVWload [j-2] {s2} p2 mem) mem))
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && mem2.Uses == 1
+ && clobber(x1, x2, mem2)
+ => (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem)
+
+(MOVLstore [i] {s} p
+ x1:(MOVLload [j] {s2} p2 mem)
+ mem2:(MOVLstore [i-4] {s} p
+ x2:(MOVLload [j-4] {s2} p2 mem) mem))
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && mem2.Uses == 1
+ && clobber(x1, x2, mem2)
+ => (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
+
+// Merge load and op
+// TODO: add indexed variants?
+((ADD|SUB|AND|OR|XOR)Q x l:(MOVQload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|AND|OR|XOR)Qload x [off] {sym} ptr mem)
+((ADD|SUB|AND|OR|XOR)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|AND|OR|XOR)Lload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+ ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
+(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+ ((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
+
+// Merge ADDQconst and LEAQ into atomic loads.
+(MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOV(Q|L|B)atomicload [off1+off2] {sym} ptr mem)
+(MOV(Q|L|B)atomicload [off1] {sym1} (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOV(Q|L|B)atomicload [off1+off2] {mergeSym(sym1, sym2)} ptr mem)
+
+// Merge ADDQconst and LEAQ into atomic stores.
+(XCHGQ [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (XCHGQ [off1+off2] {sym} val ptr mem)
+(XCHGQ [off1] {sym1} val (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ptr.Op != OpSB =>
+ (XCHGQ [off1+off2] {mergeSym(sym1,sym2)} val ptr mem)
+(XCHGL [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (XCHGL [off1+off2] {sym} val ptr mem)
+(XCHGL [off1] {sym1} val (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ptr.Op != OpSB =>
+ (XCHGL [off1+off2] {mergeSym(sym1,sym2)} val ptr mem)
+
+// Merge ADDQconst into atomic adds.
+// TODO: merging LEAQ doesn't work, assembler doesn't like the resulting instructions.
+(XADDQlock [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (XADDQlock [off1+off2] {sym} val ptr mem)
+(XADDLlock [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (XADDLlock [off1+off2] {sym} val ptr mem)
+
+// Merge ADDQconst into atomic compare and swaps.
+// TODO: merging LEAQ doesn't work, assembler doesn't like the resulting instructions.
+(CMPXCHGQlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (CMPXCHGQlock [off1+off2] {sym} ptr old new_ mem)
+(CMPXCHGLlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(int64(off1)+int64(off2)) =>
+ (CMPXCHGLlock [off1+off2] {sym} ptr old new_ mem)
+
+// We don't need the conditional move if we know the arg of BSF is not zero.
+(CMOVQEQ x _ (Select1 (BS(F|R)Q (ORQconst [c] _)))) && c != 0 => x
+// Extension is unnecessary for trailing zeros.
+(BSFQ (ORQconst <t> [1<<8] (MOVBQZX x))) => (BSFQ (ORQconst <t> [1<<8] x))
+(BSFQ (ORQconst <t> [1<<16] (MOVWQZX x))) => (BSFQ (ORQconst <t> [1<<16] x))
+
+// Redundant sign/zero extensions
+// Note: see issue 21963. We have to make sure we use the right type on
+// the resulting extension (the outer type, not the inner type).
+(MOVLQSX (MOVLQSX x)) => (MOVLQSX x)
+(MOVLQSX (MOVWQSX x)) => (MOVWQSX x)
+(MOVLQSX (MOVBQSX x)) => (MOVBQSX x)
+(MOVWQSX (MOVWQSX x)) => (MOVWQSX x)
+(MOVWQSX (MOVBQSX x)) => (MOVBQSX x)
+(MOVBQSX (MOVBQSX x)) => (MOVBQSX x)
+(MOVLQZX (MOVLQZX x)) => (MOVLQZX x)
+(MOVLQZX (MOVWQZX x)) => (MOVWQZX x)
+(MOVLQZX (MOVBQZX x)) => (MOVBQZX x)
+(MOVWQZX (MOVWQZX x)) => (MOVWQZX x)
+(MOVWQZX (MOVBQZX x)) => (MOVBQZX x)
+(MOVBQZX (MOVBQZX x)) => (MOVBQZX x)
+
+(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem)
+ && isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) =>
+ ((ADD|AND|OR|XOR)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)
+(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem)
+ && isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) =>
+ ((ADD|AND|OR|XOR)Lconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)
+
+// float <-> int register moves, with no conversion.
+// These come up when compiling math.{Float{32,64}bits,Float{32,64}frombits}.
+(MOVQload [off] {sym} ptr (MOVSDstore [off] {sym} ptr val _)) => (MOVQf2i val)
+(MOVLload [off] {sym} ptr (MOVSSstore [off] {sym} ptr val _)) => (MOVLf2i val)
+(MOVSDload [off] {sym} ptr (MOVQstore [off] {sym} ptr val _)) => (MOVQi2f val)
+(MOVSSload [off] {sym} ptr (MOVLstore [off] {sym} ptr val _)) => (MOVLi2f val)
+
+// Other load-like ops.
+(ADDQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (ADDQ x (MOVQf2i y))
+(ADDLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (ADDL x (MOVLf2i y))
+(SUBQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (SUBQ x (MOVQf2i y))
+(SUBLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (SUBL x (MOVLf2i y))
+(ANDQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (ANDQ x (MOVQf2i y))
+(ANDLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (ANDL x (MOVLf2i y))
+( ORQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => ( ORQ x (MOVQf2i y))
+( ORLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => ( ORL x (MOVLf2i y))
+(XORQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (XORQ x (MOVQf2i y))
+(XORLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (XORL x (MOVLf2i y))
+
+(ADDSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (ADDSD x (MOVQi2f y))
+(ADDSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (ADDSS x (MOVLi2f y))
+(SUBSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (SUBSD x (MOVQi2f y))
+(SUBSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (SUBSS x (MOVLi2f y))
+(MULSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (MULSD x (MOVQi2f y))
+(MULSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (MULSS x (MOVLi2f y))
+
+// Redirect stores to use the other register set.
+(MOVQstore [off] {sym} ptr (MOVQf2i val) mem) => (MOVSDstore [off] {sym} ptr val mem)
+(MOVLstore [off] {sym} ptr (MOVLf2i val) mem) => (MOVSSstore [off] {sym} ptr val mem)
+(MOVSDstore [off] {sym} ptr (MOVQi2f val) mem) => (MOVQstore [off] {sym} ptr val mem)
+(MOVSSstore [off] {sym} ptr (MOVLi2f val) mem) => (MOVLstore [off] {sym} ptr val mem)
+
+// Load args directly into the register class where it will be used.
+// We do this by just modifying the type of the Arg.
+(MOVQf2i <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+(MOVLf2i <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+(MOVQi2f <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+(MOVLi2f <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+
+// LEAQ is rematerializeable, so this helps to avoid register spill.
+// See issue 22947 for details
+(ADD(Q|L)const [off] x:(SP)) => (LEA(Q|L) [off] x)
+
+// HMULx is commutative, but its first argument must go in AX.
+// If possible, put a rematerializeable value in the first argument slot,
+// to reduce the odds that another value will be have to spilled
+// specifically to free up AX.
+(HMUL(Q|L) x y) && !x.rematerializeable() && y.rematerializeable() => (HMUL(Q|L) y x)
+(HMUL(Q|L)U x y) && !x.rematerializeable() && y.rematerializeable() => (HMUL(Q|L)U y x)
+
+// Fold loads into compares
+// Note: these may be undone by the flagalloc pass.
+(CMP(Q|L|W|B) l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) => (CMP(Q|L|W|B)load {sym} [off] ptr x mem)
+(CMP(Q|L|W|B) x l:(MOV(Q|L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (InvertFlags (CMP(Q|L|W|B)load {sym} [off] ptr x mem))
+
+(CMP(Q|L)const l:(MOV(Q|L)load {sym} [off] ptr mem) [c])
+ && l.Uses == 1
+ && clobber(l) =>
+@l.Block (CMP(Q|L)constload {sym} [makeValAndOff(c,off)] ptr mem)
+(CMP(W|B)const l:(MOV(W|B)load {sym} [off] ptr mem) [c])
+ && l.Uses == 1
+ && clobber(l) =>
+@l.Block (CMP(W|B)constload {sym} [makeValAndOff(int32(c),off)] ptr mem)
+
+(CMPQload {sym} [off] ptr (MOVQconst [c]) mem) && validVal(c) => (CMPQconstload {sym} [makeValAndOff(int32(c),off)] ptr mem)
+(CMPLload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPLconstload {sym} [makeValAndOff(c,off)] ptr mem)
+(CMPWload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPWconstload {sym} [makeValAndOff(int32(int16(c)),off)] ptr mem)
+(CMPBload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPBconstload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+
+(TEST(Q|L|W|B) l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) l2)
+ && l == l2
+ && l.Uses == 2
+ && clobber(l) =>
+ @l.Block (CMP(Q|L|W|B)constload {sym} [makeValAndOff(0, off)] ptr mem)
+
+// Convert ANDload to MOVload when we can do the AND in a containing TEST op.
+// Only do when it's within the same block, so we don't have flags live across basic block boundaries.
+// See issue 44228.
+(TEST(Q|L) a:(AND(Q|L)load [off] {sym} x ptr mem) a) && a.Uses == 2 && a.Block == v.Block && clobber(a) => (TEST(Q|L) (MOV(Q|L)load <a.Type> [off] {sym} ptr mem) x)
+
+(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read8(sym, int64(off)))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVLload [off] {sym} (SB) _) && symIsRO(sym) => (MOVQconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVQload [off] {sym} (SB) _) && symIsRO(sym) => (MOVQconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVOstore [dstOff] {dstSym} ptr (MOVOload [srcOff] {srcSym} (SB) _) mem) && symIsRO(srcSym) =>
+ (MOVQstore [dstOff+8] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff)+8, config.ctxt.Arch.ByteOrder))])
+ (MOVQstore [dstOff] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff), config.ctxt.Arch.ByteOrder))]) mem))
+
+// Arch-specific inlining for small or disjoint runtime.memmove
+// Match post-lowering calls, memory version.
+(SelectN [0] call:(CALLstatic {sym} s1:(MOVQstoreconst _ [sc] s2:(MOVQstore _ src s3:(MOVQstore _ dst mem)))))
+ && sc.Val64() >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+ && isInlinableMemmove(dst, src, sc.Val64(), config)
+ && clobber(s1, s2, s3, call)
+ => (Move [sc.Val64()] dst src mem)
+
+// Match post-lowering calls, register version.
+(SelectN [0] call:(CALLstatic {sym} dst src (MOVQconst [sz]) mem))
+ && sz >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && call.Uses == 1
+ && isInlinableMemmove(dst, src, sz, config)
+ && clobber(call)
+ => (Move [sz] dst src mem)
+
+// Prefetch instructions
+(PrefetchCache ...) => (PrefetchT0 ...)
+(PrefetchCacheStreamed ...) => (PrefetchNTA ...)
+
+// CPUID feature: BMI1.
+(AND(Q|L) x (NOT(Q|L) y)) && buildcfg.GOAMD64 >= 3 => (ANDN(Q|L) x y)
+(AND(Q|L) x (NEG(Q|L) x)) && buildcfg.GOAMD64 >= 3 => (BLSI(Q|L) x)
+(XOR(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSMSK(Q|L) x)
+(AND(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSR(Q|L) x)
+
+(BSWAP(Q|L) (BSWAP(Q|L) p)) => p
+
+// CPUID feature: MOVBE.
+(MOV(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)store [i] {s} p w mem)
+(MOVBE(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 => (MOV(Q|L)store [i] {s} p w mem)
+(BSWAP(Q|L) x:(MOV(Q|L)load [i] {s} p mem)) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => @x.Block (MOVBE(Q|L)load [i] {s} p mem)
+(BSWAP(Q|L) x:(MOVBE(Q|L)load [i] {s} p mem)) && x.Uses == 1 => @x.Block (MOV(Q|L)load [i] {s} p mem)
+(MOVWstore [i] {s} p x:(ROLWconst [8] w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBEWstore [i] {s} p w mem)
+(MOVBEWstore [i] {s} p x:(ROLWconst [8] w) mem) && x.Uses == 1 => (MOVWstore [i] {s} p w mem)
+
+(ORQ x0:(MOVBELload [i0] {s} p mem)
+ sh:(SHLQconst [32] x1:(MOVBELload [i1] {s} p mem)))
+ && i0 == i1+4
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVBEQload [i1] {s} p mem)
+
+(ORQ x0:(MOVBELload [i] {s} p0 mem)
+ sh:(SHLQconst [32] x1:(MOVBELload [i] {s} p1 mem)))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && sequentialAddresses(p1, p0, 4)
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVBEQload [i] {s} p1 mem)
+
+(SAR(Q|L) l:(MOV(Q|L)load [off] {sym} ptr mem) x) && buildcfg.GOAMD64 >= 3 && canMergeLoad(v, l) && clobber(l) => (SARX(Q|L)load [off] {sym} ptr x mem)
+(SHL(Q|L) l:(MOV(Q|L)load [off] {sym} ptr mem) x) && buildcfg.GOAMD64 >= 3 && canMergeLoad(v, l) && clobber(l) => (SHLX(Q|L)load [off] {sym} ptr x mem)
+(SHR(Q|L) l:(MOV(Q|L)load [off] {sym} ptr mem) x) && buildcfg.GOAMD64 >= 3 && canMergeLoad(v, l) && clobber(l) => (SHRX(Q|L)load [off] {sym} ptr x mem)
+
+((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVQconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem))
+((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem))
+((SHL|SHR|SAR)XLload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Lconst [int8(c&31)] (MOVLload [off] {sym} ptr mem))
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
new file mode 100644
index 0000000..cbe1f5b
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@@ -0,0 +1,1133 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - Floating-point types live in the low natural slot of an sse2 register.
+// Unused portions are junk.
+// - We do not use AH,BH,CH,DH registers.
+// - When doing sub-register operations, we try to write the whole
+// destination register to avoid a partial-register write.
+// - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+// filled by sign-extending the used portion. Users of AuxInt which interpret
+// AuxInt as unsigned (e.g. shifts) must be careful.
+// - All SymOff opcodes require their offset to fit in an int32.
+
+// Suffixes encode the bit width of various instructions.
+// Q (quad word) = 64 bit
+// L (long word) = 32 bit
+// W (word) = 16 bit
+// B (byte) = 8 bit
+// D (double) = 64 bit float
+// S (single) = 32 bit float
+
+// copied from ../../amd64/reg.go
+var regNamesAMD64 = []string{
+ "AX",
+ "CX",
+ "DX",
+ "BX",
+ "SP",
+ "BP",
+ "SI",
+ "DI",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "R13",
+ "g", // a.k.a. R14
+ "R15",
+ "X0",
+ "X1",
+ "X2",
+ "X3",
+ "X4",
+ "X5",
+ "X6",
+ "X7",
+ "X8",
+ "X9",
+ "X10",
+ "X11",
+ "X12",
+ "X13",
+ "X14",
+ "X15", // constant 0 in ABIInternal
+
+ // If you add registers, update asyncPreempt in runtime
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesAMD64) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesAMD64 {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ ax = buildReg("AX")
+ cx = buildReg("CX")
+ dx = buildReg("DX")
+ bx = buildReg("BX")
+ gp = buildReg("AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15")
+ g = buildReg("g")
+ fp = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14")
+ x15 = buildReg("X15")
+ gpsp = gp | buildReg("SP")
+ gpspsb = gpsp | buildReg("SB")
+ gpspsbg = gpspsb | g
+ callerSave = gp | fp | g // runtime.setg (and anything calling it) may clobber g
+ )
+ // Common slices of register masks
+ var (
+ gponly = []regMask{gp}
+ fponly = []regMask{fp}
+ )
+
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: gponly}
+ gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+ gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+ gp11sb = regInfo{inputs: []regMask{gpspsbg}, outputs: gponly}
+ gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+ gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+ gp21sb = regInfo{inputs: []regMask{gpspsbg, gpsp}, outputs: gponly}
+ gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+ gp31shift = regInfo{inputs: []regMask{gp, gp, cx}, outputs: []regMask{gp}}
+ gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
+ gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
+ gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
+ gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}}
+
+ gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+ gp1flags = regInfo{inputs: []regMask{gpsp}}
+ gp0flagsLoad = regInfo{inputs: []regMask{gpspsbg, 0}}
+ gp1flagsLoad = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}}
+ gp2flagsLoad = regInfo{inputs: []regMask{gpspsbg, gpsp, gpsp, 0}}
+ flagsgp = regInfo{inputs: nil, outputs: gponly}
+
+ gp11flags = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
+ gp1flags1flags = regInfo{inputs: []regMask{gp, 0}, outputs: []regMask{gp, 0}}
+
+ readflags = regInfo{inputs: nil, outputs: gponly}
+
+ gpload = regInfo{inputs: []regMask{gpspsbg, 0}, outputs: gponly}
+ gp21load = regInfo{inputs: []regMask{gp, gpspsbg, 0}, outputs: gponly}
+ gploadidx = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}, outputs: gponly}
+ gp21loadidx = regInfo{inputs: []regMask{gp, gpspsbg, gpsp, 0}, outputs: gponly}
+ gp21shxload = regInfo{inputs: []regMask{gpspsbg, gp, 0}, outputs: gponly}
+ gp21shxloadidx = regInfo{inputs: []regMask{gpspsbg, gpsp, gp, 0}, outputs: gponly}
+
+ gpstore = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}}
+ gpstoreconst = regInfo{inputs: []regMask{gpspsbg, 0}}
+ gpstoreidx = regInfo{inputs: []regMask{gpspsbg, gpsp, gpsp, 0}}
+ gpstoreconstidx = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}}
+ gpstorexchg = regInfo{inputs: []regMask{gp, gpspsbg, 0}, outputs: []regMask{gp}}
+ cmpxchg = regInfo{inputs: []regMask{gp, ax, gp, 0}, outputs: []regMask{gp, 0}, clobbers: ax}
+
+ fp01 = regInfo{inputs: nil, outputs: fponly}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+ fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
+ fp21load = regInfo{inputs: []regMask{fp, gpspsbg, 0}, outputs: fponly}
+ fp21loadidx = regInfo{inputs: []regMask{fp, gpspsbg, gpspsb, 0}, outputs: fponly}
+ fpgp = regInfo{inputs: fponly, outputs: gponly}
+ gpfp = regInfo{inputs: gponly, outputs: fponly}
+ fp11 = regInfo{inputs: fponly, outputs: fponly}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+
+ fpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+ fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+ fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+ fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+
+ prefreg = regInfo{inputs: []regMask{gpspsbg}}
+ )
+
+ var AMD64ops = []opData{
+ // {ADD,SUB,MUL,DIV}Sx: floating-point arithmetic
+ // x==S for float32, x==D for float64
+ // computes arg0 OP arg1
+ {name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true},
+ {name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true},
+ {name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true},
+ {name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true},
+ {name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true},
+ {name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true},
+ {name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true},
+ {name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true},
+
+ // MOVSxload: floating-point loads
+ // x==S for float32, x==D for float64
+ // load from arg0+auxint+aux, arg1 = mem
+ {name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+ // MOVSxconst: floatint-point constants
+ // x==S for float32, x==D for float64
+ {name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true},
+ {name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true},
+
+ // MOVSxloadidx: floating-point indexed loads
+ // x==S for float32, x==D for float64
+ // load from arg0 + scale*arg1+auxint+aux, arg2 = mem
+ {name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", scale: 1, aux: "SymOff", symEffect: "Read"},
+ {name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", scale: 4, aux: "SymOff", symEffect: "Read"},
+ {name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", scale: 1, aux: "SymOff", symEffect: "Read"},
+ {name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", scale: 8, aux: "SymOff", symEffect: "Read"},
+
+ // MOVSxstore: floating-point stores
+ // x==S for float32, x==D for float64
+ // does *(arg0+auxint+aux) = arg1, arg2 = mem
+ {name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
+
+ // MOVSxstoreidx: floating-point indexed stores
+ // x==S for float32, x==D for float64
+ // does *(arg0+scale*arg1+auxint+aux) = arg2, arg3 = mem
+ {name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", scale: 1, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", scale: 4, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", scale: 1, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", scale: 8, aux: "SymOff", symEffect: "Write"},
+
+ // {ADD,SUB,MUL,DIV}Sxload: floating-point load / op combo
+ // x==S for float32, x==D for float64
+ // computes arg0 OP *(arg1+auxint+aux), arg2=mem
+ {name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"},
+
+ // {ADD,SUB,MUL,DIV}Sxloadidx: floating-point indexed load / op combo
+ // x==S for float32, x==D for float64
+ // computes arg0 OP *(arg1+scale*arg2+auxint+aux), arg3=mem
+ {name: "ADDSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "ADDSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "ADDSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "ADDSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "SUBSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "SUBSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "SUBSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "SUBSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "MULSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "MULSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "MULSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "MULSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "DIVSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "DIVSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "DIVSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+ {name: "DIVSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"},
+
+ // {ADD,SUB,MUL,DIV,AND,OR,XOR}x: binary integer ops
+ // unadorned versions compute arg0 OP arg1
+ // const versions compute arg0 OP auxint (auxint is a sign-extended 32-bit value)
+ // constmodify versions compute *(arg0+ValAndOff(AuxInt).Off().aux) OP= ValAndOff(AuxInt).Val(), arg1 = mem
+ // x==L operations zero the upper 4 bytes of the destination register (not meaningful for constmodify versions).
+ {name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true},
+ {name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},
+ {name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int32", typ: "UInt64", clobberFlags: true},
+ {name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", clobberFlags: true},
+ {name: "ADDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+
+ {name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true, clobberFlags: true},
+ {name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},
+ {name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int32", resultInArg0: true, clobberFlags: true},
+ {name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true},
+
+ {name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true, clobberFlags: true},
+ {name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true},
+ {name: "MULQconst", argLength: 1, reg: gp11, asm: "IMUL3Q", aux: "Int32", clobberFlags: true},
+ {name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true},
+
+ // Let x = arg0*arg1 (full 32x32->64 unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x.
+ {name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true},
+ // Let x = arg0*arg1 (full 64x64->128 unsigned multiply). Returns uint64(x), and flags set to overflow if uint64(x) != x.
+ {name: "MULQU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt64,Flags)", asm: "MULQ", commutative: true, clobberFlags: true},
+
+ // HMULx[U]: computes the high bits of an integer multiply.
+ // computes arg0 * arg1 >> (x==L?32:64)
+ // The multiply is unsigned for the U versions, signed for the non-U versions.
+ // HMULx[U] are intentionally not marked as commutative, even though they are.
+ // This is because they have asymmetric register requirements.
+ // There are rewrite rules to try to place arguments in preferable slots.
+ {name: "HMULQ", argLength: 2, reg: gp21hmul, asm: "IMULQ", clobberFlags: true},
+ {name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true},
+ {name: "HMULQU", argLength: 2, reg: gp21hmul, asm: "MULQ", clobberFlags: true},
+ {name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true},
+
+ // (arg0 + arg1) / 2 as unsigned, all 64 result bits
+ {name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true},
+
+ // DIVx[U] computes [arg0 / arg1, arg0 % arg1]
+ // For signed versions, AuxInt non-zero means that the divisor has been proved to be not -1.
+ {name: "DIVQ", argLength: 2, reg: gp11div, typ: "(Int64,Int64)", asm: "IDIVQ", aux: "Bool", clobberFlags: true},
+ {name: "DIVL", argLength: 2, reg: gp11div, typ: "(Int32,Int32)", asm: "IDIVL", aux: "Bool", clobberFlags: true},
+ {name: "DIVW", argLength: 2, reg: gp11div, typ: "(Int16,Int16)", asm: "IDIVW", aux: "Bool", clobberFlags: true},
+ {name: "DIVQU", argLength: 2, reg: gp11div, typ: "(UInt64,UInt64)", asm: "DIVQ", clobberFlags: true},
+ {name: "DIVLU", argLength: 2, reg: gp11div, typ: "(UInt32,UInt32)", asm: "DIVL", clobberFlags: true},
+ {name: "DIVWU", argLength: 2, reg: gp11div, typ: "(UInt16,UInt16)", asm: "DIVW", clobberFlags: true},
+
+ // computes -arg0, flags set for 0-arg0.
+ {name: "NEGLflags", argLength: 1, reg: gp11flags, typ: "(UInt32,Flags)", asm: "NEGL", resultInArg0: true},
+
+ // The following 4 add opcodes return the low 64 bits of the sum in the first result and
+ // the carry (the 65th bit) in the carry flag.
+ {name: "ADDQcarry", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDQ", commutative: true, resultInArg0: true}, // r = arg0+arg1
+ {name: "ADCQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", commutative: true, resultInArg0: true}, // r = arg0+arg1+carry(arg2)
+ {name: "ADDQconstcarry", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint
+ {name: "ADCQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint+carry(arg1)
+
+ // The following 4 add opcodes return the low 64 bits of the difference in the first result and
+ // the borrow (if the result is negative) in the carry flag.
+ {name: "SUBQborrow", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "SUBQ", resultInArg0: true}, // r = arg0-arg1
+ {name: "SBBQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "SBBQ", resultInArg0: true}, // r = arg0-(arg1+carry(arg2))
+ {name: "SUBQconstborrow", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "SUBQ", aux: "Int32", resultInArg0: true}, // r = arg0-auxint
+ {name: "SBBQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "SBBQ", aux: "Int32", resultInArg0: true}, // r = arg0-(auxint+carry(arg1))
+
+ {name: "MULQU2", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}, commutative: true, asm: "MULQ", clobberFlags: true}, // arg0 * arg1, returns (hi, lo)
+ {name: "DIVQU2", argLength: 3, reg: regInfo{inputs: []regMask{dx, ax, gpsp}, outputs: []regMask{ax, dx}}, asm: "DIVQ", clobberFlags: true}, // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r)
+
+ {name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+ {name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+ {name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+ {name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+ {name: "ANDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+ {name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+ {name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+ {name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+ {name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+ {name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+ {name: "ORQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+ {name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+ {name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+ {name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+ {name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+ {name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+ {name: "XORQconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+ {name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+ // CMPx: compare arg0 to arg1.
+ {name: "CMPQ", argLength: 2, reg: gp2flags, asm: "CMPQ", typ: "Flags"},
+ {name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},
+ {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},
+ {name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"},
+
+ // CMPxconst: compare arg0 to auxint.
+ {name: "CMPQconst", argLength: 1, reg: gp1flags, asm: "CMPQ", typ: "Flags", aux: "Int32"},
+ {name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"},
+ {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"},
+ {name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"},
+
+ // CMPxload: compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem.
+ {name: "CMPQload", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+ // CMPxconstload: compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem.
+ {name: "CMPQconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPQ", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+ {name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+ // CMPxloadidx: compare *(arg0+N*arg1+auxint+aux) to arg2 (in that order). arg3=mem.
+ {name: "CMPQloadidx8", argLength: 4, reg: gp2flagsLoad, asm: "CMPQ", scale: 8, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPQloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPQ", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPLloadidx4", argLength: 4, reg: gp2flagsLoad, asm: "CMPL", scale: 4, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPLloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPL", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPWloadidx2", argLength: 4, reg: gp2flagsLoad, asm: "CMPW", scale: 2, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPWloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPW", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPBloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPB", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+
+ // CMPxconstloadidx: compare *(arg0+N*arg1+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg2=mem.
+ {name: "CMPQconstloadidx8", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", scale: 8, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPQconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPLconstloadidx4", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", scale: 4, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPLconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPWconstloadidx2", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", scale: 2, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPWconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+ {name: "CMPBconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+
+ // UCOMISx: floating-point compare arg0 to arg1
+ // x==S for float32, x==D for float64
+ {name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"},
+ {name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"},
+
+ // bit test/set/clear operations
+ {name: "BTL", argLength: 2, reg: gp2flags, asm: "BTL", typ: "Flags"}, // test whether bit arg0%32 in arg1 is set
+ {name: "BTQ", argLength: 2, reg: gp2flags, asm: "BTQ", typ: "Flags"}, // test whether bit arg0%64 in arg1 is set
+ {name: "BTCL", argLength: 2, reg: gp21, asm: "BTCL", resultInArg0: true, clobberFlags: true}, // complement bit arg1%32 in arg0
+ {name: "BTCQ", argLength: 2, reg: gp21, asm: "BTCQ", resultInArg0: true, clobberFlags: true}, // complement bit arg1%64 in arg0
+ {name: "BTRL", argLength: 2, reg: gp21, asm: "BTRL", resultInArg0: true, clobberFlags: true}, // reset bit arg1%32 in arg0
+ {name: "BTRQ", argLength: 2, reg: gp21, asm: "BTRQ", resultInArg0: true, clobberFlags: true}, // reset bit arg1%64 in arg0
+ {name: "BTSL", argLength: 2, reg: gp21, asm: "BTSL", resultInArg0: true, clobberFlags: true}, // set bit arg1%32 in arg0
+ {name: "BTSQ", argLength: 2, reg: gp21, asm: "BTSQ", resultInArg0: true, clobberFlags: true}, // set bit arg1%64 in arg0
+ {name: "BTLconst", argLength: 1, reg: gp1flags, asm: "BTL", typ: "Flags", aux: "Int8"}, // test whether bit auxint in arg0 is set, 0 <= auxint < 32
+ {name: "BTQconst", argLength: 1, reg: gp1flags, asm: "BTQ", typ: "Flags", aux: "Int8"}, // test whether bit auxint in arg0 is set, 0 <= auxint < 64
+ {name: "BTCLconst", argLength: 1, reg: gp11, asm: "BTCL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 32
+ {name: "BTCQconst", argLength: 1, reg: gp11, asm: "BTCQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 64
+ {name: "BTRLconst", argLength: 1, reg: gp11, asm: "BTRL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 32
+ {name: "BTRQconst", argLength: 1, reg: gp11, asm: "BTRQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 64
+ {name: "BTSLconst", argLength: 1, reg: gp11, asm: "BTSL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 32
+ {name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64
+
+ // TESTx: compare (arg0 & arg1) to 0
+ {name: "TESTQ", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTQ", typ: "Flags"},
+ {name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"},
+ {name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"},
+ {name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"},
+
+ // TESTxconst: compare (arg0 & auxint) to 0
+ {name: "TESTQconst", argLength: 1, reg: gp1flags, asm: "TESTQ", typ: "Flags", aux: "Int32"},
+ {name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"},
+ {name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"},
+ {name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},
+
+ // S{HL, HR, AR}x: shift operations
+ // SHL: shift left
+ // SHR: shift right logical (0s are shifted in from beyond the word size)
+ // SAR: shift right arithmetic (sign bit is shifted in from beyond the word size)
+ // arg0 is the value being shifted
+ // arg1 is the amount to shift, interpreted mod (Q=64,L=32,W=32,B=32)
+ // (Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!)
+ // For *const versions, use auxint instead of arg1 as the shift amount. auxint must be in the range 0 to (Q=63,L=31,W=15,B=7) inclusive.
+ {name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true},
+ {name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},
+ {name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int8", resultInArg0: true, clobberFlags: true},
+
+ {name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true},
+ {name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},
+ {name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},
+ {name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},
+ {name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true},
+
+ {name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true, clobberFlags: true},
+ {name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},
+ {name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},
+ {name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},
+ {name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true},
+
+ // unsigned arg0 >> arg2, shifting in bits from arg1 (==(arg1<<64+arg0)>>arg2, keeping low 64 bits), shift amount is mod 64
+ {name: "SHRDQ", argLength: 3, reg: gp31shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true},
+ // unsigned arg0 << arg2, shifting in bits from arg1 (==(arg0<<64+arg1)<<arg2, keeping high 64 bits), shift amount is mod 64
+ {name: "SHLDQ", argLength: 3, reg: gp31shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true},
+
+ // RO{L,R}x: rotate instructions
+ // computes arg0 rotate (L=left,R=right) arg1 bits.
+ // Bits are rotated within the low (Q=64,L=32,W=16,B=8) bits of the register.
+ // For *const versions use auxint instead of arg1 as the rotate amount. auxint must be in the range 0 to (Q=63,L=31,W=15,B=7) inclusive.
+ // x==L versions zero the upper 32 bits of the destination register.
+ // x==W and x==B versions leave the upper bits unspecified.
+ {name: "ROLQ", argLength: 2, reg: gp21shift, asm: "ROLQ", resultInArg0: true, clobberFlags: true},
+ {name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true},
+ {name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true},
+ {name: "ROLB", argLength: 2, reg: gp21shift, asm: "ROLB", resultInArg0: true, clobberFlags: true},
+ {name: "RORQ", argLength: 2, reg: gp21shift, asm: "RORQ", resultInArg0: true, clobberFlags: true},
+ {name: "RORL", argLength: 2, reg: gp21shift, asm: "RORL", resultInArg0: true, clobberFlags: true},
+ {name: "RORW", argLength: 2, reg: gp21shift, asm: "RORW", resultInArg0: true, clobberFlags: true},
+ {name: "RORB", argLength: 2, reg: gp21shift, asm: "RORB", resultInArg0: true, clobberFlags: true},
+ {name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int8", resultInArg0: true, clobberFlags: true},
+ {name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},
+
+ // [ADD,SUB,AND,OR]xload: integer load/op combo
+ // L = int32, Q = int64
+ // x==L operations zero the upper 4 bytes of the destination register.
+ // computes arg0 op *(arg1+auxint+aux), arg2=mem
+ {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "ADDQload", argLength: 3, reg: gp21load, asm: "ADDQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "SUBQload", argLength: 3, reg: gp21load, asm: "SUBQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "ANDQload", argLength: 3, reg: gp21load, asm: "ANDQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "ORQload", argLength: 3, reg: gp21load, asm: "ORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "XORQload", argLength: 3, reg: gp21load, asm: "XORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+ {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},
+
+ // integer indexed load/op combo
+ // L = int32, Q = int64
+ // L operations zero the upper 4 bytes of the destination register.
+ // computes arg0 op *(arg1+scale*arg2+auxint+aux), arg3=mem
+ {name: "ADDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ADDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ADDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ADDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "SUBLloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "SUBLloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "SUBQloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "SUBQloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ANDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ANDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ANDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ANDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "ORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "XORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "XORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "XORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+ {name: "XORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},
+
+ // direct binary op on memory (read-modify-write)
+ // L = int32, Q = int64
+ // does *(arg0+auxint+aux) op= arg1, arg2=mem
+ {name: "ADDQmodify", argLength: 3, reg: gpstore, asm: "ADDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "SUBQmodify", argLength: 3, reg: gpstore, asm: "SUBQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "ANDQmodify", argLength: 3, reg: gpstore, asm: "ANDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "ORQmodify", argLength: 3, reg: gpstore, asm: "ORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "XORQmodify", argLength: 3, reg: gpstore, asm: "XORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+ {name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
+
+ // indexed direct binary op on memory.
+ // does *(arg0+scale*arg1+auxint+aux) op= arg2, arg3=mem
+ {name: "ADDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "SUBQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "SUBQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "SUBLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "SUBLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+
+ // indexed direct binary op on memory with constant argument.
+ // does *(arg0+scale*arg1+ValAndOff(AuxInt).Off()+aux) op= ValAndOff(AuxInt).Val(), arg2=mem
+ {name: "ADDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ADDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ANDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "ORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+ {name: "XORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},
+
+ // {NEG,NOT}x: unary ops
+ // computes [NEG:-,NOT:^]arg0
+ // L = int32, Q = int64
+ // L operations zero the upper 4 bytes of the destination register.
+ {name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true},
+ {name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true},
+ {name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true},
+ {name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true},
+
+ // BS{F,R}Q returns a tuple [result, flags]
+ // result is undefined if the input is zero.
+ // flags are set to "equal" if the input is zero, "not equal" otherwise.
+ // BS{F,R}L returns only the result.
+ {name: "BSFQ", argLength: 1, reg: gp11flags, asm: "BSFQ", typ: "(UInt64,Flags)"}, // # of low-order zeroes in 64-bit arg
+ {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", typ: "UInt32", clobberFlags: true}, // # of low-order zeroes in 32-bit arg
+ {name: "BSRQ", argLength: 1, reg: gp11flags, asm: "BSRQ", typ: "(UInt64,Flags)"}, // # of high-order zeroes in 64-bit arg
+ {name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", typ: "UInt32", clobberFlags: true}, // # of high-order zeroes in 32-bit arg
+
+ // CMOV instructions: 64, 32 and 16-bit sizes.
+ // if arg2 encodes a true result, return arg1, else arg0
+ {name: "CMOVQEQ", argLength: 3, reg: gp21, asm: "CMOVQEQ", resultInArg0: true},
+ {name: "CMOVQNE", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true},
+ {name: "CMOVQLT", argLength: 3, reg: gp21, asm: "CMOVQLT", resultInArg0: true},
+ {name: "CMOVQGT", argLength: 3, reg: gp21, asm: "CMOVQGT", resultInArg0: true},
+ {name: "CMOVQLE", argLength: 3, reg: gp21, asm: "CMOVQLE", resultInArg0: true},
+ {name: "CMOVQGE", argLength: 3, reg: gp21, asm: "CMOVQGE", resultInArg0: true},
+ {name: "CMOVQLS", argLength: 3, reg: gp21, asm: "CMOVQLS", resultInArg0: true},
+ {name: "CMOVQHI", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true},
+ {name: "CMOVQCC", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true},
+ {name: "CMOVQCS", argLength: 3, reg: gp21, asm: "CMOVQCS", resultInArg0: true},
+
+ {name: "CMOVLEQ", argLength: 3, reg: gp21, asm: "CMOVLEQ", resultInArg0: true},
+ {name: "CMOVLNE", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+ {name: "CMOVLLT", argLength: 3, reg: gp21, asm: "CMOVLLT", resultInArg0: true},
+ {name: "CMOVLGT", argLength: 3, reg: gp21, asm: "CMOVLGT", resultInArg0: true},
+ {name: "CMOVLLE", argLength: 3, reg: gp21, asm: "CMOVLLE", resultInArg0: true},
+ {name: "CMOVLGE", argLength: 3, reg: gp21, asm: "CMOVLGE", resultInArg0: true},
+ {name: "CMOVLLS", argLength: 3, reg: gp21, asm: "CMOVLLS", resultInArg0: true},
+ {name: "CMOVLHI", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true},
+ {name: "CMOVLCC", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true},
+ {name: "CMOVLCS", argLength: 3, reg: gp21, asm: "CMOVLCS", resultInArg0: true},
+
+ {name: "CMOVWEQ", argLength: 3, reg: gp21, asm: "CMOVWEQ", resultInArg0: true},
+ {name: "CMOVWNE", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+ {name: "CMOVWLT", argLength: 3, reg: gp21, asm: "CMOVWLT", resultInArg0: true},
+ {name: "CMOVWGT", argLength: 3, reg: gp21, asm: "CMOVWGT", resultInArg0: true},
+ {name: "CMOVWLE", argLength: 3, reg: gp21, asm: "CMOVWLE", resultInArg0: true},
+ {name: "CMOVWGE", argLength: 3, reg: gp21, asm: "CMOVWGE", resultInArg0: true},
+ {name: "CMOVWLS", argLength: 3, reg: gp21, asm: "CMOVWLS", resultInArg0: true},
+ {name: "CMOVWHI", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true},
+ {name: "CMOVWCC", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true},
+ {name: "CMOVWCS", argLength: 3, reg: gp21, asm: "CMOVWCS", resultInArg0: true},
+
+ // CMOV with floating point instructions. We need separate pseudo-op to handle
+ // InvertFlags correctly, and to generate special code that handles NaN (unordered flag).
+ // NOTE: the fact that CMOV*EQF here is marked to generate CMOV*NE is not a bug. See
+ // code generation in amd64/ssa.go.
+ {name: "CMOVQEQF", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true, needIntTemp: true},
+ {name: "CMOVQNEF", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true},
+ {name: "CMOVQGTF", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true},
+ {name: "CMOVQGEF", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true},
+ {name: "CMOVLEQF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true, needIntTemp: true},
+ {name: "CMOVLNEF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+ {name: "CMOVLGTF", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true},
+ {name: "CMOVLGEF", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true},
+ {name: "CMOVWEQF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true, needIntTemp: true},
+ {name: "CMOVWNEF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+ {name: "CMOVWGTF", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true},
+ {name: "CMOVWGEF", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true},
+
+ // BSWAPx swaps the low-order (L=4,Q=8) bytes of arg0.
+ // Q: abcdefgh -> hgfedcba
+ // L: abcdefgh -> 0000hgfe (L zeros the upper 4 bytes)
+ {name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true},
+ {name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true},
+
+ // POPCNTx counts the number of set bits in the low-order (L=32,Q=64) bits of arg0.
+ // POPCNTx instructions are only guaranteed to be available if GOAMD64>=v2.
+ // For GOAMD64<v2, any use must be preceded by a successful runtime check of runtime.x86HasPOPCNT.
+ {name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true},
+ {name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true},
+
+ // SQRTSx computes sqrt(arg0)
+ // S = float32, D = float64
+ {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"},
+ {name: "SQRTSS", argLength: 1, reg: fp11, asm: "SQRTSS"},
+
+ // ROUNDSD rounds arg0 to an integer depending on auxint
+ // 0 means math.RoundToEven, 1 means math.Floor, 2 math.Ceil, 3 math.Trunc
+ // (The result is still a float64.)
+ // ROUNDSD instruction is only guaraneteed to be available if GOAMD64>=v2.
+ // For GOAMD64<v2, any use must be preceded by a successful check of runtime.x86HasSSE41.
+ {name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"},
+
+ // VFMADD231SD only exists on platforms with the FMA3 instruction set.
+ // Any use must be preceded by a successful check of runtime.support_fma.
+ {name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"},
+
+ {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
+ {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+ // Note: SBBW and SBBB are subsumed by SBBL
+
+ {name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+ {name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+ {name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"}, // extract signed < condition from arg0
+ {name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+ {name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"}, // extract signed > condition from arg0
+ {name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+ {name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"}, // extract unsigned < condition from arg0
+ {name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+ {name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"}, // extract unsigned > condition from arg0
+ {name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+ {name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"}, // extract if overflow flag is set from arg0
+ // Variants that store result to memory
+ {name: "SETEQstore", argLength: 3, reg: gpstoreconst, asm: "SETEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract == condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETNEstore", argLength: 3, reg: gpstoreconst, asm: "SETNE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract != condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETLstore", argLength: 3, reg: gpstoreconst, asm: "SETLT", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed < condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETLEstore", argLength: 3, reg: gpstoreconst, asm: "SETLE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed <= condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETGstore", argLength: 3, reg: gpstoreconst, asm: "SETGT", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed > condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETGEstore", argLength: 3, reg: gpstoreconst, asm: "SETGE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed >= condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETBstore", argLength: 3, reg: gpstoreconst, asm: "SETCS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned < condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETBEstore", argLength: 3, reg: gpstoreconst, asm: "SETLS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned <= condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETAstore", argLength: 3, reg: gpstoreconst, asm: "SETHI", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned > condition from arg1 to arg0+auxint+aux, arg2=mem
+ {name: "SETAEstore", argLength: 3, reg: gpstoreconst, asm: "SETCC", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned >= condition from arg1 to arg0+auxint+aux, arg2=mem
+ // Need different opcodes for floating point conditions because
+ // any comparison involving a NaN is always FALSE and thus
+ // the patterns for inverting conditions cannot be used.
+ {name: "SETEQF", argLength: 1, reg: flagsgp, asm: "SETEQ", clobberFlags: true, needIntTemp: true}, // extract == condition from arg0
+ {name: "SETNEF", argLength: 1, reg: flagsgp, asm: "SETNE", clobberFlags: true, needIntTemp: true}, // extract != condition from arg0
+ {name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"}, // extract "ordered" (No Nan present) condition from arg0
+ {name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"}, // extract "unordered" (Nan present) condition from arg0
+
+ {name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"}, // extract floating > condition from arg0
+ {name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+ {name: "MOVBQSX", argLength: 1, reg: gp11, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64
+ {name: "MOVBQZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int64
+ {name: "MOVWQSX", argLength: 1, reg: gp11, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64
+ {name: "MOVWQZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int64
+ {name: "MOVLQSX", argLength: 1, reg: gp11, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64
+ {name: "MOVLQZX", argLength: 1, reg: gp11, asm: "MOVL"}, // zero extend arg0 from int32 to int64
+
+ {name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+ {name: "MOVQconst", reg: gp01, asm: "MOVQ", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
+
+ {name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
+ {name: "CVTTSD2SQ", argLength: 1, reg: fpgp, asm: "CVTTSD2SQ"}, // convert float64 to int64
+ {name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
+ {name: "CVTTSS2SQ", argLength: 1, reg: fpgp, asm: "CVTTSS2SQ"}, // convert float32 to int64
+ {name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"}, // convert int32 to float32
+ {name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"}, // convert int32 to float64
+ {name: "CVTSQ2SS", argLength: 1, reg: gpfp, asm: "CVTSQ2SS"}, // convert int64 to float32
+ {name: "CVTSQ2SD", argLength: 1, reg: gpfp, asm: "CVTSQ2SD"}, // convert int64 to float64
+ {name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"}, // convert float64 to float32
+ {name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"}, // convert float32 to float64
+
+ // Move values between int and float registers, with no conversion.
+ // TODO: should we have generic versions of these?
+ {name: "MOVQi2f", argLength: 1, reg: gpfp, typ: "Float64"}, // move 64 bits from int to float reg
+ {name: "MOVQf2i", argLength: 1, reg: fpgp, typ: "UInt64"}, // move 64 bits from float to int reg
+ {name: "MOVLi2f", argLength: 1, reg: gpfp, typ: "Float32"}, // move 32 bits from int to float reg
+ {name: "MOVLf2i", argLength: 1, reg: fpgp, typ: "UInt32"}, // move 32 bits from float to int reg, zero extend
+
+ {name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+ {name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+ {name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+ {name: "LEAW", argLength: 1, reg: gp11sb, asm: "LEAW", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+
+ // LEAxn computes arg0 + n*arg1 + auxint + aux
+ // x==L zeroes the upper 4 bytes.
+ {name: "LEAQ1", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+ {name: "LEAL1", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+ {name: "LEAW1", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+ {name: "LEAQ2", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 2, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux
+ {name: "LEAL2", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 2, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux
+ {name: "LEAW2", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 2, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux
+ {name: "LEAQ4", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 4, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux
+ {name: "LEAL4", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 4, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux
+ {name: "LEAW4", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 4, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux
+ {name: "LEAQ8", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 8, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux
+ {name: "LEAL8", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 8, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux
+ {name: "LEAW8", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 8, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux
+ // Note: LEAx{1,2,4,8} must not have OpSB as either argument.
+
+ // MOVxload: loads
+ // Load (Q=8,L=4,W=2,B=1) bytes from (arg0+auxint+aux), arg1=mem.
+ // "+auxint+aux" == add auxint and the offset of the symbol in aux (if any) to the effective address
+ // Standard versions zero extend the result. SX versions sign extend the result.
+ {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVBQSXload", argLength: 2, reg: gpload, asm: "MOVBQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVWQSXload", argLength: 2, reg: gpload, asm: "MOVWQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVLQSXload", argLength: 2, reg: gpload, asm: "MOVLQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVQload", argLength: 2, reg: gpload, asm: "MOVQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"},
+
+ // MOVxstore: stores
+ // Store (Q=8,L=4,W=2,B=1) low bytes of arg1.
+ // Does *(arg0+auxint+aux) = arg1, arg2=mem.
+ {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVQstore", argLength: 3, reg: gpstore, asm: "MOVQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+
+ // MOVOload/store: 16 byte load/store
+ // These operations are only used to move data around: there is no *O arithmetic, for example.
+ {name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128", faultOnNilArg0: true, symEffect: "Read"}, // load 16 bytes from arg0+auxint+aux. arg1=mem
+ {name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+ // MOVxloadidx: indexed loads
+ // load (Q=8,L=4,W=2,B=1) bytes from (arg0+scale*arg1+auxint+aux), arg2=mem.
+ // Results are zero-extended. (TODO: sign-extending indexed loads)
+ {name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", scale: 1, aux: "SymOff", typ: "UInt8", symEffect: "Read"},
+ {name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", scale: 1, aux: "SymOff", typ: "UInt16", symEffect: "Read"},
+ {name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", scale: 2, aux: "SymOff", typ: "UInt16", symEffect: "Read"},
+ {name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymOff", typ: "UInt32", symEffect: "Read"},
+ {name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", scale: 4, aux: "SymOff", typ: "UInt32", symEffect: "Read"},
+ {name: "MOVLloadidx8", argLength: 3, reg: gploadidx, asm: "MOVL", scale: 8, aux: "SymOff", typ: "UInt32", symEffect: "Read"},
+ {name: "MOVQloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymOff", typ: "UInt64", symEffect: "Read"},
+ {name: "MOVQloadidx8", argLength: 3, reg: gploadidx, asm: "MOVQ", scale: 8, aux: "SymOff", typ: "UInt64", symEffect: "Read"},
+
+ // MOVxstoreidx: indexed stores
+ // Store (Q=8,L=4,W=2,B=1) low bytes of arg2.
+ // Does *(arg0+scale*arg1+auxint+aux) = arg2, arg3=mem.
+ {name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", scale: 1, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", scale: 1, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", scale: 2, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", scale: 4, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVLstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVL", scale: 8, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVQstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymOff", symEffect: "Write"},
+ {name: "MOVQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVQ", scale: 8, aux: "SymOff", symEffect: "Write"},
+
+ // TODO: add size-mismatched indexed loads/stores, like MOVBstoreidx4?
+
+ // MOVxstoreconst: constant stores
+ // Store (O=16,Q=8,L=4,W=2,B=1) constant bytes.
+ // Does *(arg0+ValAndOff(AuxInt).Off()+aux) = ValAndOff(AuxInt).Val(), arg1=mem.
+ // O version can only store the constant 0.
+ {name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVQstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVQ", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "MOVOstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVUPS", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
+
+ // MOVxstoreconstidx: constant indexed stores
+ // Store (Q=8,L=4,W=2,B=1) constant bytes.
+ // Does *(arg0+scale*arg1+ValAndOff(AuxInt).Off()+aux) = ValAndOff(AuxInt).Val(), arg2=mem.
+ {name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVB", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+ {name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVW", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+ {name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", scale: 2, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+ {name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+ {name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", scale: 4, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+ {name: "MOVQstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+ {name: "MOVQstoreconstidx8", argLength: 3, reg: gpstoreconstidx, asm: "MOVQ", scale: 8, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},
+
+ // arg0 = pointer to start of memory to zero
+ // arg1 = mem
+ // auxint = # of bytes to zero
+ // returns mem
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI")},
+ clobbers: buildReg("DI"),
+ },
+ faultOnNilArg0: true,
+ unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+ },
+
+ // arg0 = address of memory to zero
+ // arg1 = # of 8-byte words to zero
+ // arg2 = value to store (will always be zero)
+ // arg3 = mem
+ // returns mem
+ {
+ name: "REPSTOSQ",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+ clobbers: buildReg("DI CX"),
+ },
+ faultOnNilArg0: true,
+ },
+
+ // With a register ABI, the actual register info for these instructions (i.e., what is used in regalloc) is augmented with per-call-site bindings of additional arguments to specific in and out registers.
+ {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem
+
+ // arg0 = destination pointer
+ // arg1 = source pointer
+ // arg2 = mem
+ // auxint = # of bytes to copy, must be multiple of 16
+ // returns memory
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("SI")},
+ clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+ },
+
+ // arg0 = destination pointer
+ // arg1 = source pointer
+ // arg2 = # of 8-byte words to copy
+ // arg3 = mem
+ // returns memory
+ {
+ name: "REPMOVSQ",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+ clobbers: buildReg("DI SI CX"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // (InvertFlags (CMPQ a b)) == (CMPQ b a)
+ // So if we want (SETL (CMPQ a b)) but we can't do that because a is a constant,
+ // then we do (SETL (InvertFlags (CMPQ b a))) instead.
+ // Rewrites will convert this to (SETG (CMPQ b a)).
+ // InvertFlags is a pseudo-op which can't appear in assembly output.
+ {name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+ // Pseudo-ops
+ {name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of DX (the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true},
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+ //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary, but may clobber others.
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ (gp | g)}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ {name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "UInt64", aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+
+ // Constant flag values. For any comparison, there are 5 possible
+ // outcomes: the three from the signed total order (<,==,>) and the
+ // three from the unsigned total order. The == cases overlap.
+ // Note: there's a sixth "unordered" outcome for floating-point
+ // comparisons, but we don't use such a beast yet.
+ // These ops are for temporary use by rewrite rules. They
+ // cannot appear in the generated assembly.
+ {name: "FlagEQ"}, // equal
+ {name: "FlagLT_ULT"}, // signed < and unsigned <
+ {name: "FlagLT_UGT"}, // signed < and unsigned >
+ {name: "FlagGT_UGT"}, // signed > and unsigned >
+ {name: "FlagGT_ULT"}, // signed > and unsigned <
+
+ // Atomic loads. These are just normal loads but return <value,memory> tuples
+ // so they can be properly ordered with other loads.
+ // load from arg0+auxint+aux. arg1=mem.
+ {name: "MOVBatomicload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVLatomicload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVQatomicload", argLength: 2, reg: gpload, asm: "MOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+ // Atomic stores and exchanges. Stores use XCHG to get the right memory ordering semantics.
+ // store arg0 to arg1+auxint+aux, arg2=mem.
+ // These ops return a tuple of <old contents of *(arg1+auxint+aux), memory>.
+ // Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)!
+ {name: "XCHGB", argLength: 3, reg: gpstorexchg, asm: "XCHGB", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "XCHGL", argLength: 3, reg: gpstorexchg, asm: "XCHGL", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "XCHGQ", argLength: 3, reg: gpstorexchg, asm: "XCHGQ", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+
+ // Atomic adds.
+ // *(arg1+auxint+aux) += arg0. arg2=mem.
+ // Returns a tuple of <old contents of *(arg1+auxint+aux), memory>.
+ // Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)!
+ {name: "XADDLlock", argLength: 3, reg: gpstorexchg, asm: "XADDL", typ: "(UInt32,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "XADDQlock", argLength: 3, reg: gpstorexchg, asm: "XADDQ", typ: "(UInt64,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
+ {name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
+
+ // Compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+ // if *(arg0+auxint+aux) == arg1 {
+ // *(arg0+auxint+aux) = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // Note that these instructions also return the old value in AX, but we ignore it.
+ // TODO: have these return flags instead of bool. The current system generates:
+ // CMPXCHGQ ...
+ // SETEQ AX
+ // CMPB AX, $0
+ // JNE ...
+ // instead of just
+ // CMPXCHGQ ...
+ // JEQ ...
+ // but we can't do that because memory-using ops can't generate flags yet
+ // (flagalloc wants to move flag-generating instructions around).
+ {name: "CMPXCHGLlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "CMPXCHGQlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+
+ // Atomic memory updates.
+ {name: "ANDBlock", argLength: 3, reg: gpstore, asm: "ANDB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
+ {name: "ANDLlock", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
+ {name: "ORBlock", argLength: 3, reg: gpstore, asm: "ORB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1
+ {name: "ORLlock", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1
+
+ // Prefetch instructions
+ // Do prefetch arg0 address. arg0=addr, arg1=memory. Instruction variant selects locality hint
+ {name: "PrefetchT0", argLength: 2, reg: prefreg, asm: "PREFETCHT0", hasSideEffects: true},
+ {name: "PrefetchNTA", argLength: 2, reg: prefreg, asm: "PREFETCHNTA", hasSideEffects: true},
+
+ // CPUID feature: BMI1.
+ {name: "ANDNQ", argLength: 2, reg: gp21, asm: "ANDNQ", clobberFlags: true}, // arg0 &^ arg1
+ {name: "ANDNL", argLength: 2, reg: gp21, asm: "ANDNL", clobberFlags: true}, // arg0 &^ arg1
+ {name: "BLSIQ", argLength: 1, reg: gp11, asm: "BLSIQ", clobberFlags: true}, // arg0 & -arg0
+ {name: "BLSIL", argLength: 1, reg: gp11, asm: "BLSIL", clobberFlags: true}, // arg0 & -arg0
+ {name: "BLSMSKQ", argLength: 1, reg: gp11, asm: "BLSMSKQ", clobberFlags: true}, // arg0 ^ (arg0 - 1)
+ {name: "BLSMSKL", argLength: 1, reg: gp11, asm: "BLSMSKL", clobberFlags: true}, // arg0 ^ (arg0 - 1)
+ {name: "BLSRQ", argLength: 1, reg: gp11, asm: "BLSRQ", clobberFlags: true}, // arg0 & (arg0 - 1)
+ {name: "BLSRL", argLength: 1, reg: gp11, asm: "BLSRL", clobberFlags: true}, // arg0 & (arg0 - 1)
+ // count the number of trailing zero bits, prefer TZCNTQ over BSFQ, as TZCNTQ(0)==64
+ // and BSFQ(0) is undefined. Same for TZCNTL(0)==32
+ {name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true},
+ {name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true},
+
+ // CPUID feature: LZCNT.
+ // count the number of leading zero bits.
+ {name: "LZCNTQ", argLength: 1, reg: gp11, asm: "LZCNTQ", typ: "UInt64", clobberFlags: true},
+ {name: "LZCNTL", argLength: 1, reg: gp11, asm: "LZCNTL", typ: "UInt32", clobberFlags: true},
+
+ // CPUID feature: MOVBE
+ // MOVBEWload does not satisfy zero extended, so only use MOVBEWstore
+ {name: "MOVBEWstore", argLength: 3, reg: gpstore, asm: "MOVBEW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVBELload", argLength: 2, reg: gpload, asm: "MOVBEL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVBELstore", argLength: 3, reg: gpstore, asm: "MOVBEL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVBEQload", argLength: 2, reg: gpload, asm: "MOVBEQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 8 bytes from arg0+auxint+aux. arg1=mem
+ {name: "MOVBEQstore", argLength: 3, reg: gpstore, asm: "MOVBEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ // indexed MOVBE loads
+ {name: "MOVBELloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBEL", scale: 1, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load and swap 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+ {name: "MOVBELloadidx4", argLength: 3, reg: gploadidx, asm: "MOVBEL", scale: 4, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load and swap 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem. Zero extend.
+ {name: "MOVBELloadidx8", argLength: 3, reg: gploadidx, asm: "MOVBEL", scale: 8, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load and swap 4 bytes from arg0+8*arg1+auxint+aux. arg2=mem. Zero extend.
+ {name: "MOVBEQloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBEQ", scale: 1, aux: "SymOff", typ: "UInt64", symEffect: "Read"}, // load and swap 8 bytes from arg0+arg1+auxint+aux. arg2=mem
+ {name: "MOVBEQloadidx8", argLength: 3, reg: gploadidx, asm: "MOVBEQ", scale: 8, aux: "SymOff", typ: "UInt64", symEffect: "Read"}, // load and swap 8 bytes from arg0+8*arg1+auxint+aux. arg2=mem
+ // indexed MOVBE stores
+ {name: "MOVBEWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVBEW", scale: 1, aux: "SymOff", symEffect: "Write"}, // swap and store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVBEWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVBEW", scale: 2, aux: "SymOff", symEffect: "Write"}, // swap and store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+ {name: "MOVBELstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVBEL", scale: 1, aux: "SymOff", symEffect: "Write"}, // swap and store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVBELstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVBEL", scale: 4, aux: "SymOff", symEffect: "Write"}, // swap and store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+ {name: "MOVBELstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVBEL", scale: 8, aux: "SymOff", symEffect: "Write"}, // swap and store 4 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
+ {name: "MOVBEQstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVBEQ", scale: 1, aux: "SymOff", symEffect: "Write"}, // swap and store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVBEQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVBEQ", scale: 8, aux: "SymOff", symEffect: "Write"}, // swap and store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
+
+ // CPUID feature: BMI2.
+ {name: "SARXQ", argLength: 2, reg: gp21, asm: "SARXQ"}, // signed arg0 >> arg1, shift amount is mod 64
+ {name: "SARXL", argLength: 2, reg: gp21, asm: "SARXL"}, // signed int32(arg0) >> arg1, shift amount is mod 32
+ {name: "SHLXQ", argLength: 2, reg: gp21, asm: "SHLXQ"}, // arg0 << arg1, shift amount is mod 64
+ {name: "SHLXL", argLength: 2, reg: gp21, asm: "SHLXL"}, // arg0 << arg1, shift amount is mod 32
+ {name: "SHRXQ", argLength: 2, reg: gp21, asm: "SHRXQ"}, // unsigned arg0 >> arg1, shift amount is mod 64
+ {name: "SHRXL", argLength: 2, reg: gp21, asm: "SHRXL"}, // unsigned uint32(arg0) >> arg1, shift amount is mod 32
+
+ {name: "SARXLload", argLength: 3, reg: gp21shxload, asm: "SARXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 32
+ {name: "SARXQload", argLength: 3, reg: gp21shxload, asm: "SARXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 64
+ {name: "SHLXLload", argLength: 3, reg: gp21shxload, asm: "SHLXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 32
+ {name: "SHLXQload", argLength: 3, reg: gp21shxload, asm: "SHLXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 64
+ {name: "SHRXLload", argLength: 3, reg: gp21shxload, asm: "SHRXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 32
+ {name: "SHRXQload", argLength: 3, reg: gp21shxload, asm: "SHRXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 64
+
+ {name: "SARXLloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SARXL", scale: 1, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
+ {name: "SARXLloadidx4", argLength: 4, reg: gp21shxloadidx, asm: "SARXL", scale: 4, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+4*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
+ {name: "SARXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SARXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
+ {name: "SARXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SARXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
+ {name: "SARXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SARXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
+ {name: "SHLXLloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHLXL", scale: 1, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+1*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 32
+ {name: "SHLXLloadidx4", argLength: 4, reg: gp21shxloadidx, asm: "SHLXL", scale: 4, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+4*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 32
+ {name: "SHLXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHLXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+8*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 32
+ {name: "SHLXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHLXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+1*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 64
+ {name: "SHLXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHLXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+8*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 64
+ {name: "SHRXLloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 1, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
+ {name: "SHRXLloadidx4", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 4, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+4*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
+ {name: "SHRXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
+ {name: "SHRXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
+ {name: "SHRXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
+ }
+
+ var AMD64blocks = []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LT", controls: 1},
+ {name: "LE", controls: 1},
+ {name: "GT", controls: 1},
+ {name: "GE", controls: 1},
+ {name: "OS", controls: 1},
+ {name: "OC", controls: 1},
+ {name: "ULT", controls: 1},
+ {name: "ULE", controls: 1},
+ {name: "UGT", controls: 1},
+ {name: "UGE", controls: 1},
+ {name: "EQF", controls: 1},
+ {name: "NEF", controls: 1},
+ {name: "ORD", controls: 1}, // FP, ordered comparison (parity zero)
+ {name: "NAN", controls: 1}, // FP, unordered comparison (parity one)
+
+ // JUMPTABLE implements jump tables.
+ // Aux is the symbol (an *obj.LSym) for the jump table.
+ // control[0] is the index into the jump table.
+ // control[1] is the address of the jump table (the address of the symbol stored in Aux).
+ {name: "JUMPTABLE", controls: 2, aux: "Sym"},
+ }
+
+ archs = append(archs, arch{
+ name: "AMD64",
+ pkg: "cmd/internal/obj/x86",
+ genfile: "../../amd64/ssa.go",
+ ops: AMD64ops,
+ blocks: AMD64blocks,
+ regnames: regNamesAMD64,
+ ParamIntRegNames: "AX BX CX DI SI R8 R9 R10 R11",
+ ParamFloatRegNames: "X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14",
+ gpregmask: gp,
+ fpregmask: fp,
+ specialregmask: x15,
+ framepointerreg: int8(num["BP"]),
+ linkreg: -1, // not used
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules b/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules
new file mode 100644
index 0000000..a1e63d6
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules
@@ -0,0 +1,8 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Prefer SARX/SHLX/SHRX instruction because it has less register restriction on the shift input.
+(SAR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SARX(Q|L) x y)
+(SHL(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SHLX(Q|L) x y)
+(SHR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SHRX(Q|L) x y)
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules b/src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules
new file mode 100644
index 0000000..dd8f8ac
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules
@@ -0,0 +1,45 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules used by flagalloc and addressingmodes to
+// split a flag-generating merged load op into separate load and op.
+// Unlike with the other rules files, not all of these
+// rules will be applied to all values.
+// Rather, flagalloc will request for rules to be applied
+// to a particular problematic value.
+// These are often the exact inverse of rules in AMD64.rules,
+// only with the conditions removed.
+//
+// For addressingmodes, certain single instructions are slower than the two instruction
+// split generated here (which is different from the inputs to addressingmodes).
+// For example:
+// (CMPBconstload c (ADDQ x y)) -> (CMPBconstloadidx1 c x y) -> (CMPB c (MOVBloadidx1 x y))
+
+(CMP(Q|L|W|B)load {sym} [off] ptr x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)load {sym} [off] ptr mem) x)
+
+(CMP(Q|L|W|B)constload {sym} [vo] ptr mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)load {sym} [vo.Off()] ptr mem) x)
+
+(CMPQconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPQconst (MOVQload {sym} [vo.Off()] ptr mem) [vo.Val()])
+(CMPLconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPLconst (MOVLload {sym} [vo.Off()] ptr mem) [vo.Val()])
+(CMPWconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPWconst (MOVWload {sym} [vo.Off()] ptr mem) [vo.Val16()])
+(CMPBconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPBconst (MOVBload {sym} [vo.Off()] ptr mem) [vo.Val8()])
+
+(CMP(Q|L|W|B)loadidx1 {sym} [off] ptr idx x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)loadidx1 {sym} [off] ptr idx mem) x)
+(CMPQloadidx8 {sym} [off] ptr idx x mem) => (CMPQ (MOVQloadidx8 {sym} [off] ptr idx mem) x)
+(CMPLloadidx4 {sym} [off] ptr idx x mem) => (CMPL (MOVLloadidx4 {sym} [off] ptr idx mem) x)
+(CMPWloadidx2 {sym} [off] ptr idx x mem) => (CMPW (MOVWloadidx2 {sym} [off] ptr idx mem) x)
+
+(CMP(Q|L|W|B)constloadidx1 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)loadidx1 {sym} [vo.Off()] ptr idx mem) x)
+(CMPQconstloadidx8 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTQ x:(MOVQloadidx8 {sym} [vo.Off()] ptr idx mem) x)
+(CMPLconstloadidx4 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTL x:(MOVLloadidx4 {sym} [vo.Off()] ptr idx mem) x)
+(CMPWconstloadidx2 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTW x:(MOVWloadidx2 {sym} [vo.Off()] ptr idx mem) x)
+
+(CMPQconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val()])
+(CMPLconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val()])
+(CMPWconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val16()])
+(CMPBconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPBconst (MOVBloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val8()])
+
+(CMPQconstloadidx8 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx8 {sym} [vo.Off()] ptr idx mem) [vo.Val()])
+(CMPLconstloadidx4 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx4 {sym} [vo.Off()] ptr idx mem) [vo.Val()])
+(CMPWconstloadidx2 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx2 {sym} [vo.Off()] ptr idx mem) [vo.Val16()])
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM.rules b/src/cmd/compile/internal/ssa/_gen/ARM.rules
new file mode 100644
index 0000000..e5898b0
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/ARM.rules
@@ -0,0 +1,1474 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|32|16|8) ...) => (ADD ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+(Add32carry ...) => (ADDS ...)
+(Add32withcarry ...) => (ADC ...)
+
+(Sub(Ptr|32|16|8) ...) => (SUB ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+(Sub32carry ...) => (SUBS ...)
+(Sub32withcarry ...) => (SBC ...)
+
+(Mul(32|16|8) ...) => (MUL ...)
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+(Hmul(32|32u) ...) => (HMU(L|LU) ...)
+(Mul32uhilo ...) => (MULLU ...)
+
+(Div32 x y) =>
+ (SUB (XOR <typ.UInt32> // negate the result if one operand is negative
+ (Select0 <typ.UInt32> (CALLudiv
+ (SUB <typ.UInt32> (XOR x <typ.UInt32> (Signmask x)) (Signmask x)) // negate x if negative
+ (SUB <typ.UInt32> (XOR y <typ.UInt32> (Signmask y)) (Signmask y)))) // negate y if negative
+ (Signmask (XOR <typ.UInt32> x y))) (Signmask (XOR <typ.UInt32> x y)))
+(Div32u x y) => (Select0 <typ.UInt32> (CALLudiv x y))
+(Div16 x y) => (Div32 (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (Div32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (Div32 (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (Div32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod32 x y) =>
+ (SUB (XOR <typ.UInt32> // negate the result if x is negative
+ (Select1 <typ.UInt32> (CALLudiv
+ (SUB <typ.UInt32> (XOR <typ.UInt32> x (Signmask x)) (Signmask x)) // negate x if negative
+ (SUB <typ.UInt32> (XOR <typ.UInt32> y (Signmask y)) (Signmask y)))) // negate y if negative
+ (Signmask x)) (Signmask x))
+(Mod32u x y) => (Select1 <typ.UInt32> (CALLudiv x y))
+(Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+// (x + y) / 2 with x>=y -> (x - y) / 2 + y
+(Avg32u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(32|16|8) ...) => (AND ...)
+(Or(32|16|8) ...) => (OR ...)
+(Xor(32|16|8) ...) => (XOR ...)
+
+// unary ops
+(Neg(32|16|8) x) => (RSBconst [0] x)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(32|16|8) ...) => (MVN ...)
+
+(Sqrt ...) => (SQRTD ...)
+(Sqrt32 ...) => (SQRTF ...)
+(Abs ...) => (ABSD ...)
+
+// TODO: optimize this for ARMv5 and ARMv6
+(Ctz32NonZero ...) => (Ctz32 ...)
+(Ctz16NonZero ...) => (Ctz32 ...)
+(Ctz8NonZero ...) => (Ctz32 ...)
+
+// count trailing zero for ARMv5 and ARMv6
+// 32 - CLZ(x&-x - 1)
+(Ctz32 <t> x) && buildcfg.GOARM<=6 =>
+ (RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1])))
+(Ctz16 <t> x) && buildcfg.GOARM<=6 =>
+ (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x10000] x))) [1])))
+(Ctz8 <t> x) && buildcfg.GOARM<=6 =>
+ (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x100] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x100] x))) [1])))
+
+// count trailing zero for ARMv7
+(Ctz32 <t> x) && buildcfg.GOARM==7 => (CLZ <t> (RBIT <t> x))
+(Ctz16 <t> x) && buildcfg.GOARM==7 => (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) && buildcfg.GOARM==7 => (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+
+// bit length
+(BitLen32 <t> x) => (RSBconst [32] (CLZ <t> x))
+
+// byte swap for ARMv5
+// let (a, b, c, d) be the bytes of x from high to low
+// t1 = x right rotate 16 bits -- (c, d, a, b )
+// t2 = x ^ t1 -- (a^c, b^d, a^c, b^d)
+// t3 = t2 &^ 0xff0000 -- (a^c, 0, a^c, b^d)
+// t4 = t3 >> 8 -- (0, a^c, 0, a^c)
+// t5 = x right rotate 8 bits -- (d, a, b, c )
+// result = t4 ^ t5 -- (d, c, b, a )
+// using shifted ops this can be done in 4 instructions.
+(Bswap32 <t> x) && buildcfg.GOARM==5 =>
+ (XOR <t>
+ (SRLconst <t> (BICconst <t> (XOR <t> x (SRRconst <t> [16] x)) [0xff0000]) [8])
+ (SRRconst <t> x [8]))
+
+// byte swap for ARMv6 and above
+(Bswap32 x) && buildcfg.GOARM>=6 => (REV x)
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XORconst [1] (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// shifts
+// hardware instruction uses only the low byte of the shift
+// we compare to 256 to ensure Go semantics for large shifts
+(Lsh32x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0])
+(Lsh32x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Lsh32x8 x y) => (SLL x (ZeroExt8to32 y))
+
+(Lsh16x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0])
+(Lsh16x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Lsh16x8 x y) => (SLL x (ZeroExt8to32 y))
+
+(Lsh8x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0])
+(Lsh8x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Lsh8x8 x y) => (SLL x (ZeroExt8to32 y))
+
+(Rsh32Ux32 x y) => (CMOVWHSconst (SRL <x.Type> x y) (CMPconst [256] y) [0])
+(Rsh32Ux16 x y) => (CMOVWHSconst (SRL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Rsh32Ux8 x y) => (SRL x (ZeroExt8to32 y))
+
+(Rsh16Ux32 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt16to32 x) y) (CMPconst [256] y) [0])
+(Rsh16Ux16 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt16to32 x) (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Rsh16Ux8 x y) => (SRL (ZeroExt16to32 x) (ZeroExt8to32 y))
+
+(Rsh8Ux32 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt8to32 x) y) (CMPconst [256] y) [0])
+(Rsh8Ux16 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt8to32 x) (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Rsh8Ux8 x y) => (SRL (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(Rsh32x32 x y) => (SRAcond x y (CMPconst [256] y))
+(Rsh32x16 x y) => (SRAcond x (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y)))
+(Rsh32x8 x y) => (SRA x (ZeroExt8to32 y))
+
+(Rsh16x32 x y) => (SRAcond (SignExt16to32 x) y (CMPconst [256] y))
+(Rsh16x16 x y) => (SRAcond (SignExt16to32 x) (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y)))
+(Rsh16x8 x y) => (SRA (SignExt16to32 x) (ZeroExt8to32 y))
+
+(Rsh8x32 x y) => (SRAcond (SignExt8to32 x) y (CMPconst [256] y))
+(Rsh8x16 x y) => (SRAcond (SignExt8to32 x) (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y)))
+(Rsh8x8 x y) => (SRA (SignExt8to32 x) (ZeroExt8to32 y))
+
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SLLconst x [int32(c)])
+(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SRAconst x [int32(c)])
+(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 => (SRLconst x [int32(c)])
+(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SLLconst x [int32(c)])
+(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 => (SRLconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SLLconst x [int32(c)])
+(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 => (SRLconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh32x64 x (Const64 [c])) && uint64(c) >= 32 => (SRAconst x [31])
+(Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [31])
+(Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [31])
+
+// constants
+(Const(8|16|32) [val]) => (MOVWconst [int32(val)])
+(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)])
+(ConstNil) => (MOVWconst [0])
+(ConstBool [t]) => (MOVWconst [b2i32(t)])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+
+(Signmask x) => (SRAconst x [31])
+(Zeromask x) => (SRAconst (RSBshiftRL <typ.Int32> x x [1]) [31]) // sign bit of uint32(x)>>1 - x
+(Slicemask <t> x) => (SRAconst (RSBconst <t> [0] x) [31])
+
+// float <-> int conversion
+(Cvt32to32F ...) => (MOVWF ...)
+(Cvt32to64F ...) => (MOVWD ...)
+(Cvt32Uto32F ...) => (MOVWUF ...)
+(Cvt32Uto64F ...) => (MOVWUD ...)
+(Cvt32Fto32 ...) => (MOVFW ...)
+(Cvt64Fto32 ...) => (MOVDW ...)
+(Cvt32Fto32U ...) => (MOVFWU ...)
+(Cvt64Fto32U ...) => (MOVDWU ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+// fused-multiply-add
+(FMA x y z) => (FMULAD z x y)
+
+// comparisons
+(Eq8 x y) => (Equal (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (Equal (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (Equal (CMP x y))
+(EqPtr x y) => (Equal (CMP x y))
+(Eq(32|64)F x y) => (Equal (CMP(F|D) x y))
+
+(Neq8 x y) => (NotEqual (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) => (NotEqual (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) => (NotEqual (CMP x y))
+(NeqPtr x y) => (NotEqual (CMP x y))
+(Neq(32|64)F x y) => (NotEqual (CMP(F|D) x y))
+
+(Less8 x y) => (LessThan (CMP (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) => (LessThan (CMP (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) => (LessThan (CMP x y))
+(Less(32|64)F x y) => (GreaterThan (CMP(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y) => (LessThanU (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) => (LessThanU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) => (LessThanU (CMP x y))
+
+(Leq8 x y) => (LessEqual (CMP (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (LessEqual (CMP (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (LessEqual (CMP x y))
+(Leq(32|64)F x y) => (GreaterEqual (CMP(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y) => (LessEqualU (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (LessEqualU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (LessEqualU (CMP x y))
+
+(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr)
+
+(Addr {sym} base) => (MOVWaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVWaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore ptr (MOVWconst [0]) mem)
+(Zero [2] ptr mem) =>
+ (MOVBstore [1] ptr (MOVWconst [0])
+ (MOVBstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore ptr (MOVWconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] ptr (MOVWconst [0])
+ (MOVHstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] ptr mem) =>
+ (MOVBstore [3] ptr (MOVWconst [0])
+ (MOVBstore [2] ptr (MOVWconst [0])
+ (MOVBstore [1] ptr (MOVWconst [0])
+ (MOVBstore [0] ptr (MOVWconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+ (MOVBstore [2] ptr (MOVWconst [0])
+ (MOVBstore [1] ptr (MOVWconst [0])
+ (MOVBstore [0] ptr (MOVWconst [0]) mem)))
+
+// Medium zeroing uses a duff device
+// 4 and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+ && s%4 == 0 && s > 4 && s <= 512
+ && t.Alignment()%4 == 0 && !config.noDuffDevice =>
+ (DUFFZERO [4 * (128 - s/4)] ptr (MOVWconst [0]) mem)
+
+// Large zeroing uses a loop
+(Zero [s] {t} ptr mem)
+ && (s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0 =>
+ (LoweredZero [t.Alignment()]
+ ptr
+ (ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))])
+ (MOVWconst [0])
+ mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore dst (MOVHUload src mem) mem)
+(Move [2] dst src mem) =>
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] dst (MOVHUload [2] src mem)
+ (MOVHstore dst (MOVHUload src mem) mem))
+(Move [4] dst src mem) =>
+ (MOVBstore [3] dst (MOVBUload [3] src mem)
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem))))
+
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem)))
+
+// Medium move uses a duff device
+// 8 and 128 are magic constants, see runtime/mkduff.go
+(Move [s] {t} dst src mem)
+ && s%4 == 0 && s > 4 && s <= 512
+ && t.Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [8 * (128 - s/4)] dst src mem)
+
+// Large move uses a loop
+(Move [s] {t} dst src mem)
+ && ((s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0) && logLargeCopy(v, s) =>
+ (LoweredMove [t.Alignment()]
+ dst
+ src
+ (ADDconst <src.Type> src [int32(s-moveSize(t.Alignment(), config))])
+ mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) => (LessThanU (CMP idx len))
+(IsSliceInBounds idx len) => (LessEqualU (CMP idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) => (EQ cc yes no)
+(If (NotEqual cc) yes no) => (NE cc yes no)
+(If (LessThan cc) yes no) => (LT cc yes no)
+(If (LessThanU cc) yes no) => (ULT cc yes no)
+(If (LessEqual cc) yes no) => (LE cc yes no)
+(If (LessEqualU cc) yes no) => (ULE cc yes no)
+(If (GreaterThan cc) yes no) => (GT cc yes no)
+(If (GreaterThanU cc) yes no) => (UGT cc yes no)
+(If (GreaterEqual cc) yes no) => (GE cc yes no)
+(If (GreaterEqualU cc) yes no) => (UGE cc yes no)
+
+(If cond yes no) => (NE (CMPconst [0] cond) yes no)
+
+// Absorb boolean tests into block
+(NE (CMPconst [0] (Equal cc)) yes no) => (EQ cc yes no)
+(NE (CMPconst [0] (NotEqual cc)) yes no) => (NE cc yes no)
+(NE (CMPconst [0] (LessThan cc)) yes no) => (LT cc yes no)
+(NE (CMPconst [0] (LessThanU cc)) yes no) => (ULT cc yes no)
+(NE (CMPconst [0] (LessEqual cc)) yes no) => (LE cc yes no)
+(NE (CMPconst [0] (LessEqualU cc)) yes no) => (ULE cc yes no)
+(NE (CMPconst [0] (GreaterThan cc)) yes no) => (GT cc yes no)
+(NE (CMPconst [0] (GreaterThanU cc)) yes no) => (UGT cc yes no)
+(NE (CMPconst [0] (GreaterEqual cc)) yes no) => (GE cc yes no)
+(NE (CMPconst [0] (GreaterEqualU cc)) yes no) => (UGE cc yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem)
+
+// Optimizations
+
+// fold offset into address
+(ADDconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off1+off2] {sym} ptr)
+(SUBconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off2-off1] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVBload [off1+off2] {sym} ptr mem)
+(MOVBload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVBload [off1-off2] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVBUload [off1+off2] {sym} ptr mem)
+(MOVBUload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVBUload [off1-off2] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVHload [off1+off2] {sym} ptr mem)
+(MOVHload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVHload [off1-off2] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVHUload [off1+off2] {sym} ptr mem)
+(MOVHUload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVHUload [off1-off2] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVWload [off1+off2] {sym} ptr mem)
+(MOVWload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVWload [off1-off2] {sym} ptr mem)
+(MOVFload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVFload [off1+off2] {sym} ptr mem)
+(MOVFload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVFload [off1-off2] {sym} ptr mem)
+(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVDload [off1+off2] {sym} ptr mem)
+(MOVDload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVDload [off1-off2] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVBstore [off1+off2] {sym} ptr val mem)
+(MOVBstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVBstore [off1-off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVHstore [off1+off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVHstore [off1-off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVWstore [off1+off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVWstore [off1-off2] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVFstore [off1+off2] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVFstore [off1-off2] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVDstore [off1+off2] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVDstore [off1-off2] {sym} ptr val mem)
+
+(MOVBload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVFload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVFstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBreg x)
+(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBUreg x)
+(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHreg x)
+(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHUreg x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+
+(MOVFload [off] {sym} ptr (MOVFstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+
+(MOVWloadidx ptr idx (MOVWstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => x
+(MOVWloadshiftLL ptr idx [c] (MOVWstoreshiftLL ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x
+(MOVWloadshiftRL ptr idx [c] (MOVWstoreshiftRL ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x
+(MOVWloadshiftRA ptr idx [c] (MOVWstoreshiftRA ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x
+(MOVBUloadidx ptr idx (MOVBstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVBUreg x)
+(MOVBloadidx ptr idx (MOVBstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVBreg x)
+(MOVHUloadidx ptr idx (MOVHstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVHUreg x)
+(MOVHloadidx ptr idx (MOVHstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVHreg x)
+
+// fold constant into arithmetic ops
+(ADD x (MOVWconst [c])) => (ADDconst [c] x)
+(SUB (MOVWconst [c]) x) => (RSBconst [c] x)
+(SUB x (MOVWconst [c])) => (SUBconst [c] x)
+(RSB (MOVWconst [c]) x) => (SUBconst [c] x)
+(RSB x (MOVWconst [c])) => (RSBconst [c] x)
+
+(ADDS x (MOVWconst [c])) => (ADDSconst [c] x)
+(SUBS x (MOVWconst [c])) => (SUBSconst [c] x)
+
+(ADC (MOVWconst [c]) x flags) => (ADCconst [c] x flags)
+(SBC (MOVWconst [c]) x flags) => (RSCconst [c] x flags)
+(SBC x (MOVWconst [c]) flags) => (SBCconst [c] x flags)
+
+(AND x (MOVWconst [c])) => (ANDconst [c] x)
+(OR x (MOVWconst [c])) => (ORconst [c] x)
+(XOR x (MOVWconst [c])) => (XORconst [c] x)
+(BIC x (MOVWconst [c])) => (BICconst [c] x)
+
+(SLL x (MOVWconst [c])) && 0 <= c && c < 32 => (SLLconst x [c])
+(SRL x (MOVWconst [c])) && 0 <= c && c < 32 => (SRLconst x [c])
+(SRA x (MOVWconst [c])) && 0 <= c && c < 32 => (SRAconst x [c])
+
+(CMP x (MOVWconst [c])) => (CMPconst [c] x)
+(CMP (MOVWconst [c]) x) => (InvertFlags (CMPconst [c] x))
+(CMN x (MOVWconst [c])) => (CMNconst [c] x)
+(TST x (MOVWconst [c])) => (TSTconst [c] x)
+(TEQ x (MOVWconst [c])) => (TEQconst [c] x)
+
+(SRR x (MOVWconst [c])) => (SRRconst x [c&31])
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+(CMP x y) && canonLessThan(x,y) => (InvertFlags (CMP y x))
+
+// don't extend after proper load
+// MOVWreg instruction is not emitted if src and dst registers are same, but it ensures the type.
+(MOVBreg x:(MOVBload _ _)) => (MOVWreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVWreg x)
+
+// fold extensions and ANDs together
+(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x)
+(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&0xffff] x)
+(MOVBreg (ANDconst [c] x)) && c & 0x80 == 0 => (ANDconst [c&0x7f] x)
+(MOVHreg (ANDconst [c] x)) && c & 0x8000 == 0 => (ANDconst [c&0x7fff] x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVWreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVWreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVWnop doesn't emit instruction, only for ensuring the type.
+(MOVWreg x) && x.Uses == 1 => (MOVWnop x)
+
+// TODO: we should be able to get rid of MOVWnop all together.
+// But for now, this is enough to get rid of lots of them.
+(MOVWnop (MOVWconst [c])) => (MOVWconst [c])
+
+// mul by constant
+(MUL x (MOVWconst [c])) && int32(c) == -1 => (RSBconst [0] x)
+(MUL _ (MOVWconst [0])) => (MOVWconst [0])
+(MUL x (MOVWconst [1])) => x
+(MUL x (MOVWconst [c])) && isPowerOfTwo32(c) => (SLLconst [int32(log32(c))] x)
+(MUL x (MOVWconst [c])) && isPowerOfTwo32(c-1) && c >= 3 => (ADDshiftLL x x [int32(log32(c-1))])
+(MUL x (MOVWconst [c])) && isPowerOfTwo32(c+1) && c >= 7 => (RSBshiftLL x x [int32(log32(c+1))])
+(MUL x (MOVWconst [c])) && c%3 == 0 && isPowerOfTwo32(c/3) => (SLLconst [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1]))
+(MUL x (MOVWconst [c])) && c%5 == 0 && isPowerOfTwo32(c/5) => (SLLconst [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2]))
+(MUL x (MOVWconst [c])) && c%7 == 0 && isPowerOfTwo32(c/7) => (SLLconst [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3]))
+(MUL x (MOVWconst [c])) && c%9 == 0 && isPowerOfTwo32(c/9) => (SLLconst [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3]))
+
+(MULA x (MOVWconst [c]) a) && c == -1 => (SUB a x)
+(MULA _ (MOVWconst [0]) a) => a
+(MULA x (MOVWconst [1]) a) => (ADD x a)
+(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c) => (ADD (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c-1) && c >= 3 => (ADD (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c+1) && c >= 7 => (ADD (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULA x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo32(c/3) => (ADD (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULA x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo32(c/5) => (ADD (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULA x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo32(c/7) => (ADD (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULA x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo32(c/9) => (ADD (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULA (MOVWconst [c]) x a) && c == -1 => (SUB a x)
+(MULA (MOVWconst [0]) _ a) => a
+(MULA (MOVWconst [1]) x a) => (ADD x a)
+(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c) => (ADD (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c-1) && c >= 3 => (ADD (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c+1) && c >= 7 => (ADD (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULA (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo32(c/3) => (ADD (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULA (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo32(c/5) => (ADD (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULA (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo32(c/7) => (ADD (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULA (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo32(c/9) => (ADD (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULS x (MOVWconst [c]) a) && c == -1 => (ADD a x)
+(MULS _ (MOVWconst [0]) a) => a
+(MULS x (MOVWconst [1]) a) => (RSB x a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c) => (RSB (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c-1) && c >= 3 => (RSB (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c+1) && c >= 7 => (RSB (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULS x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo32(c/3) => (RSB (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULS x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo32(c/5) => (RSB (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULS x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo32(c/7) => (RSB (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULS x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo32(c/9) => (RSB (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULS (MOVWconst [c]) x a) && c == -1 => (ADD a x)
+(MULS (MOVWconst [0]) _ a) => a
+(MULS (MOVWconst [1]) x a) => (RSB x a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c) => (RSB (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c-1) && c >= 3 => (RSB (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c+1) && c >= 7 => (RSB (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULS (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo32(c/3) => (RSB (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULS (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo32(c/5) => (RSB (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULS (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo32(c/7) => (RSB (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULS (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo32(c/9) => (RSB (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+// div by constant
+(Select0 (CALLudiv x (MOVWconst [1]))) => x
+(Select1 (CALLudiv _ (MOVWconst [1]))) => (MOVWconst [0])
+(Select0 (CALLudiv x (MOVWconst [c]))) && isPowerOfTwo32(c) => (SRLconst [int32(log32(c))] x)
+(Select1 (CALLudiv x (MOVWconst [c]))) && isPowerOfTwo32(c) => (ANDconst [c-1] x)
+
+// constant comparisons
+(CMPconst (MOVWconst [x]) [y]) => (FlagConstant [subFlags32(x,y)])
+(CMNconst (MOVWconst [x]) [y]) => (FlagConstant [addFlags32(x,y)])
+(TSTconst (MOVWconst [x]) [y]) => (FlagConstant [logicFlags32(x&y)])
+(TEQconst (MOVWconst [x]) [y]) => (FlagConstant [logicFlags32(x^y)])
+
+// other known comparisons
+(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags32(0, 1)])
+(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags32(0, 1)])
+(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n => (FlagConstant [subFlags32(0, 1)])
+(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) => (FlagConstant [subFlags32(0, 1)])
+
+// absorb flag constants into branches
+(EQ (FlagConstant [fc]) yes no) && fc.eq() => (First yes no)
+(EQ (FlagConstant [fc]) yes no) && !fc.eq() => (First no yes)
+
+(NE (FlagConstant [fc]) yes no) && fc.ne() => (First yes no)
+(NE (FlagConstant [fc]) yes no) && !fc.ne() => (First no yes)
+
+(LT (FlagConstant [fc]) yes no) && fc.lt() => (First yes no)
+(LT (FlagConstant [fc]) yes no) && !fc.lt() => (First no yes)
+
+(LE (FlagConstant [fc]) yes no) && fc.le() => (First yes no)
+(LE (FlagConstant [fc]) yes no) && !fc.le() => (First no yes)
+
+(GT (FlagConstant [fc]) yes no) && fc.gt() => (First yes no)
+(GT (FlagConstant [fc]) yes no) && !fc.gt() => (First no yes)
+
+(GE (FlagConstant [fc]) yes no) && fc.ge() => (First yes no)
+(GE (FlagConstant [fc]) yes no) && !fc.ge() => (First no yes)
+
+(ULT (FlagConstant [fc]) yes no) && fc.ult() => (First yes no)
+(ULT (FlagConstant [fc]) yes no) && !fc.ult() => (First no yes)
+
+(ULE (FlagConstant [fc]) yes no) && fc.ule() => (First yes no)
+(ULE (FlagConstant [fc]) yes no) && !fc.ule() => (First no yes)
+
+(UGT (FlagConstant [fc]) yes no) && fc.ugt() => (First yes no)
+(UGT (FlagConstant [fc]) yes no) && !fc.ugt() => (First no yes)
+
+(UGE (FlagConstant [fc]) yes no) && fc.uge() => (First yes no)
+(UGE (FlagConstant [fc]) yes no) && !fc.uge() => (First no yes)
+
+(LTnoov (FlagConstant [fc]) yes no) && fc.ltNoov() => (First yes no)
+(LTnoov (FlagConstant [fc]) yes no) && !fc.ltNoov() => (First no yes)
+
+(LEnoov (FlagConstant [fc]) yes no) && fc.leNoov() => (First yes no)
+(LEnoov (FlagConstant [fc]) yes no) && !fc.leNoov() => (First no yes)
+
+(GTnoov (FlagConstant [fc]) yes no) && fc.gtNoov() => (First yes no)
+(GTnoov (FlagConstant [fc]) yes no) && !fc.gtNoov() => (First no yes)
+
+(GEnoov (FlagConstant [fc]) yes no) && fc.geNoov() => (First yes no)
+(GEnoov (FlagConstant [fc]) yes no) && !fc.geNoov() => (First no yes)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+(LTnoov (InvertFlags cmp) yes no) => (GTnoov cmp yes no)
+(GEnoov (InvertFlags cmp) yes no) => (LEnoov cmp yes no)
+(LEnoov (InvertFlags cmp) yes no) => (GEnoov cmp yes no)
+(GTnoov (InvertFlags cmp) yes no) => (LTnoov cmp yes no)
+
+// absorb flag constants into boolean values
+(Equal (FlagConstant [fc])) => (MOVWconst [b2i32(fc.eq())])
+(NotEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ne())])
+(LessThan (FlagConstant [fc])) => (MOVWconst [b2i32(fc.lt())])
+(LessThanU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ult())])
+(LessEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.le())])
+(LessEqualU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ule())])
+(GreaterThan (FlagConstant [fc])) => (MOVWconst [b2i32(fc.gt())])
+(GreaterThanU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ugt())])
+(GreaterEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ge())])
+(GreaterEqualU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.uge())])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) => (Equal x)
+(NotEqual (InvertFlags x)) => (NotEqual x)
+(LessThan (InvertFlags x)) => (GreaterThan x)
+(LessThanU (InvertFlags x)) => (GreaterThanU x)
+(GreaterThan (InvertFlags x)) => (LessThan x)
+(GreaterThanU (InvertFlags x)) => (LessThanU x)
+(LessEqual (InvertFlags x)) => (GreaterEqual x)
+(LessEqualU (InvertFlags x)) => (GreaterEqualU x)
+(GreaterEqual (InvertFlags x)) => (LessEqual x)
+(GreaterEqualU (InvertFlags x)) => (LessEqualU x)
+
+// absorb flag constants into conditional instructions
+(CMOVWLSconst _ (FlagConstant [fc]) [c]) && fc.ule() => (MOVWconst [c])
+(CMOVWLSconst x (FlagConstant [fc]) [c]) && fc.ugt() => x
+
+(CMOVWHSconst _ (FlagConstant [fc]) [c]) && fc.uge() => (MOVWconst [c])
+(CMOVWHSconst x (FlagConstant [fc]) [c]) && fc.ult() => x
+
+(CMOVWLSconst x (InvertFlags flags) [c]) => (CMOVWHSconst x flags [c])
+(CMOVWHSconst x (InvertFlags flags) [c]) => (CMOVWLSconst x flags [c])
+
+(SRAcond x _ (FlagConstant [fc])) && fc.uge() => (SRAconst x [31])
+(SRAcond x y (FlagConstant [fc])) && fc.ult() => (SRA x y)
+
+// remove redundant *const ops
+(ADDconst [0] x) => x
+(SUBconst [0] x) => x
+(ANDconst [0] _) => (MOVWconst [0])
+(ANDconst [c] x) && int32(c)==-1 => x
+(ORconst [0] x) => x
+(ORconst [c] _) && int32(c)==-1 => (MOVWconst [-1])
+(XORconst [0] x) => x
+(BICconst [0] x) => x
+(BICconst [c] _) && int32(c)==-1 => (MOVWconst [0])
+
+// generic constant folding
+(ADDconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(uint32(-c)) => (SUBconst [-c] x)
+(SUBconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(uint32(-c)) => (ADDconst [-c] x)
+(ANDconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(^uint32(c)) => (BICconst [int32(^uint32(c))] x)
+(BICconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(^uint32(c)) => (ANDconst [int32(^uint32(c))] x)
+(ADDconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && uint32(-c)<=0xffff => (SUBconst [-c] x)
+(SUBconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && uint32(-c)<=0xffff => (ADDconst [-c] x)
+(ANDconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && ^uint32(c)<=0xffff => (BICconst [int32(^uint32(c))] x)
+(BICconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && ^uint32(c)<=0xffff => (ANDconst [int32(^uint32(c))] x)
+(ADDconst [c] (MOVWconst [d])) => (MOVWconst [c+d])
+(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x)
+(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x)
+(ADDconst [c] (RSBconst [d] x)) => (RSBconst [c+d] x)
+(ADCconst [c] (ADDconst [d] x) flags) => (ADCconst [c+d] x flags)
+(ADCconst [c] (SUBconst [d] x) flags) => (ADCconst [c-d] x flags)
+(SUBconst [c] (MOVWconst [d])) => (MOVWconst [d-c])
+(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x)
+(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x)
+(SUBconst [c] (RSBconst [d] x)) => (RSBconst [-c+d] x)
+(SBCconst [c] (ADDconst [d] x) flags) => (SBCconst [c-d] x flags)
+(SBCconst [c] (SUBconst [d] x) flags) => (SBCconst [c+d] x flags)
+(RSBconst [c] (MOVWconst [d])) => (MOVWconst [c-d])
+(RSBconst [c] (RSBconst [d] x)) => (ADDconst [c-d] x)
+(RSBconst [c] (ADDconst [d] x)) => (RSBconst [c-d] x)
+(RSBconst [c] (SUBconst [d] x)) => (RSBconst [c+d] x)
+(RSCconst [c] (ADDconst [d] x) flags) => (RSCconst [c-d] x flags)
+(RSCconst [c] (SUBconst [d] x) flags) => (RSCconst [c+d] x flags)
+(SLLconst [c] (MOVWconst [d])) => (MOVWconst [d<<uint64(c)])
+(SRLconst [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)>>uint64(c))])
+(SRAconst [c] (MOVWconst [d])) => (MOVWconst [d>>uint64(c)])
+(MUL (MOVWconst [c]) (MOVWconst [d])) => (MOVWconst [c*d])
+(MULA (MOVWconst [c]) (MOVWconst [d]) a) => (ADDconst [c*d] a)
+(MULS (MOVWconst [c]) (MOVWconst [d]) a) => (SUBconst [c*d] a)
+(Select0 (CALLudiv (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)/uint32(d))])
+(Select1 (CALLudiv (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)%uint32(d))])
+(ANDconst [c] (MOVWconst [d])) => (MOVWconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVWconst [d])) => (MOVWconst [c|d])
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (MOVWconst [d])) => (MOVWconst [c^d])
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(BICconst [c] (MOVWconst [d])) => (MOVWconst [d&^c])
+(BICconst [c] (BICconst [d] x)) => (BICconst [c|d] x)
+(MVN (MOVWconst [c])) => (MOVWconst [^c])
+(MOVBreg (MOVWconst [c])) => (MOVWconst [int32(int8(c))])
+(MOVBUreg (MOVWconst [c])) => (MOVWconst [int32(uint8(c))])
+(MOVHreg (MOVWconst [c])) => (MOVWconst [int32(int16(c))])
+(MOVHUreg (MOVWconst [c])) => (MOVWconst [int32(uint16(c))])
+(MOVWreg (MOVWconst [c])) => (MOVWconst [c])
+// BFX: Width = c >> 8, LSB = c & 0xff, result = d << (32 - Width - LSB) >> (32 - Width)
+(BFX [c] (MOVWconst [d])) => (MOVWconst [d<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8))])
+(BFXU [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
+
+// absorb shifts into ops
+(ADD x (SLLconst [c] y)) => (ADDshiftLL x y [c])
+(ADD x (SRLconst [c] y)) => (ADDshiftRL x y [c])
+(ADD x (SRAconst [c] y)) => (ADDshiftRA x y [c])
+(ADD x (SLL y z)) => (ADDshiftLLreg x y z)
+(ADD x (SRL y z)) => (ADDshiftRLreg x y z)
+(ADD x (SRA y z)) => (ADDshiftRAreg x y z)
+(ADC x (SLLconst [c] y) flags) => (ADCshiftLL x y [c] flags)
+(ADC x (SRLconst [c] y) flags) => (ADCshiftRL x y [c] flags)
+(ADC x (SRAconst [c] y) flags) => (ADCshiftRA x y [c] flags)
+(ADC x (SLL y z) flags) => (ADCshiftLLreg x y z flags)
+(ADC x (SRL y z) flags) => (ADCshiftRLreg x y z flags)
+(ADC x (SRA y z) flags) => (ADCshiftRAreg x y z flags)
+(ADDS x (SLLconst [c] y)) => (ADDSshiftLL x y [c])
+(ADDS x (SRLconst [c] y)) => (ADDSshiftRL x y [c])
+(ADDS x (SRAconst [c] y)) => (ADDSshiftRA x y [c])
+(ADDS x (SLL y z)) => (ADDSshiftLLreg x y z)
+(ADDS x (SRL y z)) => (ADDSshiftRLreg x y z)
+(ADDS x (SRA y z)) => (ADDSshiftRAreg x y z)
+(SUB x (SLLconst [c] y)) => (SUBshiftLL x y [c])
+(SUB (SLLconst [c] y) x) => (RSBshiftLL x y [c])
+(SUB x (SRLconst [c] y)) => (SUBshiftRL x y [c])
+(SUB (SRLconst [c] y) x) => (RSBshiftRL x y [c])
+(SUB x (SRAconst [c] y)) => (SUBshiftRA x y [c])
+(SUB (SRAconst [c] y) x) => (RSBshiftRA x y [c])
+(SUB x (SLL y z)) => (SUBshiftLLreg x y z)
+(SUB (SLL y z) x) => (RSBshiftLLreg x y z)
+(SUB x (SRL y z)) => (SUBshiftRLreg x y z)
+(SUB (SRL y z) x) => (RSBshiftRLreg x y z)
+(SUB x (SRA y z)) => (SUBshiftRAreg x y z)
+(SUB (SRA y z) x) => (RSBshiftRAreg x y z)
+(SBC x (SLLconst [c] y) flags) => (SBCshiftLL x y [c] flags)
+(SBC (SLLconst [c] y) x flags) => (RSCshiftLL x y [c] flags)
+(SBC x (SRLconst [c] y) flags) => (SBCshiftRL x y [c] flags)
+(SBC (SRLconst [c] y) x flags) => (RSCshiftRL x y [c] flags)
+(SBC x (SRAconst [c] y) flags) => (SBCshiftRA x y [c] flags)
+(SBC (SRAconst [c] y) x flags) => (RSCshiftRA x y [c] flags)
+(SBC x (SLL y z) flags) => (SBCshiftLLreg x y z flags)
+(SBC (SLL y z) x flags) => (RSCshiftLLreg x y z flags)
+(SBC x (SRL y z) flags) => (SBCshiftRLreg x y z flags)
+(SBC (SRL y z) x flags) => (RSCshiftRLreg x y z flags)
+(SBC x (SRA y z) flags) => (SBCshiftRAreg x y z flags)
+(SBC (SRA y z) x flags) => (RSCshiftRAreg x y z flags)
+(SUBS x (SLLconst [c] y)) => (SUBSshiftLL x y [c])
+(SUBS (SLLconst [c] y) x) => (RSBSshiftLL x y [c])
+(SUBS x (SRLconst [c] y)) => (SUBSshiftRL x y [c])
+(SUBS (SRLconst [c] y) x) => (RSBSshiftRL x y [c])
+(SUBS x (SRAconst [c] y)) => (SUBSshiftRA x y [c])
+(SUBS (SRAconst [c] y) x) => (RSBSshiftRA x y [c])
+(SUBS x (SLL y z)) => (SUBSshiftLLreg x y z)
+(SUBS (SLL y z) x) => (RSBSshiftLLreg x y z)
+(SUBS x (SRL y z)) => (SUBSshiftRLreg x y z)
+(SUBS (SRL y z) x) => (RSBSshiftRLreg x y z)
+(SUBS x (SRA y z)) => (SUBSshiftRAreg x y z)
+(SUBS (SRA y z) x) => (RSBSshiftRAreg x y z)
+(RSB x (SLLconst [c] y)) => (RSBshiftLL x y [c])
+(RSB (SLLconst [c] y) x) => (SUBshiftLL x y [c])
+(RSB x (SRLconst [c] y)) => (RSBshiftRL x y [c])
+(RSB (SRLconst [c] y) x) => (SUBshiftRL x y [c])
+(RSB x (SRAconst [c] y)) => (RSBshiftRA x y [c])
+(RSB (SRAconst [c] y) x) => (SUBshiftRA x y [c])
+(RSB x (SLL y z)) => (RSBshiftLLreg x y z)
+(RSB (SLL y z) x) => (SUBshiftLLreg x y z)
+(RSB x (SRL y z)) => (RSBshiftRLreg x y z)
+(RSB (SRL y z) x) => (SUBshiftRLreg x y z)
+(RSB x (SRA y z)) => (RSBshiftRAreg x y z)
+(RSB (SRA y z) x) => (SUBshiftRAreg x y z)
+(AND x (SLLconst [c] y)) => (ANDshiftLL x y [c])
+(AND x (SRLconst [c] y)) => (ANDshiftRL x y [c])
+(AND x (SRAconst [c] y)) => (ANDshiftRA x y [c])
+(AND x (SLL y z)) => (ANDshiftLLreg x y z)
+(AND x (SRL y z)) => (ANDshiftRLreg x y z)
+(AND x (SRA y z)) => (ANDshiftRAreg x y z)
+(OR x (SLLconst [c] y)) => (ORshiftLL x y [c])
+(OR x (SRLconst [c] y)) => (ORshiftRL x y [c])
+(OR x (SRAconst [c] y)) => (ORshiftRA x y [c])
+(OR x (SLL y z)) => (ORshiftLLreg x y z)
+(OR x (SRL y z)) => (ORshiftRLreg x y z)
+(OR x (SRA y z)) => (ORshiftRAreg x y z)
+(XOR x (SLLconst [c] y)) => (XORshiftLL x y [c])
+(XOR x (SRLconst [c] y)) => (XORshiftRL x y [c])
+(XOR x (SRAconst [c] y)) => (XORshiftRA x y [c])
+(XOR x (SRRconst [c] y)) => (XORshiftRR x y [c])
+(XOR x (SLL y z)) => (XORshiftLLreg x y z)
+(XOR x (SRL y z)) => (XORshiftRLreg x y z)
+(XOR x (SRA y z)) => (XORshiftRAreg x y z)
+(BIC x (SLLconst [c] y)) => (BICshiftLL x y [c])
+(BIC x (SRLconst [c] y)) => (BICshiftRL x y [c])
+(BIC x (SRAconst [c] y)) => (BICshiftRA x y [c])
+(BIC x (SLL y z)) => (BICshiftLLreg x y z)
+(BIC x (SRL y z)) => (BICshiftRLreg x y z)
+(BIC x (SRA y z)) => (BICshiftRAreg x y z)
+(MVN (SLLconst [c] x)) => (MVNshiftLL x [c])
+(MVN (SRLconst [c] x)) => (MVNshiftRL x [c])
+(MVN (SRAconst [c] x)) => (MVNshiftRA x [c])
+(MVN (SLL x y)) => (MVNshiftLLreg x y)
+(MVN (SRL x y)) => (MVNshiftRLreg x y)
+(MVN (SRA x y)) => (MVNshiftRAreg x y)
+
+(CMP x (SLLconst [c] y)) => (CMPshiftLL x y [c])
+(CMP (SLLconst [c] y) x) => (InvertFlags (CMPshiftLL x y [c]))
+(CMP x (SRLconst [c] y)) => (CMPshiftRL x y [c])
+(CMP (SRLconst [c] y) x) => (InvertFlags (CMPshiftRL x y [c]))
+(CMP x (SRAconst [c] y)) => (CMPshiftRA x y [c])
+(CMP (SRAconst [c] y) x) => (InvertFlags (CMPshiftRA x y [c]))
+(CMP x (SLL y z)) => (CMPshiftLLreg x y z)
+(CMP (SLL y z) x) => (InvertFlags (CMPshiftLLreg x y z))
+(CMP x (SRL y z)) => (CMPshiftRLreg x y z)
+(CMP (SRL y z) x) => (InvertFlags (CMPshiftRLreg x y z))
+(CMP x (SRA y z)) => (CMPshiftRAreg x y z)
+(CMP (SRA y z) x) => (InvertFlags (CMPshiftRAreg x y z))
+(TST x (SLLconst [c] y)) => (TSTshiftLL x y [c])
+(TST x (SRLconst [c] y)) => (TSTshiftRL x y [c])
+(TST x (SRAconst [c] y)) => (TSTshiftRA x y [c])
+(TST x (SLL y z)) => (TSTshiftLLreg x y z)
+(TST x (SRL y z)) => (TSTshiftRLreg x y z)
+(TST x (SRA y z)) => (TSTshiftRAreg x y z)
+(TEQ x (SLLconst [c] y)) => (TEQshiftLL x y [c])
+(TEQ x (SRLconst [c] y)) => (TEQshiftRL x y [c])
+(TEQ x (SRAconst [c] y)) => (TEQshiftRA x y [c])
+(TEQ x (SLL y z)) => (TEQshiftLLreg x y z)
+(TEQ x (SRL y z)) => (TEQshiftRLreg x y z)
+(TEQ x (SRA y z)) => (TEQshiftRAreg x y z)
+(CMN x (SLLconst [c] y)) => (CMNshiftLL x y [c])
+(CMN x (SRLconst [c] y)) => (CMNshiftRL x y [c])
+(CMN x (SRAconst [c] y)) => (CMNshiftRA x y [c])
+(CMN x (SLL y z)) => (CMNshiftLLreg x y z)
+(CMN x (SRL y z)) => (CMNshiftRLreg x y z)
+(CMN x (SRA y z)) => (CMNshiftRAreg x y z)
+
+// prefer *const ops to *shift ops
+(ADDshiftLL (MOVWconst [c]) x [d]) => (ADDconst [c] (SLLconst <x.Type> x [d]))
+(ADDshiftRL (MOVWconst [c]) x [d]) => (ADDconst [c] (SRLconst <x.Type> x [d]))
+(ADDshiftRA (MOVWconst [c]) x [d]) => (ADDconst [c] (SRAconst <x.Type> x [d]))
+(ADCshiftLL (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SLLconst <x.Type> x [d]) flags)
+(ADCshiftRL (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SRLconst <x.Type> x [d]) flags)
+(ADCshiftRA (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SRAconst <x.Type> x [d]) flags)
+(ADDSshiftLL (MOVWconst [c]) x [d]) => (ADDSconst [c] (SLLconst <x.Type> x [d]))
+(ADDSshiftRL (MOVWconst [c]) x [d]) => (ADDSconst [c] (SRLconst <x.Type> x [d]))
+(ADDSshiftRA (MOVWconst [c]) x [d]) => (ADDSconst [c] (SRAconst <x.Type> x [d]))
+(SUBshiftLL (MOVWconst [c]) x [d]) => (RSBconst [c] (SLLconst <x.Type> x [d]))
+(SUBshiftRL (MOVWconst [c]) x [d]) => (RSBconst [c] (SRLconst <x.Type> x [d]))
+(SUBshiftRA (MOVWconst [c]) x [d]) => (RSBconst [c] (SRAconst <x.Type> x [d]))
+(SBCshiftLL (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SLLconst <x.Type> x [d]) flags)
+(SBCshiftRL (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SRLconst <x.Type> x [d]) flags)
+(SBCshiftRA (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SRAconst <x.Type> x [d]) flags)
+(SUBSshiftLL (MOVWconst [c]) x [d]) => (RSBSconst [c] (SLLconst <x.Type> x [d]))
+(SUBSshiftRL (MOVWconst [c]) x [d]) => (RSBSconst [c] (SRLconst <x.Type> x [d]))
+(SUBSshiftRA (MOVWconst [c]) x [d]) => (RSBSconst [c] (SRAconst <x.Type> x [d]))
+(RSBshiftLL (MOVWconst [c]) x [d]) => (SUBconst [c] (SLLconst <x.Type> x [d]))
+(RSBshiftRL (MOVWconst [c]) x [d]) => (SUBconst [c] (SRLconst <x.Type> x [d]))
+(RSBshiftRA (MOVWconst [c]) x [d]) => (SUBconst [c] (SRAconst <x.Type> x [d]))
+(RSCshiftLL (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SLLconst <x.Type> x [d]) flags)
+(RSCshiftRL (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SRLconst <x.Type> x [d]) flags)
+(RSCshiftRA (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SRAconst <x.Type> x [d]) flags)
+(RSBSshiftLL (MOVWconst [c]) x [d]) => (SUBSconst [c] (SLLconst <x.Type> x [d]))
+(RSBSshiftRL (MOVWconst [c]) x [d]) => (SUBSconst [c] (SRLconst <x.Type> x [d]))
+(RSBSshiftRA (MOVWconst [c]) x [d]) => (SUBSconst [c] (SRAconst <x.Type> x [d]))
+(ANDshiftLL (MOVWconst [c]) x [d]) => (ANDconst [c] (SLLconst <x.Type> x [d]))
+(ANDshiftRL (MOVWconst [c]) x [d]) => (ANDconst [c] (SRLconst <x.Type> x [d]))
+(ANDshiftRA (MOVWconst [c]) x [d]) => (ANDconst [c] (SRAconst <x.Type> x [d]))
+(ORshiftLL (MOVWconst [c]) x [d]) => (ORconst [c] (SLLconst <x.Type> x [d]))
+(ORshiftRL (MOVWconst [c]) x [d]) => (ORconst [c] (SRLconst <x.Type> x [d]))
+(ORshiftRA (MOVWconst [c]) x [d]) => (ORconst [c] (SRAconst <x.Type> x [d]))
+(XORshiftLL (MOVWconst [c]) x [d]) => (XORconst [c] (SLLconst <x.Type> x [d]))
+(XORshiftRL (MOVWconst [c]) x [d]) => (XORconst [c] (SRLconst <x.Type> x [d]))
+(XORshiftRA (MOVWconst [c]) x [d]) => (XORconst [c] (SRAconst <x.Type> x [d]))
+(XORshiftRR (MOVWconst [c]) x [d]) => (XORconst [c] (SRRconst <x.Type> x [d]))
+(CMPshiftLL (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SLLconst <x.Type> x [d])))
+(CMPshiftRL (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRLconst <x.Type> x [d])))
+(CMPshiftRA (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRAconst <x.Type> x [d])))
+(TSTshiftLL (MOVWconst [c]) x [d]) => (TSTconst [c] (SLLconst <x.Type> x [d]))
+(TSTshiftRL (MOVWconst [c]) x [d]) => (TSTconst [c] (SRLconst <x.Type> x [d]))
+(TSTshiftRA (MOVWconst [c]) x [d]) => (TSTconst [c] (SRAconst <x.Type> x [d]))
+(TEQshiftLL (MOVWconst [c]) x [d]) => (TEQconst [c] (SLLconst <x.Type> x [d]))
+(TEQshiftRL (MOVWconst [c]) x [d]) => (TEQconst [c] (SRLconst <x.Type> x [d]))
+(TEQshiftRA (MOVWconst [c]) x [d]) => (TEQconst [c] (SRAconst <x.Type> x [d]))
+(CMNshiftLL (MOVWconst [c]) x [d]) => (CMNconst [c] (SLLconst <x.Type> x [d]))
+(CMNshiftRL (MOVWconst [c]) x [d]) => (CMNconst [c] (SRLconst <x.Type> x [d]))
+(CMNshiftRA (MOVWconst [c]) x [d]) => (CMNconst [c] (SRAconst <x.Type> x [d]))
+
+(ADDshiftLLreg (MOVWconst [c]) x y) => (ADDconst [c] (SLL <x.Type> x y))
+(ADDshiftRLreg (MOVWconst [c]) x y) => (ADDconst [c] (SRL <x.Type> x y))
+(ADDshiftRAreg (MOVWconst [c]) x y) => (ADDconst [c] (SRA <x.Type> x y))
+(ADCshiftLLreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SLL <x.Type> x y) flags)
+(ADCshiftRLreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SRL <x.Type> x y) flags)
+(ADCshiftRAreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SRA <x.Type> x y) flags)
+(ADDSshiftLLreg (MOVWconst [c]) x y) => (ADDSconst [c] (SLL <x.Type> x y))
+(ADDSshiftRLreg (MOVWconst [c]) x y) => (ADDSconst [c] (SRL <x.Type> x y))
+(ADDSshiftRAreg (MOVWconst [c]) x y) => (ADDSconst [c] (SRA <x.Type> x y))
+(SUBshiftLLreg (MOVWconst [c]) x y) => (RSBconst [c] (SLL <x.Type> x y))
+(SUBshiftRLreg (MOVWconst [c]) x y) => (RSBconst [c] (SRL <x.Type> x y))
+(SUBshiftRAreg (MOVWconst [c]) x y) => (RSBconst [c] (SRA <x.Type> x y))
+(SBCshiftLLreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SLL <x.Type> x y) flags)
+(SBCshiftRLreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SRL <x.Type> x y) flags)
+(SBCshiftRAreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SRA <x.Type> x y) flags)
+(SUBSshiftLLreg (MOVWconst [c]) x y) => (RSBSconst [c] (SLL <x.Type> x y))
+(SUBSshiftRLreg (MOVWconst [c]) x y) => (RSBSconst [c] (SRL <x.Type> x y))
+(SUBSshiftRAreg (MOVWconst [c]) x y) => (RSBSconst [c] (SRA <x.Type> x y))
+(RSBshiftLLreg (MOVWconst [c]) x y) => (SUBconst [c] (SLL <x.Type> x y))
+(RSBshiftRLreg (MOVWconst [c]) x y) => (SUBconst [c] (SRL <x.Type> x y))
+(RSBshiftRAreg (MOVWconst [c]) x y) => (SUBconst [c] (SRA <x.Type> x y))
+(RSCshiftLLreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SLL <x.Type> x y) flags)
+(RSCshiftRLreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SRL <x.Type> x y) flags)
+(RSCshiftRAreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SRA <x.Type> x y) flags)
+(RSBSshiftLLreg (MOVWconst [c]) x y) => (SUBSconst [c] (SLL <x.Type> x y))
+(RSBSshiftRLreg (MOVWconst [c]) x y) => (SUBSconst [c] (SRL <x.Type> x y))
+(RSBSshiftRAreg (MOVWconst [c]) x y) => (SUBSconst [c] (SRA <x.Type> x y))
+(ANDshiftLLreg (MOVWconst [c]) x y) => (ANDconst [c] (SLL <x.Type> x y))
+(ANDshiftRLreg (MOVWconst [c]) x y) => (ANDconst [c] (SRL <x.Type> x y))
+(ANDshiftRAreg (MOVWconst [c]) x y) => (ANDconst [c] (SRA <x.Type> x y))
+(ORshiftLLreg (MOVWconst [c]) x y) => (ORconst [c] (SLL <x.Type> x y))
+(ORshiftRLreg (MOVWconst [c]) x y) => (ORconst [c] (SRL <x.Type> x y))
+(ORshiftRAreg (MOVWconst [c]) x y) => (ORconst [c] (SRA <x.Type> x y))
+(XORshiftLLreg (MOVWconst [c]) x y) => (XORconst [c] (SLL <x.Type> x y))
+(XORshiftRLreg (MOVWconst [c]) x y) => (XORconst [c] (SRL <x.Type> x y))
+(XORshiftRAreg (MOVWconst [c]) x y) => (XORconst [c] (SRA <x.Type> x y))
+(CMPshiftLLreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SLL <x.Type> x y)))
+(CMPshiftRLreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SRL <x.Type> x y)))
+(CMPshiftRAreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SRA <x.Type> x y)))
+(TSTshiftLLreg (MOVWconst [c]) x y) => (TSTconst [c] (SLL <x.Type> x y))
+(TSTshiftRLreg (MOVWconst [c]) x y) => (TSTconst [c] (SRL <x.Type> x y))
+(TSTshiftRAreg (MOVWconst [c]) x y) => (TSTconst [c] (SRA <x.Type> x y))
+(TEQshiftLLreg (MOVWconst [c]) x y) => (TEQconst [c] (SLL <x.Type> x y))
+(TEQshiftRLreg (MOVWconst [c]) x y) => (TEQconst [c] (SRL <x.Type> x y))
+(TEQshiftRAreg (MOVWconst [c]) x y) => (TEQconst [c] (SRA <x.Type> x y))
+(CMNshiftLLreg (MOVWconst [c]) x y) => (CMNconst [c] (SLL <x.Type> x y))
+(CMNshiftRLreg (MOVWconst [c]) x y) => (CMNconst [c] (SRL <x.Type> x y))
+(CMNshiftRAreg (MOVWconst [c]) x y) => (CMNconst [c] (SRA <x.Type> x y))
+
+// constant folding in *shift ops
+(ADDshiftLL x (MOVWconst [c]) [d]) => (ADDconst x [c<<uint64(d)])
+(ADDshiftRL x (MOVWconst [c]) [d]) => (ADDconst x [int32(uint32(c)>>uint64(d))])
+(ADDshiftRA x (MOVWconst [c]) [d]) => (ADDconst x [c>>uint64(d)])
+(ADCshiftLL x (MOVWconst [c]) [d] flags) => (ADCconst x [c<<uint64(d)] flags)
+(ADCshiftRL x (MOVWconst [c]) [d] flags) => (ADCconst x [int32(uint32(c)>>uint64(d))] flags)
+(ADCshiftRA x (MOVWconst [c]) [d] flags) => (ADCconst x [c>>uint64(d)] flags)
+(ADDSshiftLL x (MOVWconst [c]) [d]) => (ADDSconst x [c<<uint64(d)])
+(ADDSshiftRL x (MOVWconst [c]) [d]) => (ADDSconst x [int32(uint32(c)>>uint64(d))])
+(ADDSshiftRA x (MOVWconst [c]) [d]) => (ADDSconst x [c>>uint64(d)])
+(SUBshiftLL x (MOVWconst [c]) [d]) => (SUBconst x [c<<uint64(d)])
+(SUBshiftRL x (MOVWconst [c]) [d]) => (SUBconst x [int32(uint32(c)>>uint64(d))])
+(SUBshiftRA x (MOVWconst [c]) [d]) => (SUBconst x [c>>uint64(d)])
+(SBCshiftLL x (MOVWconst [c]) [d] flags) => (SBCconst x [c<<uint64(d)] flags)
+(SBCshiftRL x (MOVWconst [c]) [d] flags) => (SBCconst x [int32(uint32(c)>>uint64(d))] flags)
+(SBCshiftRA x (MOVWconst [c]) [d] flags) => (SBCconst x [c>>uint64(d)] flags)
+(SUBSshiftLL x (MOVWconst [c]) [d]) => (SUBSconst x [c<<uint64(d)])
+(SUBSshiftRL x (MOVWconst [c]) [d]) => (SUBSconst x [int32(uint32(c)>>uint64(d))])
+(SUBSshiftRA x (MOVWconst [c]) [d]) => (SUBSconst x [c>>uint64(d)])
+(RSBshiftLL x (MOVWconst [c]) [d]) => (RSBconst x [c<<uint64(d)])
+(RSBshiftRL x (MOVWconst [c]) [d]) => (RSBconst x [int32(uint32(c)>>uint64(d))])
+(RSBshiftRA x (MOVWconst [c]) [d]) => (RSBconst x [c>>uint64(d)])
+(RSCshiftLL x (MOVWconst [c]) [d] flags) => (RSCconst x [c<<uint64(d)] flags)
+(RSCshiftRL x (MOVWconst [c]) [d] flags) => (RSCconst x [int32(uint32(c)>>uint64(d))] flags)
+(RSCshiftRA x (MOVWconst [c]) [d] flags) => (RSCconst x [c>>uint64(d)] flags)
+(RSBSshiftLL x (MOVWconst [c]) [d]) => (RSBSconst x [c<<uint64(d)])
+(RSBSshiftRL x (MOVWconst [c]) [d]) => (RSBSconst x [int32(uint32(c)>>uint64(d))])
+(RSBSshiftRA x (MOVWconst [c]) [d]) => (RSBSconst x [c>>uint64(d)])
+(ANDshiftLL x (MOVWconst [c]) [d]) => (ANDconst x [c<<uint64(d)])
+(ANDshiftRL x (MOVWconst [c]) [d]) => (ANDconst x [int32(uint32(c)>>uint64(d))])
+(ANDshiftRA x (MOVWconst [c]) [d]) => (ANDconst x [c>>uint64(d)])
+(ORshiftLL x (MOVWconst [c]) [d]) => (ORconst x [c<<uint64(d)])
+(ORshiftRL x (MOVWconst [c]) [d]) => (ORconst x [int32(uint32(c)>>uint64(d))])
+(ORshiftRA x (MOVWconst [c]) [d]) => (ORconst x [c>>uint64(d)])
+(XORshiftLL x (MOVWconst [c]) [d]) => (XORconst x [c<<uint64(d)])
+(XORshiftRL x (MOVWconst [c]) [d]) => (XORconst x [int32(uint32(c)>>uint64(d))])
+(XORshiftRA x (MOVWconst [c]) [d]) => (XORconst x [c>>uint64(d)])
+(XORshiftRR x (MOVWconst [c]) [d]) => (XORconst x [int32(uint32(c)>>uint64(d)|uint32(c)<<uint64(32-d))])
+(BICshiftLL x (MOVWconst [c]) [d]) => (BICconst x [c<<uint64(d)])
+(BICshiftRL x (MOVWconst [c]) [d]) => (BICconst x [int32(uint32(c)>>uint64(d))])
+(BICshiftRA x (MOVWconst [c]) [d]) => (BICconst x [c>>uint64(d)])
+(MVNshiftLL (MOVWconst [c]) [d]) => (MOVWconst [^(c<<uint64(d))])
+(MVNshiftRL (MOVWconst [c]) [d]) => (MOVWconst [^int32(uint32(c)>>uint64(d))])
+(MVNshiftRA (MOVWconst [c]) [d]) => (MOVWconst [int32(c)>>uint64(d)])
+(CMPshiftLL x (MOVWconst [c]) [d]) => (CMPconst x [c<<uint64(d)])
+(CMPshiftRL x (MOVWconst [c]) [d]) => (CMPconst x [int32(uint32(c)>>uint64(d))])
+(CMPshiftRA x (MOVWconst [c]) [d]) => (CMPconst x [c>>uint64(d)])
+(TSTshiftLL x (MOVWconst [c]) [d]) => (TSTconst x [c<<uint64(d)])
+(TSTshiftRL x (MOVWconst [c]) [d]) => (TSTconst x [int32(uint32(c)>>uint64(d))])
+(TSTshiftRA x (MOVWconst [c]) [d]) => (TSTconst x [c>>uint64(d)])
+(TEQshiftLL x (MOVWconst [c]) [d]) => (TEQconst x [c<<uint64(d)])
+(TEQshiftRL x (MOVWconst [c]) [d]) => (TEQconst x [int32(uint32(c)>>uint64(d))])
+(TEQshiftRA x (MOVWconst [c]) [d]) => (TEQconst x [c>>uint64(d)])
+(CMNshiftLL x (MOVWconst [c]) [d]) => (CMNconst x [c<<uint64(d)])
+(CMNshiftRL x (MOVWconst [c]) [d]) => (CMNconst x [int32(uint32(c)>>uint64(d))])
+(CMNshiftRA x (MOVWconst [c]) [d]) => (CMNconst x [c>>uint64(d)])
+
+(ADDshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftLL x y [c])
+(ADDshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftRL x y [c])
+(ADDshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftRA x y [c])
+(ADCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftLL x y [c] flags)
+(ADCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftRL x y [c] flags)
+(ADCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftRA x y [c] flags)
+(ADDSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftLL x y [c])
+(ADDSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftRL x y [c])
+(ADDSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftRA x y [c])
+(SUBshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftLL x y [c])
+(SUBshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftRL x y [c])
+(SUBshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftRA x y [c])
+(SBCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftLL x y [c] flags)
+(SBCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftRL x y [c] flags)
+(SBCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftRA x y [c] flags)
+(SUBSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftLL x y [c])
+(SUBSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftRL x y [c])
+(SUBSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftRA x y [c])
+(RSBshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftLL x y [c])
+(RSBshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftRL x y [c])
+(RSBshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftRA x y [c])
+(RSCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftLL x y [c] flags)
+(RSCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftRL x y [c] flags)
+(RSCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftRA x y [c] flags)
+(RSBSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftLL x y [c])
+(RSBSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftRL x y [c])
+(RSBSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftRA x y [c])
+(ANDshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftLL x y [c])
+(ANDshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftRL x y [c])
+(ANDshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftRA x y [c])
+(ORshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftLL x y [c])
+(ORshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftRL x y [c])
+(ORshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftRA x y [c])
+(XORshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftLL x y [c])
+(XORshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftRL x y [c])
+(XORshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftRA x y [c])
+(BICshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftLL x y [c])
+(BICshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftRL x y [c])
+(BICshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftRA x y [c])
+(MVNshiftLLreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftLL x [c])
+(MVNshiftRLreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftRL x [c])
+(MVNshiftRAreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftRA x [c])
+(CMPshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftLL x y [c])
+(CMPshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftRL x y [c])
+(CMPshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftRA x y [c])
+(TSTshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftLL x y [c])
+(TSTshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftRL x y [c])
+(TSTshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftRA x y [c])
+(TEQshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftLL x y [c])
+(TEQshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftRL x y [c])
+(TEQshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftRA x y [c])
+(CMNshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftLL x y [c])
+(CMNshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftRL x y [c])
+(CMNshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftRA x y [c])
+
+(RotateLeft16 <t> x (MOVWconst [c])) => (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+(RotateLeft8 <t> x (MOVWconst [c])) => (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+(RotateLeft32 x y) => (SRR x (RSBconst [0] <y.Type> y))
+
+// ((x>>8) | (x<<8)) -> (REV16 x), the type of x is uint16, "|" can also be "^" or "+".
+// UBFX instruction is supported by ARMv6T2, ARMv7 and above versions, REV16 is supported by
+// ARMv6 and above versions. So for ARMv6, we need to match SLLconst, SRLconst and ORshiftLL.
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (BFXU <typ.UInt16> [int32(armBFAuxInt(8, 8))] x) x) => (REV16 x)
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (SRLconst <typ.UInt16> [24] (SLLconst [16] x)) x) && buildcfg.GOARM>=6 => (REV16 x)
+
+// use indexed loads and stores
+(MOVWload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVWloadidx ptr idx mem)
+(MOVWstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVWstoreidx ptr idx val mem)
+(MOVWload [0] {sym} (ADDshiftLL ptr idx [c]) mem) && sym == nil => (MOVWloadshiftLL ptr idx [c] mem)
+(MOVWload [0] {sym} (ADDshiftRL ptr idx [c]) mem) && sym == nil => (MOVWloadshiftRL ptr idx [c] mem)
+(MOVWload [0] {sym} (ADDshiftRA ptr idx [c]) mem) && sym == nil => (MOVWloadshiftRA ptr idx [c] mem)
+(MOVWstore [0] {sym} (ADDshiftLL ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftLL ptr idx [c] val mem)
+(MOVWstore [0] {sym} (ADDshiftRL ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftRL ptr idx [c] val mem)
+(MOVWstore [0] {sym} (ADDshiftRA ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftRA ptr idx [c] val mem)
+(MOVBUload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVBUloadidx ptr idx mem)
+(MOVBload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVBloadidx ptr idx mem)
+(MOVBstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVBstoreidx ptr idx val mem)
+(MOVHUload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVHUloadidx ptr idx mem)
+(MOVHload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVHloadidx ptr idx mem)
+(MOVHstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVHstoreidx ptr idx val mem)
+
+// constant folding in indexed loads and stores
+(MOVWloadidx ptr (MOVWconst [c]) mem) => (MOVWload [c] ptr mem)
+(MOVWloadidx (MOVWconst [c]) ptr mem) => (MOVWload [c] ptr mem)
+(MOVBloadidx ptr (MOVWconst [c]) mem) => (MOVBload [c] ptr mem)
+(MOVBloadidx (MOVWconst [c]) ptr mem) => (MOVBload [c] ptr mem)
+(MOVBUloadidx ptr (MOVWconst [c]) mem) => (MOVBUload [c] ptr mem)
+(MOVBUloadidx (MOVWconst [c]) ptr mem) => (MOVBUload [c] ptr mem)
+(MOVHUloadidx ptr (MOVWconst [c]) mem) => (MOVHUload [c] ptr mem)
+(MOVHUloadidx (MOVWconst [c]) ptr mem) => (MOVHUload [c] ptr mem)
+(MOVHloadidx ptr (MOVWconst [c]) mem) => (MOVHload [c] ptr mem)
+(MOVHloadidx (MOVWconst [c]) ptr mem) => (MOVHload [c] ptr mem)
+
+(MOVWstoreidx ptr (MOVWconst [c]) val mem) => (MOVWstore [c] ptr val mem)
+(MOVWstoreidx (MOVWconst [c]) ptr val mem) => (MOVWstore [c] ptr val mem)
+(MOVBstoreidx ptr (MOVWconst [c]) val mem) => (MOVBstore [c] ptr val mem)
+(MOVBstoreidx (MOVWconst [c]) ptr val mem) => (MOVBstore [c] ptr val mem)
+(MOVHstoreidx ptr (MOVWconst [c]) val mem) => (MOVHstore [c] ptr val mem)
+(MOVHstoreidx (MOVWconst [c]) ptr val mem) => (MOVHstore [c] ptr val mem)
+
+(MOVWloadidx ptr (SLLconst idx [c]) mem) => (MOVWloadshiftLL ptr idx [c] mem)
+(MOVWloadidx (SLLconst idx [c]) ptr mem) => (MOVWloadshiftLL ptr idx [c] mem)
+(MOVWloadidx ptr (SRLconst idx [c]) mem) => (MOVWloadshiftRL ptr idx [c] mem)
+(MOVWloadidx (SRLconst idx [c]) ptr mem) => (MOVWloadshiftRL ptr idx [c] mem)
+(MOVWloadidx ptr (SRAconst idx [c]) mem) => (MOVWloadshiftRA ptr idx [c] mem)
+(MOVWloadidx (SRAconst idx [c]) ptr mem) => (MOVWloadshiftRA ptr idx [c] mem)
+
+(MOVWstoreidx ptr (SLLconst idx [c]) val mem) => (MOVWstoreshiftLL ptr idx [c] val mem)
+(MOVWstoreidx (SLLconst idx [c]) ptr val mem) => (MOVWstoreshiftLL ptr idx [c] val mem)
+(MOVWstoreidx ptr (SRLconst idx [c]) val mem) => (MOVWstoreshiftRL ptr idx [c] val mem)
+(MOVWstoreidx (SRLconst idx [c]) ptr val mem) => (MOVWstoreshiftRL ptr idx [c] val mem)
+(MOVWstoreidx ptr (SRAconst idx [c]) val mem) => (MOVWstoreshiftRA ptr idx [c] val mem)
+(MOVWstoreidx (SRAconst idx [c]) ptr val mem) => (MOVWstoreshiftRA ptr idx [c] val mem)
+
+(MOVWloadshiftLL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)<<uint64(d))] ptr mem)
+(MOVWloadshiftRL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)>>uint64(d))] ptr mem)
+(MOVWloadshiftRA ptr (MOVWconst [c]) [d] mem) => (MOVWload [c>>uint64(d)] ptr mem)
+
+(MOVWstoreshiftLL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)<<uint64(d))] ptr val mem)
+(MOVWstoreshiftRL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)>>uint64(d))] ptr val mem)
+(MOVWstoreshiftRA ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [c>>uint64(d)] ptr val mem)
+
+// generic simplifications
+(ADD x (RSBconst [0] y)) => (SUB x y)
+(ADD <t> (RSBconst [c] x) (RSBconst [d] y)) => (RSBconst [c+d] (ADD <t> x y))
+(SUB x x) => (MOVWconst [0])
+(RSB x x) => (MOVWconst [0])
+(AND x x) => x
+(OR x x) => x
+(XOR x x) => (MOVWconst [0])
+(BIC x x) => (MOVWconst [0])
+
+(ADD (MUL x y) a) => (MULA x y a)
+(SUB a (MUL x y)) && buildcfg.GOARM == 7 => (MULS x y a)
+(RSB (MUL x y) a) && buildcfg.GOARM == 7 => (MULS x y a)
+
+(NEGF (MULF x y)) && buildcfg.GOARM >= 6 => (NMULF x y)
+(NEGD (MULD x y)) && buildcfg.GOARM >= 6 => (NMULD x y)
+(MULF (NEGF x) y) && buildcfg.GOARM >= 6 => (NMULF x y)
+(MULD (NEGD x) y) && buildcfg.GOARM >= 6 => (NMULD x y)
+(NMULF (NEGF x) y) => (MULF x y)
+(NMULD (NEGD x) y) => (MULD x y)
+
+// the result will overwrite the addend, since they are in the same register
+(ADDF a (MULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAF a x y)
+(ADDF a (NMULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSF a x y)
+(ADDD a (MULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAD a x y)
+(ADDD a (NMULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSD a x y)
+(SUBF a (MULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSF a x y)
+(SUBF a (NMULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAF a x y)
+(SUBD a (MULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSD a x y)
+(SUBD a (NMULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAD a x y)
+
+(AND x (MVN y)) => (BIC x y)
+
+// simplification with *shift ops
+(SUBshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(SUBshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(SUBshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(RSBshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(RSBshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(RSBshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(ANDshiftLL y:(SLLconst x [c]) x [c]) => y
+(ANDshiftRL y:(SRLconst x [c]) x [c]) => y
+(ANDshiftRA y:(SRAconst x [c]) x [c]) => y
+(ORshiftLL y:(SLLconst x [c]) x [c]) => y
+(ORshiftRL y:(SRLconst x [c]) x [c]) => y
+(ORshiftRA y:(SRAconst x [c]) x [c]) => y
+(XORshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(XORshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(XORshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(BICshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(BICshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(BICshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(AND x (MVNshiftLL y [c])) => (BICshiftLL x y [c])
+(AND x (MVNshiftRL y [c])) => (BICshiftRL x y [c])
+(AND x (MVNshiftRA y [c])) => (BICshiftRA x y [c])
+
+// floating point optimizations
+(CMPF x (MOVFconst [0])) => (CMPF0 x)
+(CMPD x (MOVDconst [0])) => (CMPD0 x)
+
+// bit extraction
+(SRAconst (SLLconst x [c]) [d]) && buildcfg.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFX [(d-c)|(32-d)<<8] x)
+(SRLconst (SLLconst x [c]) [d]) && buildcfg.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFXU [(d-c)|(32-d)<<8] x)
+
+// comparison simplification
+((EQ|NE) (CMP x (RSBconst [0] y))) => ((EQ|NE) (CMN x y)) // sense of carry bit not preserved; see also #50854
+((EQ|NE) (CMN x (RSBconst [0] y))) => ((EQ|NE) (CMP x y)) // sense of carry bit not preserved; see also #50864
+(EQ (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (EQ (CMP x y) yes no)
+(EQ (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no)
+(EQ (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (EQ (CMPconst [c] x) yes no)
+(EQ (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (NE (CMP x y) yes no)
+(NE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (NE (CMP a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (NE (CMPconst [c] x) yes no)
+(NE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftRAreg x y z) yes no)
+(EQ (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (EQ (CMN x y) yes no)
+(EQ (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (EQ (CMN a (MUL <x.Type> x y)) yes no)
+(EQ (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (EQ (CMNconst [c] x) yes no)
+(EQ (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (NE (CMN x y) yes no)
+(NE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (NE (CMN a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (NE (CMNconst [c] x) yes no)
+(NE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftRAreg x y z) yes no)
+(EQ (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (EQ (TST x y) yes no)
+(EQ (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (EQ (TSTconst [c] x) yes no)
+(EQ (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (NE (TST x y) yes no)
+(NE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (NE (TSTconst [c] x) yes no)
+(NE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftRAreg x y z) yes no)
+(EQ (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (EQ (TEQ x y) yes no)
+(EQ (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (EQ (TEQconst [c] x) yes no)
+(EQ (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (NE (TEQ x y) yes no)
+(NE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (NE (TEQconst [c] x) yes no)
+(NE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (LTnoov (CMP x y) yes no)
+(LT (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (LTnoov (CMP a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (LTnoov (CMPconst [c] x) yes no)
+(LT (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (LEnoov (CMP x y) yes no)
+(LE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (LEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (LEnoov (CMPconst [c] x) yes no)
+(LE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (LTnoov (CMN x y) yes no)
+(LT (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (LTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (LTnoov (CMNconst [c] x) yes no)
+(LT (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (LEnoov (CMN x y) yes no)
+(LE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (LEnoov (CMN a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (LEnoov (CMNconst [c] x) yes no)
+(LE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (LTnoov (TST x y) yes no)
+(LT (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (LTnoov (TSTconst [c] x) yes no)
+(LT (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (LEnoov (TST x y) yes no)
+(LE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (LEnoov (TSTconst [c] x) yes no)
+(LE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (LTnoov (TEQ x y) yes no)
+(LT (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (LTnoov (TEQconst [c] x) yes no)
+(LT (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (LEnoov (TEQ x y) yes no)
+(LE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (LEnoov (TEQconst [c] x) yes no)
+(LE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (GTnoov (CMP x y) yes no)
+(GT (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (GTnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (GTnoov (CMPconst [c] x) yes no)
+(GT (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (GEnoov (CMP x y) yes no)
+(GE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (GEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (GEnoov (CMPconst [c] x) yes no)
+(GE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (GTnoov (CMN x y) yes no)
+(GT (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (GTnoov (CMNconst [c] x) yes no)
+(GT (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (GEnoov (CMN x y) yes no)
+(GE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (GEnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (GEnoov (CMNconst [c] x) yes no)
+(GE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (GTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (GTnoov (TST x y) yes no)
+(GT (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (GTnoov (TSTconst [c] x) yes no)
+(GT (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (GEnoov (TST x y) yes no)
+(GE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (GEnoov (TSTconst [c] x) yes no)
+(GE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (GTnoov (TEQ x y) yes no)
+(GT (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (GTnoov (TEQconst [c] x) yes no)
+(GT (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (GEnoov (TEQ x y) yes no)
+(GE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (GEnoov (TEQconst [c] x) yes no)
+(GE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftRAreg x y z) yes no)
+
+(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read8(sym, int64(off)))])
+(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
new file mode 100644
index 0000000..78a8492
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
@@ -0,0 +1,3030 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|64|32|16|8) ...) => (ADD ...)
+(Add(32F|64F) ...) => (FADD(S|D) ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUB ...)
+(Sub(32F|64F) ...) => (FSUB(S|D) ...)
+
+(Mul64 ...) => (MUL ...)
+(Mul(32|16|8) ...) => (MULW ...)
+(Mul(32F|64F) ...) => (FMUL(S|D) ...)
+
+(Hmul64 ...) => (MULH ...)
+(Hmul64u ...) => (UMULH ...)
+(Hmul32 x y) => (SRAconst (MULL <typ.Int64> x y) [32])
+(Hmul32u x y) => (SRAconst (UMULL <typ.UInt64> x y) [32])
+(Select0 (Mul64uhilo x y)) => (UMULH x y)
+(Select1 (Mul64uhilo x y)) => (MUL x y)
+
+(Div64 [false] x y) => (DIV x y)
+(Div64u ...) => (UDIV ...)
+(Div32 [false] x y) => (DIVW x y)
+(Div32u ...) => (UDIVW ...)
+(Div16 [false] x y) => (DIVW (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (UDIVW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (UDIVW (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIVD ...)
+
+(Mod64 x y) => (MOD x y)
+(Mod64u ...) => (UMOD ...)
+(Mod32 x y) => (MODW x y)
+(Mod32u ...) => (UMODW ...)
+(Mod16 x y) => (MODW (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (UMODW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (MODW (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (UMODW (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+// (x + y) / 2 with x>=y => (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+// unary ops
+(Neg(64|32|16|8) ...) => (NEG ...)
+(Neg(32F|64F) ...) => (FNEG(S|D) ...)
+(Com(64|32|16|8) ...) => (MVN ...)
+
+// math package intrinsics
+(Abs ...) => (FABSD ...)
+(Sqrt ...) => (FSQRTD ...)
+(Ceil ...) => (FRINTPD ...)
+(Floor ...) => (FRINTMD ...)
+(Round ...) => (FRINTAD ...)
+(RoundToEven ...) => (FRINTND ...)
+(Trunc ...) => (FRINTZD ...)
+(FMA x y z) => (FMADDD z x y)
+
+(Sqrt32 ...) => (FSQRTS ...)
+
+// lowering rotates
+// we do rotate detection in generic rules, if the following rules need to be changed, chcek generic rules first.
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft8 <t> x y) => (OR <t> (SLL <t> x (ANDconst <typ.Int64> [7] y)) (SRL <t> (ZeroExt8to64 x) (ANDconst <typ.Int64> [7] (NEG <typ.Int64> y))))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft16 <t> x y) => (RORW <t> (ORshiftLL <typ.UInt32> (ZeroExt16to32 x) (ZeroExt16to32 x) [16]) (NEG <typ.Int64> y))
+(RotateLeft32 x y) => (RORW x (NEG <y.Type> y))
+(RotateLeft64 x y) => (ROR x (NEG <y.Type> y))
+
+(Ctz(64|32|16|8)NonZero ...) => (Ctz(64|32|32|32) ...)
+
+(Ctz64 <t> x) => (CLZ (RBIT <t> x))
+(Ctz32 <t> x) => (CLZW (RBITW <t> x))
+(Ctz16 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+
+(PopCount64 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x))))
+(PopCount32 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x)))))
+(PopCount16 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt16to64 x)))))
+
+// Load args directly into the register class where it will be used.
+(FMOVDgpfp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym})
+(FMOVDfpgp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym})
+
+// Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set.
+(MOVDstore [off] {sym} ptr (FMOVDfpgp val) mem) => (FMOVDstore [off] {sym} ptr val mem)
+(FMOVDstore [off] {sym} ptr (FMOVDgpfp val) mem) => (MOVDstore [off] {sym} ptr val mem)
+(MOVWstore [off] {sym} ptr (FMOVSfpgp val) mem) => (FMOVSstore [off] {sym} ptr val mem)
+(FMOVSstore [off] {sym} ptr (FMOVSgpfp val) mem) => (MOVWstore [off] {sym} ptr val mem)
+
+// float <=> int register moves, with no conversion.
+// These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
+(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr val _)) => (FMOVDfpgp val)
+(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr val _)) => (FMOVDgpfp val)
+(MOVWUload [off] {sym} ptr (FMOVSstore [off] {sym} ptr val _)) => (FMOVSfpgp val)
+(FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (FMOVSgpfp val)
+
+(BitLen64 x) => (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
+(BitLen32 x) => (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
+
+(Bswap64 ...) => (REV ...)
+(Bswap32 ...) => (REVW ...)
+
+(BitRev64 ...) => (RBIT ...)
+(BitRev32 ...) => (RBITW ...)
+(BitRev16 x) => (SRLconst [48] (RBIT <typ.UInt64> x))
+(BitRev8 x) => (SRLconst [56] (RBIT <typ.UInt64> x))
+
+// In fact, UMOD will be translated into UREM instruction, and UREM is originally translated into
+// UDIV and MSUB instructions. But if there is already an identical UDIV instruction just before or
+// after UREM (case like quo, rem := z/y, z%y), then the second UDIV instruction becomes redundant.
+// The purpose of this rule is to have this extra UDIV instruction removed in CSE pass.
+(UMOD <typ.UInt64> x y) => (MSUB <typ.UInt64> x y (UDIV <typ.UInt64> x y))
+(UMODW <typ.UInt32> x y) => (MSUBW <typ.UInt32> x y (UDIVW <typ.UInt32> x y))
+
+// 64-bit addition with carry.
+(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c))))
+(Select1 (Add64carry x y c)) => (ADCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c)))))
+
+// 64-bit subtraction with borrowing.
+(Select0 (Sub64borrow x y bo)) => (Select0 <typ.UInt64> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))
+(Select1 (Sub64borrow x y bo)) => (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))))
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVDconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XOR (MOVDconst [1]) x)
+
+// shifts
+// hardware instruction uses only the low 6 bits of the shift
+// we compare to 64 to ensure Go semantics for large shifts
+// Rules about rotates with non-const shift are based on the following rules,
+// if the following rules change, please also modify the rules based on them.
+
+// check shiftIsBounded first, if shift value is proved to be valid then we
+// can do the shift directly.
+// left shift
+(Lsh(64|32|16|8)x64 <t> x y) && shiftIsBounded(v) => (SLL <t> x y)
+(Lsh(64|32|16|8)x32 <t> x y) && shiftIsBounded(v) => (SLL <t> x y)
+(Lsh(64|32|16|8)x16 <t> x y) && shiftIsBounded(v) => (SLL <t> x y)
+(Lsh(64|32|16|8)x8 <t> x y) && shiftIsBounded(v) => (SLL <t> x y)
+
+// signed right shift
+(Rsh64x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> x y)
+(Rsh32x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) y)
+(Rsh16x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) y)
+(Rsh8x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) y)
+
+// unsigned right shift
+(Rsh64Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> x y)
+(Rsh32Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> (ZeroExt32to64 x) y)
+(Rsh16Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> (ZeroExt16to64 x) y)
+(Rsh8Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> (ZeroExt8to64 x) y)
+
+// shift value may be out of range, use CMP + CSEL instead
+(Lsh64x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh64x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh64x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Lsh32x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh32x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh32x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Lsh16x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh16x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh16x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Lsh8x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh8x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh8x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh64Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh32Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh16Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh8Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh8Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh8Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y)))
+
+(Rsh64x64 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh64x32 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh64x16 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh64x8 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Rsh32x64 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh32x32 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh32x16 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh32x8 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Rsh16x64 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh16x32 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh16x16 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh16x8 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+(Rsh8x64 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh8x32 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh8x16 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh8x8 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y))))
+
+// constants
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32F|64F) [val]) => (FMOV(S|D)const [float64(val)])
+(ConstNil) => (MOVDconst [0])
+(ConstBool [t]) => (MOVDconst [b2i(t)])
+
+(Slicemask <t> x) => (SRAconst (NEG <t> x) [63])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+// float <=> int conversion
+(Cvt32to32F ...) => (SCVTFWS ...)
+(Cvt32to64F ...) => (SCVTFWD ...)
+(Cvt64to32F ...) => (SCVTFS ...)
+(Cvt64to64F ...) => (SCVTFD ...)
+(Cvt32Uto32F ...) => (UCVTFWS ...)
+(Cvt32Uto64F ...) => (UCVTFWD ...)
+(Cvt64Uto32F ...) => (UCVTFS ...)
+(Cvt64Uto64F ...) => (UCVTFD ...)
+(Cvt32Fto32 ...) => (FCVTZSSW ...)
+(Cvt64Fto32 ...) => (FCVTZSDW ...)
+(Cvt32Fto64 ...) => (FCVTZSS ...)
+(Cvt64Fto64 ...) => (FCVTZSD ...)
+(Cvt32Fto32U ...) => (FCVTZUSW ...)
+(Cvt64Fto32U ...) => (FCVTZUDW ...)
+(Cvt32Fto64U ...) => (FCVTZUS ...)
+(Cvt64Fto64U ...) => (FCVTZUD ...)
+(Cvt32Fto64F ...) => (FCVTSD ...)
+(Cvt64Fto32F ...) => (FCVTDS ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (LoweredRound32F ...)
+(Round64F ...) => (LoweredRound64F ...)
+
+// comparisons
+(Eq8 x y) => (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (Equal (CMPW x y))
+(Eq64 x y) => (Equal (CMP x y))
+(EqPtr x y) => (Equal (CMP x y))
+(Eq32F x y) => (Equal (FCMPS x y))
+(Eq64F x y) => (Equal (FCMPD x y))
+
+(Neq8 x y) => (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) => (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) => (NotEqual (CMPW x y))
+(Neq64 x y) => (NotEqual (CMP x y))
+(NeqPtr x y) => (NotEqual (CMP x y))
+(Neq32F x y) => (NotEqual (FCMPS x y))
+(Neq64F x y) => (NotEqual (FCMPD x y))
+
+(Less8 x y) => (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) => (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) => (LessThan (CMPW x y))
+(Less64 x y) => (LessThan (CMP x y))
+
+// Set condition flags for floating-point comparisons "x < y"
+// and "x <= y". Because if either or both of the operands are
+// NaNs, all three of (x < y), (x == y) and (x > y) are false,
+// and ARM Manual says FCMP instruction sets PSTATE.<N,Z,C,V>
+// of this case to (0, 0, 1, 1).
+(Less32F x y) => (LessThanF (FCMPS x y))
+(Less64F x y) => (LessThanF (FCMPD x y))
+
+// For an unsigned integer x, the following rules are useful when combining branch
+// 0 < x => x != 0
+// x <= 0 => x == 0
+// x < 1 => x == 0
+// 1 <= x => x != 0
+(Less(8U|16U|32U|64U) zero:(MOVDconst [0]) x) => (Neq(8|16|32|64) zero x)
+(Leq(8U|16U|32U|64U) x zero:(MOVDconst [0])) => (Eq(8|16|32|64) x zero)
+(Less(8U|16U|32U|64U) x (MOVDconst [1])) => (Eq(8|16|32|64) x (MOVDconst [0]))
+(Leq(8U|16U|32U|64U) (MOVDconst [1]) x) => (Neq(8|16|32|64) (MOVDconst [0]) x)
+
+(Less8U x y) => (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) => (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) => (LessThanU (CMPW x y))
+(Less64U x y) => (LessThanU (CMP x y))
+
+(Leq8 x y) => (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (LessEqual (CMPW x y))
+(Leq64 x y) => (LessEqual (CMP x y))
+
+// Refer to the comments for op Less64F above.
+(Leq32F x y) => (LessEqualF (FCMPS x y))
+(Leq64F x y) => (LessEqualF (FCMPD x y))
+
+(Leq8U x y) => (LessEqualU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (LessEqualU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (LessEqualU (CMPW x y))
+(Leq64U x y) => (LessEqualU (CMP x y))
+
+// Optimize comparison between a floating-point value and 0.0 with "FCMP $(0.0), Fn"
+(FCMPS x (FMOVSconst [0])) => (FCMPS0 x)
+(FCMPS (FMOVSconst [0]) x) => (InvertFlags (FCMPS0 x))
+(FCMPD x (FMOVDconst [0])) => (FCMPD0 x)
+(FCMPD (FMOVDconst [0]) x) => (InvertFlags (FCMPD0 x))
+
+// CSEL needs a flag-generating argument. Synthesize a TSTW if necessary.
+(CondSelect x y boolval) && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval))
+(CondSelect x y boolval) && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval))
+
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVDaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [off] ptr)
+
+(Addr {sym} base) => (MOVDaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+
+// zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVDconst [0]) mem)
+(Zero [2] ptr mem) => (MOVHstore ptr (MOVDconst [0]) mem)
+(Zero [4] ptr mem) => (MOVWstore ptr (MOVDconst [0]) mem)
+(Zero [3] ptr mem) =>
+ (MOVBstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem))
+(Zero [5] ptr mem) =>
+ (MOVBstore [4] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [6] ptr mem) =>
+ (MOVHstore [4] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [7] ptr mem) =>
+ (MOVWstore [3] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [8] ptr mem) => (MOVDstore ptr (MOVDconst [0]) mem)
+(Zero [9] ptr mem) =>
+ (MOVBstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [10] ptr mem) =>
+ (MOVHstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [11] ptr mem) =>
+ (MOVDstore [3] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [12] ptr mem) =>
+ (MOVWstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [13] ptr mem) =>
+ (MOVDstore [5] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [14] ptr mem) =>
+ (MOVDstore [6] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [15] ptr mem) =>
+ (MOVDstore [7] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [16] ptr mem) =>
+ (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
+
+(Zero [32] ptr mem) =>
+ (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+ (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
+
+(Zero [48] ptr mem) =>
+ (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
+ (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+ (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
+
+(Zero [64] ptr mem) =>
+ (STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
+ (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
+ (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+ (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
+
+// strip off fractional word zeroing
+(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 =>
+ (Zero [8]
+ (OffPtr <ptr.Type> ptr [s-8])
+ (Zero [s-s%16] ptr mem))
+(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 =>
+ (Zero [16]
+ (OffPtr <ptr.Type> ptr [s-16])
+ (Zero [s-s%16] ptr mem))
+
+// medium zeroing uses a duff device
+// 4, 16, and 64 are magic constants, see runtime/mkduff.go
+(Zero [s] ptr mem)
+ && s%16 == 0 && s > 64 && s <= 16*64
+ && !config.noDuffDevice =>
+ (DUFFZERO [4 * (64 - s/16)] ptr mem)
+
+// large zeroing uses a loop
+(Zero [s] ptr mem)
+ && s%16 == 0 && (s > 16*64 || config.noDuffDevice) =>
+ (LoweredZero
+ ptr
+ (ADDconst <ptr.Type> [s-16] ptr)
+ mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
+(Move [2] dst src mem) => (MOVHstore dst (MOVHUload src mem) mem)
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVHstore dst (MOVHUload src mem) mem))
+(Move [4] dst src mem) => (MOVWstore dst (MOVWUload src mem) mem)
+(Move [5] dst src mem) =>
+ (MOVBstore [4] dst (MOVBUload [4] src mem)
+ (MOVWstore dst (MOVWUload src mem) mem))
+(Move [6] dst src mem) =>
+ (MOVHstore [4] dst (MOVHUload [4] src mem)
+ (MOVWstore dst (MOVWUload src mem) mem))
+(Move [7] dst src mem) =>
+ (MOVWstore [3] dst (MOVWUload [3] src mem)
+ (MOVWstore dst (MOVWUload src mem) mem))
+(Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem)
+(Move [9] dst src mem) =>
+ (MOVBstore [8] dst (MOVBUload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [10] dst src mem) =>
+ (MOVHstore [8] dst (MOVHUload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [11] dst src mem) =>
+ (MOVDstore [3] dst (MOVDload [3] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [12] dst src mem) =>
+ (MOVWstore [8] dst (MOVWUload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [13] dst src mem) =>
+ (MOVDstore [5] dst (MOVDload [5] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [14] dst src mem) =>
+ (MOVDstore [6] dst (MOVDload [6] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [15] dst src mem) =>
+ (MOVDstore [7] dst (MOVDload [7] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [16] dst src mem) =>
+ (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)
+(Move [32] dst src mem) =>
+ (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
+ (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))
+(Move [48] dst src mem) =>
+ (STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem))
+ (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
+ (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)))
+(Move [64] dst src mem) =>
+ (STP [48] dst (Select0 <typ.UInt64> (LDP [48] src mem)) (Select1 <typ.UInt64> (LDP [48] src mem))
+ (STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem))
+ (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
+ (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
+
+// strip off fractional word move
+(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 16 =>
+ (Move [8]
+ (OffPtr <dst.Type> dst [s-8])
+ (OffPtr <src.Type> src [s-8])
+ (Move [s-s%16] dst src mem))
+(Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 16 =>
+ (Move [16]
+ (OffPtr <dst.Type> dst [s-16])
+ (OffPtr <src.Type> src [s-16])
+ (Move [s-s%16] dst src mem))
+
+// medium move uses a duff device
+(Move [s] dst src mem)
+ && s > 64 && s <= 16*64 && s%16 == 0
+ && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [8 * (64 - s/16)] dst src mem)
+// 8 is the number of bytes to encode:
+//
+// LDP.P 16(R16), (R26, R27)
+// STP.P (R26, R27), 16(R17)
+//
+// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
+
+// large move uses a loop
+(Move [s] dst src mem)
+ && s%16 == 0 && (s > 16*64 || config.noDuffDevice)
+ && logLargeCopy(v, s) =>
+ (LoweredMove
+ dst
+ src
+ (ADDconst <src.Type> src [s-16])
+ mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) => (LessThanU (CMP idx len))
+(IsSliceInBounds idx len) => (LessEqualU (CMP idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) => (EQ cc yes no)
+(If (NotEqual cc) yes no) => (NE cc yes no)
+(If (LessThan cc) yes no) => (LT cc yes no)
+(If (LessThanU cc) yes no) => (ULT cc yes no)
+(If (LessEqual cc) yes no) => (LE cc yes no)
+(If (LessEqualU cc) yes no) => (ULE cc yes no)
+(If (GreaterThan cc) yes no) => (GT cc yes no)
+(If (GreaterThanU cc) yes no) => (UGT cc yes no)
+(If (GreaterEqual cc) yes no) => (GE cc yes no)
+(If (GreaterEqualU cc) yes no) => (UGE cc yes no)
+(If (LessThanF cc) yes no) => (FLT cc yes no)
+(If (LessEqualF cc) yes no) => (FLE cc yes no)
+(If (GreaterThanF cc) yes no) => (FGT cc yes no)
+(If (GreaterEqualF cc) yes no) => (FGE cc yes no)
+
+(If cond yes no) => (TBNZ [0] cond yes no)
+
+(JumpTable idx) => (JUMPTABLE {makeJumpTableSym(b)} idx (MOVDaddr <typ.Uintptr> {makeJumpTableSym(b)} (SB)))
+
+// atomic intrinsics
+// Note: these ops do not accept offset.
+(AtomicLoad8 ...) => (LDARB ...)
+(AtomicLoad32 ...) => (LDARW ...)
+(AtomicLoad64 ...) => (LDAR ...)
+(AtomicLoadPtr ...) => (LDAR ...)
+
+(AtomicStore8 ...) => (STLRB ...)
+(AtomicStore32 ...) => (STLRW ...)
+(AtomicStore64 ...) => (STLR ...)
+(AtomicStorePtrNoWB ...) => (STLR ...)
+
+(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+(AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...)
+
+(AtomicAdd(32|64)Variant ...) => (LoweredAtomicAdd(32|64)Variant ...)
+(AtomicExchange(32|64)Variant ...) => (LoweredAtomicExchange(32|64)Variant ...)
+(AtomicCompareAndSwap(32|64)Variant ...) => (LoweredAtomicCas(32|64)Variant ...)
+
+// Currently the updated value is not used, but we need a register to temporarily hold it.
+(AtomicAnd8 ptr val mem) => (Select1 (LoweredAtomicAnd8 ptr val mem))
+(AtomicAnd32 ptr val mem) => (Select1 (LoweredAtomicAnd32 ptr val mem))
+(AtomicOr8 ptr val mem) => (Select1 (LoweredAtomicOr8 ptr val mem))
+(AtomicOr32 ptr val mem) => (Select1 (LoweredAtomicOr32 ptr val mem))
+
+(AtomicAnd8Variant ptr val mem) => (Select1 (LoweredAtomicAnd8Variant ptr val mem))
+(AtomicAnd32Variant ptr val mem) => (Select1 (LoweredAtomicAnd32Variant ptr val mem))
+(AtomicOr8Variant ptr val mem) => (Select1 (LoweredAtomicOr8Variant ptr val mem))
+(AtomicOr32Variant ptr val mem) => (Select1 (LoweredAtomicOr32Variant ptr val mem))
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+// Publication barrier (0xe is ST option)
+(PubBarrier mem) => (DMB [0xe] mem)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NZ (Equal cc) yes no) => (EQ cc yes no)
+(NZ (NotEqual cc) yes no) => (NE cc yes no)
+(NZ (LessThan cc) yes no) => (LT cc yes no)
+(NZ (LessThanU cc) yes no) => (ULT cc yes no)
+(NZ (LessEqual cc) yes no) => (LE cc yes no)
+(NZ (LessEqualU cc) yes no) => (ULE cc yes no)
+(NZ (GreaterThan cc) yes no) => (GT cc yes no)
+(NZ (GreaterThanU cc) yes no) => (UGT cc yes no)
+(NZ (GreaterEqual cc) yes no) => (GE cc yes no)
+(NZ (GreaterEqualU cc) yes no) => (UGE cc yes no)
+(NZ (LessThanF cc) yes no) => (FLT cc yes no)
+(NZ (LessEqualF cc) yes no) => (FLE cc yes no)
+(NZ (GreaterThanF cc) yes no) => (FGT cc yes no)
+(NZ (GreaterEqualF cc) yes no) => (FGE cc yes no)
+
+(TBNZ [0] (Equal cc) yes no) => (EQ cc yes no)
+(TBNZ [0] (NotEqual cc) yes no) => (NE cc yes no)
+(TBNZ [0] (LessThan cc) yes no) => (LT cc yes no)
+(TBNZ [0] (LessThanU cc) yes no) => (ULT cc yes no)
+(TBNZ [0] (LessEqual cc) yes no) => (LE cc yes no)
+(TBNZ [0] (LessEqualU cc) yes no) => (ULE cc yes no)
+(TBNZ [0] (GreaterThan cc) yes no) => (GT cc yes no)
+(TBNZ [0] (GreaterThanU cc) yes no) => (UGT cc yes no)
+(TBNZ [0] (GreaterEqual cc) yes no) => (GE cc yes no)
+(TBNZ [0] (GreaterEqualU cc) yes no) => (UGE cc yes no)
+(TBNZ [0] (LessThanF cc) yes no) => (FLT cc yes no)
+(TBNZ [0] (LessEqualF cc) yes no) => (FLE cc yes no)
+(TBNZ [0] (GreaterThanF cc) yes no) => (FGT cc yes no)
+(TBNZ [0] (GreaterEqualF cc) yes no) => (FGE cc yes no)
+
+(EQ (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (EQ (TST x y) yes no)
+(NE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (NE (TST x y) yes no)
+(LT (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LT (TST x y) yes no)
+(LE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LE (TST x y) yes no)
+(GT (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GT (TST x y) yes no)
+(GE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GE (TST x y) yes no)
+
+(EQ (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (EQ (TSTconst [c] y) yes no)
+(NE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (NE (TSTconst [c] y) yes no)
+(LT (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LT (TSTconst [c] y) yes no)
+(LE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LE (TSTconst [c] y) yes no)
+(GT (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GT (TSTconst [c] y) yes no)
+(GE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GE (TSTconst [c] y) yes no)
+
+(EQ (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (EQ (TSTW x y) yes no)
+(NE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (NE (TSTW x y) yes no)
+(LT (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LT (TSTW x y) yes no)
+(LE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LE (TSTW x y) yes no)
+(GT (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GT (TSTW x y) yes no)
+(GE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GE (TSTW x y) yes no)
+
+(EQ (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (EQ (TSTWconst [int32(c)] y) yes no)
+(NE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (NE (TSTWconst [int32(c)] y) yes no)
+(LT (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LT (TSTWconst [int32(c)] y) yes no)
+(LE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LE (TSTWconst [int32(c)] y) yes no)
+(GT (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GT (TSTWconst [int32(c)] y) yes no)
+(GE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GE (TSTWconst [int32(c)] y) yes no)
+
+// For conditional instructions such as CSET, CSEL.
+(Equal (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (Equal (TST x y))
+(NotEqual (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (NotEqual (TST x y))
+(LessThan (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (LessThan (TST x y))
+(LessEqual (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (LessEqual (TST x y))
+(GreaterThan (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterThan (TST x y))
+(GreaterEqual (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterEqual (TST x y))
+
+(Equal (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (Equal (TSTWconst [int32(c)] y))
+(NotEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (NotEqual (TSTWconst [int32(c)] y))
+(LessThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessThan (TSTWconst [int32(c)] y))
+(LessEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessEqual (TSTWconst [int32(c)] y))
+(GreaterThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterThan (TSTWconst [int32(c)] y))
+(GreaterEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterEqual (TSTWconst [int32(c)] y))
+
+(Equal (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (Equal (TSTW x y))
+(NotEqual (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (NotEqual (TSTW x y))
+(LessThan (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (LessThan (TSTW x y))
+(LessEqual (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (LessEqual (TSTW x y))
+(GreaterThan (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterThan (TSTW x y))
+(GreaterEqual (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterEqual (TSTW x y))
+
+(Equal (CMPconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (Equal (TSTconst [c] y))
+(NotEqual (CMPconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (NotEqual (TSTconst [c] y))
+(LessThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessThan (TSTconst [c] y))
+(LessEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessEqual (TSTconst [c] y))
+(GreaterThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterThan (TSTconst [c] y))
+(GreaterEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterEqual (TSTconst [c] y))
+
+(EQ (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (EQ (CMNconst [c] y) yes no)
+(NE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (NE (CMNconst [c] y) yes no)
+(LT (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LTnoov (CMNconst [c] y) yes no)
+(LE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LEnoov (CMNconst [c] y) yes no)
+(GT (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GTnoov (CMNconst [c] y) yes no)
+(GE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GEnoov (CMNconst [c] y) yes no)
+
+(EQ (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (EQ (CMNWconst [int32(c)] y) yes no)
+(NE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (NE (CMNWconst [int32(c)] y) yes no)
+(LT (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LTnoov (CMNWconst [int32(c)] y) yes no)
+(LE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LEnoov (CMNWconst [int32(c)] y) yes no)
+(GT (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GTnoov (CMNWconst [int32(c)] y) yes no)
+(GE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GEnoov (CMNWconst [int32(c)] y) yes no)
+
+(EQ (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (EQ (CMN x y) yes no)
+(NE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (NE (CMN x y) yes no)
+(LT (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LTnoov (CMN x y) yes no)
+(LE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LEnoov (CMN x y) yes no)
+(GT (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GTnoov (CMN x y) yes no)
+(GE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GEnoov (CMN x y) yes no)
+
+(EQ (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (EQ (CMNW x y) yes no)
+(NE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (NE (CMNW x y) yes no)
+(LT (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LTnoov (CMNW x y) yes no)
+(LE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LEnoov (CMNW x y) yes no)
+(GT (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GTnoov (CMNW x y) yes no)
+(GE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GEnoov (CMNW x y) yes no)
+
+// CMP(x,-y) -> CMN(x,y) is only valid for unordered comparison, if y can be -1<<63
+(EQ (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (EQ (CMN x y) yes no)
+(NE (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (NE (CMN x y) yes no)
+
+(Equal (CMP x z:(NEG y))) && z.Uses == 1 => (Equal (CMN x y))
+(NotEqual (CMP x z:(NEG y))) && z.Uses == 1 => (NotEqual (CMN x y))
+
+// CMPW(x,-y) -> CMNW(x,y) is only valid for unordered comparison, if y can be -1<<31
+(EQ (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (EQ (CMNW x y) yes no)
+(NE (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (NE (CMNW x y) yes no)
+
+(Equal (CMPW x z:(NEG y))) && z.Uses == 1 => (Equal (CMNW x y))
+(NotEqual (CMPW x z:(NEG y))) && z.Uses == 1 => (NotEqual (CMNW x y))
+
+// For conditional instructions such as CSET, CSEL.
+// TODO: add support for LT, LE, GT, GE, overflow needs to be considered.
+(Equal (CMPconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (Equal (CMNconst [c] y))
+(NotEqual (CMPconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (NotEqual (CMNconst [c] y))
+
+(Equal (CMPWconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (Equal (CMNWconst [int32(c)] y))
+(NotEqual (CMPWconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (NotEqual (CMNWconst [int32(c)] y))
+
+(Equal (CMPconst [0] z:(ADD x y))) && z.Uses == 1 => (Equal (CMN x y))
+(NotEqual (CMPconst [0] z:(ADD x y))) && z.Uses == 1 => (NotEqual (CMN x y))
+
+(Equal (CMPWconst [0] z:(ADD x y))) && z.Uses == 1 => (Equal (CMNW x y))
+(NotEqual (CMPWconst [0] z:(ADD x y))) && z.Uses == 1 => (NotEqual (CMNW x y))
+
+(Equal (CMPconst [0] z:(MADD a x y))) && z.Uses==1 => (Equal (CMN a (MUL <x.Type> x y)))
+(NotEqual (CMPconst [0] z:(MADD a x y))) && z.Uses==1 => (NotEqual (CMN a (MUL <x.Type> x y)))
+
+(Equal (CMPconst [0] z:(MSUB a x y))) && z.Uses==1 => (Equal (CMP a (MUL <x.Type> x y)))
+(NotEqual (CMPconst [0] z:(MSUB a x y))) && z.Uses==1 => (NotEqual (CMP a (MUL <x.Type> x y)))
+
+(Equal (CMPWconst [0] z:(MADDW a x y))) && z.Uses==1 => (Equal (CMNW a (MULW <x.Type> x y)))
+(NotEqual (CMPWconst [0] z:(MADDW a x y))) && z.Uses==1 => (NotEqual (CMNW a (MULW <x.Type> x y)))
+
+(Equal (CMPWconst [0] z:(MSUBW a x y))) && z.Uses==1 => (Equal (CMPW a (MULW <x.Type> x y)))
+(NotEqual (CMPWconst [0] z:(MSUBW a x y))) && z.Uses==1 => (NotEqual (CMPW a (MULW <x.Type> x y)))
+
+(CMPconst [c] y) && c < 0 && c != -1<<63 => (CMNconst [-c] y)
+(CMPWconst [c] y) && c < 0 && c != -1<<31 => (CMNWconst [-c] y)
+(CMNconst [c] y) && c < 0 && c != -1<<63 => (CMPconst [-c] y)
+(CMNWconst [c] y) && c < 0 && c != -1<<31 => (CMPWconst [-c] y)
+
+(EQ (CMPconst [0] x) yes no) => (Z x yes no)
+(NE (CMPconst [0] x) yes no) => (NZ x yes no)
+(EQ (CMPWconst [0] x) yes no) => (ZW x yes no)
+(NE (CMPWconst [0] x) yes no) => (NZW x yes no)
+
+(EQ (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (EQ (CMN a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (NE (CMN a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (LTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (LEnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (GTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (GEnoov (CMN a (MUL <x.Type> x y)) yes no)
+
+(EQ (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (NE (CMP a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (LEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (LTnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (GEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (GTnoov (CMP a (MUL <x.Type> x y)) yes no)
+
+(EQ (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (EQ (CMNW a (MULW <x.Type> x y)) yes no)
+(NE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (NE (CMNW a (MULW <x.Type> x y)) yes no)
+(LE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (LEnoov (CMNW a (MULW <x.Type> x y)) yes no)
+(LT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (LTnoov (CMNW a (MULW <x.Type> x y)) yes no)
+(GE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (GEnoov (CMNW a (MULW <x.Type> x y)) yes no)
+(GT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (GTnoov (CMNW a (MULW <x.Type> x y)) yes no)
+
+(EQ (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (EQ (CMPW a (MULW <x.Type> x y)) yes no)
+(NE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (NE (CMPW a (MULW <x.Type> x y)) yes no)
+(LE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (LEnoov (CMPW a (MULW <x.Type> x y)) yes no)
+(LT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (LTnoov (CMPW a (MULW <x.Type> x y)) yes no)
+(GE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (GEnoov (CMPW a (MULW <x.Type> x y)) yes no)
+(GT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (GTnoov (CMPW a (MULW <x.Type> x y)) yes no)
+
+// Absorb bit-tests into block
+(Z (ANDconst [c] x) yes no) && oneBit(c) => (TBZ [int64(ntz64(c))] x yes no)
+(NZ (ANDconst [c] x) yes no) && oneBit(c) => (TBNZ [int64(ntz64(c))] x yes no)
+(ZW (ANDconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBZ [int64(ntz64(int64(uint32(c))))] x yes no)
+(NZW (ANDconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBNZ [int64(ntz64(int64(uint32(c))))] x yes no)
+(EQ (TSTconst [c] x) yes no) && oneBit(c) => (TBZ [int64(ntz64(c))] x yes no)
+(NE (TSTconst [c] x) yes no) && oneBit(c) => (TBNZ [int64(ntz64(c))] x yes no)
+(EQ (TSTWconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBZ [int64(ntz64(int64(uint32(c))))] x yes no)
+(NE (TSTWconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBNZ [int64(ntz64(int64(uint32(c))))] x yes no)
+
+// Test sign-bit for signed comparisons against zero
+(GE (CMPWconst [0] x) yes no) => (TBZ [31] x yes no)
+(GE (CMPconst [0] x) yes no) => (TBZ [63] x yes no)
+(LT (CMPWconst [0] x) yes no) => (TBNZ [31] x yes no)
+(LT (CMPconst [0] x) yes no) => (TBNZ [63] x yes no)
+
+// fold offset into address
+(ADDconst [off1] (MOVDaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) =>
+ (MOVDaddr [int32(off1)+off2] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBload [off1+int32(off2)] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBUload [off1+int32(off2)] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHload [off1+int32(off2)] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHUload [off1+int32(off2)] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWload [off1+int32(off2)] {sym} ptr mem)
+(MOVWUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWUload [off1+int32(off2)] {sym} ptr mem)
+(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVDload [off1+int32(off2)] {sym} ptr mem)
+(LDP [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (LDP [off1+int32(off2)] {sym} ptr mem)
+(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVSload [off1+int32(off2)] {sym} ptr mem)
+(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVDload [off1+int32(off2)] {sym} ptr mem)
+
+// register indexed load
+(MOVDload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVDloadidx ptr idx mem)
+(MOVWUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWUloadidx ptr idx mem)
+(MOVWload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWloadidx ptr idx mem)
+(MOVHUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHUloadidx ptr idx mem)
+(MOVHload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHloadidx ptr idx mem)
+(MOVBUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBUloadidx ptr idx mem)
+(MOVBload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBloadidx ptr idx mem)
+(FMOVSload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (FMOVSloadidx ptr idx mem)
+(FMOVDload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (FMOVDloadidx ptr idx mem)
+(MOVDloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVDload [int32(c)] ptr mem)
+(MOVDloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVDload [int32(c)] ptr mem)
+(MOVWUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWUload [int32(c)] ptr mem)
+(MOVWUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVWUload [int32(c)] ptr mem)
+(MOVWloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWload [int32(c)] ptr mem)
+(MOVWloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVWload [int32(c)] ptr mem)
+(MOVHUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHUload [int32(c)] ptr mem)
+(MOVHUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVHUload [int32(c)] ptr mem)
+(MOVHloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHload [int32(c)] ptr mem)
+(MOVHloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVHload [int32(c)] ptr mem)
+(MOVBUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBUload [int32(c)] ptr mem)
+(MOVBUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVBUload [int32(c)] ptr mem)
+(MOVBloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBload [int32(c)] ptr mem)
+(MOVBloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVBload [int32(c)] ptr mem)
+(FMOVSloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (FMOVSload [int32(c)] ptr mem)
+(FMOVSloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (FMOVSload [int32(c)] ptr mem)
+(FMOVDloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (FMOVDload [int32(c)] ptr mem)
+(FMOVDloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (FMOVDload [int32(c)] ptr mem)
+
+// shifted register indexed load
+(MOVDload [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (MOVDloadidx8 ptr idx mem)
+(MOVWUload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWUloadidx4 ptr idx mem)
+(MOVWload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWloadidx4 ptr idx mem)
+(MOVHUload [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHUloadidx2 ptr idx mem)
+(MOVHload [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHloadidx2 ptr idx mem)
+(MOVDloadidx ptr (SLLconst [3] idx) mem) => (MOVDloadidx8 ptr idx mem)
+(MOVWloadidx ptr (SLLconst [2] idx) mem) => (MOVWloadidx4 ptr idx mem)
+(MOVWUloadidx ptr (SLLconst [2] idx) mem) => (MOVWUloadidx4 ptr idx mem)
+(MOVHloadidx ptr (SLLconst [1] idx) mem) => (MOVHloadidx2 ptr idx mem)
+(MOVHUloadidx ptr (SLLconst [1] idx) mem) => (MOVHUloadidx2 ptr idx mem)
+(MOVHloadidx ptr (ADD idx idx) mem) => (MOVHloadidx2 ptr idx mem)
+(MOVHUloadidx ptr (ADD idx idx) mem) => (MOVHUloadidx2 ptr idx mem)
+(MOVDloadidx (SLLconst [3] idx) ptr mem) => (MOVDloadidx8 ptr idx mem)
+(MOVWloadidx (SLLconst [2] idx) ptr mem) => (MOVWloadidx4 ptr idx mem)
+(MOVWUloadidx (SLLconst [2] idx) ptr mem) => (MOVWUloadidx4 ptr idx mem)
+(MOVHloadidx (ADD idx idx) ptr mem) => (MOVHloadidx2 ptr idx mem)
+(MOVHUloadidx (ADD idx idx) ptr mem) => (MOVHUloadidx2 ptr idx mem)
+(MOVDloadidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (MOVDload [int32(c)<<3] ptr mem)
+(MOVWUloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWUload [int32(c)<<2] ptr mem)
+(MOVWloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWload [int32(c)<<2] ptr mem)
+(MOVHUloadidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHUload [int32(c)<<1] ptr mem)
+(MOVHloadidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHload [int32(c)<<1] ptr mem)
+
+(FMOVDload [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (FMOVDloadidx8 ptr idx mem)
+(FMOVSload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (FMOVSloadidx4 ptr idx mem)
+(FMOVDloadidx ptr (SLLconst [3] idx) mem) => (FMOVDloadidx8 ptr idx mem)
+(FMOVSloadidx ptr (SLLconst [2] idx) mem) => (FMOVSloadidx4 ptr idx mem)
+(FMOVDloadidx (SLLconst [3] idx) ptr mem) => (FMOVDloadidx8 ptr idx mem)
+(FMOVSloadidx (SLLconst [2] idx) ptr mem) => (FMOVSloadidx4 ptr idx mem)
+(FMOVDloadidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (FMOVDload ptr [int32(c)<<3] mem)
+(FMOVSloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (FMOVSload ptr [int32(c)<<2] mem)
+
+(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(STP [off1] {sym} (ADDconst [off2] ptr) val1 val2 mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (STP [off1+int32(off2)] {sym} ptr val1 val2 mem)
+(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVSstore [off1+int32(off2)] {sym} ptr val mem)
+(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVBstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVDstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVQstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVQstorezero [off1+int32(off2)] {sym} ptr mem)
+
+// register indexed store
+(MOVDstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVDstoreidx ptr idx val mem)
+(MOVWstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVWstoreidx ptr idx val mem)
+(MOVHstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVHstoreidx ptr idx val mem)
+(MOVBstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVBstoreidx ptr idx val mem)
+(FMOVDstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (FMOVDstoreidx ptr idx val mem)
+(FMOVSstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (FMOVSstoreidx ptr idx val mem)
+(MOVDstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVDstore [int32(c)] ptr val mem)
+(MOVDstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVDstore [int32(c)] idx val mem)
+(MOVWstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVWstore [int32(c)] ptr val mem)
+(MOVWstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVWstore [int32(c)] idx val mem)
+(MOVHstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVHstore [int32(c)] ptr val mem)
+(MOVHstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVHstore [int32(c)] idx val mem)
+(MOVBstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVBstore [int32(c)] ptr val mem)
+(MOVBstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVBstore [int32(c)] idx val mem)
+(FMOVDstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (FMOVDstore [int32(c)] ptr val mem)
+(FMOVDstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (FMOVDstore [int32(c)] idx val mem)
+(FMOVSstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (FMOVSstore [int32(c)] ptr val mem)
+(FMOVSstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (FMOVSstore [int32(c)] idx val mem)
+
+// shifted register indexed store
+(MOVDstore [off] {sym} (ADDshiftLL [3] ptr idx) val mem) && off == 0 && sym == nil => (MOVDstoreidx8 ptr idx val mem)
+(MOVWstore [off] {sym} (ADDshiftLL [2] ptr idx) val mem) && off == 0 && sym == nil => (MOVWstoreidx4 ptr idx val mem)
+(MOVHstore [off] {sym} (ADDshiftLL [1] ptr idx) val mem) && off == 0 && sym == nil => (MOVHstoreidx2 ptr idx val mem)
+(MOVDstoreidx ptr (SLLconst [3] idx) val mem) => (MOVDstoreidx8 ptr idx val mem)
+(MOVWstoreidx ptr (SLLconst [2] idx) val mem) => (MOVWstoreidx4 ptr idx val mem)
+(MOVHstoreidx ptr (SLLconst [1] idx) val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVHstoreidx ptr (ADD idx idx) val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVDstoreidx (SLLconst [3] idx) ptr val mem) => (MOVDstoreidx8 ptr idx val mem)
+(MOVWstoreidx (SLLconst [2] idx) ptr val mem) => (MOVWstoreidx4 ptr idx val mem)
+(MOVHstoreidx (SLLconst [1] idx) ptr val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVHstoreidx (ADD idx idx) ptr val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVDstoreidx8 ptr (MOVDconst [c]) val mem) && is32Bit(c<<3) => (MOVDstore [int32(c)<<3] ptr val mem)
+(MOVWstoreidx4 ptr (MOVDconst [c]) val mem) && is32Bit(c<<2) => (MOVWstore [int32(c)<<2] ptr val mem)
+(MOVHstoreidx2 ptr (MOVDconst [c]) val mem) && is32Bit(c<<1) => (MOVHstore [int32(c)<<1] ptr val mem)
+
+(FMOVDstore [off] {sym} (ADDshiftLL [3] ptr idx) val mem) && off == 0 && sym == nil => (FMOVDstoreidx8 ptr idx val mem)
+(FMOVSstore [off] {sym} (ADDshiftLL [2] ptr idx) val mem) && off == 0 && sym == nil => (FMOVSstoreidx4 ptr idx val mem)
+(FMOVDstoreidx ptr (SLLconst [3] idx) val mem) => (FMOVDstoreidx8 ptr idx val mem)
+(FMOVSstoreidx ptr (SLLconst [2] idx) val mem) => (FMOVSstoreidx4 ptr idx val mem)
+(FMOVDstoreidx (SLLconst [3] idx) ptr val mem) => (FMOVDstoreidx8 ptr idx val mem)
+(FMOVSstoreidx (SLLconst [2] idx) ptr val mem) => (FMOVSstoreidx4 ptr idx val mem)
+(FMOVDstoreidx8 ptr (MOVDconst [c]) val mem) && is32Bit(c<<3) => (FMOVDstore [int32(c)<<3] ptr val mem)
+(FMOVSstoreidx4 ptr (MOVDconst [c]) val mem) && is32Bit(c<<2) => (FMOVSstore [int32(c)<<2] ptr val mem)
+
+(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(LDP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (LDP [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVSload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(STP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val1 val2 mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (STP [off1+off2] {mergeSym(sym1,sym2)} ptr val1 val2 mem)
+(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVQstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+ && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+ && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+ (MOVQstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+// store zero
+(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem)
+(STP [off] {sym} ptr (MOVDconst [0]) (MOVDconst [0]) mem) => (MOVQstorezero [off] {sym} ptr mem)
+
+// register indexed store zero
+(MOVDstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVDstorezeroidx ptr idx mem)
+(MOVWstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWstorezeroidx ptr idx mem)
+(MOVHstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHstorezeroidx ptr idx mem)
+(MOVBstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBstorezeroidx ptr idx mem)
+(MOVDstoreidx ptr idx (MOVDconst [0]) mem) => (MOVDstorezeroidx ptr idx mem)
+(MOVWstoreidx ptr idx (MOVDconst [0]) mem) => (MOVWstorezeroidx ptr idx mem)
+(MOVHstoreidx ptr idx (MOVDconst [0]) mem) => (MOVHstorezeroidx ptr idx mem)
+(MOVBstoreidx ptr idx (MOVDconst [0]) mem) => (MOVBstorezeroidx ptr idx mem)
+(MOVDstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVDstorezero [int32(c)] ptr mem)
+(MOVDstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVDstorezero [int32(c)] idx mem)
+(MOVWstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWstorezero [int32(c)] ptr mem)
+(MOVWstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVWstorezero [int32(c)] idx mem)
+(MOVHstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHstorezero [int32(c)] ptr mem)
+(MOVHstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVHstorezero [int32(c)] idx mem)
+(MOVBstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBstorezero [int32(c)] ptr mem)
+(MOVBstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVBstorezero [int32(c)] idx mem)
+
+// shifted register indexed store zero
+(MOVDstorezero [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstorezero [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstorezero [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstorezeroidx ptr (SLLconst [3] idx) mem) => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstorezeroidx ptr (SLLconst [2] idx) mem) => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstorezeroidx ptr (SLLconst [1] idx) mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVHstorezeroidx ptr (ADD idx idx) mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstorezeroidx (SLLconst [3] idx) ptr mem) => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstorezeroidx (SLLconst [2] idx) ptr mem) => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstorezeroidx (SLLconst [1] idx) ptr mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVHstorezeroidx (ADD idx idx) ptr mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstoreidx8 ptr idx (MOVDconst [0]) mem) => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstoreidx4 ptr idx (MOVDconst [0]) mem) => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstoreidx2 ptr idx (MOVDconst [0]) mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstorezeroidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (MOVDstorezero [int32(c<<3)] ptr mem)
+(MOVWstorezeroidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWstorezero [int32(c<<2)] ptr mem)
+(MOVHstorezeroidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHstorezero [int32(c<<1)] ptr mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+// these seem to have bad interaction with other rules, resulting in slower code
+//(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBreg x)
+//(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBUreg x)
+//(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHreg x)
+//(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHUreg x)
+//(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWreg x)
+//(MOVWUload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWUreg x)
+//(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
+//(LDP [off] {sym} ptr (STP [off2] {sym2} ptr2 x y _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x y
+//(FMOVSload [off] {sym} ptr (FMOVSstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
+//(FMOVDload [off] {sym} ptr (FMOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
+
+(MOVBload [off] {sym} ptr (MOVBstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVBUload [off] {sym} ptr (MOVBstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVHload [off] {sym} ptr (MOVHstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVHUload [off] {sym} ptr (MOVHstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVWload [off] {sym} ptr (MOVWstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVWUload [off] {sym} ptr (MOVWstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVDload [off] {sym} ptr (MOVDstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+
+(MOVBloadidx ptr idx (MOVBstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVBUloadidx ptr idx (MOVBstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVHloadidx ptr idx (MOVHstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVHUloadidx ptr idx (MOVHstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVWloadidx ptr idx (MOVWstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVWUloadidx ptr idx (MOVWstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVDloadidx ptr idx (MOVDstorezeroidx ptr2 idx2 _))
+ && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+
+(MOVHloadidx2 ptr idx (MOVHstorezeroidx2 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVHUloadidx2 ptr idx (MOVHstorezeroidx2 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVWloadidx4 ptr idx (MOVWstorezeroidx4 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVWUloadidx4 ptr idx (MOVWstorezeroidx4 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVDloadidx8 ptr idx (MOVDstorezeroidx8 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x)
+(MOVBreg x:(MOVBloadidx _ _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHloadidx _ _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWloadidx _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHloadidx2 _ _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWloadidx4 _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUloadidx4 _ _ _)) => (MOVDreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstoreidx ptr idx (MOVBreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVBUreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVHreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVHUreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVWreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVWUreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVHreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVHUreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVWreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVWUreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVWstoreidx ptr idx (MOVWreg x) mem) => (MOVWstoreidx ptr idx x mem)
+(MOVWstoreidx ptr idx (MOVWUreg x) mem) => (MOVWstoreidx ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVHreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVHUreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVWreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVWUreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVWstoreidx4 ptr idx (MOVWreg x) mem) => (MOVWstoreidx4 ptr idx x mem)
+(MOVWstoreidx4 ptr idx (MOVWUreg x) mem) => (MOVWstoreidx4 ptr idx x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVDnop doesn't emit instruction, only for ensuring the type.
+(MOVDreg x) && x.Uses == 1 => (MOVDnop x)
+
+// TODO: we should be able to get rid of MOVDnop all together.
+// But for now, this is enough to get rid of lots of them.
+(MOVDnop (MOVDconst [c])) => (MOVDconst [c])
+
+// fold constant into arithmetic ops
+(ADD x (MOVDconst [c])) => (ADDconst [c] x)
+(SUB x (MOVDconst [c])) => (SUBconst [c] x)
+(AND x (MOVDconst [c])) => (ANDconst [c] x)
+(OR x (MOVDconst [c])) => (ORconst [c] x)
+(XOR x (MOVDconst [c])) => (XORconst [c] x)
+(TST x (MOVDconst [c])) => (TSTconst [c] x)
+(TSTW x (MOVDconst [c])) => (TSTWconst [int32(c)] x)
+(CMN x (MOVDconst [c])) => (CMNconst [c] x)
+(CMNW x (MOVDconst [c])) => (CMNWconst [int32(c)] x)
+(BIC x (MOVDconst [c])) => (ANDconst [^c] x)
+(EON x (MOVDconst [c])) => (XORconst [^c] x)
+(ORN x (MOVDconst [c])) => (ORconst [^c] x)
+
+(SLL x (MOVDconst [c])) => (SLLconst x [c&63])
+(SRL x (MOVDconst [c])) => (SRLconst x [c&63])
+(SRA x (MOVDconst [c])) => (SRAconst x [c&63])
+(SLL x (ANDconst [63] y)) => (SLL x y)
+(SRL x (ANDconst [63] y)) => (SRL x y)
+(SRA x (ANDconst [63] y)) => (SRA x y)
+
+(CMP x (MOVDconst [c])) => (CMPconst [c] x)
+(CMP (MOVDconst [c]) x) => (InvertFlags (CMPconst [c] x))
+(CMPW x (MOVDconst [c])) => (CMPWconst [int32(c)] x)
+(CMPW (MOVDconst [c]) x) => (InvertFlags (CMPWconst [int32(c)] x))
+
+(ROR x (MOVDconst [c])) => (RORconst x [c&63])
+(RORW x (MOVDconst [c])) => (RORWconst x [c&31])
+
+(ADDSflags x (MOVDconst [c])) => (ADDSconstflags [c] x)
+
+(ADDconst [c] y) && c < 0 => (SUBconst [-c] y)
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+((CMP|CMPW) x y) && canonLessThan(x,y) => (InvertFlags ((CMP|CMPW) y x))
+
+// mul-neg => mneg
+(NEG (MUL x y)) => (MNEG x y)
+(NEG (MULW x y)) => (MNEGW x y)
+(MUL (NEG x) y) => (MNEG x y)
+(MULW (NEG x) y) => (MNEGW x y)
+
+// madd/msub
+(ADD a l:(MUL x y)) && l.Uses==1 && clobber(l) => (MADD a x y)
+(SUB a l:(MUL x y)) && l.Uses==1 && clobber(l) => (MSUB a x y)
+(ADD a l:(MNEG x y)) && l.Uses==1 && clobber(l) => (MSUB a x y)
+(SUB a l:(MNEG x y)) && l.Uses==1 && clobber(l) => (MADD a x y)
+
+(ADD a l:(MULW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MADDW a x y)
+(SUB a l:(MULW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MSUBW a x y)
+(ADD a l:(MNEGW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MSUBW a x y)
+(SUB a l:(MNEGW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MADDW a x y)
+
+// optimize ADCSflags, SBCSflags and friends
+(ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] (ADCzerocarry <typ.UInt64> c)))) => (ADCSflags x y c)
+(ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] (MOVDconst [0])))) => (ADDSflags x y)
+(SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> bo))))) => (SBCSflags x y bo)
+(SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags (MOVDconst [0])))) => (SUBSflags x y)
+
+// mul by constant
+(MUL x (MOVDconst [-1])) => (NEG x)
+(MUL _ (MOVDconst [0])) => (MOVDconst [0])
+(MUL x (MOVDconst [1])) => x
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log64(c)] x)
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (ADDshiftLL x x [log64(c-1)])
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log64(c+1)])
+(MUL x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst [log64(c/3)] (ADDshiftLL <x.Type> x x [1]))
+(MUL x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SLLconst [log64(c/5)] (ADDshiftLL <x.Type> x x [2]))
+(MUL x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst [log64(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
+(MUL x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SLLconst [log64(c/9)] (ADDshiftLL <x.Type> x x [3]))
+
+(MULW x (MOVDconst [c])) && int32(c)==-1 => (NEG x)
+(MULW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0])
+(MULW x (MOVDconst [c])) && int32(c)==1 => x
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log64(c)] x)
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (ADDshiftLL x x [log64(c-1)])
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log64(c+1)])
+(MULW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst [log64(c/3)] (ADDshiftLL <x.Type> x x [1]))
+(MULW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SLLconst [log64(c/5)] (ADDshiftLL <x.Type> x x [2]))
+(MULW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst [log64(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
+(MULW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SLLconst [log64(c/9)] (ADDshiftLL <x.Type> x x [3]))
+
+// mneg by constant
+(MNEG x (MOVDconst [-1])) => x
+(MNEG _ (MOVDconst [0])) => (MOVDconst [0])
+(MNEG x (MOVDconst [1])) => (NEG x)
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log64(c)] x))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (NEG (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log64(c+1)]))
+(MNEG x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst <x.Type> [log64(c/3)] (SUBshiftLL <x.Type> x x [2]))
+(MNEG x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (NEG (SLLconst <x.Type> [log64(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEG x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst <x.Type> [log64(c/7)] (SUBshiftLL <x.Type> x x [3]))
+(MNEG x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (NEG (SLLconst <x.Type> [log64(c/9)] (ADDshiftLL <x.Type> x x [3])))
+
+
+(MNEGW x (MOVDconst [c])) && int32(c)==-1 => x
+(MNEGW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0])
+(MNEGW x (MOVDconst [c])) && int32(c)==1 => (NEG x)
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log64(c)] x))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (NEG (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log64(c+1)]))
+(MNEGW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst <x.Type> [log64(c/3)] (SUBshiftLL <x.Type> x x [2]))
+(MNEGW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (NEG (SLLconst <x.Type> [log64(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEGW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst <x.Type> [log64(c/7)] (SUBshiftLL <x.Type> x x [3]))
+(MNEGW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (NEG (SLLconst <x.Type> [log64(c/9)] (ADDshiftLL <x.Type> x x [3])))
+
+
+(MADD a x (MOVDconst [-1])) => (SUB a x)
+(MADD a _ (MOVDconst [0])) => a
+(MADD a x (MOVDconst [1])) => (ADD a x)
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADD a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADD a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADD a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADD a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MADD a (MOVDconst [-1]) x) => (SUB a x)
+(MADD a (MOVDconst [0]) _) => a
+(MADD a (MOVDconst [1]) x) => (ADD a x)
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADD a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADD a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADD a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADD a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MADDW a x (MOVDconst [c])) && int32(c)==-1 => (SUB a x)
+(MADDW a _ (MOVDconst [c])) && int32(c)==0 => a
+(MADDW a x (MOVDconst [c])) && int32(c)==1 => (ADD a x)
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADDW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADDW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADDW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADDW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MADDW a (MOVDconst [c]) x) && int32(c)==-1 => (SUB a x)
+(MADDW a (MOVDconst [c]) _) && int32(c)==0 => a
+(MADDW a (MOVDconst [c]) x) && int32(c)==1 => (ADD a x)
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADDW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADDW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADDW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADDW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUB a x (MOVDconst [-1])) => (ADD a x)
+(MSUB a _ (MOVDconst [0])) => a
+(MSUB a x (MOVDconst [1])) => (SUB a x)
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUB a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUB a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUB a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUB a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUB a (MOVDconst [-1]) x) => (ADD a x)
+(MSUB a (MOVDconst [0]) _) => a
+(MSUB a (MOVDconst [1]) x) => (SUB a x)
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUB a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUB a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUB a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUB a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUBW a x (MOVDconst [c])) && int32(c)==-1 => (ADD a x)
+(MSUBW a _ (MOVDconst [c])) && int32(c)==0 => a
+(MSUBW a x (MOVDconst [c])) && int32(c)==1 => (SUB a x)
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUBW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUBW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUBW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUBW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUBW a (MOVDconst [c]) x) && int32(c)==-1 => (ADD a x)
+(MSUBW a (MOVDconst [c]) _) && int32(c)==0 => a
+(MSUBW a (MOVDconst [c]) x) && int32(c)==1 => (SUB a x)
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUBW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUBW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUBW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUBW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+// div by constant
+(UDIV x (MOVDconst [1])) => x
+(UDIV x (MOVDconst [c])) && isPowerOfTwo64(c) => (SRLconst [log64(c)] x)
+(UDIVW x (MOVDconst [c])) && uint32(c)==1 => x
+(UDIVW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (SRLconst [log64(c)] x)
+(UMOD _ (MOVDconst [1])) => (MOVDconst [0])
+(UMOD x (MOVDconst [c])) && isPowerOfTwo64(c) => (ANDconst [c-1] x)
+(UMODW _ (MOVDconst [c])) && uint32(c)==1 => (MOVDconst [0])
+(UMODW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (ANDconst [c-1] x)
+
+// generic simplifications
+(ADD x (NEG y)) => (SUB x y)
+(SUB x x) => (MOVDconst [0])
+(AND x x) => x
+(OR x x) => x
+(XOR x x) => (MOVDconst [0])
+(BIC x x) => (MOVDconst [0])
+(EON x x) => (MOVDconst [-1])
+(ORN x x) => (MOVDconst [-1])
+(AND x (MVN y)) => (BIC x y)
+(XOR x (MVN y)) => (EON x y)
+(OR x (MVN y)) => (ORN x y)
+(MVN (XOR x y)) => (EON x y)
+(NEG (NEG x)) => x
+
+(CSEL [cc] (MOVDconst [-1]) (MOVDconst [0]) flag) => (CSETM [cc] flag)
+(CSEL [cc] (MOVDconst [0]) (MOVDconst [-1]) flag) => (CSETM [arm64Negate(cc)] flag)
+(CSEL [cc] x (MOVDconst [0]) flag) => (CSEL0 [cc] x flag)
+(CSEL [cc] (MOVDconst [0]) y flag) => (CSEL0 [arm64Negate(cc)] y flag)
+(CSEL [cc] x (ADDconst [1] a) flag) => (CSINC [cc] x a flag)
+(CSEL [cc] (ADDconst [1] a) x flag) => (CSINC [arm64Negate(cc)] x a flag)
+(CSEL [cc] x (MVN a) flag) => (CSINV [cc] x a flag)
+(CSEL [cc] (MVN a) x flag) => (CSINV [arm64Negate(cc)] x a flag)
+(CSEL [cc] x (NEG a) flag) => (CSNEG [cc] x a flag)
+(CSEL [cc] (NEG a) x flag) => (CSNEG [arm64Negate(cc)] x a flag)
+
+(SUB x (SUB y z)) => (SUB (ADD <v.Type> x z) y)
+(SUB (SUB x y) z) => (SUB x (ADD <y.Type> y z))
+
+// remove redundant *const ops
+(ADDconst [0] x) => x
+(SUBconst [0] x) => x
+(ANDconst [0] _) => (MOVDconst [0])
+(ANDconst [-1] x) => x
+(ORconst [0] x) => x
+(ORconst [-1] _) => (MOVDconst [-1])
+(XORconst [0] x) => x
+(XORconst [-1] x) => (MVN x)
+
+// generic constant folding
+(ADDconst [c] (MOVDconst [d])) => (MOVDconst [c+d])
+(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x)
+(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x)
+(SUBconst [c] (MOVDconst [d])) => (MOVDconst [d-c])
+(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x)
+(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x)
+(SLLconst [c] (MOVDconst [d])) => (MOVDconst [d<<uint64(c)])
+(SRLconst [c] (MOVDconst [d])) => (MOVDconst [int64(uint64(d)>>uint64(c))])
+(SRAconst [c] (MOVDconst [d])) => (MOVDconst [d>>uint64(c)])
+(MUL (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c*d])
+(MULW (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [int64(int32(c)*int32(d))])
+(MNEG (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [-c*d])
+(MNEGW (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [-int64(int32(c)*int32(d))])
+(MADD (MOVDconst [c]) x y) => (ADDconst [c] (MUL <x.Type> x y))
+(MADDW (MOVDconst [c]) x y) => (ADDconst [c] (MULW <x.Type> x y))
+(MSUB (MOVDconst [c]) x y) => (ADDconst [c] (MNEG <x.Type> x y))
+(MSUBW (MOVDconst [c]) x y) => (ADDconst [c] (MNEGW <x.Type> x y))
+(MADD a (MOVDconst [c]) (MOVDconst [d])) => (ADDconst [c*d] a)
+(MADDW a (MOVDconst [c]) (MOVDconst [d])) => (ADDconst [int64(int32(c)*int32(d))] a)
+(MSUB a (MOVDconst [c]) (MOVDconst [d])) => (SUBconst [c*d] a)
+(MSUBW a (MOVDconst [c]) (MOVDconst [d])) => (SUBconst [int64(int32(c)*int32(d))] a)
+(DIV (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [c/d])
+(UDIV (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint64(c)/uint64(d))])
+(DIVW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(int32(c)/int32(d))])
+(UDIVW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint32(c)/uint32(d))])
+(MOD (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [c%d])
+(UMOD (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint64(c)%uint64(d))])
+(MODW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(int32(c)%int32(d))])
+(UMODW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint32(c)%uint32(d))])
+(ANDconst [c] (MOVDconst [d])) => (MOVDconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ANDconst [c] (MOVWUreg x)) => (ANDconst [c&(1<<32-1)] x)
+(ANDconst [c] (MOVHUreg x)) => (ANDconst [c&(1<<16-1)] x)
+(ANDconst [c] (MOVBUreg x)) => (ANDconst [c&(1<<8-1)] x)
+(MOVWUreg (ANDconst [c] x)) => (ANDconst [c&(1<<32-1)] x)
+(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&(1<<16-1)] x)
+(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&(1<<8-1)] x)
+(ORconst [c] (MOVDconst [d])) => (MOVDconst [c|d])
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (MOVDconst [d])) => (MOVDconst [c^d])
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(MVN (MOVDconst [c])) => (MOVDconst [^c])
+(NEG (MOVDconst [c])) => (MOVDconst [-c])
+(MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))])
+(MOVBUreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))])
+(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+(MOVHUreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+(MOVWUreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+(MOVDreg (MOVDconst [c])) => (MOVDconst [c])
+
+// constant comparisons
+(CMPconst (MOVDconst [x]) [y]) => (FlagConstant [subFlags64(x,y)])
+(CMPWconst (MOVDconst [x]) [y]) => (FlagConstant [subFlags32(int32(x),y)])
+(TSTconst (MOVDconst [x]) [y]) => (FlagConstant [logicFlags64(x&y)])
+(TSTWconst (MOVDconst [x]) [y]) => (FlagConstant [logicFlags32(int32(x)&y)])
+(CMNconst (MOVDconst [x]) [y]) => (FlagConstant [addFlags64(x,y)])
+(CMNWconst (MOVDconst [x]) [y]) => (FlagConstant [addFlags32(int32(x),y)])
+
+// other known comparisons
+(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags64(0,1)])
+(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags64(0,1)])
+(CMPconst (MOVWUreg _) [c]) && 0xffffffff < c => (FlagConstant [subFlags64(0,1)])
+(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n => (FlagConstant [subFlags64(0,1)])
+(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 63 && (1<<uint64(64-c)) <= uint64(n) => (FlagConstant [subFlags64(0,1)])
+(CMPWconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags64(0,1)])
+(CMPWconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags64(0,1)])
+
+// absorb flag constants into branches
+(EQ (FlagConstant [fc]) yes no) && fc.eq() => (First yes no)
+(EQ (FlagConstant [fc]) yes no) && !fc.eq() => (First no yes)
+
+(NE (FlagConstant [fc]) yes no) && fc.ne() => (First yes no)
+(NE (FlagConstant [fc]) yes no) && !fc.ne() => (First no yes)
+
+(LT (FlagConstant [fc]) yes no) && fc.lt() => (First yes no)
+(LT (FlagConstant [fc]) yes no) && !fc.lt() => (First no yes)
+
+(LE (FlagConstant [fc]) yes no) && fc.le() => (First yes no)
+(LE (FlagConstant [fc]) yes no) && !fc.le() => (First no yes)
+
+(GT (FlagConstant [fc]) yes no) && fc.gt() => (First yes no)
+(GT (FlagConstant [fc]) yes no) && !fc.gt() => (First no yes)
+
+(GE (FlagConstant [fc]) yes no) && fc.ge() => (First yes no)
+(GE (FlagConstant [fc]) yes no) && !fc.ge() => (First no yes)
+
+(ULT (FlagConstant [fc]) yes no) && fc.ult() => (First yes no)
+(ULT (FlagConstant [fc]) yes no) && !fc.ult() => (First no yes)
+
+(ULE (FlagConstant [fc]) yes no) && fc.ule() => (First yes no)
+(ULE (FlagConstant [fc]) yes no) && !fc.ule() => (First no yes)
+
+(UGT (FlagConstant [fc]) yes no) && fc.ugt() => (First yes no)
+(UGT (FlagConstant [fc]) yes no) && !fc.ugt() => (First no yes)
+
+(UGE (FlagConstant [fc]) yes no) && fc.uge() => (First yes no)
+(UGE (FlagConstant [fc]) yes no) && !fc.uge() => (First no yes)
+
+(LTnoov (FlagConstant [fc]) yes no) && fc.ltNoov() => (First yes no)
+(LTnoov (FlagConstant [fc]) yes no) && !fc.ltNoov() => (First no yes)
+
+(LEnoov (FlagConstant [fc]) yes no) && fc.leNoov() => (First yes no)
+(LEnoov (FlagConstant [fc]) yes no) && !fc.leNoov() => (First no yes)
+
+(GTnoov (FlagConstant [fc]) yes no) && fc.gtNoov() => (First yes no)
+(GTnoov (FlagConstant [fc]) yes no) && !fc.gtNoov() => (First no yes)
+
+(GEnoov (FlagConstant [fc]) yes no) && fc.geNoov() => (First yes no)
+(GEnoov (FlagConstant [fc]) yes no) && !fc.geNoov() => (First no yes)
+
+(Z (MOVDconst [0]) yes no) => (First yes no)
+(Z (MOVDconst [c]) yes no) && c != 0 => (First no yes)
+(NZ (MOVDconst [0]) yes no) => (First no yes)
+(NZ (MOVDconst [c]) yes no) && c != 0 => (First yes no)
+(ZW (MOVDconst [c]) yes no) && int32(c) == 0 => (First yes no)
+(ZW (MOVDconst [c]) yes no) && int32(c) != 0 => (First no yes)
+(NZW (MOVDconst [c]) yes no) && int32(c) == 0 => (First no yes)
+(NZW (MOVDconst [c]) yes no) && int32(c) != 0 => (First yes no)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+(FLT (InvertFlags cmp) yes no) => (FGT cmp yes no)
+(FGT (InvertFlags cmp) yes no) => (FLT cmp yes no)
+(FLE (InvertFlags cmp) yes no) => (FGE cmp yes no)
+(FGE (InvertFlags cmp) yes no) => (FLE cmp yes no)
+(LTnoov (InvertFlags cmp) yes no) => (GTnoov cmp yes no)
+(GEnoov (InvertFlags cmp) yes no) => (LEnoov cmp yes no)
+(LEnoov (InvertFlags cmp) yes no) => (GEnoov cmp yes no)
+(GTnoov (InvertFlags cmp) yes no) => (LTnoov cmp yes no)
+
+// absorb InvertFlags into conditional instructions
+(CSEL [cc] x y (InvertFlags cmp)) => (CSEL [arm64Invert(cc)] x y cmp)
+(CSEL0 [cc] x (InvertFlags cmp)) => (CSEL0 [arm64Invert(cc)] x cmp)
+(CSETM [cc] (InvertFlags cmp)) => (CSETM [arm64Invert(cc)] cmp)
+(CSINC [cc] x y (InvertFlags cmp)) => (CSINC [arm64Invert(cc)] x y cmp)
+(CSINV [cc] x y (InvertFlags cmp)) => (CSINV [arm64Invert(cc)] x y cmp)
+(CSNEG [cc] x y (InvertFlags cmp)) => (CSNEG [arm64Invert(cc)] x y cmp)
+
+// absorb flag constants into boolean values
+(Equal (FlagConstant [fc])) => (MOVDconst [b2i(fc.eq())])
+(NotEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.ne())])
+(LessThan (FlagConstant [fc])) => (MOVDconst [b2i(fc.lt())])
+(LessThanU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ult())])
+(LessEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.le())])
+(LessEqualU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ule())])
+(GreaterThan (FlagConstant [fc])) => (MOVDconst [b2i(fc.gt())])
+(GreaterThanU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ugt())])
+(GreaterEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.ge())])
+(GreaterEqualU (FlagConstant [fc])) => (MOVDconst [b2i(fc.uge())])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) => (Equal x)
+(NotEqual (InvertFlags x)) => (NotEqual x)
+(LessThan (InvertFlags x)) => (GreaterThan x)
+(LessThanU (InvertFlags x)) => (GreaterThanU x)
+(GreaterThan (InvertFlags x)) => (LessThan x)
+(GreaterThanU (InvertFlags x)) => (LessThanU x)
+(LessEqual (InvertFlags x)) => (GreaterEqual x)
+(LessEqualU (InvertFlags x)) => (GreaterEqualU x)
+(GreaterEqual (InvertFlags x)) => (LessEqual x)
+(GreaterEqualU (InvertFlags x)) => (LessEqualU x)
+(LessThanF (InvertFlags x)) => (GreaterThanF x)
+(LessEqualF (InvertFlags x)) => (GreaterEqualF x)
+(GreaterThanF (InvertFlags x)) => (LessThanF x)
+(GreaterEqualF (InvertFlags x)) => (LessEqualF x)
+
+// Boolean-generating instructions (NOTE: NOT all boolean Values) always
+// zero upper bit of the register; no need to zero-extend
+(MOVBUreg x:((Equal|NotEqual|LessThan|LessThanU|LessThanF|LessEqual|LessEqualU|LessEqualF|GreaterThan|GreaterThanU|GreaterThanF|GreaterEqual|GreaterEqualU|GreaterEqualF) _)) => (MOVDreg x)
+
+// absorb flag constants into conditional instructions
+(CSEL [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSEL [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => y
+(CSEL0 [cc] x flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSEL0 [cc] _ flag) && ccARM64Eval(cc, flag) < 0 => (MOVDconst [0])
+(CSNEG [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSNEG [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => (NEG y)
+(CSINV [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSINV [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => (Not y)
+(CSINC [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSINC [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => (ADDconst [1] y)
+(CSETM [cc] flag) && ccARM64Eval(cc, flag) > 0 => (MOVDconst [-1])
+(CSETM [cc] flag) && ccARM64Eval(cc, flag) < 0 => (MOVDconst [0])
+
+// absorb flags back into boolean CSEL
+(CSEL [cc] x y (CMPWconst [0] boolval)) && cc == OpARM64NotEqual && flagArg(boolval) != nil =>
+ (CSEL [boolval.Op] x y flagArg(boolval))
+(CSEL [cc] x y (CMPWconst [0] boolval)) && cc == OpARM64Equal && flagArg(boolval) != nil =>
+ (CSEL [arm64Negate(boolval.Op)] x y flagArg(boolval))
+(CSEL0 [cc] x (CMPWconst [0] boolval)) && cc == OpARM64NotEqual && flagArg(boolval) != nil =>
+ (CSEL0 [boolval.Op] x flagArg(boolval))
+(CSEL0 [cc] x (CMPWconst [0] boolval)) && cc == OpARM64Equal && flagArg(boolval) != nil =>
+ (CSEL0 [arm64Negate(boolval.Op)] x flagArg(boolval))
+
+// absorb shifts into ops
+(NEG x:(SLLconst [c] y)) && clobberIfDead(x) => (NEGshiftLL [c] y)
+(NEG x:(SRLconst [c] y)) && clobberIfDead(x) => (NEGshiftRL [c] y)
+(NEG x:(SRAconst [c] y)) && clobberIfDead(x) => (NEGshiftRA [c] y)
+(MVN x:(SLLconst [c] y)) && clobberIfDead(x) => (MVNshiftLL [c] y)
+(MVN x:(SRLconst [c] y)) && clobberIfDead(x) => (MVNshiftRL [c] y)
+(MVN x:(SRAconst [c] y)) && clobberIfDead(x) => (MVNshiftRA [c] y)
+(MVN x:(RORconst [c] y)) && clobberIfDead(x) => (MVNshiftRO [c] y)
+(ADD x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ADDshiftLL x0 y [c])
+(ADD x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ADDshiftRL x0 y [c])
+(ADD x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ADDshiftRA x0 y [c])
+(SUB x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (SUBshiftLL x0 y [c])
+(SUB x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (SUBshiftRL x0 y [c])
+(SUB x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (SUBshiftRA x0 y [c])
+(AND x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ANDshiftLL x0 y [c])
+(AND x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ANDshiftRL x0 y [c])
+(AND x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ANDshiftRA x0 y [c])
+(AND x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (ANDshiftRO x0 y [c])
+(OR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ORshiftLL x0 y [c]) // useful for combined load
+(OR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ORshiftRL x0 y [c])
+(OR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ORshiftRA x0 y [c])
+(OR x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (ORshiftRO x0 y [c])
+(XOR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (XORshiftLL x0 y [c])
+(XOR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (XORshiftRL x0 y [c])
+(XOR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (XORshiftRA x0 y [c])
+(XOR x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (XORshiftRO x0 y [c])
+(BIC x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (BICshiftLL x0 y [c])
+(BIC x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (BICshiftRL x0 y [c])
+(BIC x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (BICshiftRA x0 y [c])
+(BIC x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (BICshiftRO x0 y [c])
+(ORN x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ORNshiftLL x0 y [c])
+(ORN x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ORNshiftRL x0 y [c])
+(ORN x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ORNshiftRA x0 y [c])
+(ORN x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (ORNshiftRO x0 y [c])
+(EON x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (EONshiftLL x0 y [c])
+(EON x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (EONshiftRL x0 y [c])
+(EON x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (EONshiftRA x0 y [c])
+(EON x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (EONshiftRO x0 y [c])
+(CMP x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (CMPshiftLL x0 y [c])
+(CMP x0:(SLLconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftLL x1 y [c]))
+(CMP x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (CMPshiftRL x0 y [c])
+(CMP x0:(SRLconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftRL x1 y [c]))
+(CMP x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (CMPshiftRA x0 y [c])
+(CMP x0:(SRAconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftRA x1 y [c]))
+(CMN x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (CMNshiftLL x0 y [c])
+(CMN x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (CMNshiftRL x0 y [c])
+(CMN x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (CMNshiftRA x0 y [c])
+(TST x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (TSTshiftLL x0 y [c])
+(TST x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (TSTshiftRL x0 y [c])
+(TST x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (TSTshiftRA x0 y [c])
+(TST x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (TSTshiftRO x0 y [c])
+
+// prefer *const ops to *shift ops
+(ADDshiftLL (MOVDconst [c]) x [d]) => (ADDconst [c] (SLLconst <x.Type> x [d]))
+(ADDshiftRL (MOVDconst [c]) x [d]) => (ADDconst [c] (SRLconst <x.Type> x [d]))
+(ADDshiftRA (MOVDconst [c]) x [d]) => (ADDconst [c] (SRAconst <x.Type> x [d]))
+(ANDshiftLL (MOVDconst [c]) x [d]) => (ANDconst [c] (SLLconst <x.Type> x [d]))
+(ANDshiftRL (MOVDconst [c]) x [d]) => (ANDconst [c] (SRLconst <x.Type> x [d]))
+(ANDshiftRA (MOVDconst [c]) x [d]) => (ANDconst [c] (SRAconst <x.Type> x [d]))
+(ANDshiftRO (MOVDconst [c]) x [d]) => (ANDconst [c] (RORconst <x.Type> x [d]))
+(ORshiftLL (MOVDconst [c]) x [d]) => (ORconst [c] (SLLconst <x.Type> x [d]))
+(ORshiftRL (MOVDconst [c]) x [d]) => (ORconst [c] (SRLconst <x.Type> x [d]))
+(ORshiftRA (MOVDconst [c]) x [d]) => (ORconst [c] (SRAconst <x.Type> x [d]))
+(ORshiftRO (MOVDconst [c]) x [d]) => (ORconst [c] (RORconst <x.Type> x [d]))
+(XORshiftLL (MOVDconst [c]) x [d]) => (XORconst [c] (SLLconst <x.Type> x [d]))
+(XORshiftRL (MOVDconst [c]) x [d]) => (XORconst [c] (SRLconst <x.Type> x [d]))
+(XORshiftRA (MOVDconst [c]) x [d]) => (XORconst [c] (SRAconst <x.Type> x [d]))
+(XORshiftRO (MOVDconst [c]) x [d]) => (XORconst [c] (RORconst <x.Type> x [d]))
+(CMPshiftLL (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SLLconst <x.Type> x [d])))
+(CMPshiftRL (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRLconst <x.Type> x [d])))
+(CMPshiftRA (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRAconst <x.Type> x [d])))
+(CMNshiftLL (MOVDconst [c]) x [d]) => (CMNconst [c] (SLLconst <x.Type> x [d]))
+(CMNshiftRL (MOVDconst [c]) x [d]) => (CMNconst [c] (SRLconst <x.Type> x [d]))
+(CMNshiftRA (MOVDconst [c]) x [d]) => (CMNconst [c] (SRAconst <x.Type> x [d]))
+(TSTshiftLL (MOVDconst [c]) x [d]) => (TSTconst [c] (SLLconst <x.Type> x [d]))
+(TSTshiftRL (MOVDconst [c]) x [d]) => (TSTconst [c] (SRLconst <x.Type> x [d]))
+(TSTshiftRA (MOVDconst [c]) x [d]) => (TSTconst [c] (SRAconst <x.Type> x [d]))
+(TSTshiftRO (MOVDconst [c]) x [d]) => (TSTconst [c] (RORconst <x.Type> x [d]))
+
+// constant folding in *shift ops
+(MVNshiftLL (MOVDconst [c]) [d]) => (MOVDconst [^int64(uint64(c)<<uint64(d))])
+(MVNshiftRL (MOVDconst [c]) [d]) => (MOVDconst [^int64(uint64(c)>>uint64(d))])
+(MVNshiftRA (MOVDconst [c]) [d]) => (MOVDconst [^(c>>uint64(d))])
+(MVNshiftRO (MOVDconst [c]) [d]) => (MOVDconst [^rotateRight64(c, d)])
+(NEGshiftLL (MOVDconst [c]) [d]) => (MOVDconst [-int64(uint64(c)<<uint64(d))])
+(NEGshiftRL (MOVDconst [c]) [d]) => (MOVDconst [-int64(uint64(c)>>uint64(d))])
+(NEGshiftRA (MOVDconst [c]) [d]) => (MOVDconst [-(c>>uint64(d))])
+(ADDshiftLL x (MOVDconst [c]) [d]) => (ADDconst x [int64(uint64(c)<<uint64(d))])
+(ADDshiftRL x (MOVDconst [c]) [d]) => (ADDconst x [int64(uint64(c)>>uint64(d))])
+(ADDshiftRA x (MOVDconst [c]) [d]) => (ADDconst x [c>>uint64(d)])
+(SUBshiftLL x (MOVDconst [c]) [d]) => (SUBconst x [int64(uint64(c)<<uint64(d))])
+(SUBshiftRL x (MOVDconst [c]) [d]) => (SUBconst x [int64(uint64(c)>>uint64(d))])
+(SUBshiftRA x (MOVDconst [c]) [d]) => (SUBconst x [c>>uint64(d)])
+(ANDshiftLL x (MOVDconst [c]) [d]) => (ANDconst x [int64(uint64(c)<<uint64(d))])
+(ANDshiftRL x (MOVDconst [c]) [d]) => (ANDconst x [int64(uint64(c)>>uint64(d))])
+(ANDshiftRA x (MOVDconst [c]) [d]) => (ANDconst x [c>>uint64(d)])
+(ANDshiftRO x (MOVDconst [c]) [d]) => (ANDconst x [rotateRight64(c, d)])
+(ORshiftLL x (MOVDconst [c]) [d]) => (ORconst x [int64(uint64(c)<<uint64(d))])
+(ORshiftRL x (MOVDconst [c]) [d]) => (ORconst x [int64(uint64(c)>>uint64(d))])
+(ORshiftRA x (MOVDconst [c]) [d]) => (ORconst x [c>>uint64(d)])
+(ORshiftRO x (MOVDconst [c]) [d]) => (ORconst x [rotateRight64(c, d)])
+(XORshiftLL x (MOVDconst [c]) [d]) => (XORconst x [int64(uint64(c)<<uint64(d))])
+(XORshiftRL x (MOVDconst [c]) [d]) => (XORconst x [int64(uint64(c)>>uint64(d))])
+(XORshiftRA x (MOVDconst [c]) [d]) => (XORconst x [c>>uint64(d)])
+(XORshiftRO x (MOVDconst [c]) [d]) => (XORconst x [rotateRight64(c, d)])
+(BICshiftLL x (MOVDconst [c]) [d]) => (ANDconst x [^int64(uint64(c)<<uint64(d))])
+(BICshiftRL x (MOVDconst [c]) [d]) => (ANDconst x [^int64(uint64(c)>>uint64(d))])
+(BICshiftRA x (MOVDconst [c]) [d]) => (ANDconst x [^(c>>uint64(d))])
+(BICshiftRO x (MOVDconst [c]) [d]) => (ANDconst x [^rotateRight64(c, d)])
+(ORNshiftLL x (MOVDconst [c]) [d]) => (ORconst x [^int64(uint64(c)<<uint64(d))])
+(ORNshiftRL x (MOVDconst [c]) [d]) => (ORconst x [^int64(uint64(c)>>uint64(d))])
+(ORNshiftRA x (MOVDconst [c]) [d]) => (ORconst x [^(c>>uint64(d))])
+(ORNshiftRO x (MOVDconst [c]) [d]) => (ORconst x [^rotateRight64(c, d)])
+(EONshiftLL x (MOVDconst [c]) [d]) => (XORconst x [^int64(uint64(c)<<uint64(d))])
+(EONshiftRL x (MOVDconst [c]) [d]) => (XORconst x [^int64(uint64(c)>>uint64(d))])
+(EONshiftRA x (MOVDconst [c]) [d]) => (XORconst x [^(c>>uint64(d))])
+(EONshiftRO x (MOVDconst [c]) [d]) => (XORconst x [^rotateRight64(c, d)])
+(CMPshiftLL x (MOVDconst [c]) [d]) => (CMPconst x [int64(uint64(c)<<uint64(d))])
+(CMPshiftRL x (MOVDconst [c]) [d]) => (CMPconst x [int64(uint64(c)>>uint64(d))])
+(CMPshiftRA x (MOVDconst [c]) [d]) => (CMPconst x [c>>uint64(d)])
+(CMNshiftLL x (MOVDconst [c]) [d]) => (CMNconst x [int64(uint64(c)<<uint64(d))])
+(CMNshiftRL x (MOVDconst [c]) [d]) => (CMNconst x [int64(uint64(c)>>uint64(d))])
+(CMNshiftRA x (MOVDconst [c]) [d]) => (CMNconst x [c>>uint64(d)])
+(TSTshiftLL x (MOVDconst [c]) [d]) => (TSTconst x [int64(uint64(c)<<uint64(d))])
+(TSTshiftRL x (MOVDconst [c]) [d]) => (TSTconst x [int64(uint64(c)>>uint64(d))])
+(TSTshiftRA x (MOVDconst [c]) [d]) => (TSTconst x [c>>uint64(d)])
+(TSTshiftRO x (MOVDconst [c]) [d]) => (TSTconst x [rotateRight64(c, d)])
+
+// simplification with *shift ops
+(SUBshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0])
+(SUBshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0])
+(SUBshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0])
+(ANDshiftLL y:(SLLconst x [c]) x [c]) => y
+(ANDshiftRL y:(SRLconst x [c]) x [c]) => y
+(ANDshiftRA y:(SRAconst x [c]) x [c]) => y
+(ANDshiftRO y:(RORconst x [c]) x [c]) => y
+(ORshiftLL y:(SLLconst x [c]) x [c]) => y
+(ORshiftRL y:(SRLconst x [c]) x [c]) => y
+(ORshiftRA y:(SRAconst x [c]) x [c]) => y
+(ORshiftRO y:(RORconst x [c]) x [c]) => y
+(XORshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0])
+(XORshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0])
+(XORshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0])
+(XORshiftRO (RORconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftRO (RORconst x [c]) x [c]) => (MOVDconst [0])
+(EONshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [-1])
+(EONshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [-1])
+(EONshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [-1])
+(EONshiftRO (RORconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftRO (RORconst x [c]) x [c]) => (MOVDconst [-1])
+
+// rev16w | rev16
+// ((x>>8) | (x<<8)) => (REV16W x), the type of x is uint16, "|" can also be "^" or "+".
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (UBFX <typ.UInt16> [armBFAuxInt(8, 8)] x) x) => (REV16W x)
+
+// ((x & 0xff00ff00)>>8) | ((x & 0x00ff00ff)<<8), "|" can also be "^" or "+".
+((ADDshiftLL|ORshiftLL|XORshiftLL) [8] (UBFX [armBFAuxInt(8, 24)] (ANDconst [c1] x)) (ANDconst [c2] x))
+ && uint32(c1) == 0xff00ff00 && uint32(c2) == 0x00ff00ff
+ => (REV16W x)
+
+// ((x & 0xff00ff00ff00ff00)>>8) | ((x & 0x00ff00ff00ff00ff)<<8), "|" can also be "^" or "+".
+((ADDshiftLL|ORshiftLL|XORshiftLL) [8] (SRLconst [8] (ANDconst [c1] x)) (ANDconst [c2] x))
+ && (uint64(c1) == 0xff00ff00ff00ff00 && uint64(c2) == 0x00ff00ff00ff00ff)
+ => (REV16 x)
+
+// ((x & 0xff00ff00)>>8) | ((x & 0x00ff00ff)<<8), "|" can also be "^" or "+".
+((ADDshiftLL|ORshiftLL|XORshiftLL) [8] (SRLconst [8] (ANDconst [c1] x)) (ANDconst [c2] x))
+ && (uint64(c1) == 0xff00ff00 && uint64(c2) == 0x00ff00ff)
+ => (REV16 (ANDconst <x.Type> [0xffffffff] x))
+
+// Extract from reg pair
+(ADDshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x)
+( ORshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x)
+(XORshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x)
+
+(ADDshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+ => (EXTRWconst [32-c] x2 x)
+( ORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+ => (EXTRWconst [32-c] x2 x)
+(XORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+ => (EXTRWconst [32-c] x2 x)
+
+// Rewrite special pairs of shifts to AND.
+// On ARM64 the bitmask can fit into an instruction.
+(SRLconst [c] (SLLconst [c] x)) && 0 < c && c < 64 => (ANDconst [1<<uint(64-c)-1] x) // mask out high bits
+(SLLconst [c] (SRLconst [c] x)) && 0 < c && c < 64 => (ANDconst [^(1<<uint(c)-1)] x) // mask out low bits
+
+// Special case setting bit as 1. An example is math.Copysign(c,-1)
+(ORconst [c1] (ANDconst [c2] x)) && c2|c1 == ^0 => (ORconst [c1] x)
+
+// If the shift amount is larger than the datasize(32, 16, 8), we can optimize to constant 0.
+(MOVWUreg (SLLconst [lc] x)) && lc >= 32 => (MOVDconst [0])
+(MOVHUreg (SLLconst [lc] x)) && lc >= 16 => (MOVDconst [0])
+(MOVBUreg (SLLconst [lc] x)) && lc >= 8 => (MOVDconst [0])
+
+// After zero extension, the upper (64-datasize(32|16|8)) bits are zero, we can optimiza to constant 0.
+(SRLconst [rc] (MOVWUreg x)) && rc >= 32 => (MOVDconst [0])
+(SRLconst [rc] (MOVHUreg x)) && rc >= 16 => (MOVDconst [0])
+(SRLconst [rc] (MOVBUreg x)) && rc >= 8 => (MOVDconst [0])
+
+// bitfield ops
+
+// sbfiz
+// (x << lc) >> rc
+(SRAconst [rc] (SLLconst [lc] x)) && lc > rc => (SBFIZ [armBFAuxInt(lc-rc, 64-lc)] x)
+// int64(x << lc)
+(MOVWreg (SLLconst [lc] x)) && lc < 32 => (SBFIZ [armBFAuxInt(lc, 32-lc)] x)
+(MOVHreg (SLLconst [lc] x)) && lc < 16 => (SBFIZ [armBFAuxInt(lc, 16-lc)] x)
+(MOVBreg (SLLconst [lc] x)) && lc < 8 => (SBFIZ [armBFAuxInt(lc, 8-lc)] x)
+// int64(x) << lc
+(SLLconst [lc] (MOVWreg x)) => (SBFIZ [armBFAuxInt(lc, min(32, 64-lc))] x)
+(SLLconst [lc] (MOVHreg x)) => (SBFIZ [armBFAuxInt(lc, min(16, 64-lc))] x)
+(SLLconst [lc] (MOVBreg x)) => (SBFIZ [armBFAuxInt(lc, min(8, 64-lc))] x)
+
+// sbfx
+// (x << lc) >> rc
+(SRAconst [rc] (SLLconst [lc] x)) && lc <= rc => (SBFX [armBFAuxInt(rc-lc, 64-rc)] x)
+// int64(x) >> rc
+(SRAconst [rc] (MOVWreg x)) && rc < 32 => (SBFX [armBFAuxInt(rc, 32-rc)] x)
+(SRAconst [rc] (MOVHreg x)) && rc < 16 => (SBFX [armBFAuxInt(rc, 16-rc)] x)
+(SRAconst [rc] (MOVBreg x)) && rc < 8 => (SBFX [armBFAuxInt(rc, 8-rc)] x)
+// merge sbfx and sign-extension into sbfx
+(MOVWreg (SBFX [bfc] x)) && bfc.getARM64BFwidth() <= 32 => (SBFX [bfc] x)
+(MOVHreg (SBFX [bfc] x)) && bfc.getARM64BFwidth() <= 16 => (SBFX [bfc] x)
+(MOVBreg (SBFX [bfc] x)) && bfc.getARM64BFwidth() <= 8 => (SBFX [bfc] x)
+
+// sbfiz/sbfx combinations: merge shifts into bitfield ops
+(SRAconst [sc] (SBFIZ [bfc] x)) && sc < bfc.getARM64BFlsb()
+ => (SBFIZ [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x)
+(SRAconst [sc] (SBFIZ [bfc] x)) && sc >= bfc.getARM64BFlsb()
+ && sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth()
+ => (SBFX [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x)
+
+// ubfiz
+// (x << lc) >> rc
+(SRLconst [rc] (SLLconst [lc] x)) && lc > rc => (UBFIZ [armBFAuxInt(lc-rc, 64-lc)] x)
+// uint64(x) << lc
+(SLLconst [lc] (MOVWUreg x)) => (UBFIZ [armBFAuxInt(lc, min(32, 64-lc))] x)
+(SLLconst [lc] (MOVHUreg x)) => (UBFIZ [armBFAuxInt(lc, min(16, 64-lc))] x)
+(SLLconst [lc] (MOVBUreg x)) => (UBFIZ [armBFAuxInt(lc, min(8, 64-lc))] x)
+// uint64(x << lc)
+(MOVWUreg (SLLconst [lc] x)) && lc < 32 => (UBFIZ [armBFAuxInt(lc, 32-lc)] x)
+(MOVHUreg (SLLconst [lc] x)) && lc < 16 => (UBFIZ [armBFAuxInt(lc, 16-lc)] x)
+(MOVBUreg (SLLconst [lc] x)) && lc < 8 => (UBFIZ [armBFAuxInt(lc, 8-lc)] x)
+
+// merge ANDconst into ubfiz
+// (x & ac) << sc
+(SLLconst [sc] (ANDconst [ac] x)) && isARM64BFMask(sc, ac, 0)
+ => (UBFIZ [armBFAuxInt(sc, arm64BFWidth(ac, 0))] x)
+// (x << sc) & ac
+(ANDconst [ac] (SLLconst [sc] x)) && isARM64BFMask(sc, ac, sc)
+ => (UBFIZ [armBFAuxInt(sc, arm64BFWidth(ac, sc))] x)
+
+// ubfx
+// (x << lc) >> rc
+(SRLconst [rc] (SLLconst [lc] x)) && lc < rc => (UBFX [armBFAuxInt(rc-lc, 64-rc)] x)
+// uint64(x) >> rc
+(SRLconst [rc] (MOVWUreg x)) && rc < 32 => (UBFX [armBFAuxInt(rc, 32-rc)] x)
+(SRLconst [rc] (MOVHUreg x)) && rc < 16 => (UBFX [armBFAuxInt(rc, 16-rc)] x)
+(SRLconst [rc] (MOVBUreg x)) && rc < 8 => (UBFX [armBFAuxInt(rc, 8-rc)] x)
+// uint64(x >> rc)
+(MOVWUreg (SRLconst [rc] x)) && rc < 32 => (UBFX [armBFAuxInt(rc, 32)] x)
+(MOVHUreg (SRLconst [rc] x)) && rc < 16 => (UBFX [armBFAuxInt(rc, 16)] x)
+(MOVBUreg (SRLconst [rc] x)) && rc < 8 => (UBFX [armBFAuxInt(rc, 8)] x)
+// merge ANDconst into ubfx
+// (x >> sc) & ac
+(ANDconst [ac] (SRLconst [sc] x)) && isARM64BFMask(sc, ac, 0)
+ => (UBFX [armBFAuxInt(sc, arm64BFWidth(ac, 0))] x)
+// (x & ac) >> sc
+(SRLconst [sc] (ANDconst [ac] x)) && isARM64BFMask(sc, ac, sc)
+ => (UBFX [armBFAuxInt(sc, arm64BFWidth(ac, sc))] x)
+// merge ANDconst and ubfx into ubfx
+(ANDconst [c] (UBFX [bfc] x)) && isARM64BFMask(0, c, 0) =>
+ (UBFX [armBFAuxInt(bfc.getARM64BFlsb(), min(bfc.getARM64BFwidth(), arm64BFWidth(c, 0)))] x)
+(UBFX [bfc] (ANDconst [c] x)) && isARM64BFMask(0, c, 0) && bfc.getARM64BFlsb() + bfc.getARM64BFwidth() <= arm64BFWidth(c, 0) =>
+ (UBFX [bfc] x)
+// merge ubfx and zerso-extension into ubfx
+(MOVWUreg (UBFX [bfc] x)) && bfc.getARM64BFwidth() <= 32 => (UBFX [bfc] x)
+(MOVHUreg (UBFX [bfc] x)) && bfc.getARM64BFwidth() <= 16 => (UBFX [bfc] x)
+(MOVBUreg (UBFX [bfc] x)) && bfc.getARM64BFwidth() <= 8 => (UBFX [bfc] x)
+
+// ubfiz/ubfx combinations: merge shifts into bitfield ops
+(SRLconst [sc] (UBFX [bfc] x)) && sc < bfc.getARM64BFwidth()
+ => (UBFX [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth()-sc)] x)
+(UBFX [bfc] (SRLconst [sc] x)) && sc+bfc.getARM64BFwidth()+bfc.getARM64BFlsb() < 64
+ => (UBFX [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth())] x)
+(SLLconst [sc] (UBFIZ [bfc] x)) && sc+bfc.getARM64BFwidth()+bfc.getARM64BFlsb() < 64
+ => (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth())] x)
+(UBFIZ [bfc] (SLLconst [sc] x)) && sc < bfc.getARM64BFwidth()
+ => (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth()-sc)] x)
+// ((x << c1) >> c2) >> c3
+(SRLconst [sc] (UBFIZ [bfc] x)) && sc == bfc.getARM64BFlsb()
+ => (ANDconst [1<<uint(bfc.getARM64BFwidth())-1] x)
+(SRLconst [sc] (UBFIZ [bfc] x)) && sc < bfc.getARM64BFlsb()
+ => (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x)
+(SRLconst [sc] (UBFIZ [bfc] x)) && sc > bfc.getARM64BFlsb()
+ && sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth()
+ => (UBFX [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x)
+// ((x << c1) << c2) >> c3
+(UBFX [bfc] (SLLconst [sc] x)) && sc == bfc.getARM64BFlsb()
+ => (ANDconst [1<<uint(bfc.getARM64BFwidth())-1] x)
+(UBFX [bfc] (SLLconst [sc] x)) && sc < bfc.getARM64BFlsb()
+ => (UBFX [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x)
+(UBFX [bfc] (SLLconst [sc] x)) && sc > bfc.getARM64BFlsb()
+ && sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth()
+ => (UBFIZ [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x)
+
+// bfi
+(OR (UBFIZ [bfc] x) (ANDconst [ac] y))
+ && ac == ^((1<<uint(bfc.getARM64BFwidth())-1) << uint(bfc.getARM64BFlsb()))
+ => (BFI [bfc] y x)
+(ORshiftRL [rc] (ANDconst [ac] x) (SLLconst [lc] y))
+ && lc > rc && ac == ^((1<<uint(64-lc)-1) << uint64(lc-rc))
+ => (BFI [armBFAuxInt(lc-rc, 64-lc)] x y)
+// bfxil
+(OR (UBFX [bfc] x) (ANDconst [ac] y)) && ac == ^(1<<uint(bfc.getARM64BFwidth())-1)
+ => (BFXIL [bfc] y x)
+(ORshiftLL [sc] (UBFX [bfc] x) (SRLconst [sc] y)) && sc == bfc.getARM64BFwidth()
+ => (BFXIL [bfc] y x)
+(ORshiftRL [rc] (ANDconst [ac] y) (SLLconst [lc] x)) && lc < rc && ac == ^((1<<uint(64-rc)-1))
+ => (BFXIL [armBFAuxInt(rc-lc, 64-rc)] y x)
+
+// do combined loads
+// little endian loads
+// b[0] | b[1]<<8 => load 16-bit
+(ORshiftLL <t> [8]
+ y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem))
+ y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+ && i1 == i0+1
+ && x0.Uses == 1 && x1.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, y0, y1)
+ => @mergePoint(b,x0,x1) (MOVHUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(ORshiftLL <t> [8]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem))
+ y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x0, x1, y0, y1)
+ => @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr0 idx0 mem)
+(ORshiftLL <t> [8]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ && x0.Uses == 1 && x1.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, y0, y1)
+ => @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr idx mem)
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 => load 32-bit
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ x0:(MOVHUload [i0] {s} p mem)
+ y1:(MOVDnop x1:(MOVBUload [i2] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i3] {s} p mem)))
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && clobber(x0, x1, x2, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ x0:(MOVHUloadidx ptr0 idx0 mem)
+ y1:(MOVDnop x1:(MOVBUload [2] {s} p1:(ADD ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUload [3] {s} p mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr0 idx0 mem)
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ x0:(MOVHUloadidx ptr idx mem)
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && clobber(x0, x1, x2, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr idx mem)
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ x0:(MOVHUloadidx2 ptr0 idx0 mem)
+ y1:(MOVDnop x1:(MOVBUload [2] {s} p1:(ADDshiftLL [1] ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUload [3] {s} p mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr0 (SLLconst <idx0.Type> [1] idx0) mem)
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4]<<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 => load 64-bit
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ x0:(MOVWUload [i0] {s} p mem)
+ y1:(MOVDnop x1:(MOVBUload [i4] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [i6] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [i7] {s} p mem)))
+ && i4 == i0+4
+ && i5 == i0+5
+ && i6 == i0+6
+ && i7 == i0+7
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ x0:(MOVWUloadidx ptr0 idx0 mem)
+ y1:(MOVDnop x1:(MOVBUload [4] {s} p1:(ADD ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUload [5] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [6] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [7] {s} p mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr0 idx0 mem)
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ x0:(MOVWUloadidx4 ptr0 idx0 mem)
+ y1:(MOVDnop x1:(MOVBUload [4] {s} p1:(ADDshiftLL [2] ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUload [5] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [6] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [7] {s} p mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr0 (SLLconst <idx0.Type> [2] idx0) mem)
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ x0:(MOVWUloadidx ptr idx mem)
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [4] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [5] idx) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [6] idx) mem)))
+ y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [7] idx) mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr idx mem)
+
+// b[3]<<24 | b[2]<<16 | b[1]<<8 | b[0] => load 32-bit
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+ y0:(MOVDnop x0:(MOVBUload [i3] {s} p mem)))
+ y1:(MOVDnop x1:(MOVBUload [i2] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i1] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [i0] {s} p mem)))
+ && i1 == i0+1
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3) != nil
+ && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+ => @mergePoint(b,x0,x1,x2,x3) (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+ y0:(MOVDnop x0:(MOVBUload [3] {s} p mem)))
+ y1:(MOVDnop x1:(MOVBUload [2] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr0 idx0 mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+ => @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr0 idx0 mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr idx mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3) != nil
+ && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+ => @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr idx mem)
+
+// b[7]<<56 | b[6]<<48 | b[5]<<40 | b[4]<<32 | b[3]<<24 | b[2]<<16 | b[1]<<8 | b[0] => load 64-bit
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+ y0:(MOVDnop x0:(MOVBUload [i7] {s} p mem)))
+ y1:(MOVDnop x1:(MOVBUload [i6] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [i4] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [i3] {s} p mem)))
+ y5:(MOVDnop x5:(MOVBUload [i2] {s} p mem)))
+ y6:(MOVDnop x6:(MOVBUload [i1] {s} p mem)))
+ y7:(MOVDnop x7:(MOVBUload [i0] {s} p mem)))
+ && i1 == i0+1
+ && i2 == i0+2
+ && i3 == i0+3
+ && i4 == i0+4
+ && i5 == i0+5
+ && i6 == i0+6
+ && i7 == i0+7
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+ && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+ && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+ => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+ y0:(MOVDnop x0:(MOVBUload [7] {s} p mem)))
+ y1:(MOVDnop x1:(MOVBUload [6] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [5] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [4] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [3] {s} p mem)))
+ y5:(MOVDnop x5:(MOVBUload [2] {s} p mem)))
+ y6:(MOVDnop x6:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ y7:(MOVDnop x7:(MOVBUloadidx ptr0 idx0 mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+ && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+ => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDloadidx <t> ptr0 idx0 mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [7] idx) mem)))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [6] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [5] idx) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [4] idx) mem)))
+ y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+ y5:(MOVDnop x5:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+ y6:(MOVDnop x6:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ y7:(MOVDnop x7:(MOVBUloadidx ptr idx mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+ && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+ && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+ => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDloadidx <t> ptr idx mem)
+
+// big endian loads
+// b[1] | b[0]<<8 => load 16-bit, reverse
+(ORshiftLL <t> [8]
+ y0:(MOVDnop x0:(MOVBUload [i1] {s} p mem))
+ y1:(MOVDnop x1:(MOVBUload [i0] {s} p mem)))
+ && i1 == i0+1
+ && x0.Uses == 1 && x1.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, y0, y1)
+ => @mergePoint(b,x0,x1) (REV16W <t> (MOVHUload <t> [i0] {s} p mem))
+(ORshiftLL <t> [8]
+ y0:(MOVDnop x0:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr0 idx0 mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x0, x1, y0, y1)
+ => @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr0 idx0 mem))
+(ORshiftLL <t> [8]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [1] idx) mem))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr idx mem)))
+ && x0.Uses == 1 && x1.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, y0, y1)
+ => @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr idx mem))
+
+// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 => load 32-bit, reverse
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ y0:(REV16W x0:(MOVHUload [i2] {s} p mem))
+ y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i0] {s} p mem)))
+ && i1 == i0+1
+ && i2 == i0+2
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && clobber(x0, x1, x2, y0, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ y0:(REV16W x0:(MOVHUload [2] {s} p mem))
+ y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr0 idx0 mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, y0, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem))
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+ y0:(REV16W x0:(MOVHUloadidx ptr (ADDconst [2] idx) mem))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr idx mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1
+ && o0.Uses == 1
+ && mergePoint(b,x0,x1,x2) != nil
+ && clobber(x0, x1, x2, y0, y1, y2, o0)
+ => @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUloadidx <t> ptr idx mem))
+
+// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 => load 64-bit, reverse
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ y0:(REVW x0:(MOVWUload [i4] {s} p mem))
+ y1:(MOVDnop x1:(MOVBUload [i3] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [i1] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [i0] {s} p mem)))
+ && i1 == i0+1
+ && i2 == i0+2
+ && i3 == i0+3
+ && i4 == i0+4
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ y0:(REVW x0:(MOVWUload [4] {s} p mem))
+ y1:(MOVDnop x1:(MOVBUload [3] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [2] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ y4:(MOVDnop x4:(MOVBUloadidx ptr0 idx0 mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDloadidx <t> ptr0 idx0 mem))
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+ y0:(REVW x0:(MOVWUloadidx ptr (ADDconst [4] idx) mem))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ y4:(MOVDnop x4:(MOVBUloadidx ptr idx mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4) != nil
+ && clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDloadidx <t> ptr idx mem))
+
+// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] => load 32-bit, reverse
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+ y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem)))
+ y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [i3] {s} p mem)))
+ && i1 == i0+1
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3) != nil
+ && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+ => @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem)))
+ y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUload [2] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [3] {s} p mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+ => @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem)))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3) != nil
+ && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+ => @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr idx mem))
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] => load 64-bit, reverse
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+ y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem)))
+ y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+ y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [i3] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [i4] {s} p mem)))
+ y5:(MOVDnop x5:(MOVBUload [i5] {s} p mem)))
+ y6:(MOVDnop x6:(MOVBUload [i6] {s} p mem)))
+ y7:(MOVDnop x7:(MOVBUload [i7] {s} p mem)))
+ && i1 == i0+1
+ && i2 == i0+2
+ && i3 == i0+3
+ && i4 == i0+4
+ && i5 == i0+5
+ && i6 == i0+6
+ && i7 == i0+7
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+ && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+ && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+ => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem)))
+ y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+ y2:(MOVDnop x2:(MOVBUload [2] {s} p mem)))
+ y3:(MOVDnop x3:(MOVBUload [3] {s} p mem)))
+ y4:(MOVDnop x4:(MOVBUload [4] {s} p mem)))
+ y5:(MOVDnop x5:(MOVBUload [5] {s} p mem)))
+ y6:(MOVDnop x6:(MOVBUload [6] {s} p mem)))
+ y7:(MOVDnop x7:(MOVBUload [7] {s} p mem)))
+ && s == nil
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+ && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+ => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDloadidx <t> ptr0 idx0 mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+ y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem)))
+ y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+ y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+ y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+ y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [4] idx) mem)))
+ y5:(MOVDnop x5:(MOVBUloadidx ptr (ADDconst [5] idx) mem)))
+ y6:(MOVDnop x6:(MOVBUloadidx ptr (ADDconst [6] idx) mem)))
+ y7:(MOVDnop x7:(MOVBUloadidx ptr (ADDconst [7] idx) mem)))
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+ && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+ && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+ && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+ && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+ => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDloadidx <t> ptr idx mem))
+
+// Combine zero stores into larger (unaligned) stores.
+(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(int64(i),int64(j),1)
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVBstorezero [1] {s} (ADD ptr0 idx0) x:(MOVBstorezeroidx ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstorezeroidx ptr1 idx1 mem)
+(MOVBstorezeroidx ptr (ADDconst [1] idx) x:(MOVBstorezeroidx ptr idx mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstorezeroidx ptr idx mem)
+(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(int64(i),int64(j),2)
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVWstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVHstorezero [2] {s} (ADD ptr0 idx0) x:(MOVHstorezeroidx ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVWstorezeroidx ptr1 idx1 mem)
+(MOVHstorezeroidx ptr (ADDconst [2] idx) x:(MOVHstorezeroidx ptr idx mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstorezeroidx ptr idx mem)
+(MOVHstorezero [2] {s} (ADDshiftLL [1] ptr0 idx0) x:(MOVHstorezeroidx2 ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVWstorezeroidx ptr1 (SLLconst <idx1.Type> [1] idx1) mem)
+(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(int64(i),int64(j),4)
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVDstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVWstorezero [4] {s} (ADD ptr0 idx0) x:(MOVWstorezeroidx ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVDstorezeroidx ptr1 idx1 mem)
+(MOVWstorezeroidx ptr (ADDconst [4] idx) x:(MOVWstorezeroidx ptr idx mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVDstorezeroidx ptr idx mem)
+(MOVWstorezero [4] {s} (ADDshiftLL [2] ptr0 idx0) x:(MOVWstorezeroidx4 ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVDstorezeroidx ptr1 (SLLconst <idx1.Type> [2] idx1) mem)
+(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(int64(i),int64(j),8)
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVQstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVDstorezero [8] {s} p0:(ADD ptr0 idx0) x:(MOVDstorezeroidx ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVQstorezero [0] {s} p0 mem)
+(MOVDstorezero [8] {s} p0:(ADDshiftLL [3] ptr0 idx0) x:(MOVDstorezeroidx8 ptr1 idx1 mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVQstorezero [0] {s} p0 mem)
+
+// Combine stores into larger (unaligned) stores.
+(MOVBstore [i] {s} ptr0 (SRLconst [8] w) x:(MOVBstore [i-1] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [8] w) x:(MOVBstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstoreidx ptr (ADDconst [1] idx) (SRLconst [8] w) x:(MOVBstoreidx ptr idx w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstoreidx ptr idx w mem)
+(MOVBstore [i] {s} ptr0 (UBFX [armBFAuxInt(8, 8)] w) x:(MOVBstore [i-1] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(8, 8)] w) x:(MOVBstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstore [i] {s} ptr0 (UBFX [armBFAuxInt(8, 24)] w) x:(MOVBstore [i-1] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(8, 24)] w) x:(MOVBstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstore [i] {s} ptr0 (SRLconst [8] (MOVDreg w)) x:(MOVBstore [i-1] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [8] (MOVDreg w)) x:(MOVBstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVBstore [i-1] {s} ptr1 w0:(SRLconst [j-8] w) mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w0 mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVBstoreidx ptr1 idx1 w0:(SRLconst [j-8] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w0 mem)
+(MOVBstore [i] {s} ptr0 (UBFX [bfc] w) x:(MOVBstore [i-1] {s} ptr1 w0:(UBFX [bfc2] w) mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && bfc.getARM64BFwidth() == 32 - bfc.getARM64BFlsb()
+ && bfc2.getARM64BFwidth() == 32 - bfc2.getARM64BFlsb()
+ && bfc2.getARM64BFlsb() == bfc.getARM64BFlsb() - 8
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w0 mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [bfc] w) x:(MOVBstoreidx ptr1 idx1 w0:(UBFX [bfc2] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && bfc.getARM64BFwidth() == 32 - bfc.getARM64BFlsb()
+ && bfc2.getARM64BFwidth() == 32 - bfc2.getARM64BFlsb()
+ && bfc2.getARM64BFlsb() == bfc.getARM64BFlsb() - 8
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w0 mem)
+(MOVBstore [i] {s} ptr0 (SRLconst [j] (MOVDreg w)) x:(MOVBstore [i-1] {s} ptr1 w0:(SRLconst [j-8] (MOVDreg w)) mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr0 w0 mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [j] (MOVDreg w)) x:(MOVBstoreidx ptr1 idx1 w0:(SRLconst [j-8] (MOVDreg w)) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr1 idx1 w0 mem)
+(MOVHstore [i] {s} ptr0 (SRLconst [16] w) x:(MOVHstore [i-2] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVWstore [i-2] {s} ptr0 w mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [16] w) x:(MOVHstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVWstoreidx ptr1 idx1 w mem)
+(MOVHstoreidx ptr (ADDconst [2] idx) (SRLconst [16] w) x:(MOVHstoreidx ptr idx w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstoreidx ptr idx w mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [16] w) x:(MOVHstoreidx2 ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem)
+(MOVHstore [i] {s} ptr0 (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstore [i-2] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVWstore [i-2] {s} ptr0 w mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVWstoreidx ptr1 idx1 w mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstoreidx2 ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem)
+(MOVHstore [i] {s} ptr0 (SRLconst [16] (MOVDreg w)) x:(MOVHstore [i-2] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVWstore [i-2] {s} ptr0 w mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [16] (MOVDreg w)) x:(MOVHstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVWstoreidx ptr1 idx1 w mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [16] (MOVDreg w)) x:(MOVHstoreidx2 ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem)
+(MOVHstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVHstore [i-2] {s} ptr1 w0:(SRLconst [j-16] w) mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVWstore [i-2] {s} ptr0 w0 mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVHstoreidx ptr1 idx1 w0:(SRLconst [j-16] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVWstoreidx ptr1 idx1 w0 mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [j] w) x:(MOVHstoreidx2 ptr1 idx1 w0:(SRLconst [j-16] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w0 mem)
+(MOVWstore [i] {s} ptr0 (SRLconst [32] w) x:(MOVWstore [i-4] {s} ptr1 w mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVDstore [i-4] {s} ptr0 w mem)
+(MOVWstore [4] {s} (ADD ptr0 idx0) (SRLconst [32] w) x:(MOVWstoreidx ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVDstoreidx ptr1 idx1 w mem)
+(MOVWstoreidx ptr (ADDconst [4] idx) (SRLconst [32] w) x:(MOVWstoreidx ptr idx w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVDstoreidx ptr idx w mem)
+(MOVWstore [4] {s} (ADDshiftLL [2] ptr0 idx0) (SRLconst [32] w) x:(MOVWstoreidx4 ptr1 idx1 w mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVDstoreidx ptr1 (SLLconst <idx1.Type> [2] idx1) w mem)
+(MOVWstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVWstore [i-4] {s} ptr1 w0:(SRLconst [j-32] w) mem))
+ && x.Uses == 1
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ => (MOVDstore [i-4] {s} ptr0 w0 mem)
+(MOVWstore [4] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVWstoreidx ptr1 idx1 w0:(SRLconst [j-32] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVDstoreidx ptr1 idx1 w0 mem)
+(MOVWstore [4] {s} (ADDshiftLL [2] ptr0 idx0) (SRLconst [j] w) x:(MOVWstoreidx4 ptr1 idx1 w0:(SRLconst [j-32] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+ && clobber(x)
+ => (MOVDstoreidx ptr1 (SLLconst <idx1.Type> [2] idx1) w0 mem)
+(MOVBstore [i] {s} ptr w
+ x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] w)
+ x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] w)
+ x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] w)
+ x3:(MOVBstore [i-4] {s} ptr (SRLconst [32] w)
+ x4:(MOVBstore [i-5] {s} ptr (SRLconst [40] w)
+ x5:(MOVBstore [i-6] {s} ptr (SRLconst [48] w)
+ x6:(MOVBstore [i-7] {s} ptr (SRLconst [56] w) mem))))))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && x3.Uses == 1
+ && x4.Uses == 1
+ && x5.Uses == 1
+ && x6.Uses == 1
+ && clobber(x0, x1, x2, x3, x4, x5, x6)
+ => (MOVDstore [i-7] {s} ptr (REV <typ.UInt64> w) mem)
+(MOVBstore [7] {s} p w
+ x0:(MOVBstore [6] {s} p (SRLconst [8] w)
+ x1:(MOVBstore [5] {s} p (SRLconst [16] w)
+ x2:(MOVBstore [4] {s} p (SRLconst [24] w)
+ x3:(MOVBstore [3] {s} p (SRLconst [32] w)
+ x4:(MOVBstore [2] {s} p (SRLconst [40] w)
+ x5:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [48] w)
+ x6:(MOVBstoreidx ptr0 idx0 (SRLconst [56] w) mem))))))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && x3.Uses == 1
+ && x4.Uses == 1
+ && x5.Uses == 1
+ && x6.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2, x3, x4, x5, x6)
+ => (MOVDstoreidx ptr0 idx0 (REV <typ.UInt64> w) mem)
+(MOVBstore [i] {s} ptr w
+ x0:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 24)] w)
+ x1:(MOVBstore [i-2] {s} ptr (UBFX [armBFAuxInt(16, 16)] w)
+ x2:(MOVBstore [i-3] {s} ptr (UBFX [armBFAuxInt(24, 8)] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && clobber(x0, x1, x2)
+ => (MOVWstore [i-3] {s} ptr (REVW <typ.UInt32> w) mem)
+(MOVBstore [3] {s} p w
+ x0:(MOVBstore [2] {s} p (UBFX [armBFAuxInt(8, 24)] w)
+ x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (UBFX [armBFAuxInt(16, 16)] w)
+ x2:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(24, 8)] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2)
+ => (MOVWstoreidx ptr0 idx0 (REVW <typ.UInt32> w) mem)
+(MOVBstoreidx ptr (ADDconst [3] idx) w
+ x0:(MOVBstoreidx ptr (ADDconst [2] idx) (UBFX [armBFAuxInt(8, 24)] w)
+ x1:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(16, 16)] w)
+ x2:(MOVBstoreidx ptr idx (UBFX [armBFAuxInt(24, 8)] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && clobber(x0, x1, x2)
+ => (MOVWstoreidx ptr idx (REVW <typ.UInt32> w) mem)
+(MOVBstoreidx ptr idx w
+ x0:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(8, 24)] w)
+ x1:(MOVBstoreidx ptr (ADDconst [2] idx) (UBFX [armBFAuxInt(16, 16)] w)
+ x2:(MOVBstoreidx ptr (ADDconst [3] idx) (UBFX [armBFAuxInt(24, 8)] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && clobber(x0, x1, x2)
+ => (MOVWstoreidx ptr idx w mem)
+(MOVBstore [i] {s} ptr w
+ x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] (MOVDreg w))
+ x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] (MOVDreg w))
+ x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] (MOVDreg w)) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && clobber(x0, x1, x2)
+ => (MOVWstore [i-3] {s} ptr (REVW <typ.UInt32> w) mem)
+(MOVBstore [3] {s} p w
+ x0:(MOVBstore [2] {s} p (SRLconst [8] (MOVDreg w))
+ x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [16] (MOVDreg w))
+ x2:(MOVBstoreidx ptr0 idx0 (SRLconst [24] (MOVDreg w)) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2)
+ => (MOVWstoreidx ptr0 idx0 (REVW <typ.UInt32> w) mem)
+(MOVBstore [i] {s} ptr w
+ x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] w)
+ x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] w)
+ x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && clobber(x0, x1, x2)
+ => (MOVWstore [i-3] {s} ptr (REVW <typ.UInt32> w) mem)
+(MOVBstore [3] {s} p w
+ x0:(MOVBstore [2] {s} p (SRLconst [8] w)
+ x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [16] w)
+ x2:(MOVBstoreidx ptr0 idx0 (SRLconst [24] w) mem))))
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && isSamePtr(p1, p)
+ && clobber(x0, x1, x2)
+ => (MOVWstoreidx ptr0 idx0 (REVW <typ.UInt32> w) mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (SRLconst [8] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (SRLconst [8] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 8)] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(8, 8)] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem)
+(MOVBstoreidx ptr (ADDconst [1] idx) w x:(MOVBstoreidx ptr idx (UBFX [armBFAuxInt(8, 8)] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstoreidx ptr idx (REV16W <typ.UInt16> w) mem)
+(MOVBstoreidx ptr idx w x:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(8, 8)] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstoreidx ptr idx w mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (SRLconst [8] (MOVDreg w)) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (SRLconst [8] (MOVDreg w)) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 24)] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(8, 24)] w) mem))
+ && x.Uses == 1
+ && s == nil
+ && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+ && clobber(x)
+ => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem)
+
+// FP simplification
+(FNEGS (FMULS x y)) => (FNMULS x y)
+(FNEGD (FMULD x y)) => (FNMULD x y)
+(FMULS (FNEGS x) y) => (FNMULS x y)
+(FMULD (FNEGD x) y) => (FNMULD x y)
+(FNEGS (FNMULS x y)) => (FMULS x y)
+(FNEGD (FNMULD x y)) => (FMULD x y)
+(FNMULS (FNEGS x) y) => (FMULS x y)
+(FNMULD (FNEGD x) y) => (FMULD x y)
+
+(FADDS a (FMULS x y)) && a.Block.Func.useFMA(v) => (FMADDS a x y)
+(FADDD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD a x y)
+(FSUBS a (FMULS x y)) && a.Block.Func.useFMA(v) => (FMSUBS a x y)
+(FSUBD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMSUBD a x y)
+(FSUBS (FMULS x y) a) && a.Block.Func.useFMA(v) => (FNMSUBS a x y)
+(FSUBD (FMULD x y) a) && a.Block.Func.useFMA(v) => (FNMSUBD a x y)
+(FADDS a (FNMULS x y)) && a.Block.Func.useFMA(v) => (FMSUBS a x y)
+(FADDD a (FNMULD x y)) && a.Block.Func.useFMA(v) => (FMSUBD a x y)
+(FSUBS a (FNMULS x y)) && a.Block.Func.useFMA(v) => (FMADDS a x y)
+(FSUBD a (FNMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD a x y)
+(FSUBS (FNMULS x y) a) && a.Block.Func.useFMA(v) => (FNMADDS a x y)
+(FSUBD (FNMULD x y) a) && a.Block.Func.useFMA(v) => (FNMADDD a x y)
+
+(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read8(sym, int64(off)))])
+(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVWUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVDload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+
+// Prefetch instructions (aux is option: 0 - PLDL1KEEP; 1 - PLDL1STRM)
+(PrefetchCache addr mem) => (PRFM [0] addr mem)
+(PrefetchCacheStreamed addr mem) => (PRFM [1] addr mem)
+
+// Arch-specific inlining for small or disjoint runtime.memmove
+(SelectN [0] call:(CALLstatic {sym} s1:(MOVDstore _ (MOVDconst [sz]) s2:(MOVDstore _ src s3:(MOVDstore {t} _ dst mem)))))
+ && sz >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+ && isInlinableMemmove(dst, src, sz, config)
+ && clobber(s1, s2, s3, call)
+ => (Move [sz] dst src mem)
+
+// Match post-lowering calls, register version.
+(SelectN [0] call:(CALLstatic {sym} dst src (MOVDconst [sz]) mem))
+ && sz >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && call.Uses == 1
+ && isInlinableMemmove(dst, src, sz, config)
+ && clobber(call)
+ => (Move [sz] dst src mem)
+
+((REV|REVW) ((REV|REVW) p)) => p
+
+// runtime/internal/math.MulUintptr intrinsics
+
+(Select0 (Mul64uover x y)) => (MUL x y)
+(Select1 (Mul64uover x y)) => (NotEqual (CMPconst (UMULH <typ.UInt64> x y) [0]))
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
new file mode 100644
index 0000000..f7cc47b
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
@@ -0,0 +1,794 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - *const instructions may use a constant larger than the instruction can encode.
+// In this case the assembler expands to multiple instructions and uses tmp
+// register (R27).
+
+// Suffixes encode the bit width of various instructions.
+// D (double word) = 64 bit
+// W (word) = 32 bit
+// H (half word) = 16 bit
+// HU = 16 bit unsigned
+// B (byte) = 8 bit
+// BU = 8 bit unsigned
+// S (single) = 32 bit float
+// D (double) = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesARM64 = []string{
+ "R0",
+ "R1",
+ "R2",
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "R13",
+ "R14",
+ "R15",
+ "R16",
+ "R17",
+ "R18", // platform register, not used
+ "R19",
+ "R20",
+ "R21",
+ "R22",
+ "R23",
+ "R24",
+ "R25",
+ "R26",
+ // R27 = REGTMP not used in regalloc
+ "g", // aka R28
+ "R29", // frame pointer, not used
+ "R30", // aka REGLINK
+ "SP", // aka R31
+
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15",
+ "F16",
+ "F17",
+ "F18",
+ "F19",
+ "F20",
+ "F21",
+ "F22",
+ "F23",
+ "F24",
+ "F25",
+ "F26",
+ "F27",
+ "F28",
+ "F29",
+ "F30",
+ "F31",
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesARM64) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesARM64 {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30")
+ gpg = gp | buildReg("g")
+ gpsp = gp | buildReg("SP")
+ gpspg = gpg | buildReg("SP")
+ gpspsbg = gpspg | buildReg("SB")
+ fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+ callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r0 = buildReg("R0")
+ r1 = buildReg("R1")
+ r2 = buildReg("R2")
+ r3 = buildReg("R3")
+ )
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp0flags1 = regInfo{inputs: []regMask{0}, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+ gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+ gp1flags = regInfo{inputs: []regMask{gpg}}
+ gp1flags1 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+ gp11flags = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp, 0}}
+ gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+ gp21nog = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+ gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
+ gp2flags = regInfo{inputs: []regMask{gpg, gpg}}
+ gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+ gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}}
+ gp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+ gp31 = regInfo{inputs: []regMask{gpg, gpg, gpg}, outputs: []regMask{gp}}
+ gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+ gpload2 = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gpg, gpg}}
+ gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
+ gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+ gpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
+ gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+ gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+ fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+ fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+ fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+ gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+ fp1flags = regInfo{inputs: []regMask{fp}}
+ fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+ fp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}}
+ fpstore = regInfo{inputs: []regMask{gpspsbg, fp}}
+ fpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, fp}}
+ readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+ prefreg = regInfo{inputs: []regMask{gpspsbg}}
+ )
+ ops := []opData{
+ // binary ops
+ {name: "ADCSflags", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCS", commutative: true}, // arg0+arg1+carry, set flags.
+ {name: "ADCzerocarry", argLength: 1, reg: gp0flags1, typ: "UInt64", asm: "ADC"}, // ZR+ZR+carry
+ {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1
+ {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int64"}, // arg0 + auxInt
+ {name: "ADDSconstflags", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDS", aux: "Int64"}, // arg0+auxint, set flags.
+ {name: "ADDSflags", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDS", commutative: true}, // arg0+arg1, set flags.
+ {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0 - arg1
+ {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int64"}, // arg0 - auxInt
+ {name: "SBCSflags", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "SBCS"}, // arg0-(arg1+borrowing), set flags.
+ {name: "SUBSflags", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "SUBS"}, // arg0 - arg1, set flags.
+ {name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true}, // arg0 * arg1
+ {name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true}, // arg0 * arg1, 32-bit
+ {name: "MNEG", argLength: 2, reg: gp21, asm: "MNEG", commutative: true}, // -arg0 * arg1
+ {name: "MNEGW", argLength: 2, reg: gp21, asm: "MNEGW", commutative: true}, // -arg0 * arg1, 32-bit
+ {name: "MULH", argLength: 2, reg: gp21, asm: "SMULH", commutative: true}, // (arg0 * arg1) >> 64, signed
+ {name: "UMULH", argLength: 2, reg: gp21, asm: "UMULH", commutative: true}, // (arg0 * arg1) >> 64, unsigned
+ {name: "MULL", argLength: 2, reg: gp21, asm: "SMULL", commutative: true}, // arg0 * arg1, signed, 32-bit mult results in 64-bit
+ {name: "UMULL", argLength: 2, reg: gp21, asm: "UMULL", commutative: true}, // arg0 * arg1, unsigned, 32-bit mult results in 64-bit
+ {name: "DIV", argLength: 2, reg: gp21, asm: "SDIV"}, // arg0 / arg1, signed
+ {name: "UDIV", argLength: 2, reg: gp21, asm: "UDIV"}, // arg0 / arg1, unsighed
+ {name: "DIVW", argLength: 2, reg: gp21, asm: "SDIVW"}, // arg0 / arg1, signed, 32 bit
+ {name: "UDIVW", argLength: 2, reg: gp21, asm: "UDIVW"}, // arg0 / arg1, unsighed, 32 bit
+ {name: "MOD", argLength: 2, reg: gp21, asm: "REM"}, // arg0 % arg1, signed
+ {name: "UMOD", argLength: 2, reg: gp21, asm: "UREM"}, // arg0 % arg1, unsigned
+ {name: "MODW", argLength: 2, reg: gp21, asm: "REMW"}, // arg0 % arg1, signed, 32 bit
+ {name: "UMODW", argLength: 2, reg: gp21, asm: "UREMW"}, // arg0 % arg1, unsigned, 32 bit
+
+ {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1
+ {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1
+ {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0 - arg1
+ {name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"}, // arg0 - arg1
+ {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1
+ {name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1
+ {name: "FNMULS", argLength: 2, reg: fp21, asm: "FNMULS", commutative: true}, // -(arg0 * arg1)
+ {name: "FNMULD", argLength: 2, reg: fp21, asm: "FNMULD", commutative: true}, // -(arg0 * arg1)
+ {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0 / arg1
+ {name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"}, // arg0 / arg1
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt
+ {name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true}, // arg0 | arg1
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int64"}, // arg0 | auxInt
+ {name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int64"}, // arg0 ^ auxInt
+ {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1
+ {name: "EON", argLength: 2, reg: gp21, asm: "EON"}, // arg0 ^ ^arg1
+ {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // arg0 | ^arg1
+
+ // unary ops
+ {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
+ {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0
+ {name: "NEGSflags", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "NEGS"}, // -arg0, set flags.
+ {name: "NGCzerocarry", argLength: 1, reg: gp0flags1, typ: "UInt64", asm: "NGC"}, // -1 if borrowing, 0 otherwise.
+ {name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD"}, // abs(arg0), float64
+ {name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"}, // -arg0, float32
+ {name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64
+ {name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64
+ {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0), float32
+ {name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // byte reverse, 64-bit
+ {name: "REVW", argLength: 1, reg: gp11, asm: "REVW"}, // byte reverse, 32-bit
+ {name: "REV16", argLength: 1, reg: gp11, asm: "REV16"}, // byte reverse in each 16-bit halfword, 64-bit
+ {name: "REV16W", argLength: 1, reg: gp11, asm: "REV16W"}, // byte reverse in each 16-bit halfword, 32-bit
+ {name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"}, // bit reverse, 64-bit
+ {name: "RBITW", argLength: 1, reg: gp11, asm: "RBITW"}, // bit reverse, 32-bit
+ {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero, 64-bit
+ {name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // count leading zero, 32-bit
+ {name: "VCNT", argLength: 1, reg: fp11, asm: "VCNT"}, // count set bits for each 8-bit unit and store the result in each 8-bit unit
+ {name: "VUADDLV", argLength: 1, reg: fp11, asm: "VUADDLV"}, // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit.
+ {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+ {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+
+ // 3-operand, the addend comes first
+ {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // +arg0 + (arg1 * arg2)
+ {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD"}, // +arg0 + (arg1 * arg2)
+ {name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS"}, // -arg0 - (arg1 * arg2)
+ {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD"}, // -arg0 - (arg1 * arg2)
+ {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // +arg0 - (arg1 * arg2)
+ {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD"}, // +arg0 - (arg1 * arg2)
+ {name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS"}, // -arg0 + (arg1 * arg2)
+ {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD"}, // -arg0 + (arg1 * arg2)
+ {name: "MADD", argLength: 3, reg: gp31, asm: "MADD"}, // +arg0 + (arg1 * arg2)
+ {name: "MADDW", argLength: 3, reg: gp31, asm: "MADDW"}, // +arg0 + (arg1 * arg2), 32-bit
+ {name: "MSUB", argLength: 3, reg: gp31, asm: "MSUB"}, // +arg0 - (arg1 * arg2)
+ {name: "MSUBW", argLength: 3, reg: gp31, asm: "MSUBW"}, // +arg0 - (arg1 * arg2), 32-bit
+
+ // shifts
+ {name: "SLL", argLength: 2, reg: gp21, asm: "LSL"}, // arg0 << arg1, shift amount is mod 64
+ {name: "SLLconst", argLength: 1, reg: gp11, asm: "LSL", aux: "Int64"}, // arg0 << auxInt, auxInt should be in the range 0 to 63.
+ {name: "SRL", argLength: 2, reg: gp21, asm: "LSR"}, // arg0 >> arg1, unsigned, shift amount is mod 64
+ {name: "SRLconst", argLength: 1, reg: gp11, asm: "LSR", aux: "Int64"}, // arg0 >> auxInt, unsigned, auxInt should be in the range 0 to 63.
+ {name: "SRA", argLength: 2, reg: gp21, asm: "ASR"}, // arg0 >> arg1, signed, shift amount is mod 64
+ {name: "SRAconst", argLength: 1, reg: gp11, asm: "ASR", aux: "Int64"}, // arg0 >> auxInt, signed, auxInt should be in the range 0 to 63.
+ {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // arg0 right rotate by (arg1 mod 64) bits
+ {name: "RORW", argLength: 2, reg: gp21, asm: "RORW"}, // arg0 right rotate by (arg1 mod 32) bits
+ {name: "RORconst", argLength: 1, reg: gp11, asm: "ROR", aux: "Int64"}, // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63.
+ {name: "RORWconst", argLength: 1, reg: gp11, asm: "RORW", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31.
+ {name: "EXTRconst", argLength: 2, reg: gp21, asm: "EXTR", aux: "Int64"}, // extract 64 bits from arg0:arg1 starting at lsb auxInt, auxInt should be in the range 0 to 63.
+ {name: "EXTRWconst", argLength: 2, reg: gp21, asm: "EXTRW", aux: "Int64"}, // extract 32 bits from arg0[31:0]:arg1[31:0] starting at lsb auxInt and zero top 32 bits, auxInt should be in the range 0 to 31.
+
+ // comparisons
+ {name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to auxInt
+ {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1, 32 bit
+ {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt, 32 bit
+ {name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags", commutative: true}, // arg0 compare to -arg1, provided arg1 is not 1<<63
+ {name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // arg0 compare to -auxInt
+ {name: "CMNW", argLength: 2, reg: gp2flags, asm: "CMNW", typ: "Flags", commutative: true}, // arg0 compare to -arg1, 32 bit, provided arg1 is not 1<<31
+ {name: "CMNWconst", argLength: 1, reg: gp1flags, asm: "CMNW", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt, 32 bit
+ {name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0
+ {name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int64", typ: "Flags"}, // arg0 & auxInt compare to 0
+ {name: "TSTW", argLength: 2, reg: gp2flags, asm: "TSTW", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0, 32 bit
+ {name: "TSTWconst", argLength: 1, reg: gp1flags, asm: "TSTW", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0, 32 bit
+ {name: "FCMPS", argLength: 2, reg: fp2flags, asm: "FCMPS", typ: "Flags"}, // arg0 compare to arg1, float32
+ {name: "FCMPD", argLength: 2, reg: fp2flags, asm: "FCMPD", typ: "Flags"}, // arg0 compare to arg1, float64
+ {name: "FCMPS0", argLength: 1, reg: fp1flags, asm: "FCMPS", typ: "Flags"}, // arg0 compare to 0, float32
+ {name: "FCMPD0", argLength: 1, reg: fp1flags, asm: "FCMPD", typ: "Flags"}, // arg0 compare to 0, float64
+
+ // shifted ops
+ {name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0<<auxInt), auxInt should be in the range 0 to 63.
+ {name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0>>auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "MVNshiftRO", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0 ROR auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "NEGshiftLL", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"}, // -(arg0<<auxInt), auxInt should be in the range 0 to 63.
+ {name: "NEGshiftRL", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"}, // -(arg0>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "NEGshiftRA", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"}, // -(arg0>>auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"}, // arg0 + arg1<<auxInt, auxInt should be in the range 0 to 63.
+ {name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"}, // arg0 + arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"}, // arg0 + arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"}, // arg0 - arg1<<auxInt, auxInt should be in the range 0 to 63.
+ {name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"}, // arg0 - arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"}, // arg0 - arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1<<auxInt), auxInt should be in the range 0 to 63.
+ {name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "ANDshiftRO", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1<<auxInt, auxInt should be in the range 0 to 63.
+ {name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "ORshiftRO", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1 ROR auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1<<auxInt, auxInt should be in the range 0 to 63.
+ {name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "XORshiftRO", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1 ROR auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1<<auxInt), auxInt should be in the range 0 to 63.
+ {name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "BICshiftRO", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "EONshiftLL", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1<<auxInt), auxInt should be in the range 0 to 63.
+ {name: "EONshiftRL", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "EONshiftRA", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "EONshiftRO", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "ORNshiftLL", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1<<auxInt), auxInt should be in the range 0 to 63.
+ {name: "ORNshiftRL", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "ORNshiftRA", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "ORNshiftRO", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63.
+ {name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1<<auxInt, auxInt should be in the range 0 to 63.
+ {name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
+ {name: "CMNshiftLL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1<<auxInt) compare to 0, auxInt should be in the range 0 to 63.
+ {name: "CMNshiftRL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1>>auxInt) compare to 0, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "CMNshiftRA", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1>>auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63.
+ {name: "TSTshiftLL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1<<auxInt) compare to 0, auxInt should be in the range 0 to 63.
+ {name: "TSTshiftRL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1>>auxInt) compare to 0, unsigned shift, auxInt should be in the range 0 to 63.
+ {name: "TSTshiftRA", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1>>auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63.
+ {name: "TSTshiftRO", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1 ROR auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63.
+
+ // bitfield ops
+ // for all bitfield ops lsb is auxInt>>8, width is auxInt&0xff
+ // insert low width bits of arg1 into the result starting at bit lsb, copy other bits from arg0
+ {name: "BFI", argLength: 2, reg: gp21nog, asm: "BFI", aux: "ARM64BitField", resultInArg0: true},
+ // extract width bits of arg1 starting at bit lsb and insert at low end of result, copy other bits from arg0
+ {name: "BFXIL", argLength: 2, reg: gp21nog, asm: "BFXIL", aux: "ARM64BitField", resultInArg0: true},
+ // insert low width bits of arg0 into the result starting at bit lsb, bits to the left of the inserted bit field are set to the high/sign bit of the inserted bit field, bits to the right are zeroed
+ {name: "SBFIZ", argLength: 1, reg: gp11, asm: "SBFIZ", aux: "ARM64BitField"},
+ // extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are set to the high/sign bit of the extracted bitfield
+ {name: "SBFX", argLength: 1, reg: gp11, asm: "SBFX", aux: "ARM64BitField"},
+ // insert low width bits of arg0 into the result starting at bit lsb, bits to the left and right of the inserted bit field are zeroed
+ {name: "UBFIZ", argLength: 1, reg: gp11, asm: "UBFIZ", aux: "ARM64BitField"},
+ // extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are zeroed
+ {name: "UBFX", argLength: 1, reg: gp11, asm: "UBFX", aux: "ARM64BitField"},
+
+ // moves
+ {name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "UInt64", rematerializeable: true}, // 64 bits from auxint
+ {name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVS", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+ {name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+ {name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+ {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVDload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVD", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "LDP", argLength: 2, reg: gpload2, aux: "SymOff", asm: "LDP", typ: "(UInt64,UInt64)", faultOnNilArg0: true, symEffect: "Read"}, // load from ptr = arg0 + auxInt + aux, returns the tuple <*(*uint64)ptr, *(*uint64)(ptr+8)>. arg1=mem.
+ {name: "FMOVSload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVS", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "FMOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+
+ // register indexed load
+ {name: "MOVDloadidx", argLength: 3, reg: gp2load, asm: "MOVD", typ: "UInt64"}, // load 64-bit dword from arg0 + arg1, arg2 = mem.
+ {name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"}, // load 32-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
+ {name: "MOVWUloadidx", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"}, // load 32-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
+ {name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load 16-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
+ {name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load 16-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
+ {name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"}, // load 8-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
+ {name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"}, // load 8-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
+ {name: "FMOVSloadidx", argLength: 3, reg: fp2load, asm: "FMOVS", typ: "Float32"}, // load 32-bit float from arg0 + arg1, arg2=mem.
+ {name: "FMOVDloadidx", argLength: 3, reg: fp2load, asm: "FMOVD", typ: "Float64"}, // load 64-bit float from arg0 + arg1, arg2=mem.
+
+ // shifted register indexed load
+ {name: "MOVHloadidx2", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load 16-bit half-word from arg0 + arg1*2, sign-extended to 64-bit, arg2=mem.
+ {name: "MOVHUloadidx2", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load 16-bit half-word from arg0 + arg1*2, zero-extended to 64-bit, arg2=mem.
+ {name: "MOVWloadidx4", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"}, // load 32-bit word from arg0 + arg1*4, sign-extended to 64-bit, arg2=mem.
+ {name: "MOVWUloadidx4", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"}, // load 32-bit word from arg0 + arg1*4, zero-extended to 64-bit, arg2=mem.
+ {name: "MOVDloadidx8", argLength: 3, reg: gp2load, asm: "MOVD", typ: "UInt64"}, // load 64-bit double-word from arg0 + arg1*8, arg2 = mem.
+ {name: "FMOVSloadidx4", argLength: 3, reg: fp2load, asm: "FMOVS", typ: "Float32"}, // load 32-bit float from arg0 + arg1*4, arg2 = mem.
+ {name: "FMOVDloadidx8", argLength: 3, reg: fp2load, asm: "FMOVD", typ: "Float64"}, // load 64-bit float from arg0 + arg1*8, arg2 = mem.
+
+ {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVDstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "STP", argLength: 4, reg: gpstore2, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of arg1 and arg2 to arg0 + auxInt + aux. arg3=mem.
+ {name: "FMOVSstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVS", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "FMOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+
+ // register indexed store
+ {name: "MOVBstoreidx", argLength: 4, reg: gpstore2, asm: "MOVB", typ: "Mem"}, // store 1 byte of arg2 to arg0 + arg1, arg3 = mem.
+ {name: "MOVHstoreidx", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg2 to arg0 + arg1, arg3 = mem.
+ {name: "MOVWstoreidx", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg2 to arg0 + arg1, arg3 = mem.
+ {name: "MOVDstoreidx", argLength: 4, reg: gpstore2, asm: "MOVD", typ: "Mem"}, // store 8 bytes of arg2 to arg0 + arg1, arg3 = mem.
+ {name: "FMOVSstoreidx", argLength: 4, reg: fpstore2, asm: "FMOVS", typ: "Mem"}, // store 32-bit float of arg2 to arg0 + arg1, arg3=mem.
+ {name: "FMOVDstoreidx", argLength: 4, reg: fpstore2, asm: "FMOVD", typ: "Mem"}, // store 64-bit float of arg2 to arg0 + arg1, arg3=mem.
+
+ // shifted register indexed store
+ {name: "MOVHstoreidx2", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg2 to arg0 + arg1*2, arg3 = mem.
+ {name: "MOVWstoreidx4", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg2 to arg0 + arg1*4, arg3 = mem.
+ {name: "MOVDstoreidx8", argLength: 4, reg: gpstore2, asm: "MOVD", typ: "Mem"}, // store 8 bytes of arg2 to arg0 + arg1*8, arg3 = mem.
+ {name: "FMOVSstoreidx4", argLength: 4, reg: fpstore2, asm: "FMOVS", typ: "Mem"}, // store 32-bit float of arg2 to arg0 + arg1*4, arg3=mem.
+ {name: "FMOVDstoreidx8", argLength: 4, reg: fpstore2, asm: "FMOVD", typ: "Mem"}, // store 64-bit float of arg2 to arg0 + arg1*8, arg3=mem.
+
+ {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVQstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+
+ // register indexed store zero
+ {name: "MOVBstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVB", typ: "Mem"}, // store 1 byte of zero to arg0 + arg1, arg2 = mem.
+ {name: "MOVHstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1, arg2 = mem.
+ {name: "MOVWstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1, arg2 = mem.
+ {name: "MOVDstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1, arg2 = mem.
+
+ // shifted register indexed store zero
+ {name: "MOVHstorezeroidx2", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1*2, arg2 = mem.
+ {name: "MOVWstorezeroidx4", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1*4, arg2 = mem.
+ {name: "MOVDstorezeroidx8", argLength: 3, reg: gpstore, asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1*8, arg2 = mem.
+
+ {name: "FMOVDgpfp", argLength: 1, reg: gpfp, asm: "FMOVD"}, // move int64 to float64 (no conversion)
+ {name: "FMOVDfpgp", argLength: 1, reg: fpgp, asm: "FMOVD"}, // move float64 to int64 (no conversion)
+ {name: "FMOVSgpfp", argLength: 1, reg: gpfp, asm: "FMOVS"}, // move 32bits from int to float reg (no conversion)
+ {name: "FMOVSfpgp", argLength: 1, reg: fpgp, asm: "FMOVS"}, // move 32bits from float to int reg, zero extend (no conversion)
+
+ // conversions
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
+ {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half
+ {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word
+ {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+ {name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOVD"}, // move from arg0
+
+ {name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+ {name: "SCVTFWS", argLength: 1, reg: gpfp, asm: "SCVTFWS"}, // int32 -> float32
+ {name: "SCVTFWD", argLength: 1, reg: gpfp, asm: "SCVTFWD"}, // int32 -> float64
+ {name: "UCVTFWS", argLength: 1, reg: gpfp, asm: "UCVTFWS"}, // uint32 -> float32
+ {name: "UCVTFWD", argLength: 1, reg: gpfp, asm: "UCVTFWD"}, // uint32 -> float64
+ {name: "SCVTFS", argLength: 1, reg: gpfp, asm: "SCVTFS"}, // int64 -> float32
+ {name: "SCVTFD", argLength: 1, reg: gpfp, asm: "SCVTFD"}, // int64 -> float64
+ {name: "UCVTFS", argLength: 1, reg: gpfp, asm: "UCVTFS"}, // uint64 -> float32
+ {name: "UCVTFD", argLength: 1, reg: gpfp, asm: "UCVTFD"}, // uint64 -> float64
+ {name: "FCVTZSSW", argLength: 1, reg: fpgp, asm: "FCVTZSSW"}, // float32 -> int32
+ {name: "FCVTZSDW", argLength: 1, reg: fpgp, asm: "FCVTZSDW"}, // float64 -> int32
+ {name: "FCVTZUSW", argLength: 1, reg: fpgp, asm: "FCVTZUSW"}, // float32 -> uint32
+ {name: "FCVTZUDW", argLength: 1, reg: fpgp, asm: "FCVTZUDW"}, // float64 -> uint32
+ {name: "FCVTZSS", argLength: 1, reg: fpgp, asm: "FCVTZSS"}, // float32 -> int64
+ {name: "FCVTZSD", argLength: 1, reg: fpgp, asm: "FCVTZSD"}, // float64 -> int64
+ {name: "FCVTZUS", argLength: 1, reg: fpgp, asm: "FCVTZUS"}, // float32 -> uint64
+ {name: "FCVTZUD", argLength: 1, reg: fpgp, asm: "FCVTZUD"}, // float64 -> uint64
+ {name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD"}, // float32 -> float64
+ {name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS"}, // float64 -> float32
+
+ // floating-point round to integral
+ {name: "FRINTAD", argLength: 1, reg: fp11, asm: "FRINTAD"},
+ {name: "FRINTMD", argLength: 1, reg: fp11, asm: "FRINTMD"},
+ {name: "FRINTND", argLength: 1, reg: fp11, asm: "FRINTND"},
+ {name: "FRINTPD", argLength: 1, reg: fp11, asm: "FRINTPD"},
+ {name: "FRINTZD", argLength: 1, reg: fp11, asm: "FRINTZD"},
+
+ // conditional instructions; auxint is
+ // one of the arm64 comparison pseudo-ops (LessThan, LessThanU, etc.)
+ {name: "CSEL", argLength: 3, reg: gp2flags1, asm: "CSEL", aux: "CCop"}, // auxint(flags) ? arg0 : arg1
+ {name: "CSEL0", argLength: 2, reg: gp1flags1, asm: "CSEL", aux: "CCop"}, // auxint(flags) ? arg0 : 0
+ {name: "CSINC", argLength: 3, reg: gp2flags1, asm: "CSINC", aux: "CCop"}, // auxint(flags) ? arg0 : arg1 + 1
+ {name: "CSINV", argLength: 3, reg: gp2flags1, asm: "CSINV", aux: "CCop"}, // auxint(flags) ? arg0 : ^arg1
+ {name: "CSNEG", argLength: 3, reg: gp2flags1, asm: "CSNEG", aux: "CCop"}, // auxint(flags) ? arg0 : -arg1
+ {name: "CSETM", argLength: 1, reg: readflags, asm: "CSETM", aux: "CCop"}, // auxint(flags) ? -1 : 0
+
+ // function calls
+ {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{gpsp, buildReg("R26"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem
+
+ // pseudo-ops
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
+
+ {name: "Equal", argLength: 1, reg: readflags}, // bool, true flags encode x==y false otherwise.
+ {name: "NotEqual", argLength: 1, reg: readflags}, // bool, true flags encode x!=y false otherwise.
+ {name: "LessThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x<y false otherwise.
+ {name: "LessEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x<=y false otherwise.
+ {name: "GreaterThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x>y false otherwise.
+ {name: "GreaterEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y false otherwise.
+ {name: "LessThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<y false otherwise.
+ {name: "LessEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<=y false otherwise.
+ {name: "GreaterThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>y false otherwise.
+ {name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise.
+ {name: "LessThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<y false otherwise.
+ {name: "LessEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<=y false otherwise.
+ {name: "GreaterThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>y false otherwise.
+ {name: "GreaterEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>=y false otherwise.
+ {name: "NotLessThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>=y || x is unordered with y, false otherwise.
+ {name: "NotLessEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>y || x is unordered with y, false otherwise.
+ {name: "NotGreaterThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<=y || x is unordered with y, false otherwise.
+ {name: "NotGreaterEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<y || x is unordered with y, false otherwise.
+ // duffzero
+ // arg0 = address of memory to zero
+ // arg1 = mem
+ // auxint = offset into duffzero code to start executing
+ // returns mem
+ // R20 changed as side effect
+ // R16 and R17 may be clobbered by linker trampoline.
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20")},
+ clobbers: buildReg("R16 R17 R20 R30"),
+ },
+ faultOnNilArg0: true,
+ unsafePoint: true, // FP maintenance around DUFFZERO can be clobbered by interrupts
+ },
+
+ // large zeroing
+ // arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
+ // arg1 = address of the last 16-byte unit to zero
+ // arg2 = mem
+ // returns mem
+ // STP.P (ZR,ZR), 16(R16)
+ // CMP Rarg1, R16
+ // BLE -2(PC)
+ // Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
+ // the-end-of-the-memory - 16 is with the area to zero, ok to spill.
+ {
+ name: "LoweredZero",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R16"), gp},
+ clobbers: buildReg("R16"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ },
+
+ // duffcopy
+ // arg0 = address of dst memory (in R21, changed as side effect)
+ // arg1 = address of src memory (in R20, changed as side effect)
+ // arg2 = mem
+ // auxint = offset into duffcopy code to start executing
+ // returns mem
+ // R20, R21 changed as side effect
+ // R16 and R17 may be clobbered by linker trampoline.
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R21"), buildReg("R20")},
+ clobbers: buildReg("R16 R17 R20 R21 R26 R30"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+ },
+
+ // large move
+ // arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
+ // arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
+ // arg2 = address of the last element of src
+ // arg3 = mem
+ // returns mem
+ // LDP.P 16(R16), (R25, Rtmp)
+ // STP.P (R25, Rtmp), 16(R17)
+ // CMP Rarg2, R16
+ // BLE -3(PC)
+ // Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
+ // the-end-of-src - 16 is within the area to copy, ok to spill.
+ {
+ name: "LoweredMove",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")},
+ clobbers: buildReg("R16 R17 R25"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of R26 (arm64.REGCTXT, the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R26")}}, zeroWidth: true},
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ // Constant flag value.
+ // Note: there's an "unordered" outcome for floating-point
+ // comparisons, but we don't use such a beast yet.
+ // This op is for temporary use by rewrite rules. It
+ // cannot appear in the generated assembly.
+ {name: "FlagConstant", aux: "FlagConstant"},
+
+ // (InvertFlags (CMP a b)) == (CMP b a)
+ // InvertFlags is a pseudo-op which can't appear in assembly output.
+ {name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+ // atomic loads.
+ // load from arg0. arg1=mem. auxint must be zero.
+ // returns <value,memory> so they can be properly ordered with other loads.
+ {name: "LDAR", argLength: 2, reg: gpload, asm: "LDAR", faultOnNilArg0: true},
+ {name: "LDARB", argLength: 2, reg: gpload, asm: "LDARB", faultOnNilArg0: true},
+ {name: "LDARW", argLength: 2, reg: gpload, asm: "LDARW", faultOnNilArg0: true},
+
+ // atomic stores.
+ // store arg1 to arg0. arg2=mem. returns memory. auxint must be zero.
+ {name: "STLRB", argLength: 3, reg: gpstore, asm: "STLRB", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "STLR", argLength: 3, reg: gpstore, asm: "STLR", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "STLRW", argLength: 3, reg: gpstore, asm: "STLRW", faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic exchange.
+ // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
+ // LDAXR (Rarg0), Rout
+ // STLXR Rarg1, (Rarg0), Rtmp
+ // CBNZ Rtmp, -2(PC)
+ {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic exchange variant.
+ // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
+ // SWPALD Rarg1, (Rarg0), Rout
+ {name: "LoweredAtomicExchange64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicExchange32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic add.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+ // LDAXR (Rarg0), Rout
+ // ADD Rarg1, Rout
+ // STLXR Rout, (Rarg0), Rtmp
+ // CBNZ Rtmp, -3(PC)
+ {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic add variant.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+ // LDADDAL (Rarg0), Rarg1, Rout
+ // ADD Rarg1, Rout
+ {name: "LoweredAtomicAdd64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicAdd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // LDAXR (Rarg0), Rtmp
+ // CMP Rarg1, Rtmp
+ // BNE 3(PC)
+ // STLXR Rarg2, (Rarg0), Rtmp
+ // CBNZ Rtmp, -4(PC)
+ // CSET EQ, Rout
+ {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic compare and swap variant.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // MOV Rarg1, Rtmp
+ // CASAL Rtmp, (Rarg0), Rarg2
+ // CMP Rarg1, Rtmp
+ // CSET EQ, Rout
+ {name: "LoweredAtomicCas64Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicCas32Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic and/or.
+ // *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+ // LDAXR (Rarg0), Rout
+ // AND/OR Rarg1, Rout
+ // STLXR Rout, (Rarg0), Rtmp
+ // CBNZ Rtmp, -3(PC)
+ {name: "LoweredAtomicAnd8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAnd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicOr8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicOr32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic and/or variant.
+ // *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+ // AND:
+ // MNV Rarg1, Rtemp
+ // LDANDALB Rtemp, (Rarg0), Rout
+ // AND Rarg1, Rout
+ // OR:
+ // LDORALB Rarg1, (Rarg0), Rout
+ // ORR Rarg1, Rout
+ {name: "LoweredAtomicAnd8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAnd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicOr8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicOr32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers R30 (LR) because it's a call.
+ // R16 and R17 may be clobbered by linker trampoline.
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R16 R17 R30")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+
+ // Prefetch instruction
+ // Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option.
+ {name: "PRFM", argLength: 2, aux: "Int64", reg: prefreg, asm: "PRFM", hasSideEffects: true},
+
+ // Publication barrier
+ {name: "DMB", argLength: 1, aux: "Int64", asm: "DMB", hasSideEffects: true}, // Do data barrier. arg0=memory, aux=option.
+ }
+
+ blocks := []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LT", controls: 1},
+ {name: "LE", controls: 1},
+ {name: "GT", controls: 1},
+ {name: "GE", controls: 1},
+ {name: "ULT", controls: 1},
+ {name: "ULE", controls: 1},
+ {name: "UGT", controls: 1},
+ {name: "UGE", controls: 1},
+ {name: "Z", controls: 1}, // Control == 0 (take a register instead of flags)
+ {name: "NZ", controls: 1}, // Control != 0
+ {name: "ZW", controls: 1}, // Control == 0, 32-bit
+ {name: "NZW", controls: 1}, // Control != 0, 32-bit
+ {name: "TBZ", controls: 1, aux: "Int64"}, // Control & (1 << AuxInt) == 0
+ {name: "TBNZ", controls: 1, aux: "Int64"}, // Control & (1 << AuxInt) != 0
+ {name: "FLT", controls: 1},
+ {name: "FLE", controls: 1},
+ {name: "FGT", controls: 1},
+ {name: "FGE", controls: 1},
+ {name: "LTnoov", controls: 1}, // 'LT' but without honoring overflow
+ {name: "LEnoov", controls: 1}, // 'LE' but without honoring overflow
+ {name: "GTnoov", controls: 1}, // 'GT' but without honoring overflow
+ {name: "GEnoov", controls: 1}, // 'GE' but without honoring overflow
+
+ // JUMPTABLE implements jump tables.
+ // Aux is the symbol (an *obj.LSym) for the jump table.
+ // control[0] is the index into the jump table.
+ // control[1] is the address of the jump table (the address of the symbol stored in Aux).
+ {name: "JUMPTABLE", controls: 2, aux: "Sym"},
+ }
+
+ archs = append(archs, arch{
+ name: "ARM64",
+ pkg: "cmd/internal/obj/arm64",
+ genfile: "../../arm64/ssa.go",
+ ops: ops,
+ blocks: blocks,
+ regnames: regNamesARM64,
+ ParamIntRegNames: "R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15",
+ ParamFloatRegNames: "F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15",
+ gpregmask: gp,
+ fpregmask: fp,
+ framepointerreg: -1, // not used
+ linkreg: int8(num["R30"]),
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules b/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules
new file mode 100644
index 0000000..d0c2099
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules
@@ -0,0 +1,21 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules used by the laterLower pass.
+// These are often the exact inverse of rules in ARM64.rules.
+
+(ADDconst [c] x) && !isARM64addcon(c) => (ADD x (MOVDconst [c]))
+(SUBconst [c] x) && !isARM64addcon(c) => (SUB x (MOVDconst [c]))
+(ANDconst [c] x) && !isARM64bitcon(uint64(c)) => (AND x (MOVDconst [c]))
+(ORconst [c] x) && !isARM64bitcon(uint64(c)) => (OR x (MOVDconst [c]))
+(XORconst [c] x) && !isARM64bitcon(uint64(c)) => (XOR x (MOVDconst [c]))
+(TSTconst [c] x) && !isARM64bitcon(uint64(c)) => (TST x (MOVDconst [c]))
+(TSTWconst [c] x) && !isARM64bitcon(uint64(c)|uint64(c)<<32) => (TSTW x (MOVDconst [int64(c)]))
+
+(CMPconst [c] x) && !isARM64addcon(c) => (CMP x (MOVDconst [c]))
+(CMPWconst [c] x) && !isARM64addcon(int64(c)) => (CMPW x (MOVDconst [int64(c)]))
+(CMNconst [c] x) && !isARM64addcon(c) => (CMN x (MOVDconst [c]))
+(CMNWconst [c] x) && !isARM64addcon(int64(c)) => (CMNW x (MOVDconst [int64(c)]))
+
+(ADDSconstflags [c] x) && !isARM64addcon(c) => (ADDSflags x (MOVDconst [c]))
diff --git a/src/cmd/compile/internal/ssa/_gen/ARMOps.go b/src/cmd/compile/internal/ssa/_gen/ARMOps.go
new file mode 100644
index 0000000..de477a2
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/ARMOps.go
@@ -0,0 +1,600 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - *const instructions may use a constant larger than the instruction can encode.
+// In this case the assembler expands to multiple instructions and uses tmp
+// register (R11).
+
+// Suffixes encode the bit width of various instructions.
+// W (word) = 32 bit
+// H (half word) = 16 bit
+// HU = 16 bit unsigned
+// B (byte) = 8 bit
+// BU = 8 bit unsigned
+// F (float) = 32 bit float
+// D (double) = 64 bit float
+
+var regNamesARM = []string{
+ "R0",
+ "R1",
+ "R2",
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "g", // aka R10
+ "R11", // tmp
+ "R12",
+ "SP", // aka R13
+ "R14", // link
+ "R15", // pc
+
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15", // tmp
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesARM) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesARM {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14")
+ gpg = gp | buildReg("g")
+ gpsp = gp | buildReg("SP")
+ gpspg = gpg | buildReg("SP")
+ gpspsbg = gpspg | buildReg("SB")
+ fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+ callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r0 = buildReg("R0")
+ r1 = buildReg("R1")
+ r2 = buildReg("R2")
+ r3 = buildReg("R3")
+ r4 = buildReg("R4")
+ )
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+ gp11carry = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp, 0}}
+ gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+ gp1flags = regInfo{inputs: []regMask{gpg}}
+ gp1flags1 = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}
+ gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+ gp21carry = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, 0}}
+ gp2flags = regInfo{inputs: []regMask{gpg, gpg}}
+ gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+ gp22 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}}
+ gp31 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+ gp31carry = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp, 0}}
+ gp3flags = regInfo{inputs: []regMask{gp, gp, gp}}
+ gp3flags1 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+ gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+ gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
+ gp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+ gp2store = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
+ fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+ fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+ fp1flags = regInfo{inputs: []regMask{fp}}
+ fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}, clobbers: buildReg("F15")} // int-float conversion uses F15 as tmp
+ gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}, clobbers: buildReg("F15")}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+ fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+ fpstore = regInfo{inputs: []regMask{gpspsbg, fp}}
+ readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+ )
+ ops := []opData{
+ // binary ops
+ {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1
+ {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32"}, // arg0 + auxInt
+ {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0 - arg1
+ {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32"}, // arg0 - auxInt
+ {name: "RSB", argLength: 2, reg: gp21, asm: "RSB"}, // arg1 - arg0
+ {name: "RSBconst", argLength: 1, reg: gp11, asm: "RSB", aux: "Int32"}, // auxInt - arg0
+ {name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true}, // arg0 * arg1
+ {name: "HMUL", argLength: 2, reg: gp21, asm: "MULL", commutative: true}, // (arg0 * arg1) >> 32, signed
+ {name: "HMULU", argLength: 2, reg: gp21, asm: "MULLU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+
+ // udiv runtime call for soft division
+ // output0 = arg0/arg1, output1 = arg0%arg1
+ // see ../../../../../runtime/vlop_arm.s
+ {
+ name: "CALLudiv",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), buildReg("R0")},
+ outputs: []regMask{buildReg("R0"), buildReg("R1")},
+ clobbers: buildReg("R2 R3 R12 R14"), // R14 is LR, R12 is linker trampoline scratch register
+ },
+ clobberFlags: true,
+ typ: "(UInt32,UInt32)",
+ call: false, // TODO(mdempsky): Should this be true?
+ },
+
+ {name: "ADDS", argLength: 2, reg: gp21carry, asm: "ADD", commutative: true}, // arg0 + arg1, set carry flag
+ {name: "ADDSconst", argLength: 1, reg: gp11carry, asm: "ADD", aux: "Int32"}, // arg0 + auxInt, set carry flag
+ {name: "ADC", argLength: 3, reg: gp2flags1, asm: "ADC", commutative: true}, // arg0 + arg1 + carry, arg2=flags
+ {name: "ADCconst", argLength: 2, reg: gp1flags1, asm: "ADC", aux: "Int32"}, // arg0 + auxInt + carry, arg1=flags
+ {name: "SUBS", argLength: 2, reg: gp21carry, asm: "SUB"}, // arg0 - arg1, set carry flag
+ {name: "SUBSconst", argLength: 1, reg: gp11carry, asm: "SUB", aux: "Int32"}, // arg0 - auxInt, set carry flag
+ {name: "RSBSconst", argLength: 1, reg: gp11carry, asm: "RSB", aux: "Int32"}, // auxInt - arg0, set carry flag
+ {name: "SBC", argLength: 3, reg: gp2flags1, asm: "SBC"}, // arg0 - arg1 - carry, arg2=flags
+ {name: "SBCconst", argLength: 2, reg: gp1flags1, asm: "SBC", aux: "Int32"}, // arg0 - auxInt - carry, arg1=flags
+ {name: "RSCconst", argLength: 2, reg: gp1flags1, asm: "RSC", aux: "Int32"}, // auxInt - arg0 - carry, arg1=flags
+
+ {name: "MULLU", argLength: 2, reg: gp22, asm: "MULLU", commutative: true}, // arg0 * arg1, high 32 bits in out0, low 32 bits in out1
+ {name: "MULA", argLength: 3, reg: gp31, asm: "MULA"}, // arg0 * arg1 + arg2
+ {name: "MULS", argLength: 3, reg: gp31, asm: "MULS"}, // arg2 - arg0 * arg1
+
+ {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+ {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+ {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1
+ {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1
+ {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+ {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+ {name: "NMULF", argLength: 2, reg: fp21, asm: "NMULF", commutative: true}, // -(arg0 * arg1)
+ {name: "NMULD", argLength: 2, reg: fp21, asm: "NMULD", commutative: true}, // -(arg0 * arg1)
+ {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1
+ {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1
+
+ {name: "MULAF", argLength: 3, reg: fp31, asm: "MULAF", resultInArg0: true}, // arg0 + (arg1 * arg2)
+ {name: "MULAD", argLength: 3, reg: fp31, asm: "MULAD", resultInArg0: true}, // arg0 + (arg1 * arg2)
+ {name: "MULSF", argLength: 3, reg: fp31, asm: "MULSF", resultInArg0: true}, // arg0 - (arg1 * arg2)
+ {name: "MULSD", argLength: 3, reg: fp31, asm: "MULSD", resultInArg0: true}, // arg0 - (arg1 * arg2)
+
+ // FMULAD only exists on platforms with the VFPv4 instruction set.
+ // Any use must be preceded by a successful check of runtime.arm_support_vfpv4.
+ {name: "FMULAD", argLength: 3, reg: fp31, asm: "FMULAD", resultInArg0: true}, // arg0 + (arg1 * arg2)
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt
+ {name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true}, // arg0 | arg1
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int32"}, // arg0 | auxInt
+ {name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int32"}, // arg0 ^ auxInt
+ {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1
+ {name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt
+
+ // bit extraction, AuxInt = Width<<8 | LSB
+ {name: "BFX", argLength: 1, reg: gp11, asm: "BFX", aux: "Int32"}, // extract W bits from bit L in arg0, then signed extend
+ {name: "BFXU", argLength: 1, reg: gp11, asm: "BFXU", aux: "Int32"}, // extract W bits from bit L in arg0, then unsigned extend
+
+ // unary ops
+ {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
+
+ {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
+ {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
+ {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+ {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
+ {name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64
+
+ {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero
+ {name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // reverse byte order
+ {name: "REV16", argLength: 1, reg: gp11, asm: "REV16"}, // reverse byte order in 16-bit halfwords
+ {name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"}, // reverse bit order
+
+ // shifts
+ {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << arg1, shift amount is mod 256
+ {name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt, 0 <= auxInt < 32
+ {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> arg1, unsigned, shift amount is mod 256
+ {name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, unsigned, 0 <= auxInt < 32
+ {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> arg1, signed, shift amount is mod 256
+ {name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed, 0 <= auxInt < 32
+ {name: "SRR", argLength: 2, reg: gp21}, // arg0 right rotate by arg1 bits
+ {name: "SRRconst", argLength: 1, reg: gp11, aux: "Int32"}, // arg0 right rotate by auxInt bits, 0 <= auxInt < 32
+
+ // auxInt for all of these satisfy 0 <= auxInt < 32
+ {name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt
+ {name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift
+ {name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift
+ {name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt
+ {name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift
+ {name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift
+ {name: "RSBshiftLL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0
+ {name: "RSBshiftRL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift
+ {name: "RSBshiftRA", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift
+ {name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1<<auxInt)
+ {name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), unsigned shift
+ {name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), signed shift
+ {name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"}, // arg0 | arg1<<auxInt
+ {name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"}, // arg0 | arg1>>auxInt, unsigned shift
+ {name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"}, // arg0 | arg1>>auxInt, signed shift
+ {name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1<<auxInt
+ {name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, unsigned shift
+ {name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, signed shift
+ {name: "XORshiftRR", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ (arg1 right rotate by auxInt)
+ {name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1<<auxInt)
+ {name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), unsigned shift
+ {name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), signed shift
+ {name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0<<auxInt)
+ {name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), unsigned shift
+ {name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), signed shift
+
+ {name: "ADCshiftLL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1<<auxInt + carry, arg2=flags
+ {name: "ADCshiftRL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, unsigned shift, arg2=flags
+ {name: "ADCshiftRA", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, signed shift, arg2=flags
+ {name: "SBCshiftLL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1<<auxInt - carry, arg2=flags
+ {name: "SBCshiftRL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, unsigned shift, arg2=flags
+ {name: "SBCshiftRA", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, signed shift, arg2=flags
+ {name: "RSCshiftLL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1<<auxInt - arg0 - carry, arg2=flags
+ {name: "RSCshiftRL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, unsigned shift, arg2=flags
+ {name: "RSCshiftRA", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, signed shift, arg2=flags
+
+ {name: "ADDSshiftLL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt, set carry flag
+ {name: "ADDSshiftRL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift, set carry flag
+ {name: "ADDSshiftRA", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift, set carry flag
+ {name: "SUBSshiftLL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt, set carry flag
+ {name: "SUBSshiftRL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift, set carry flag
+ {name: "SUBSshiftRA", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift, set carry flag
+ {name: "RSBSshiftLL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0, set carry flag
+ {name: "RSBSshiftRL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift, set carry flag
+ {name: "RSBSshiftRA", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift, set carry flag
+
+ {name: "ADDshiftLLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1<<arg2
+ {name: "ADDshiftRLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift
+ {name: "ADDshiftRAreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift
+ {name: "SUBshiftLLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1<<arg2
+ {name: "SUBshiftRLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift
+ {name: "SUBshiftRAreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift
+ {name: "RSBshiftLLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1<<arg2 - arg0
+ {name: "RSBshiftRLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift
+ {name: "RSBshiftRAreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift
+ {name: "ANDshiftLLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1<<arg2)
+ {name: "ANDshiftRLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), unsigned shift
+ {name: "ANDshiftRAreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), signed shift
+ {name: "ORshiftLLreg", argLength: 3, reg: gp31, asm: "ORR"}, // arg0 | arg1<<arg2
+ {name: "ORshiftRLreg", argLength: 3, reg: gp31, asm: "ORR"}, // arg0 | arg1>>arg2, unsigned shift
+ {name: "ORshiftRAreg", argLength: 3, reg: gp31, asm: "ORR"}, // arg0 | arg1>>arg2, signed shift
+ {name: "XORshiftLLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1<<arg2
+ {name: "XORshiftRLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, unsigned shift
+ {name: "XORshiftRAreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, signed shift
+ {name: "BICshiftLLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1<<arg2)
+ {name: "BICshiftRLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), unsigned shift
+ {name: "BICshiftRAreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), signed shift
+ {name: "MVNshiftLLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0<<arg1)
+ {name: "MVNshiftRLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), unsigned shift
+ {name: "MVNshiftRAreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), signed shift
+
+ {name: "ADCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1<<arg2 + carry, arg3=flags
+ {name: "ADCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, unsigned shift, arg3=flags
+ {name: "ADCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, signed shift, arg3=flags
+ {name: "SBCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1<<arg2 - carry, arg3=flags
+ {name: "SBCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, unsigned shift, arg3=flags
+ {name: "SBCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, signed shift, arg3=flags
+ {name: "RSCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1<<arg2 - arg0 - carry, arg3=flags
+ {name: "RSCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, unsigned shift, arg3=flags
+ {name: "RSCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, signed shift, arg3=flags
+
+ {name: "ADDSshiftLLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1<<arg2, set carry flag
+ {name: "ADDSshiftRLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift, set carry flag
+ {name: "ADDSshiftRAreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift, set carry flag
+ {name: "SUBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1<<arg2, set carry flag
+ {name: "SUBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift, set carry flag
+ {name: "SUBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift, set carry flag
+ {name: "RSBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1<<arg2 - arg0, set carry flag
+ {name: "RSBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift, set carry flag
+ {name: "RSBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift, set carry flag
+
+ // comparisons
+ {name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt
+ {name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags", commutative: true}, // arg0 compare to -arg1, provided arg1 is not 1<<63
+ {name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt
+ {name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0
+ {name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0
+ {name: "TEQ", argLength: 2, reg: gp2flags, asm: "TEQ", typ: "Flags", commutative: true}, // arg0 ^ arg1 compare to 0
+ {name: "TEQconst", argLength: 1, reg: gp1flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ auxInt compare to 0
+ {name: "CMPF", argLength: 2, reg: fp2flags, asm: "CMPF", typ: "Flags"}, // arg0 compare to arg1, float32
+ {name: "CMPD", argLength: 2, reg: fp2flags, asm: "CMPD", typ: "Flags"}, // arg0 compare to arg1, float64
+
+ {name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1<<auxInt
+ {name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift
+ {name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift
+ {name: "CMNshiftLL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1<<auxInt)
+ {name: "CMNshiftRL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1>>auxInt), unsigned shift
+ {name: "CMNshiftRA", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1>>auxInt), signed shift
+ {name: "TSTshiftLL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1<<auxInt) compare to 0
+ {name: "TSTshiftRL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1>>auxInt) compare to 0, unsigned shift
+ {name: "TSTshiftRA", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1>>auxInt) compare to 0, signed shift
+ {name: "TEQshiftLL", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1<<auxInt) compare to 0
+ {name: "TEQshiftRL", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1>>auxInt) compare to 0, unsigned shift
+ {name: "TEQshiftRA", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1>>auxInt) compare to 0, signed shift
+
+ {name: "CMPshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1<<arg2
+ {name: "CMPshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, unsigned shift
+ {name: "CMPshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, signed shift
+ {name: "CMNshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1<<arg2) compare to 0
+ {name: "CMNshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1>>arg2) compare to 0, unsigned shift
+ {name: "CMNshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1>>arg2) compare to 0, signed shift
+ {name: "TSTshiftLLreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1<<arg2) compare to 0
+ {name: "TSTshiftRLreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1>>arg2) compare to 0, unsigned shift
+ {name: "TSTshiftRAreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1>>arg2) compare to 0, signed shift
+ {name: "TEQshiftLLreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1<<arg2) compare to 0
+ {name: "TEQshiftRLreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1>>arg2) compare to 0, unsigned shift
+ {name: "TEQshiftRAreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1>>arg2) compare to 0, signed shift
+
+ {name: "CMPF0", argLength: 1, reg: fp1flags, asm: "CMPF", typ: "Flags"}, // arg0 compare to 0, float32
+ {name: "CMPD0", argLength: 1, reg: fp1flags, asm: "CMPD", typ: "Flags"}, // arg0 compare to 0, float64
+
+ // moves
+ {name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true}, // 32 low bits of auxint
+ {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+ {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+ {name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+ {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+
+ {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+
+ {name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "UInt32"}, // load from arg0 + arg1. arg2=mem
+ {name: "MOVWloadshiftLL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1<<auxInt. arg2=mem
+ {name: "MOVWloadshiftRL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1>>auxInt, unsigned shift. arg2=mem
+ {name: "MOVWloadshiftRA", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1>>auxInt, signed shift. arg2=mem
+ {name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"}, // load from arg0 + arg1. arg2=mem
+ {name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"}, // load from arg0 + arg1. arg2=mem
+ {name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load from arg0 + arg1. arg2=mem
+ {name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load from arg0 + arg1. arg2=mem
+
+ {name: "MOVWstoreidx", argLength: 4, reg: gp2store, asm: "MOVW", typ: "Mem"}, // store arg2 to arg0 + arg1. arg3=mem
+ {name: "MOVWstoreshiftLL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1<<auxInt. arg3=mem
+ {name: "MOVWstoreshiftRL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1>>auxInt, unsigned shift. arg3=mem
+ {name: "MOVWstoreshiftRA", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1>>auxInt, signed shift. arg3=mem
+ {name: "MOVBstoreidx", argLength: 4, reg: gp2store, asm: "MOVB", typ: "Mem"}, // store arg2 to arg0 + arg1. arg3=mem
+ {name: "MOVHstoreidx", argLength: 4, reg: gp2store, asm: "MOVH", typ: "Mem"}, // store arg2 to arg0 + arg1. arg3=mem
+
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVBS"}, // move from arg0, sign-extended from byte
+ {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVHS"}, // move from arg0, sign-extended from half
+ {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0
+
+ {name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+ {name: "MOVWF", argLength: 1, reg: gpfp, asm: "MOVWF"}, // int32 -> float32
+ {name: "MOVWD", argLength: 1, reg: gpfp, asm: "MOVWD"}, // int32 -> float64
+ {name: "MOVWUF", argLength: 1, reg: gpfp, asm: "MOVWF"}, // uint32 -> float32, set U bit in the instruction
+ {name: "MOVWUD", argLength: 1, reg: gpfp, asm: "MOVWD"}, // uint32 -> float64, set U bit in the instruction
+ {name: "MOVFW", argLength: 1, reg: fpgp, asm: "MOVFW"}, // float32 -> int32
+ {name: "MOVDW", argLength: 1, reg: fpgp, asm: "MOVDW"}, // float64 -> int32
+ {name: "MOVFWU", argLength: 1, reg: fpgp, asm: "MOVFW"}, // float32 -> uint32, set U bit in the instruction
+ {name: "MOVDWU", argLength: 1, reg: fpgp, asm: "MOVDW"}, // float64 -> uint32, set U bit in the instruction
+ {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64
+ {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32
+
+ // conditional instructions, for lowering shifts
+ {name: "CMOVWHSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates HS, arg1=flags
+ {name: "CMOVWLSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates LS, arg1=flags
+ {name: "SRAcond", argLength: 3, reg: gp2flags1, asm: "SRA"}, // arg0 >> 31 if flags indicates HS, arg0 >> arg1 otherwise, signed shift, arg2=flags
+
+ // function calls
+ {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R7"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // pseudo-ops
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
+
+ {name: "Equal", argLength: 1, reg: readflags}, // bool, true flags encode x==y false otherwise.
+ {name: "NotEqual", argLength: 1, reg: readflags}, // bool, true flags encode x!=y false otherwise.
+ {name: "LessThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x<y false otherwise.
+ {name: "LessEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x<=y false otherwise.
+ {name: "GreaterThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x>y false otherwise.
+ {name: "GreaterEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y false otherwise.
+ {name: "LessThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<y false otherwise.
+ {name: "LessEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<=y false otherwise.
+ {name: "GreaterThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>y false otherwise.
+ {name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise.
+
+ // duffzero (must be 4-byte aligned)
+ // arg0 = address of memory to zero (in R1, changed as side effect)
+ // arg1 = value to store (always zero)
+ // arg2 = mem
+ // auxint = offset into duffzero code to start executing
+ // returns mem
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), buildReg("R0")},
+ clobbers: buildReg("R1 R12 R14"), // R14 is LR, R12 is linker trampoline scratch register
+ },
+ faultOnNilArg0: true,
+ },
+
+ // duffcopy (must be 4-byte aligned)
+ // arg0 = address of dst memory (in R2, changed as side effect)
+ // arg1 = address of src memory (in R1, changed as side effect)
+ // arg2 = mem
+ // auxint = offset into duffcopy code to start executing
+ // returns mem
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R2"), buildReg("R1")},
+ clobbers: buildReg("R0 R1 R2 R12 R14"), // R14 is LR, R12 is linker trampoline scratch register
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // large or unaligned zeroing
+ // arg0 = address of memory to zero (in R1, changed as side effect)
+ // arg1 = address of the last element to zero
+ // arg2 = value to store (always zero)
+ // arg3 = mem
+ // returns mem
+ // MOVW.P Rarg2, 4(R1)
+ // CMP R1, Rarg1
+ // BLE -2(PC)
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), gp, gp},
+ clobbers: buildReg("R1"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ },
+
+ // large or unaligned move
+ // arg0 = address of dst memory (in R2, changed as side effect)
+ // arg1 = address of src memory (in R1, changed as side effect)
+ // arg2 = address of the last element of src
+ // arg3 = mem
+ // returns mem
+ // MOVW.P 4(R1), Rtmp
+ // MOVW.P Rtmp, 4(R2)
+ // CMP R1, Rarg2
+ // BLE -3(PC)
+ {
+ name: "LoweredMove",
+ aux: "Int64",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R2"), buildReg("R1"), gp},
+ clobbers: buildReg("R1 R2"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of R7 (arm.REGCTXT, the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R7")}}, zeroWidth: true},
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ // Extend ops are the same as Bounds ops except the indexes are 64-bit.
+ {name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ {name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ {name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r0, r1}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+
+ // Constant flag value.
+ // Note: there's an "unordered" outcome for floating-point
+ // comparisons, but we don't use such a beast yet.
+ // This op is for temporary use by rewrite rules. It
+ // cannot appear in the generated assembly.
+ {name: "FlagConstant", aux: "FlagConstant"},
+
+ // (InvertFlags (CMP a b)) == (CMP b a)
+ // InvertFlags is a pseudo-op which can't appear in assembly output.
+ {name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers R14 (LR) because it's a call, and R12 which is linker trampoline scratch register.
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R12 R14")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+ }
+
+ blocks := []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LT", controls: 1},
+ {name: "LE", controls: 1},
+ {name: "GT", controls: 1},
+ {name: "GE", controls: 1},
+ {name: "ULT", controls: 1},
+ {name: "ULE", controls: 1},
+ {name: "UGT", controls: 1},
+ {name: "UGE", controls: 1},
+ {name: "LTnoov", controls: 1}, // 'LT' but without honoring overflow
+ {name: "LEnoov", controls: 1}, // 'LE' but without honoring overflow
+ {name: "GTnoov", controls: 1}, // 'GT' but without honoring overflow
+ {name: "GEnoov", controls: 1}, // 'GE' but without honoring overflow
+ }
+
+ archs = append(archs, arch{
+ name: "ARM",
+ pkg: "cmd/internal/obj/arm",
+ genfile: "../../arm/ssa.go",
+ ops: ops,
+ blocks: blocks,
+ regnames: regNamesARM,
+ gpregmask: gp,
+ fpregmask: fp,
+ framepointerreg: -1, // not used
+ linkreg: int8(num["R14"]),
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
new file mode 100644
index 0000000..1caaf13
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
@@ -0,0 +1,694 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|64|32|16|8) ...) => (ADDV ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUBV ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+
+(Mul(64|32|16|8) x y) => (Select1 (MULVU x y))
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+(Mul64uhilo ...) => (MULVU ...)
+(Select0 (Mul64uover x y)) => (Select1 <typ.UInt64> (MULVU x y))
+(Select1 (Mul64uover x y)) => (SGTU <typ.Bool> (Select0 <typ.UInt64> (MULVU x y)) (MOVVconst <typ.UInt64> [0]))
+
+(Hmul64 x y) => (Select0 (MULV x y))
+(Hmul64u x y) => (Select0 (MULVU x y))
+(Hmul32 x y) => (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32])
+(Hmul32u x y) => (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32])
+
+(Div64 x y) => (Select1 (DIVV x y))
+(Div64u x y) => (Select1 (DIVVU x y))
+(Div32 x y) => (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Div32u x y) => (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Div16 x y) => (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Div16u x y) => (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Div8 x y) => (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Div8u x y) => (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod64 x y) => (Select0 (DIVV x y))
+(Mod64u x y) => (Select0 (DIVVU x y))
+(Mod32 x y) => (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Mod32u x y) => (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Mod16 x y) => (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Mod16u x y) => (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Mod8 x y) => (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Mod8u x y) => (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+
+(Select0 <t> (Add64carry x y c)) => (ADDV (ADDV <t> x y) c)
+(Select1 <t> (Add64carry x y c)) =>
+ (OR (SGTU <t> x s:(ADDV <t> x y)) (SGTU <t> s (ADDV <t> s c)))
+
+(Select0 <t> (Sub64borrow x y c)) => (SUBV (SUBV <t> x y) c)
+(Select1 <t> (Sub64borrow x y c)) =>
+ (OR (SGTU <t> s:(SUBV <t> x y) x) (SGTU <t> (SUBV <t> s c) s))
+
+// (x + y) / 2 with x>=y => (x - y) / 2 + y
+(Avg64u <t> x y) => (ADDV (SRLVconst <t> (SUBV <t> x y) [1]) y)
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+// shifts
+// hardware instruction uses only the low 6 bits of the shift
+// we compare to 64 to ensure Go semantics for large shifts
+(Lsh64x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh64x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh64x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Lsh32x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh32x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh32x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Lsh16x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh16x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh16x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Lsh8x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh8x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh8x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Rsh64Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y))
+(Rsh64Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y)))
+(Rsh64Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> x (ZeroExt8to64 y)))
+
+(Rsh32Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y))
+(Rsh32Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)))
+(Rsh32Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64 y)))
+
+(Rsh16Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y))
+(Rsh16Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Rsh16Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64 y)))
+
+(Rsh8Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y))
+(Rsh8Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)))
+(Rsh8Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64 y)))
+
+(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh64x8 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh32x8 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh16x8 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh8x8 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+// rotates
+(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
+(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
+(RotateLeft32 x y) => (ROTR x (NEGV <y.Type> y))
+(RotateLeft64 x y) => (ROTRV x (NEGV <y.Type> y))
+
+// unary ops
+(Neg(64|32|16|8) ...) => (NEGV ...)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x)
+
+(Sqrt ...) => (SQRTD ...)
+(Sqrt32 ...) => (SQRTF ...)
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVVconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// constants
+(Const(64|32|16|8) [val]) => (MOVVconst [int64(val)])
+(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)])
+(ConstNil) => (MOVVconst [0])
+(ConstBool [t]) => (MOVVconst [int64(b2i(t))])
+
+(Slicemask <t> x) => (SRAVconst (NEGV <t> x) [63])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+// float <=> int conversion
+(Cvt32to32F ...) => (MOVWF ...)
+(Cvt32to64F ...) => (MOVWD ...)
+(Cvt64to32F ...) => (MOVVF ...)
+(Cvt64to64F ...) => (MOVVD ...)
+(Cvt32Fto32 ...) => (TRUNCFW ...)
+(Cvt64Fto32 ...) => (TRUNCDW ...)
+(Cvt32Fto64 ...) => (TRUNCFV ...)
+(Cvt64Fto64 ...) => (TRUNCDV ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+// comparisons
+(Eq8 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Eq16 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq32 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Eq64 x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(EqPtr x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y))
+
+(Neq8 x y) => (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0]))
+(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0]))
+(Neq32 x y) => (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0]))
+(Neq64 x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(NeqPtr x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y))
+
+(Less8 x y) => (SGT (SignExt8to64 y) (SignExt8to64 x))
+(Less16 x y) => (SGT (SignExt16to64 y) (SignExt16to64 x))
+(Less32 x y) => (SGT (SignExt32to64 y) (SignExt32to64 x))
+(Less64 x y) => (SGT y x)
+(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y) => (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x))
+(Less16U x y) => (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x))
+(Less32U x y) => (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x))
+(Less64U x y) => (SGTU y x)
+
+(Leq8 x y) => (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y)))
+(Leq16 x y) => (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y)))
+(Leq32 x y) => (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y)))
+(Leq64 x y) => (XOR (MOVVconst [1]) (SGT x y))
+(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Leq16U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Leq32U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Leq64U x y) => (XOR (MOVVconst [1]) (SGTU x y))
+
+(OffPtr [off] ptr:(SP)) => (MOVVaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDVconst [off] ptr)
+
+(Addr {sym} base) => (MOVVaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVVaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVVload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVVstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore ptr (MOVVconst [0]) mem)
+(Zero [2] ptr mem) =>
+ (MOVBstore [1] ptr (MOVVconst [0])
+ (MOVBstore [0] ptr (MOVVconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore ptr (MOVVconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] ptr (MOVVconst [0])
+ (MOVHstore [0] ptr (MOVVconst [0]) mem))
+(Zero [4] ptr mem) =>
+ (MOVBstore [3] ptr (MOVVconst [0])
+ (MOVBstore [2] ptr (MOVVconst [0])
+ (MOVBstore [1] ptr (MOVVconst [0])
+ (MOVBstore [0] ptr (MOVVconst [0]) mem))))
+(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore ptr (MOVVconst [0]) mem)
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] ptr (MOVVconst [0])
+ (MOVWstore [0] ptr (MOVVconst [0]) mem))
+(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] ptr (MOVVconst [0])
+ (MOVHstore [4] ptr (MOVVconst [0])
+ (MOVHstore [2] ptr (MOVVconst [0])
+ (MOVHstore [0] ptr (MOVVconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+ (MOVBstore [2] ptr (MOVVconst [0])
+ (MOVBstore [1] ptr (MOVVconst [0])
+ (MOVBstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] ptr (MOVVconst [0])
+ (MOVHstore [2] ptr (MOVVconst [0])
+ (MOVHstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] ptr (MOVVconst [0])
+ (MOVWstore [4] ptr (MOVVconst [0])
+ (MOVWstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [8] ptr (MOVVconst [0])
+ (MOVVstore [0] ptr (MOVVconst [0]) mem))
+(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [16] ptr (MOVVconst [0])
+ (MOVVstore [8] ptr (MOVVconst [0])
+ (MOVVstore [0] ptr (MOVVconst [0]) mem)))
+
+// medium zeroing uses a duff device
+// 8, and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+ && s%8 == 0 && s > 24 && s <= 8*128
+ && t.Alignment()%8 == 0 && !config.noDuffDevice =>
+ (DUFFZERO [8 * (128 - s/8)] ptr mem)
+
+// large or unaligned zeroing uses a loop
+(Zero [s] {t} ptr mem)
+ && (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 =>
+ (LoweredZero [t.Alignment()]
+ ptr
+ (ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)])
+ mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore dst (MOVHload src mem) mem)
+(Move [2] dst src mem) =>
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))
+(Move [4] dst src mem) =>
+ (MOVBstore [3] dst (MOVBload [3] src mem)
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem))))
+(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore dst (MOVVload src mem) mem)
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] dst (MOVHload [6] src mem)
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))))
+
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem)))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] dst (MOVWload [8] src mem)
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [8] dst (MOVVload [8] src mem)
+ (MOVVstore dst (MOVVload src mem) mem))
+(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [16] dst (MOVVload [16] src mem)
+ (MOVVstore [8] dst (MOVVload [8] src mem)
+ (MOVVstore dst (MOVVload src mem) mem)))
+
+// medium move uses a duff device
+(Move [s] {t} dst src mem)
+ && s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0
+ && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+// 16 and 128 are magic constants. 16 is the number of bytes to encode:
+// MOVV (R1), R23
+// ADDV $8, R1
+// MOVV R23, (R2)
+// ADDV $8, R2
+// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy.
+
+// large or unaligned move uses a loop
+(Move [s] {t} dst src mem)
+ && s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 =>
+ (LoweredMove [t.Alignment()]
+ dst
+ src
+ (ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)])
+ mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// atomic intrinsics
+(AtomicLoad(8|32|64) ...) => (LoweredAtomicLoad(8|32|64) ...)
+(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...)
+
+(AtomicStore(8|32|64) ...) => (LoweredAtomicStore(8|32|64) ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
+
+(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+
+(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+
+(AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem)
+(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (SGTU ptr (MOVVconst [0]))
+(IsInBounds idx len) => (SGTU len idx)
+(IsSliceInBounds idx len) => (XOR (MOVVconst [1]) (SGTU idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+(If cond yes no) => (NE cond yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(CondSelect <t> x y cond) => (OR (MASKEQZ <t> x cond) (MASKNEZ <t> y cond))
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no)
+(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no)
+(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no)
+(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no)
+(NE (SGTUconst [1] x) yes no) => (EQ x yes no)
+(EQ (SGTUconst [1] x) yes no) => (NE x yes no)
+(NE (SGTU x (MOVVconst [0])) yes no) => (NE x yes no)
+(EQ (SGTU x (MOVVconst [0])) yes no) => (EQ x yes no)
+(NE (SGTconst [0] x) yes no) => (LTZ x yes no)
+(EQ (SGTconst [0] x) yes no) => (GEZ x yes no)
+(NE (SGT x (MOVVconst [0])) yes no) => (GTZ x yes no)
+(EQ (SGT x (MOVVconst [0])) yes no) => (LEZ x yes no)
+
+// fold offset into address
+(ADDVconst [off1] (MOVVaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) => (MOVVaddr [int32(off1)+int32(off2)] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBload [off1+int32(off2)] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBUload [off1+int32(off2)] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHload [off1+int32(off2)] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHUload [off1+int32(off2)] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWload [off1+int32(off2)] {sym} ptr mem)
+(MOVWUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWUload [off1+int32(off2)] {sym} ptr mem)
+(MOVVload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVload [off1+int32(off2)] {sym} ptr mem)
+(MOVFload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVFload [off1+int32(off2)] {sym} ptr mem)
+(MOVDload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDload [off1+int32(off2)] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVBstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVHstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVWstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVVstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVVstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVFstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVBstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVVstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVstorezero [off1+int32(off2)] {sym} ptr mem)
+
+(MOVBload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVVload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVVload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVFload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVDload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVVstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVVstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVFstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVDstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVVstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVVstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+
+(LoweredAtomicStore(32|64) ptr (MOVVconst [0]) mem) => (LoweredAtomicStorezero(32|64) ptr mem)
+(LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst32 [int32(c)] ptr mem)
+(LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst64 [c] ptr mem)
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVVreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVVreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVVnop doesn't emit instruction, only for ensuring the type.
+(MOVVreg x) && x.Uses == 1 => (MOVVnop x)
+
+// fold constant into arithmetic ops
+(ADDV x (MOVVconst [c])) && is32Bit(c) => (ADDVconst [c] x)
+(SUBV x (MOVVconst [c])) && is32Bit(c) => (SUBVconst [c] x)
+(AND x (MOVVconst [c])) && is32Bit(c) => (ANDconst [c] x)
+(OR x (MOVVconst [c])) && is32Bit(c) => (ORconst [c] x)
+(XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x)
+(NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x)
+
+(SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63])
+(SLLV x (MOVVconst [c])) => (SLLVconst x [c])
+(SRLV x (MOVVconst [c])) => (SRLVconst x [c])
+(SRAV x (MOVVconst [c])) => (SRAVconst x [c])
+(ROTR x (MOVVconst [c])) => (ROTRconst x [c&31])
+(ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63])
+
+(SGT (MOVVconst [c]) x) && is32Bit(c) => (SGTconst [c] x)
+(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x)
+
+// mul by constant
+(Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x)
+(Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0])
+(Select1 (MULVU x (MOVVconst [1]))) => x
+(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SLLVconst [log64(c)] x)
+
+// div by constant
+(Select1 (DIVVU x (MOVVconst [1]))) => x
+(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SRLVconst [log64(c)] x)
+(Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0]) // mod
+(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (ANDconst [c-1] x) // mod
+
+// generic simplifications
+(ADDV x (NEGV y)) => (SUBV x y)
+(SUBV x x) => (MOVVconst [0])
+(SUBV (MOVVconst [0]) x) => (NEGV x)
+(AND x x) => x
+(OR x x) => x
+(XOR x x) => (MOVVconst [0])
+
+// remove redundant *const ops
+(ADDVconst [0] x) => x
+(SUBVconst [0] x) => x
+(ANDconst [0] _) => (MOVVconst [0])
+(ANDconst [-1] x) => x
+(ORconst [0] x) => x
+(ORconst [-1] _) => (MOVVconst [-1])
+(XORconst [0] x) => x
+(XORconst [-1] x) => (NORconst [0] x)
+(MASKEQZ (MOVVconst [0]) cond) => (MOVVconst [0])
+(MASKNEZ (MOVVconst [0]) cond) => (MOVVconst [0])
+
+// generic constant folding
+(ADDVconst [c] (MOVVconst [d])) => (MOVVconst [c+d])
+(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) => (ADDVconst [c+d] x)
+(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) => (ADDVconst [c-d] x)
+(SUBVconst [c] (MOVVconst [d])) => (MOVVconst [d-c])
+(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) => (ADDVconst [-c-d] x)
+(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) => (ADDVconst [-c+d] x)
+(SLLVconst [c] (MOVVconst [d])) => (MOVVconst [d<<uint64(c)])
+(SRLVconst [c] (MOVVconst [d])) => (MOVVconst [int64(uint64(d)>>uint64(c))])
+(SRAVconst [c] (MOVVconst [d])) => (MOVVconst [d>>uint64(c)])
+(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c*d])
+(Select1 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c/d])
+(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)/uint64(d))])
+(Select0 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c%d]) // mod
+(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)%uint64(d))]) // mod
+(ANDconst [c] (MOVVconst [d])) => (MOVVconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVVconst [d])) => (MOVVconst [c|d])
+(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) => (ORconst [c|d] x)
+(XORconst [c] (MOVVconst [d])) => (MOVVconst [c^d])
+(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) => (XORconst [c^d] x)
+(NORconst [c] (MOVVconst [d])) => (MOVVconst [^(c|d)])
+(NEGV (MOVVconst [c])) => (MOVVconst [-c])
+(MOVBreg (MOVVconst [c])) => (MOVVconst [int64(int8(c))])
+(MOVBUreg (MOVVconst [c])) => (MOVVconst [int64(uint8(c))])
+(MOVHreg (MOVVconst [c])) => (MOVVconst [int64(int16(c))])
+(MOVHUreg (MOVVconst [c])) => (MOVVconst [int64(uint16(c))])
+(MOVWreg (MOVVconst [c])) => (MOVVconst [int64(int32(c))])
+(MOVWUreg (MOVVconst [c])) => (MOVVconst [int64(uint32(c))])
+(MOVVreg (MOVVconst [c])) => (MOVVconst [c])
+
+// constant comparisons
+(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1])
+(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) => (MOVVconst [1])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) => (MOVVconst [0])
+
+// other known comparisons
+(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVVconst [1])
+(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVVconst [0])
+(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVVconst [1])
+(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVVconst [0])
+(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVWUreg _)) && c < 0 => (MOVVconst [0])
+(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVVconst [1])
+(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+
+// absorb constants into branches
+(EQ (MOVVconst [0]) yes no) => (First yes no)
+(EQ (MOVVconst [c]) yes no) && c != 0 => (First no yes)
+(NE (MOVVconst [0]) yes no) => (First no yes)
+(NE (MOVVconst [c]) yes no) && c != 0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c < 0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c >= 0 => (First no yes)
+(LEZ (MOVVconst [c]) yes no) && c <= 0 => (First yes no)
+(LEZ (MOVVconst [c]) yes no) && c > 0 => (First no yes)
+(GTZ (MOVVconst [c]) yes no) && c > 0 => (First yes no)
+(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes)
+(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
+(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes)
+
+// SGT/SGTU with known outcomes.
+(SGT x x) => (MOVVconst [0])
+(SGTU x x) => (MOVVconst [0])
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
new file mode 100644
index 0000000..22a83fb
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
@@ -0,0 +1,484 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - *const instructions may use a constant larger than the instruction can encode.
+// In this case the assembler expands to multiple instructions and uses tmp
+// register (R23).
+
+// Suffixes encode the bit width of various instructions.
+// V (vlong) = 64 bit
+// WU (word) = 32 bit unsigned
+// W (word) = 32 bit
+// H (half word) = 16 bit
+// HU = 16 bit unsigned
+// B (byte) = 8 bit
+// BU = 8 bit unsigned
+// F (float) = 32 bit float
+// D (double) = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesLOONG64 = []string{
+ "R0", // constant 0
+ "R1",
+ "SP", // aka R3
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "R13",
+ "R14",
+ "R15",
+ "R16",
+ "R17",
+ "R18",
+ "R19",
+ "R20",
+ "R21",
+ "g", // aka R22
+ "R23",
+ "R24",
+ "R25",
+ "R26",
+ "R27",
+ "R28",
+ "R29",
+ // R30 is REGTMP not used in regalloc
+ "R31",
+
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15",
+ "F16",
+ "F17",
+ "F18",
+ "F19",
+ "F20",
+ "F21",
+ "F22",
+ "F23",
+ "F24",
+ "F25",
+ "F26",
+ "F27",
+ "F28",
+ "F29",
+ "F30",
+ "F31",
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesLOONG64) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesLOONG64 {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ gp = buildReg("R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R23 R24 R25 R26 R27 R28 R29 R31") // R1 is LR, R2 is thread pointer, R3 is stack pointer, R21-unused, R22 is g, R30 is REGTMP
+ gps = buildReg("R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R19 R20 R23 R24 R25 R26 R27 R28 R29 R31") | buildReg("g")
+ gpg = gp | buildReg("g")
+ gpsp = gp | buildReg("SP")
+ gpspg = gpg | buildReg("SP")
+ gpspsbg = gpspg | buildReg("SB")
+ fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+ callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r1 = buildReg("R19")
+ r2 = buildReg("R18")
+ r3 = buildReg("R17")
+ r4 = buildReg("R4")
+ )
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+ gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+ gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+ gp22 = regInfo{inputs: []regMask{gps, gps}, outputs: []regMask{gp, gp}}
+ gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+ gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
+ gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+ gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+ gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+ fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+ fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+ fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+ fpstore = regInfo{inputs: []regMask{gpspsbg, fp}}
+ readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+ )
+ ops := []opData{
+ // binary ops
+ {name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true}, // arg0 + arg1
+ {name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"}, // arg0 + auxInt. auxInt is 32-bit, also in other *const ops.
+ {name: "SUBV", argLength: 2, reg: gp21, asm: "SUBVU"}, // arg0 - arg1
+ {name: "SUBVconst", argLength: 1, reg: gp11, asm: "SUBVU", aux: "Int64"}, // arg0 - auxInt
+
+ {name: "MULV", argLength: 2, reg: gp22, resultNotInArgs: true, commutative: true, typ: "(Int64,Int64)"}, // arg0 * arg1, signed
+ {name: "MULVU", argLength: 2, reg: gp22, resultNotInArgs: true, commutative: true, typ: "(UInt64,UInt64)"}, // arg0 * arg1, unsigned
+ {name: "DIVV", argLength: 2, reg: gp22, resultNotInArgs: true, typ: "(Int64,Int64)"}, // arg0 / arg1, signed
+ {name: "DIVVU", argLength: 2, reg: gp22, resultNotInArgs: true, typ: "(UInt64,UInt64)"}, // arg0 / arg1, unsigned
+
+ {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+ {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+ {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1
+ {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1
+ {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+ {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+ {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1
+ {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt
+ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0 | auxInt
+ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt64"}, // arg0 ^ arg1
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", typ: "UInt64"}, // arg0 ^ auxInt
+ {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1)
+ {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt)
+
+ {name: "NEGV", argLength: 1, reg: gp11}, // -arg0
+ {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
+ {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
+ {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+ {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
+
+ {name: "MASKEQZ", argLength: 2, reg: gp21, asm: "MASKEQZ"}, // returns 0 if arg1 == 0, otherwise returns arg0
+ {name: "MASKNEZ", argLength: 2, reg: gp21, asm: "MASKNEZ"}, // returns 0 if arg1 != 0, otherwise returns arg0
+
+ // shifts
+ {name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64
+ {name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt
+ {name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64
+ {name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned
+ {name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64
+ {name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed
+ {name: "ROTR", argLength: 2, reg: gp21, asm: "ROTR"}, // arg0 right rotate by (arg1 mod 32) bits
+ {name: "ROTRV", argLength: 2, reg: gp21, asm: "ROTRV"}, // arg0 right rotate by (arg1 mod 64) bits
+ {name: "ROTRconst", argLength: 1, reg: gp11, asm: "ROTR", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31.
+ {name: "ROTRVconst", argLength: 1, reg: gp11, asm: "ROTRV", aux: "Int64"}, // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63.
+
+ // comparisons
+ {name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise
+ {name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise
+ {name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise
+ {name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise
+
+ {name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32
+ {name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64
+ {name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32
+ {name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64
+ {name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32
+ {name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64
+
+ // moves
+ {name: "MOVVconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVV", typ: "UInt64", rematerializeable: true}, // auxint
+ {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+ {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+ {name: "MOVVaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVV", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+ {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVVload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVV", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+
+ {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVVstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+
+ {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVVstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. ar12=mem.
+
+ // conversions
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
+ {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half
+ {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word
+ {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+ {name: "MOVVreg", argLength: 1, reg: gp11, asm: "MOVV"}, // move from arg0
+
+ {name: "MOVVnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+ {name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32
+ {name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64
+ {name: "MOVVF", argLength: 1, reg: fp11, asm: "MOVVF"}, // int64 -> float32
+ {name: "MOVVD", argLength: 1, reg: fp11, asm: "MOVVD"}, // int64 -> float64
+ {name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32
+ {name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32
+ {name: "TRUNCFV", argLength: 1, reg: fp11, asm: "TRUNCFV"}, // float32 -> int64
+ {name: "TRUNCDV", argLength: 1, reg: fp11, asm: "TRUNCDV"}, // float64 -> int64
+ {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64
+ {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32
+
+ // function calls
+ {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R29"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // duffzero
+ // arg0 = address of memory to zero
+ // arg1 = mem
+ // auxint = offset into duffzero code to start executing
+ // returns mem
+ // R19 aka loong64.REGRT1 changed as side effect
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp},
+ clobbers: buildReg("R19 R1"),
+ },
+ faultOnNilArg0: true,
+ },
+
+ // duffcopy
+ // arg0 = address of dst memory (in R20, changed as side effect) REGRT2
+ // arg1 = address of src memory (in R19, changed as side effect) REGRT1
+ // arg2 = mem
+ // auxint = offset into duffcopy code to start executing
+ // returns mem
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20"), buildReg("R19")},
+ clobbers: buildReg("R19 R20 R1"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // large or unaligned zeroing
+ // arg0 = address of memory to zero (in R19, changed as side effect)
+ // arg1 = address of the last element to zero
+ // arg2 = mem
+ // auxint = alignment
+ // returns mem
+ // SUBV $8, R19
+ // MOVV R0, 8(R19)
+ // ADDV $8, R19
+ // BNE Rarg1, R19, -2(PC)
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R19"), gp},
+ clobbers: buildReg("R19"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ },
+
+ // large or unaligned move
+ // arg0 = address of dst memory (in R4, changed as side effect)
+ // arg1 = address of src memory (in R19, changed as side effect)
+ // arg2 = address of the last element of src
+ // arg3 = mem
+ // auxint = alignment
+ // returns mem
+ // SUBV $8, R19
+ // MOVV 8(R19), Rtmp
+ // MOVV Rtmp, (R4)
+ // ADDV $8, R19
+ // ADDV $8, R4
+ // BNE Rarg2, R19, -4(PC)
+ {
+ name: "LoweredMove",
+ aux: "Int64",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R4"), buildReg("R19"), gp},
+ clobbers: buildReg("R19 R4"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // atomic loads.
+ // load from arg0. arg1=mem.
+ // returns <value,memory> so they can be properly ordered with other loads.
+ {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+ // atomic stores.
+ // store arg1 to arg0. arg2=mem. returns memory.
+ {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ // store zero to arg0. arg1=mem. returns memory.
+ {name: "LoweredAtomicStorezero32", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStorezero64", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic exchange.
+ // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>.
+ // DBAR
+ // LL (Rarg0), Rout
+ // MOVV Rarg1, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // DBAR
+ {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic add.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+ // DBAR
+ // LL (Rarg0), Rout
+ // ADDV Rarg1, Rout, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // DBAR
+ // ADDV Rarg1, Rout
+ {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ // *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit.
+ {name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // DBAR
+ // MOVV $0, Rout
+ // LL (Rarg0), Rtmp
+ // BNE Rtmp, Rarg1, 4(PC)
+ // MOVV Rarg2, Rout
+ // SC Rout, (Rarg0)
+ // BEQ Rout, -4(PC)
+ // DBAR
+ {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // pseudo-ops
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
+
+ {name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true
+ {name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false
+
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of R22 (loong64.REGCTXT, the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R29")}}, zeroWidth: true},
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers R1 (LR) because it's a call
+ // and R30 (REGTMP).
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R27"), buildReg("R28")}, clobbers: (callerSave &^ gpg) | buildReg("R1")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ }
+
+ blocks := []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LTZ", controls: 1}, // < 0
+ {name: "LEZ", controls: 1}, // <= 0
+ {name: "GTZ", controls: 1}, // > 0
+ {name: "GEZ", controls: 1}, // >= 0
+ {name: "FPT", controls: 1}, // FP flag is true
+ {name: "FPF", controls: 1}, // FP flag is false
+ }
+
+ archs = append(archs, arch{
+ name: "LOONG64",
+ pkg: "cmd/internal/obj/loong64",
+ genfile: "../../loong64/ssa.go",
+ ops: ops,
+ blocks: blocks,
+ regnames: regNamesLOONG64,
+ // TODO: support register ABI on loong64
+ ParamIntRegNames: "R4 R5 R6 R7 R8 R9 R10 R11",
+ ParamFloatRegNames: "F0 F1 F2 F3 F4 F5 F6 F7",
+ gpregmask: gp,
+ fpregmask: fp,
+ framepointerreg: -1, // not used
+ linkreg: int8(num["R1"]),
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS.rules b/src/cmd/compile/internal/ssa/_gen/MIPS.rules
new file mode 100644
index 0000000..6f696da
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/MIPS.rules
@@ -0,0 +1,703 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|32|16|8) ...) => (ADD ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+
+(Select0 (Add32carry <t> x y)) => (ADD <t.FieldType(0)> x y)
+(Select1 (Add32carry <t> x y)) => (SGTU <typ.Bool> x (ADD <t.FieldType(0)> x y))
+(Add32withcarry <t> x y c) => (ADD c (ADD <t> x y))
+
+(Sub(Ptr|32|16|8) ...) => (SUB ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+
+(Select0 (Sub32carry <t> x y)) => (SUB <t.FieldType(0)> x y)
+(Select1 (Sub32carry <t> x y)) => (SGTU <typ.Bool> (SUB <t.FieldType(0)> x y) x)
+(Sub32withcarry <t> x y c) => (SUB (SUB <t> x y) c)
+
+(Mul(32|16|8) ...) => (MUL ...)
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+
+(Hmul(32|32u) x y) => (Select0 (MUL(T|TU) x y))
+(Mul32uhilo ...) => (MULTU ...)
+
+(Div32 x y) => (Select1 (DIV x y))
+(Div32u x y) => (Select1 (DIVU x y))
+(Div16 x y) => (Select1 (DIV (SignExt16to32 x) (SignExt16to32 y)))
+(Div16u x y) => (Select1 (DIVU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Div8 x y) => (Select1 (DIV (SignExt8to32 x) (SignExt8to32 y)))
+(Div8u x y) => (Select1 (DIVU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod32 x y) => (Select0 (DIV x y))
+(Mod32u x y) => (Select0 (DIVU x y))
+(Mod16 x y) => (Select0 (DIV (SignExt16to32 x) (SignExt16to32 y)))
+(Mod16u x y) => (Select0 (DIVU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Mod8 x y) => (Select0 (DIV (SignExt8to32 x) (SignExt8to32 y)))
+(Mod8u x y) => (Select0 (DIVU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+
+// (x + y) / 2 with x>=y becomes (x - y) / 2 + y
+(Avg32u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(32|16|8) ...) => (AND ...)
+(Or(32|16|8) ...) => (OR ...)
+(Xor(32|16|8) ...) => (XOR ...)
+
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint32(c) < 32 => (SLLconst x [int32(c)])
+(Rsh32x64 x (Const64 [c])) && uint32(c) < 32 => (SRAconst x [int32(c)])
+(Rsh32Ux64 x (Const64 [c])) && uint32(c) < 32 => (SRLconst x [int32(c)])
+(Lsh16x64 x (Const64 [c])) && uint32(c) < 16 => (SLLconst x [int32(c)])
+(Rsh16x64 x (Const64 [c])) && uint32(c) < 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Rsh16Ux64 x (Const64 [c])) && uint32(c) < 16 => (SRLconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Lsh8x64 x (Const64 [c])) && uint32(c) < 8 => (SLLconst x [int32(c)])
+(Rsh8x64 x (Const64 [c])) && uint32(c) < 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+(Rsh8Ux64 x (Const64 [c])) && uint32(c) < 8 => (SRLconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint32(c) >= 32 => (MOVWconst [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint32(c) >= 32 => (MOVWconst [0])
+(Lsh16x64 _ (Const64 [c])) && uint32(c) >= 16 => (MOVWconst [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint32(c) >= 16 => (MOVWconst [0])
+(Lsh8x64 _ (Const64 [c])) && uint32(c) >= 8 => (MOVWconst [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint32(c) >= 8 => (MOVWconst [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh32x64 x (Const64 [c])) && uint32(c) >= 32 => (SRAconst x [31])
+(Rsh16x64 x (Const64 [c])) && uint32(c) >= 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [31])
+(Rsh8x64 x (Const64 [c])) && uint32(c) >= 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [31])
+
+// shifts
+// hardware instruction uses only the low 5 bits of the shift
+// we compare to 32 to ensure Go semantics for large shifts
+(Lsh32x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Lsh32x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Lsh32x8 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Lsh16x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Lsh16x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Lsh16x8 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Lsh8x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Lsh8x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Lsh8x8 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh32Ux32 <t> x y) => (CMOVZ (SRL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Rsh32Ux16 <t> x y) => (CMOVZ (SRL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Rsh32Ux8 <t> x y) => (CMOVZ (SRL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh16Ux32 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) y) (MOVWconst [0]) (SGTUconst [32] y))
+(Rsh16Ux16 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Rsh16Ux8 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh8Ux32 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) y) (MOVWconst [0]) (SGTUconst [32] y))
+(Rsh8Ux16 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Rsh8Ux8 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh32x32 x y) => (SRA x ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y)))
+(Rsh32x16 x y) => (SRA x ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y))))
+(Rsh32x8 x y) => (SRA x ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y))))
+
+(Rsh16x32 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y)))
+(Rsh16x16 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y))))
+(Rsh16x8 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y))))
+
+(Rsh8x32 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y)))
+(Rsh8x16 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y))))
+(Rsh8x8 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y))))
+
+// rotates
+(RotateLeft8 <t> x (MOVWconst [c])) => (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+(RotateLeft16 <t> x (MOVWconst [c])) => (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+(RotateLeft32 <t> x (MOVWconst [c])) => (Or32 (Lsh32x32 <t> x (MOVWconst [c&31])) (Rsh32Ux32 <t> x (MOVWconst [-c&31])))
+(RotateLeft64 <t> x (MOVWconst [c])) => (Or64 (Lsh64x32 <t> x (MOVWconst [c&63])) (Rsh64Ux32 <t> x (MOVWconst [-c&63])))
+
+// unary ops
+(Neg(32|16|8) ...) => (NEG ...)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(32|16|8) x) => (NORconst [0] x)
+
+(Sqrt ...) => (SQRTD ...)
+(Sqrt32 ...) => (SQRTF ...)
+
+// TODO: optimize this case?
+(Ctz32NonZero ...) => (Ctz32 ...)
+
+// count trailing zero
+// 32 - CLZ(x&-x - 1)
+(Ctz32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> (SUBconst <t> [1] (AND <t> x (NEG <t> x)))))
+
+// bit length
+(BitLen32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> x))
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XORconst [1] (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// constants
+(Const(32|16|8) [val]) => (MOVWconst [int32(val)])
+(Const(32|64)F ...) => (MOV(F|D)const ...)
+(ConstNil) => (MOVWconst [0])
+(ConstBool [t]) => (MOVWconst [b2i32(t)])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+
+(Signmask x) => (SRAconst x [31])
+(Zeromask x) => (NEG (SGTU x (MOVWconst [0])))
+(Slicemask <t> x) => (SRAconst (NEG <t> x) [31])
+
+// float-int conversion
+(Cvt32to(32|64)F ...) => (MOVW(F|D) ...)
+(Cvt(32|64)Fto32 ...) => (TRUNC(F|D)W ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+// comparisons
+(Eq8 x y) => (SGTUconst [1] (XOR (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (SGTUconst [1] (XOR (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (SGTUconst [1] (XOR x y))
+(EqPtr x y) => (SGTUconst [1] (XOR x y))
+(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y))
+
+(Neq8 x y) => (SGTU (XOR (ZeroExt8to32 x) (ZeroExt8to32 y)) (MOVWconst [0]))
+(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to32 y)) (MOVWconst [0]))
+(Neq32 x y) => (SGTU (XOR x y) (MOVWconst [0]))
+(NeqPtr x y) => (SGTU (XOR x y) (MOVWconst [0]))
+(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y))
+
+(Less8 x y) => (SGT (SignExt8to32 y) (SignExt8to32 x))
+(Less16 x y) => (SGT (SignExt16to32 y) (SignExt16to32 x))
+(Less32 x y) => (SGT y x)
+(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y) => (SGTU (ZeroExt8to32 y) (ZeroExt8to32 x))
+(Less16U x y) => (SGTU (ZeroExt16to32 y) (ZeroExt16to32 x))
+(Less32U x y) => (SGTU y x)
+
+(Leq8 x y) => (XORconst [1] (SGT (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (XORconst [1] (SGT (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (XORconst [1] (SGT x y))
+(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y) => (XORconst [1] (SGTU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (XORconst [1] (SGTU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (XORconst [1] (SGTU x y))
+
+(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr)
+
+(Addr {sym} base) => (MOVWaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVWaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore ptr (MOVWconst [0]) mem)
+(Zero [2] ptr mem) =>
+ (MOVBstore [1] ptr (MOVWconst [0])
+ (MOVBstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore ptr (MOVWconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] ptr (MOVWconst [0])
+ (MOVHstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] ptr mem) =>
+ (MOVBstore [3] ptr (MOVWconst [0])
+ (MOVBstore [2] ptr (MOVWconst [0])
+ (MOVBstore [1] ptr (MOVWconst [0])
+ (MOVBstore [0] ptr (MOVWconst [0]) mem))))
+(Zero [3] ptr mem) =>
+ (MOVBstore [2] ptr (MOVWconst [0])
+ (MOVBstore [1] ptr (MOVWconst [0])
+ (MOVBstore [0] ptr (MOVWconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] ptr (MOVWconst [0])
+ (MOVHstore [2] ptr (MOVWconst [0])
+ (MOVHstore [0] ptr (MOVWconst [0]) mem)))
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] ptr (MOVWconst [0])
+ (MOVWstore [0] ptr (MOVWconst [0]) mem))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] ptr (MOVWconst [0])
+ (MOVWstore [4] ptr (MOVWconst [0])
+ (MOVWstore [0] ptr (MOVWconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [12] ptr (MOVWconst [0])
+ (MOVWstore [8] ptr (MOVWconst [0])
+ (MOVWstore [4] ptr (MOVWconst [0])
+ (MOVWstore [0] ptr (MOVWconst [0]) mem))))
+
+// large or unaligned zeroing uses a loop
+(Zero [s] {t} ptr mem)
+ && (s > 16 || t.Alignment()%4 != 0) =>
+ (LoweredZero [int32(t.Alignment())]
+ ptr
+ (ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))])
+ mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore dst (MOVHUload src mem) mem)
+(Move [2] dst src mem) =>
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] dst (MOVHUload [2] src mem)
+ (MOVHstore dst (MOVHUload src mem) mem))
+(Move [4] dst src mem) =>
+ (MOVBstore [3] dst (MOVBUload [3] src mem)
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem))))
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBUload [2] src mem)
+ (MOVBstore [1] dst (MOVBUload [1] src mem)
+ (MOVBstore dst (MOVBUload src mem) mem)))
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] dst (MOVHload [6] src mem)
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] dst (MOVWload [8] src mem)
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [12] dst (MOVWload [12] src mem)
+ (MOVWstore [8] dst (MOVWload [8] src mem)
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))))
+
+
+// large or unaligned move uses a loop
+(Move [s] {t} dst src mem)
+ && (s > 16 && logLargeCopy(v, s) || t.Alignment()%4 != 0) =>
+ (LoweredMove [int32(t.Alignment())]
+ dst
+ src
+ (ADDconst <src.Type> src [int32(s-moveSize(t.Alignment(), config))])
+ mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// atomic intrinsics
+(AtomicLoad(8|32) ...) => (LoweredAtomicLoad(8|32) ...)
+(AtomicLoadPtr ...) => (LoweredAtomicLoad32 ...)
+
+(AtomicStore(8|32) ...) => (LoweredAtomicStore(8|32) ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore32 ...)
+
+(AtomicExchange32 ...) => (LoweredAtomicExchange ...)
+(AtomicAdd32 ...) => (LoweredAtomicAdd ...)
+
+(AtomicCompareAndSwap32 ...) => (LoweredAtomicCas ...)
+
+// AtomicOr8(ptr,val) => LoweredAtomicOr(ptr&^3,uint32(val) << ((ptr & 3) * 8))
+(AtomicOr8 ptr val mem) && !config.BigEndian =>
+ (LoweredAtomicOr (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+ (SLL <typ.UInt32> (ZeroExt8to32 val)
+ (SLLconst <typ.UInt32> [3]
+ (ANDconst <typ.UInt32> [3] ptr))) mem)
+
+// AtomicAnd8(ptr,val) => LoweredAtomicAnd(ptr&^3,(uint32(val) << ((ptr & 3) * 8)) | ^(uint32(0xFF) << ((ptr & 3) * 8))))
+(AtomicAnd8 ptr val mem) && !config.BigEndian =>
+ (LoweredAtomicAnd (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+ (OR <typ.UInt32> (SLL <typ.UInt32> (ZeroExt8to32 val)
+ (SLLconst <typ.UInt32> [3]
+ (ANDconst <typ.UInt32> [3] ptr)))
+ (NORconst [0] <typ.UInt32> (SLL <typ.UInt32>
+ (MOVWconst [0xff]) (SLLconst <typ.UInt32> [3]
+ (ANDconst <typ.UInt32> [3] ptr))))) mem)
+
+// AtomicOr8(ptr,val) => LoweredAtomicOr(ptr&^3,uint32(val) << (((ptr^3) & 3) * 8))
+(AtomicOr8 ptr val mem) && config.BigEndian =>
+ (LoweredAtomicOr (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+ (SLL <typ.UInt32> (ZeroExt8to32 val)
+ (SLLconst <typ.UInt32> [3]
+ (ANDconst <typ.UInt32> [3]
+ (XORconst <typ.UInt32> [3] ptr)))) mem)
+
+// AtomicAnd8(ptr,val) => LoweredAtomicAnd(ptr&^3,(uint32(val) << (((ptr^3) & 3) * 8)) | ^(uint32(0xFF) << (((ptr^3) & 3) * 8))))
+(AtomicAnd8 ptr val mem) && config.BigEndian =>
+ (LoweredAtomicAnd (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+ (OR <typ.UInt32> (SLL <typ.UInt32> (ZeroExt8to32 val)
+ (SLLconst <typ.UInt32> [3]
+ (ANDconst <typ.UInt32> [3]
+ (XORconst <typ.UInt32> [3] ptr))))
+ (NORconst [0] <typ.UInt32> (SLL <typ.UInt32>
+ (MOVWconst [0xff]) (SLLconst <typ.UInt32> [3]
+ (ANDconst <typ.UInt32> [3]
+ (XORconst <typ.UInt32> [3] ptr)))))) mem)
+
+(AtomicAnd32 ...) => (LoweredAtomicAnd ...)
+(AtomicOr32 ...) => (LoweredAtomicOr ...)
+
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (SGTU ptr (MOVWconst [0]))
+(IsInBounds idx len) => (SGTU len idx)
+(IsSliceInBounds idx len) => (XORconst [1] (SGTU idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+(If cond yes no) => (NE cond yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem)
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no)
+(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no)
+(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTzero _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUzero _)) yes no) => (EQ cmp yes no)
+(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTzero _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUzero _)) yes no) => (NE cmp yes no)
+(NE (SGTUconst [1] x) yes no) => (EQ x yes no)
+(EQ (SGTUconst [1] x) yes no) => (NE x yes no)
+(NE (SGTUzero x) yes no) => (NE x yes no)
+(EQ (SGTUzero x) yes no) => (EQ x yes no)
+(NE (SGTconst [0] x) yes no) => (LTZ x yes no)
+(EQ (SGTconst [0] x) yes no) => (GEZ x yes no)
+(NE (SGTzero x) yes no) => (GTZ x yes no)
+(EQ (SGTzero x) yes no) => (LEZ x yes no)
+
+// fold offset into address
+(ADDconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off1+off2] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBload [off1+off2] {sym} ptr mem)
+(MOVBUload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBUload [off1+off2] {sym} ptr mem)
+(MOVHload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHload [off1+off2] {sym} ptr mem)
+(MOVHUload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHUload [off1+off2] {sym} ptr mem)
+(MOVWload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWload [off1+off2] {sym} ptr mem)
+(MOVFload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVFload [off1+off2] {sym} ptr mem)
+(MOVDload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVDload [off1+off2] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBstore [off1+off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHstore [off1+off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWstore [off1+off2] {sym} ptr val mem)
+(MOVFstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVFstore [off1+off2] {sym} ptr val mem)
+(MOVDstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVDstore [off1+off2] {sym} ptr val mem)
+
+(MOVBstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBstorezero [off1+off2] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHstorezero [off1+off2] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWstorezero [off1+off2] {sym} ptr mem)
+
+(MOVBload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVFload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVFstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+ (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+ (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBreg x)
+(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBUreg x)
+(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHreg x)
+(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHUreg x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVFload [off] {sym} ptr (MOVFstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+
+// store zero
+(MOVBstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _)) => (MOVWreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVWreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVWreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVWreg x)
+
+// sign extended loads
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values. See test/atomicload.go.
+(MOVBreg <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <t> [off] {sym} ptr mem)
+(MOVBUreg <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem)
+(MOVHreg <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload <t> [off] {sym} ptr mem)
+(MOVHUreg <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem)
+
+// fold extensions and ANDs together
+(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x)
+(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&0xffff] x)
+(MOVBreg (ANDconst [c] x)) && c & 0x80 == 0 => (ANDconst [c&0x7f] x)
+(MOVHreg (ANDconst [c] x)) && c & 0x8000 == 0 => (ANDconst [c&0x7fff] x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVWnop doesn't emit instruction, only for ensuring the type.
+(MOVWreg x) && x.Uses == 1 => (MOVWnop x)
+
+// TODO: we should be able to get rid of MOVWnop all together.
+// But for now, this is enough to get rid of lots of them.
+(MOVWnop (MOVWconst [c])) => (MOVWconst [c])
+
+// fold constant into arithmetic ops
+(ADD x (MOVWconst [c])) => (ADDconst [c] x)
+(SUB x (MOVWconst [c])) => (SUBconst [c] x)
+(AND x (MOVWconst [c])) => (ANDconst [c] x)
+(OR x (MOVWconst [c])) => (ORconst [c] x)
+(XOR x (MOVWconst [c])) => (XORconst [c] x)
+(NOR x (MOVWconst [c])) => (NORconst [c] x)
+
+(SLL x (MOVWconst [c])) => (SLLconst x [c&31])
+(SRL x (MOVWconst [c])) => (SRLconst x [c&31])
+(SRA x (MOVWconst [c])) => (SRAconst x [c&31])
+
+(SGT (MOVWconst [c]) x) => (SGTconst [c] x)
+(SGTU (MOVWconst [c]) x) => (SGTUconst [c] x)
+(SGT x (MOVWconst [0])) => (SGTzero x)
+(SGTU x (MOVWconst [0])) => (SGTUzero x)
+
+// mul with constant
+(Select1 (MULTU (MOVWconst [0]) _ )) => (MOVWconst [0])
+(Select0 (MULTU (MOVWconst [0]) _ )) => (MOVWconst [0])
+(Select1 (MULTU (MOVWconst [1]) x )) => x
+(Select0 (MULTU (MOVWconst [1]) _ )) => (MOVWconst [0])
+(Select1 (MULTU (MOVWconst [-1]) x )) => (NEG <x.Type> x)
+(Select0 (MULTU (MOVWconst [-1]) x )) => (CMOVZ (ADDconst <x.Type> [-1] x) (MOVWconst [0]) x)
+(Select1 (MULTU (MOVWconst [c]) x )) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
+(Select0 (MULTU (MOVWconst [c]) x )) && isPowerOfTwo64(int64(uint32(c))) => (SRLconst [int32(32-log2uint32(int64(c)))] x)
+
+(MUL (MOVWconst [0]) _ ) => (MOVWconst [0])
+(MUL (MOVWconst [1]) x ) => x
+(MUL (MOVWconst [-1]) x ) => (NEG x)
+(MUL (MOVWconst [c]) x ) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
+
+// generic simplifications
+(ADD x (NEG y)) => (SUB x y)
+(SUB x x) => (MOVWconst [0])
+(SUB (MOVWconst [0]) x) => (NEG x)
+(AND x x) => x
+(OR x x) => x
+(XOR x x) => (MOVWconst [0])
+
+// miscellaneous patterns generated by dec64
+(AND (SGTUconst [1] x) (SGTUconst [1] y)) => (SGTUconst [1] (OR <x.Type> x y))
+(OR (SGTUzero x) (SGTUzero y)) => (SGTUzero (OR <x.Type> x y))
+
+// remove redundant *const ops
+(ADDconst [0] x) => x
+(SUBconst [0] x) => x
+(ANDconst [0] _) => (MOVWconst [0])
+(ANDconst [-1] x) => x
+(ORconst [0] x) => x
+(ORconst [-1] _) => (MOVWconst [-1])
+(XORconst [0] x) => x
+(XORconst [-1] x) => (NORconst [0] x)
+
+// generic constant folding
+(ADDconst [c] (MOVWconst [d])) => (MOVWconst [int32(c+d)])
+(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x)
+(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x)
+(SUBconst [c] (MOVWconst [d])) => (MOVWconst [d-c])
+(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x)
+(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x)
+(SLLconst [c] (MOVWconst [d])) => (MOVWconst [d<<uint32(c)])
+(SRLconst [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)>>uint32(c))])
+(SRAconst [c] (MOVWconst [d])) => (MOVWconst [d>>uint32(c)])
+(MUL (MOVWconst [c]) (MOVWconst [d])) => (MOVWconst [c*d])
+(Select1 (MULTU (MOVWconst [c]) (MOVWconst [d]))) => (MOVWconst [int32(uint32(c)*uint32(d))])
+(Select0 (MULTU (MOVWconst [c]) (MOVWconst [d]))) => (MOVWconst [int32((int64(uint32(c))*int64(uint32(d)))>>32)])
+(Select1 (DIV (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [c/d])
+(Select1 (DIVU (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)/uint32(d))])
+(Select0 (DIV (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [c%d])
+(Select0 (DIVU (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)%uint32(d))])
+(ANDconst [c] (MOVWconst [d])) => (MOVWconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVWconst [d])) => (MOVWconst [c|d])
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (MOVWconst [d])) => (MOVWconst [c^d])
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(NORconst [c] (MOVWconst [d])) => (MOVWconst [^(c|d)])
+(NEG (MOVWconst [c])) => (MOVWconst [-c])
+(MOVBreg (MOVWconst [c])) => (MOVWconst [int32(int8(c))])
+(MOVBUreg (MOVWconst [c])) => (MOVWconst [int32(uint8(c))])
+(MOVHreg (MOVWconst [c])) => (MOVWconst [int32(int16(c))])
+(MOVHUreg (MOVWconst [c])) => (MOVWconst [int32(uint16(c))])
+(MOVWreg (MOVWconst [c])) => (MOVWconst [c])
+
+// constant comparisons
+(SGTconst [c] (MOVWconst [d])) && c > d => (MOVWconst [1])
+(SGTconst [c] (MOVWconst [d])) && c <= d => (MOVWconst [0])
+(SGTUconst [c] (MOVWconst [d])) && uint32(c) > uint32(d) => (MOVWconst [1])
+(SGTUconst [c] (MOVWconst [d])) && uint32(c) <= uint32(d) => (MOVWconst [0])
+(SGTzero (MOVWconst [d])) && d > 0 => (MOVWconst [1])
+(SGTzero (MOVWconst [d])) && d <= 0 => (MOVWconst [0])
+(SGTUzero (MOVWconst [d])) && d != 0 => (MOVWconst [1])
+(SGTUzero (MOVWconst [d])) && d == 0 => (MOVWconst [0])
+
+// other known comparisons
+(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVWconst [1])
+(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVWconst [0])
+(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVWconst [1])
+(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVWconst [0])
+(SGTUconst [c] (MOVBUreg _)) && 0xff < uint32(c) => (MOVWconst [1])
+(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVWconst [1])
+(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVWconst [0])
+(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVWconst [1])
+(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVWconst [0])
+(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint32(c) => (MOVWconst [1])
+(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVWconst [1])
+(SGTUconst [c] (ANDconst [m] _)) && uint32(m) < uint32(c) => (MOVWconst [1])
+(SGTconst [c] (SRLconst _ [d])) && 0 <= c && uint32(d) <= 31 && 0xffffffff>>uint32(d) < uint32(c) => (MOVWconst [1])
+(SGTUconst [c] (SRLconst _ [d])) && uint32(d) <= 31 && 0xffffffff>>uint32(d) < uint32(c) => (MOVWconst [1])
+
+// absorb constants into branches
+(EQ (MOVWconst [0]) yes no) => (First yes no)
+(EQ (MOVWconst [c]) yes no) && c != 0 => (First no yes)
+(NE (MOVWconst [0]) yes no) => (First no yes)
+(NE (MOVWconst [c]) yes no) && c != 0 => (First yes no)
+(LTZ (MOVWconst [c]) yes no) && c < 0 => (First yes no)
+(LTZ (MOVWconst [c]) yes no) && c >= 0 => (First no yes)
+(LEZ (MOVWconst [c]) yes no) && c <= 0 => (First yes no)
+(LEZ (MOVWconst [c]) yes no) && c > 0 => (First no yes)
+(GTZ (MOVWconst [c]) yes no) && c > 0 => (First yes no)
+(GTZ (MOVWconst [c]) yes no) && c <= 0 => (First no yes)
+(GEZ (MOVWconst [c]) yes no) && c >= 0 => (First yes no)
+(GEZ (MOVWconst [c]) yes no) && c < 0 => (First no yes)
+
+// conditional move
+(CMOVZ _ f (MOVWconst [0])) => f
+(CMOVZ a _ (MOVWconst [c])) && c!=0 => a
+(CMOVZzero _ (MOVWconst [0])) => (MOVWconst [0])
+(CMOVZzero a (MOVWconst [c])) && c!=0 => a
+(CMOVZ a (MOVWconst [0]) c) => (CMOVZzero a c)
+
+// atomic
+(LoweredAtomicStore32 ptr (MOVWconst [0]) mem) => (LoweredAtomicStorezero ptr mem)
+(LoweredAtomicAdd ptr (MOVWconst [c]) mem) && is16Bit(int64(c)) => (LoweredAtomicAddconst [c] ptr mem)
+
diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS64.rules b/src/cmd/compile/internal/ssa/_gen/MIPS64.rules
new file mode 100644
index 0000000..a594df2
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/MIPS64.rules
@@ -0,0 +1,691 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|64|32|16|8) ...) => (ADDV ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUBV ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+
+(Mul(64|32|16|8) x y) => (Select1 (MULVU x y))
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+(Mul64uhilo ...) => (MULVU ...)
+(Select0 (Mul64uover x y)) => (Select1 <typ.UInt64> (MULVU x y))
+(Select1 (Mul64uover x y)) => (SGTU <typ.Bool> (Select0 <typ.UInt64> (MULVU x y)) (MOVVconst <typ.UInt64> [0]))
+
+(Hmul64 x y) => (Select0 (MULV x y))
+(Hmul64u x y) => (Select0 (MULVU x y))
+(Hmul32 x y) => (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32])
+(Hmul32u x y) => (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32])
+
+(Div64 x y) => (Select1 (DIVV x y))
+(Div64u x y) => (Select1 (DIVVU x y))
+(Div32 x y) => (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Div32u x y) => (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Div16 x y) => (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Div16u x y) => (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Div8 x y) => (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Div8u x y) => (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod64 x y) => (Select0 (DIVV x y))
+(Mod64u x y) => (Select0 (DIVVU x y))
+(Mod32 x y) => (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Mod32u x y) => (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Mod16 x y) => (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Mod16u x y) => (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Mod8 x y) => (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Mod8u x y) => (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+
+// (x + y) / 2 with x>=y => (x - y) / 2 + y
+(Avg64u <t> x y) => (ADDV (SRLVconst <t> (SUBV <t> x y) [1]) y)
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+// shifts
+// hardware instruction uses only the low 6 bits of the shift
+// we compare to 64 to ensure Go semantics for large shifts
+(Lsh64x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh64x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh64x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Lsh32x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh32x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh32x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Lsh16x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh16x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh16x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Lsh8x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh8x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh8x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y)))
+
+(Rsh64Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y))
+(Rsh64Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y)))
+(Rsh64Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> x (ZeroExt8to64 y)))
+
+(Rsh32Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y))
+(Rsh32Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)))
+(Rsh32Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64 y)))
+
+(Rsh16Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y))
+(Rsh16Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Rsh16Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64 y)))
+
+(Rsh8Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y))
+(Rsh8Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)))
+(Rsh8Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64 y)))
+
+(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh64x8 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh32x8 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh16x8 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh8x8 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+
+// rotates
+(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
+(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
+(RotateLeft32 <t> x (MOVVconst [c])) => (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
+(RotateLeft64 <t> x (MOVVconst [c])) => (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))
+
+// unary ops
+(Neg(64|32|16|8) ...) => (NEGV ...)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x)
+
+(Sqrt ...) => (SQRTD ...)
+(Sqrt32 ...) => (SQRTF ...)
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVVconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// constants
+(Const(64|32|16|8) [val]) => (MOVVconst [int64(val)])
+(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)])
+(ConstNil) => (MOVVconst [0])
+(ConstBool [t]) => (MOVVconst [int64(b2i(t))])
+
+(Slicemask <t> x) => (SRAVconst (NEGV <t> x) [63])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+// float <=> int conversion
+(Cvt32to32F ...) => (MOVWF ...)
+(Cvt32to64F ...) => (MOVWD ...)
+(Cvt64to32F ...) => (MOVVF ...)
+(Cvt64to64F ...) => (MOVVD ...)
+(Cvt32Fto32 ...) => (TRUNCFW ...)
+(Cvt64Fto32 ...) => (TRUNCDW ...)
+(Cvt32Fto64 ...) => (TRUNCFV ...)
+(Cvt64Fto64 ...) => (TRUNCDV ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+// comparisons
+(Eq8 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Eq16 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq32 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Eq64 x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(EqPtr x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y))
+
+(Neq8 x y) => (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0]))
+(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0]))
+(Neq32 x y) => (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0]))
+(Neq64 x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(NeqPtr x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y))
+
+(Less8 x y) => (SGT (SignExt8to64 y) (SignExt8to64 x))
+(Less16 x y) => (SGT (SignExt16to64 y) (SignExt16to64 x))
+(Less32 x y) => (SGT (SignExt32to64 y) (SignExt32to64 x))
+(Less64 x y) => (SGT y x)
+(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y) => (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x))
+(Less16U x y) => (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x))
+(Less32U x y) => (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x))
+(Less64U x y) => (SGTU y x)
+
+(Leq8 x y) => (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y)))
+(Leq16 x y) => (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y)))
+(Leq32 x y) => (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y)))
+(Leq64 x y) => (XOR (MOVVconst [1]) (SGT x y))
+(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Leq16U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Leq32U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Leq64U x y) => (XOR (MOVVconst [1]) (SGTU x y))
+
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVVaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDVconst [off] ptr)
+
+(Addr {sym} base) => (MOVVaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVVaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVVload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVVstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore ptr (MOVVconst [0]) mem)
+(Zero [2] ptr mem) =>
+ (MOVBstore [1] ptr (MOVVconst [0])
+ (MOVBstore [0] ptr (MOVVconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore ptr (MOVVconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] ptr (MOVVconst [0])
+ (MOVHstore [0] ptr (MOVVconst [0]) mem))
+(Zero [4] ptr mem) =>
+ (MOVBstore [3] ptr (MOVVconst [0])
+ (MOVBstore [2] ptr (MOVVconst [0])
+ (MOVBstore [1] ptr (MOVVconst [0])
+ (MOVBstore [0] ptr (MOVVconst [0]) mem))))
+(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore ptr (MOVVconst [0]) mem)
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] ptr (MOVVconst [0])
+ (MOVWstore [0] ptr (MOVVconst [0]) mem))
+(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] ptr (MOVVconst [0])
+ (MOVHstore [4] ptr (MOVVconst [0])
+ (MOVHstore [2] ptr (MOVVconst [0])
+ (MOVHstore [0] ptr (MOVVconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+ (MOVBstore [2] ptr (MOVVconst [0])
+ (MOVBstore [1] ptr (MOVVconst [0])
+ (MOVBstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] ptr (MOVVconst [0])
+ (MOVHstore [2] ptr (MOVVconst [0])
+ (MOVHstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] ptr (MOVVconst [0])
+ (MOVWstore [4] ptr (MOVVconst [0])
+ (MOVWstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [8] ptr (MOVVconst [0])
+ (MOVVstore [0] ptr (MOVVconst [0]) mem))
+(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [16] ptr (MOVVconst [0])
+ (MOVVstore [8] ptr (MOVVconst [0])
+ (MOVVstore [0] ptr (MOVVconst [0]) mem)))
+
+// medium zeroing uses a duff device
+// 8, and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+ && s%8 == 0 && s > 24 && s <= 8*128
+ && t.Alignment()%8 == 0 && !config.noDuffDevice =>
+ (DUFFZERO [8 * (128 - s/8)] ptr mem)
+
+// large or unaligned zeroing uses a loop
+(Zero [s] {t} ptr mem)
+ && (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 =>
+ (LoweredZero [t.Alignment()]
+ ptr
+ (ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)])
+ mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore dst (MOVHload src mem) mem)
+(Move [2] dst src mem) =>
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))
+(Move [4] dst src mem) =>
+ (MOVBstore [3] dst (MOVBload [3] src mem)
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem))))
+(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore dst (MOVVload src mem) mem)
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] dst (MOVHload [6] src mem)
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))))
+
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem)))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] dst (MOVWload [8] src mem)
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [8] dst (MOVVload [8] src mem)
+ (MOVVstore dst (MOVVload src mem) mem))
+(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVVstore [16] dst (MOVVload [16] src mem)
+ (MOVVstore [8] dst (MOVVload [8] src mem)
+ (MOVVstore dst (MOVVload src mem) mem)))
+
+// medium move uses a duff device
+(Move [s] {t} dst src mem)
+ && s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0
+ && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+// 16 and 128 are magic constants. 16 is the number of bytes to encode:
+// MOVV (R1), R23
+// ADDV $8, R1
+// MOVV R23, (R2)
+// ADDV $8, R2
+// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy.
+
+// large or unaligned move uses a loop
+(Move [s] {t} dst src mem)
+ && s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 =>
+ (LoweredMove [t.Alignment()]
+ dst
+ src
+ (ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)])
+ mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// atomic intrinsics
+(AtomicLoad(8|32|64) ...) => (LoweredAtomicLoad(8|32|64) ...)
+(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...)
+
+(AtomicStore(8|32|64) ...) => (LoweredAtomicStore(8|32|64) ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
+
+(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+
+(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+
+(AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem)
+(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (SGTU ptr (MOVVconst [0]))
+(IsInBounds idx len) => (SGTU len idx)
+(IsSliceInBounds idx len) => (XOR (MOVVconst [1]) (SGTU idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+(If cond yes no) => (NE cond yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no)
+(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no)
+(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no)
+(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no)
+(NE (SGTUconst [1] x) yes no) => (EQ x yes no)
+(EQ (SGTUconst [1] x) yes no) => (NE x yes no)
+(NE (SGTU x (MOVVconst [0])) yes no) => (NE x yes no)
+(EQ (SGTU x (MOVVconst [0])) yes no) => (EQ x yes no)
+(NE (SGTconst [0] x) yes no) => (LTZ x yes no)
+(EQ (SGTconst [0] x) yes no) => (GEZ x yes no)
+(NE (SGT x (MOVVconst [0])) yes no) => (GTZ x yes no)
+(EQ (SGT x (MOVVconst [0])) yes no) => (LEZ x yes no)
+
+// fold offset into address
+(ADDVconst [off1] (MOVVaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) => (MOVVaddr [int32(off1)+int32(off2)] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBload [off1+int32(off2)] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBUload [off1+int32(off2)] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHload [off1+int32(off2)] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHUload [off1+int32(off2)] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWload [off1+int32(off2)] {sym} ptr mem)
+(MOVWUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWUload [off1+int32(off2)] {sym} ptr mem)
+(MOVVload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVload [off1+int32(off2)] {sym} ptr mem)
+(MOVFload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVFload [off1+int32(off2)] {sym} ptr mem)
+(MOVDload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDload [off1+int32(off2)] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVBstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVHstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVWstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVVstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVVstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVFstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVBstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVVstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVstorezero [off1+int32(off2)] {sym} ptr mem)
+
+(MOVBload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVVload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVVload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVFload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVDload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVVstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVVstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVFstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVDstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVVstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVVstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+
+// store zero
+(MOVBstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVVstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVVstorezero [off] {sym} ptr mem)
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVVreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVVreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVVnop doesn't emit instruction, only for ensuring the type.
+(MOVVreg x) && x.Uses == 1 => (MOVVnop x)
+
+// TODO: we should be able to get rid of MOVVnop all together.
+// But for now, this is enough to get rid of lots of them.
+(MOVVnop (MOVVconst [c])) => (MOVVconst [c])
+
+// fold constant into arithmetic ops
+(ADDV x (MOVVconst [c])) && is32Bit(c) => (ADDVconst [c] x)
+(SUBV x (MOVVconst [c])) && is32Bit(c) => (SUBVconst [c] x)
+(AND x (MOVVconst [c])) && is32Bit(c) => (ANDconst [c] x)
+(OR x (MOVVconst [c])) && is32Bit(c) => (ORconst [c] x)
+(XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x)
+(NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x)
+
+(SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63])
+(SLLV x (MOVVconst [c])) => (SLLVconst x [c])
+(SRLV x (MOVVconst [c])) => (SRLVconst x [c])
+(SRAV x (MOVVconst [c])) => (SRAVconst x [c])
+
+(SGT (MOVVconst [c]) x) && is32Bit(c) => (SGTconst [c] x)
+(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x)
+
+// mul by constant
+(Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x)
+(Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0])
+(Select1 (MULVU x (MOVVconst [1]))) => x
+(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SLLVconst [log64(c)] x)
+
+// div by constant
+(Select1 (DIVVU x (MOVVconst [1]))) => x
+(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SRLVconst [log64(c)] x)
+(Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0]) // mod
+(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (ANDconst [c-1] x) // mod
+
+// generic simplifications
+(ADDV x (NEGV y)) => (SUBV x y)
+(SUBV x x) => (MOVVconst [0])
+(SUBV (MOVVconst [0]) x) => (NEGV x)
+(AND x x) => x
+(OR x x) => x
+(XOR x x) => (MOVVconst [0])
+
+// remove redundant *const ops
+(ADDVconst [0] x) => x
+(SUBVconst [0] x) => x
+(ANDconst [0] _) => (MOVVconst [0])
+(ANDconst [-1] x) => x
+(ORconst [0] x) => x
+(ORconst [-1] _) => (MOVVconst [-1])
+(XORconst [0] x) => x
+(XORconst [-1] x) => (NORconst [0] x)
+
+// generic constant folding
+(ADDVconst [c] (MOVVconst [d])) => (MOVVconst [c+d])
+(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) => (ADDVconst [c+d] x)
+(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) => (ADDVconst [c-d] x)
+(SUBVconst [c] (MOVVconst [d])) => (MOVVconst [d-c])
+(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) => (ADDVconst [-c-d] x)
+(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) => (ADDVconst [-c+d] x)
+(SLLVconst [c] (MOVVconst [d])) => (MOVVconst [d<<uint64(c)])
+(SRLVconst [c] (MOVVconst [d])) => (MOVVconst [int64(uint64(d)>>uint64(c))])
+(SRAVconst [c] (MOVVconst [d])) => (MOVVconst [d>>uint64(c)])
+(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c*d])
+(Select1 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c/d])
+(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)/uint64(d))])
+(Select0 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c%d]) // mod
+(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)%uint64(d))]) // mod
+(ANDconst [c] (MOVVconst [d])) => (MOVVconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVVconst [d])) => (MOVVconst [c|d])
+(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) => (ORconst [c|d] x)
+(XORconst [c] (MOVVconst [d])) => (MOVVconst [c^d])
+(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) => (XORconst [c^d] x)
+(NORconst [c] (MOVVconst [d])) => (MOVVconst [^(c|d)])
+(NEGV (MOVVconst [c])) => (MOVVconst [-c])
+(MOVBreg (MOVVconst [c])) => (MOVVconst [int64(int8(c))])
+(MOVBUreg (MOVVconst [c])) => (MOVVconst [int64(uint8(c))])
+(MOVHreg (MOVVconst [c])) => (MOVVconst [int64(int16(c))])
+(MOVHUreg (MOVVconst [c])) => (MOVVconst [int64(uint16(c))])
+(MOVWreg (MOVVconst [c])) => (MOVVconst [int64(int32(c))])
+(MOVWUreg (MOVVconst [c])) => (MOVVconst [int64(uint32(c))])
+(MOVVreg (MOVVconst [c])) => (MOVVconst [c])
+(LoweredAtomicStore(32|64) ptr (MOVVconst [0]) mem) => (LoweredAtomicStorezero(32|64) ptr mem)
+(LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst32 [int32(c)] ptr mem)
+(LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst64 [c] ptr mem)
+
+// constant comparisons
+(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1])
+(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) => (MOVVconst [1])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) => (MOVVconst [0])
+
+// other known comparisons
+(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVVconst [1])
+(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVVconst [0])
+(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVVconst [1])
+(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVVconst [0])
+(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVWUreg _)) && c < 0 => (MOVVconst [0])
+(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVVconst [1])
+(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+
+// absorb constants into branches
+(EQ (MOVVconst [0]) yes no) => (First yes no)
+(EQ (MOVVconst [c]) yes no) && c != 0 => (First no yes)
+(NE (MOVVconst [0]) yes no) => (First no yes)
+(NE (MOVVconst [c]) yes no) && c != 0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c < 0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c >= 0 => (First no yes)
+(LEZ (MOVVconst [c]) yes no) && c <= 0 => (First yes no)
+(LEZ (MOVVconst [c]) yes no) && c > 0 => (First no yes)
+(GTZ (MOVVconst [c]) yes no) && c > 0 => (First yes no)
+(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes)
+(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
+(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes)
+
+// fold readonly sym load
+(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read8(sym, int64(off)))])
+(MOVHload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVVload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go b/src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go
new file mode 100644
index 0000000..89c8772
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go
@@ -0,0 +1,482 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - *const instructions may use a constant larger than the instruction can encode.
+// In this case the assembler expands to multiple instructions and uses tmp
+// register (R23).
+
+// Suffixes encode the bit width of various instructions.
+// V (vlong) = 64 bit
+// WU (word) = 32 bit unsigned
+// W (word) = 32 bit
+// H (half word) = 16 bit
+// HU = 16 bit unsigned
+// B (byte) = 8 bit
+// BU = 8 bit unsigned
+// F (float) = 32 bit float
+// D (double) = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesMIPS64 = []string{
+ "R0", // constant 0
+ "R1",
+ "R2",
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "R13",
+ "R14",
+ "R15",
+ "R16",
+ "R17",
+ "R18",
+ "R19",
+ "R20",
+ "R21",
+ "R22",
+ // R23 = REGTMP not used in regalloc
+ "R24",
+ "R25",
+ // R26 reserved by kernel
+ // R27 reserved by kernel
+ // R28 = REGSB not used in regalloc
+ "SP", // aka R29
+ "g", // aka R30
+ "R31", // aka REGLINK
+
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15",
+ "F16",
+ "F17",
+ "F18",
+ "F19",
+ "F20",
+ "F21",
+ "F22",
+ "F23",
+ "F24",
+ "F25",
+ "F26",
+ "F27",
+ "F28",
+ "F29",
+ "F30",
+ "F31",
+
+ "HI", // high bits of multiplication
+ "LO", // low bits of multiplication
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesMIPS64) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesMIPS64 {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ gp = buildReg("R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R31")
+ gpg = gp | buildReg("g")
+ gpsp = gp | buildReg("SP")
+ gpspg = gpg | buildReg("SP")
+ gpspsbg = gpspg | buildReg("SB")
+ fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+ lo = buildReg("LO")
+ hi = buildReg("HI")
+ callerSave = gp | fp | lo | hi | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r1 = buildReg("R1")
+ r2 = buildReg("R2")
+ r3 = buildReg("R3")
+ r4 = buildReg("R4")
+ )
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+ gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+ gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+ gp2hilo = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{hi, lo}}
+ gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+ gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
+ gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+ gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+ gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+ fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+ fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+ //fp1flags = regInfo{inputs: []regMask{fp}}
+ //fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+ //gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+ fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+ fpstore = regInfo{inputs: []regMask{gpspsbg, fp}}
+ readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+ )
+ ops := []opData{
+ // binary ops
+ {name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true}, // arg0 + arg1
+ {name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"}, // arg0 + auxInt. auxInt is 32-bit, also in other *const ops.
+ {name: "SUBV", argLength: 2, reg: gp21, asm: "SUBVU"}, // arg0 - arg1
+ {name: "SUBVconst", argLength: 1, reg: gp11, asm: "SUBVU", aux: "Int64"}, // arg0 - auxInt
+ {name: "MULV", argLength: 2, reg: gp2hilo, asm: "MULV", commutative: true, typ: "(Int64,Int64)"}, // arg0 * arg1, signed, results hi,lo
+ {name: "MULVU", argLength: 2, reg: gp2hilo, asm: "MULVU", commutative: true, typ: "(UInt64,UInt64)"}, // arg0 * arg1, unsigned, results hi,lo
+ {name: "DIVV", argLength: 2, reg: gp2hilo, asm: "DIVV", typ: "(Int64,Int64)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+ {name: "DIVVU", argLength: 2, reg: gp2hilo, asm: "DIVVU", typ: "(UInt64,UInt64)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+
+ {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+ {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+ {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1
+ {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1
+ {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+ {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+ {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1
+ {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt
+ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0 | auxInt
+ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt64"}, // arg0 ^ arg1
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", typ: "UInt64"}, // arg0 ^ auxInt
+ {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1)
+ {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt)
+
+ {name: "NEGV", argLength: 1, reg: gp11}, // -arg0
+ {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
+ {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
+ {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+ {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
+
+ // shifts
+ {name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64
+ {name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt
+ {name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64
+ {name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned
+ {name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64
+ {name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed
+
+ // comparisons
+ {name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise
+ {name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise
+ {name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise
+ {name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise
+
+ {name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32
+ {name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64
+ {name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32
+ {name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64
+ {name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32
+ {name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64
+
+ // moves
+ {name: "MOVVconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVV", typ: "UInt64", rematerializeable: true}, // auxint
+ {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+ {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+ {name: "MOVVaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVV", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+ {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVVload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVV", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+
+ {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVVstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+
+ {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVVstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. ar12=mem.
+
+ // conversions
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
+ {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half
+ {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word
+ {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+ {name: "MOVVreg", argLength: 1, reg: gp11, asm: "MOVV"}, // move from arg0
+
+ {name: "MOVVnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+ {name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32
+ {name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64
+ {name: "MOVVF", argLength: 1, reg: fp11, asm: "MOVVF"}, // int64 -> float32
+ {name: "MOVVD", argLength: 1, reg: fp11, asm: "MOVVD"}, // int64 -> float64
+ {name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32
+ {name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32
+ {name: "TRUNCFV", argLength: 1, reg: fp11, asm: "TRUNCFV"}, // float32 -> int64
+ {name: "TRUNCDV", argLength: 1, reg: fp11, asm: "TRUNCDV"}, // float64 -> int64
+ {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64
+ {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32
+
+ // function calls
+ {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R22"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // duffzero
+ // arg0 = address of memory to zero
+ // arg1 = mem
+ // auxint = offset into duffzero code to start executing
+ // returns mem
+ // R1 aka mips.REGRT1 changed as side effect
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp},
+ clobbers: buildReg("R1 R31"),
+ },
+ faultOnNilArg0: true,
+ },
+
+ // duffcopy
+ // arg0 = address of dst memory (in R2, changed as side effect)
+ // arg1 = address of src memory (in R1, changed as side effect)
+ // arg2 = mem
+ // auxint = offset into duffcopy code to start executing
+ // returns mem
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R2"), buildReg("R1")},
+ clobbers: buildReg("R1 R2 R31"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // large or unaligned zeroing
+ // arg0 = address of memory to zero (in R1, changed as side effect)
+ // arg1 = address of the last element to zero
+ // arg2 = mem
+ // auxint = alignment
+ // returns mem
+ // SUBV $8, R1
+ // MOVV R0, 8(R1)
+ // ADDV $8, R1
+ // BNE Rarg1, R1, -2(PC)
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), gp},
+ clobbers: buildReg("R1"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ },
+
+ // large or unaligned move
+ // arg0 = address of dst memory (in R2, changed as side effect)
+ // arg1 = address of src memory (in R1, changed as side effect)
+ // arg2 = address of the last element of src
+ // arg3 = mem
+ // auxint = alignment
+ // returns mem
+ // SUBV $8, R1
+ // MOVV 8(R1), Rtmp
+ // MOVV Rtmp, (R2)
+ // ADDV $8, R1
+ // ADDV $8, R2
+ // BNE Rarg2, R1, -4(PC)
+ {
+ name: "LoweredMove",
+ aux: "Int64",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R2"), buildReg("R1"), gp},
+ clobbers: buildReg("R1 R2"),
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // atomic loads.
+ // load from arg0. arg1=mem.
+ // returns <value,memory> so they can be properly ordered with other loads.
+ {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+ // atomic stores.
+ // store arg1 to arg0. arg2=mem. returns memory.
+ {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ // store zero to arg0. arg1=mem. returns memory.
+ {name: "LoweredAtomicStorezero32", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStorezero64", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic exchange.
+ // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>.
+ // SYNC
+ // LL (Rarg0), Rout
+ // MOVV Rarg1, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // SYNC
+ {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic add.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+ // SYNC
+ // LL (Rarg0), Rout
+ // ADDV Rarg1, Rout, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // SYNC
+ // ADDV Rarg1, Rout
+ {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ // *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit.
+ {name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // SYNC
+ // MOVV $0, Rout
+ // LL (Rarg0), Rtmp
+ // BNE Rtmp, Rarg1, 4(PC)
+ // MOVV Rarg2, Rout
+ // SC Rout, (Rarg0)
+ // BEQ Rout, -4(PC)
+ // SYNC
+ {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // pseudo-ops
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
+
+ {name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true
+ {name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false
+
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of R22 (mips.REGCTXT, the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R22")}}, zeroWidth: true},
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers R31 (LR) because it's a call
+ // and R23 (REGTMP).
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ gpg) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ }
+
+ blocks := []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LTZ", controls: 1}, // < 0
+ {name: "LEZ", controls: 1}, // <= 0
+ {name: "GTZ", controls: 1}, // > 0
+ {name: "GEZ", controls: 1}, // >= 0
+ {name: "FPT", controls: 1}, // FP flag is true
+ {name: "FPF", controls: 1}, // FP flag is false
+ }
+
+ archs = append(archs, arch{
+ name: "MIPS64",
+ pkg: "cmd/internal/obj/mips",
+ genfile: "../../mips64/ssa.go",
+ ops: ops,
+ blocks: blocks,
+ regnames: regNamesMIPS64,
+ gpregmask: gp,
+ fpregmask: fp,
+ specialregmask: hi | lo,
+ framepointerreg: -1, // not used
+ linkreg: int8(num["R31"]),
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/MIPSOps.go b/src/cmd/compile/internal/ssa/_gen/MIPSOps.go
new file mode 100644
index 0000000..22a7a5c
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/MIPSOps.go
@@ -0,0 +1,439 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - Unused portions of AuxInt are filled by sign-extending the used portion.
+// - *const instructions may use a constant larger than the instruction can encode.
+// In this case the assembler expands to multiple instructions and uses tmp
+// register (R23).
+
+// Suffixes encode the bit width of various instructions.
+// W (word) = 32 bit
+// H (half word) = 16 bit
+// HU = 16 bit unsigned
+// B (byte) = 8 bit
+// BU = 8 bit unsigned
+// F (float) = 32 bit float
+// D (double) = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesMIPS = []string{
+ "R0", // constant 0
+ "R1",
+ "R2",
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "R13",
+ "R14",
+ "R15",
+ "R16",
+ "R17",
+ "R18",
+ "R19",
+ "R20",
+ "R21",
+ "R22",
+ //REGTMP
+ "R24",
+ "R25",
+ // R26 reserved by kernel
+ // R27 reserved by kernel
+ "R28",
+ "SP", // aka R29
+ "g", // aka R30
+ "R31", // REGLINK
+
+ // odd FP registers contain high parts of 64-bit FP values
+ "F0",
+ "F2",
+ "F4",
+ "F6",
+ "F8",
+ "F10",
+ "F12",
+ "F14",
+ "F16",
+ "F18",
+ "F20",
+ "F22",
+ "F24",
+ "F26",
+ "F28",
+ "F30",
+
+ "HI", // high bits of multiplication
+ "LO", // low bits of multiplication
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesMIPS) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesMIPS {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ gp = buildReg("R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31")
+ gpg = gp | buildReg("g")
+ gpsp = gp | buildReg("SP")
+ gpspg = gpg | buildReg("SP")
+ gpspsbg = gpspg | buildReg("SB")
+ fp = buildReg("F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30")
+ lo = buildReg("LO")
+ hi = buildReg("HI")
+ callerSave = gp | fp | lo | hi | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r1 = buildReg("R1")
+ r2 = buildReg("R2")
+ r3 = buildReg("R3")
+ r4 = buildReg("R4")
+ r5 = buildReg("R5")
+ )
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+ gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+ gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+ gp31 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+ gp2hilo = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{hi, lo}}
+ gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+ gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
+ gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+ gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+ gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+ fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+ fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+ fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+ fpstore = regInfo{inputs: []regMask{gpspsbg, fp}}
+ readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+ )
+ ops := []opData{
+ {name: "ADD", argLength: 2, reg: gp21, asm: "ADDU", commutative: true}, // arg0 + arg1
+ {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADDU", aux: "Int32"}, // arg0 + auxInt
+ {name: "SUB", argLength: 2, reg: gp21, asm: "SUBU"}, // arg0 - arg1
+ {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUBU", aux: "Int32"}, // arg0 - auxInt
+ {name: "MUL", argLength: 2, reg: regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}, clobbers: hi | lo}, asm: "MUL", commutative: true}, // arg0 * arg1
+ {name: "MULT", argLength: 2, reg: gp2hilo, asm: "MUL", commutative: true, typ: "(Int32,Int32)"}, // arg0 * arg1, signed, results hi,lo
+ {name: "MULTU", argLength: 2, reg: gp2hilo, asm: "MULU", commutative: true, typ: "(UInt32,UInt32)"}, // arg0 * arg1, unsigned, results hi,lo
+ {name: "DIV", argLength: 2, reg: gp2hilo, asm: "DIV", typ: "(Int32,Int32)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+ {name: "DIVU", argLength: 2, reg: gp2hilo, asm: "DIVU", typ: "(UInt32,UInt32)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+
+ {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+ {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+ {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1
+ {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1
+ {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+ {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+ {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1
+ {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt
+ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int32"}, // arg0 | auxInt
+ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt32"}, // arg0 ^ arg1
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int32", typ: "UInt32"}, // arg0 ^ auxInt
+ {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1)
+ {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int32"}, // ^(arg0 | auxInt)
+
+ {name: "NEG", argLength: 1, reg: gp11}, // -arg0
+ {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
+ {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
+ {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+ {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
+
+ // shifts
+ {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << arg1, shift amount is mod 32
+ {name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt, shift amount must be 0 through 31 inclusive
+ {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> arg1, unsigned, shift amount is mod 32
+ {name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, shift amount must be 0 through 31 inclusive
+ {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> arg1, signed, shift amount is mod 32
+ {name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed, shift amount must be 0 through 31 inclusive
+
+ {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"},
+
+ // comparisons
+ {name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise
+ {name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int32", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise
+ {name: "SGTzero", argLength: 1, reg: gp11, asm: "SGT", typ: "Bool"}, // 1 if arg0 > 0 (signed), 0 otherwise
+ {name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise
+ {name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int32", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise
+ {name: "SGTUzero", argLength: 1, reg: gp11, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > 0 (unsigned), 0 otherwise
+
+ {name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32
+ {name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64
+ {name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32
+ {name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64
+ {name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32
+ {name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64
+
+ // moves
+ {name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true}, // auxint
+ {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float32", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+ {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+ {name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+ {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
+
+ {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+ {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
+
+ {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+
+ // conversions
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
+ {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half
+ {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0
+
+ {name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+ // conditional move on zero (returns arg1 if arg2 is 0, otherwise arg0)
+ // order of parameters is reversed so we can use resultInArg0 (OpCMOVZ result arg1 arg2-> CMOVZ arg2reg, arg1reg, resultReg)
+ {name: "CMOVZ", argLength: 3, reg: gp31, asm: "CMOVZ", resultInArg0: true},
+ {name: "CMOVZzero", argLength: 2, reg: regInfo{inputs: []regMask{gp, gpg}, outputs: []regMask{gp}}, asm: "CMOVZ", resultInArg0: true},
+
+ {name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32
+ {name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64
+ {name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32
+ {name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32
+ {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64
+ {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32
+
+ // function calls
+ {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R22"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // atomic ops
+
+ // load from arg0. arg1=mem.
+ // returns <value,memory> so they can be properly ordered with other loads.
+ // SYNC
+ // MOV(B|W) (Rarg0), Rout
+ // SYNC
+ {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+ // store arg1 to arg0. arg2=mem. returns memory.
+ // SYNC
+ // MOV(B|W) Rarg1, (Rarg0)
+ // SYNC
+ {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStorezero", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic exchange.
+ // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>.
+ // SYNC
+ // LL (Rarg0), Rout
+ // MOVW Rarg1, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // SYNC
+ {name: "LoweredAtomicExchange", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic add.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+ // SYNC
+ // LL (Rarg0), Rout
+ // ADDU Rarg1, Rout, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // SYNC
+ // ADDU Rarg1, Rout
+ {name: "LoweredAtomicAdd", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAddconst", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // SYNC
+ // MOVW $0, Rout
+ // LL (Rarg0), Rtmp
+ // BNE Rtmp, Rarg1, 4(PC)
+ // MOVW Rarg2, Rout
+ // SC Rout, (Rarg0)
+ // BEQ Rout, -4(PC)
+ // SYNC
+ {name: "LoweredAtomicCas", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // atomic and/or.
+ // *arg0 &= (|=) arg1. arg2=mem. returns memory.
+ // SYNC
+ // LL (Rarg0), Rtmp
+ // AND Rarg1, Rtmp
+ // SC Rtmp, (Rarg0)
+ // BEQ Rtmp, -3(PC)
+ // SYNC
+ {name: "LoweredAtomicAnd", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicOr", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // large or unaligned zeroing
+ // arg0 = address of memory to zero (in R1, changed as side effect)
+ // arg1 = address of the last element to zero
+ // arg2 = mem
+ // auxint = alignment
+ // returns mem
+ // SUBU $4, R1
+ // MOVW R0, 4(R1)
+ // ADDU $4, R1
+ // BNE Rarg1, R1, -2(PC)
+ {
+ name: "LoweredZero",
+ aux: "Int32",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), gp},
+ clobbers: buildReg("R1"),
+ },
+ faultOnNilArg0: true,
+ },
+
+ // large or unaligned move
+ // arg0 = address of dst memory (in R2, changed as side effect)
+ // arg1 = address of src memory (in R1, changed as side effect)
+ // arg2 = address of the last element of src
+ // arg3 = mem
+ // auxint = alignment
+ // returns mem
+ // SUBU $4, R1
+ // MOVW 4(R1), Rtmp
+ // MOVW Rtmp, (R2)
+ // ADDU $4, R1
+ // ADDU $4, R2
+ // BNE Rarg2, R1, -4(PC)
+ {
+ name: "LoweredMove",
+ aux: "Int32",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R2"), buildReg("R1"), gp},
+ clobbers: buildReg("R1 R2"),
+ },
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // pseudo-ops
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
+
+ {name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true
+ {name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false
+
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of R22 (mips.REGCTXT, the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R22")}}, zeroWidth: true},
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers R31 (LR) because it's a call
+ // and R23 (REGTMP).
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ gpg) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ // Extend ops are the same as Bounds ops except the indexes are 64-bit.
+ {name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r3, r4}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ {name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ {name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+ }
+
+ blocks := []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LTZ", controls: 1}, // < 0
+ {name: "LEZ", controls: 1}, // <= 0
+ {name: "GTZ", controls: 1}, // > 0
+ {name: "GEZ", controls: 1}, // >= 0
+ {name: "FPT", controls: 1}, // FP flag is true
+ {name: "FPF", controls: 1}, // FP flag is false
+ }
+
+ archs = append(archs, arch{
+ name: "MIPS",
+ pkg: "cmd/internal/obj/mips",
+ genfile: "../../mips/ssa.go",
+ ops: ops,
+ blocks: blocks,
+ regnames: regNamesMIPS,
+ gpregmask: gp,
+ fpregmask: fp,
+ specialregmask: hi | lo,
+ framepointerreg: -1, // not used
+ linkreg: int8(num["R31"]),
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
new file mode 100644
index 0000000..3aea9ab
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
@@ -0,0 +1,1274 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(Ptr|64|32|16|8) ...) => (ADD ...)
+(Add64F ...) => (FADD ...)
+(Add32F ...) => (FADDS ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUB ...)
+(Sub32F ...) => (FSUBS ...)
+(Sub64F ...) => (FSUB ...)
+
+// Combine 64 bit integer multiply and adds
+(ADD l:(MULLD x y) z) && buildcfg.GOPPC64 >= 9 && l.Uses == 1 && clobber(l) => (MADDLD x y z)
+
+(Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Mod64 x y) && buildcfg.GOPPC64 >=9 => (MODSD x y)
+(Mod64 x y) && buildcfg.GOPPC64 <=8 => (SUB x (MULLD y (DIVD x y)))
+(Mod64u x y) && buildcfg.GOPPC64 >= 9 => (MODUD x y)
+(Mod64u x y) && buildcfg.GOPPC64 <= 8 => (SUB x (MULLD y (DIVDU x y)))
+(Mod32 x y) && buildcfg.GOPPC64 >= 9 => (MODSW x y)
+(Mod32 x y) && buildcfg.GOPPC64 <= 8 => (SUB x (MULLW y (DIVW x y)))
+(Mod32u x y) && buildcfg.GOPPC64 >= 9 => (MODUW x y)
+(Mod32u x y) && buildcfg.GOPPC64 <= 8 => (SUB x (MULLW y (DIVWU x y)))
+
+// (x + y) / 2 with x>=y => (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y)
+
+(Mul64 ...) => (MULLD ...)
+(Mul(32|16|8) ...) => (MULLW ...)
+(Select0 (Mul64uhilo x y)) => (MULHDU x y)
+(Select1 (Mul64uhilo x y)) => (MULLD x y)
+
+(Div64 [false] x y) => (DIVD x y)
+(Div64u ...) => (DIVDU ...)
+(Div32 [false] x y) => (DIVW x y)
+(Div32u ...) => (DIVWU ...)
+(Div16 [false] x y) => (DIVW (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (DIVWU (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (DIVWU (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(Hmul(64|64u|32|32u) ...) => (MULH(D|DU|W|WU) ...)
+
+(Mul(32|64)F ...) => ((FMULS|FMUL) ...)
+
+(Div(32|64)F ...) => ((FDIVS|FDIV) ...)
+
+// Lowering float <=> int
+(Cvt32to(32|64)F x) => ((FCFIDS|FCFID) (MTVSRD (SignExt32to64 x)))
+(Cvt64to(32|64)F x) => ((FCFIDS|FCFID) (MTVSRD x))
+
+(Cvt32Fto(32|64) x) => (MFVSRD (FCTI(W|D)Z x))
+(Cvt64Fto(32|64) x) => (MFVSRD (FCTI(W|D)Z x))
+
+(Cvt32Fto64F ...) => (Copy ...) // Note v will have the wrong type for patterns dependent on Float32/Float64
+(Cvt64Fto32F ...) => (FRSP ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
+
+(Sqrt ...) => (FSQRT ...)
+(Sqrt32 ...) => (FSQRTS ...)
+(Floor ...) => (FFLOOR ...)
+(Ceil ...) => (FCEIL ...)
+(Trunc ...) => (FTRUNC ...)
+(Round ...) => (FROUND ...)
+(Copysign x y) => (FCPSGN y x)
+(Abs ...) => (FABS ...)
+(FMA ...) => (FMADD ...)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to(16|32|64) ...) => (MOVBreg ...)
+(SignExt16to(32|64) ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to(16|32|64) ...) => (MOVBZreg ...)
+(ZeroExt16to(32|64) ...) => (MOVHZreg ...)
+(ZeroExt32to64 ...) => (MOVWZreg ...)
+
+(Trunc(16|32|64)to8 <t> x) && isSigned(t) => (MOVBreg x)
+(Trunc(16|32|64)to8 x) => (MOVBZreg x)
+(Trunc(32|64)to16 <t> x) && isSigned(t) => (MOVHreg x)
+(Trunc(32|64)to16 x) => (MOVHZreg x)
+(Trunc64to32 <t> x) && isSigned(t) => (MOVWreg x)
+(Trunc64to32 x) => (MOVWZreg x)
+
+// Lowering constants
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32|64)F ...) => (FMOV(S|D)const ...)
+(ConstNil) => (MOVDconst [0])
+(ConstBool [t]) => (MOVDconst [b2i(t)])
+
+// Carrying addition.
+(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADDE x y (Select1 <typ.UInt64> (ADDCconst c [-1]))))
+(Select1 (Add64carry x y c)) => (ADDZEzero (Select1 <typ.UInt64> (ADDE x y (Select1 <typ.UInt64> (ADDCconst c [-1])))))
+// Fold initial carry bit if 0.
+(ADDE x y (Select1 <typ.UInt64> (ADDCconst (MOVDconst [0]) [-1]))) => (ADDC x y)
+// Fold transfer of CA -> GPR -> CA. Note 2 uses when feeding into a chained Add64carry.
+(Select1 (ADDCconst n:(ADDZEzero x) [-1])) && n.Uses <= 2 => x
+
+// Borrowing subtraction.
+(Select0 (Sub64borrow x y c)) => (Select0 <typ.UInt64> (SUBE x y (Select1 <typ.UInt64> (SUBCconst c [0]))))
+(Select1 (Sub64borrow x y c)) => (NEG (SUBZEzero (Select1 <typ.UInt64> (SUBE x y (Select1 <typ.UInt64> (SUBCconst c [0]))))))
+// Fold initial borrow bit if 0.
+(SUBE x y (Select1 <typ.UInt64> (SUBCconst (MOVDconst [0]) [0]))) => (SUBC x y)
+// Fold transfer of CA -> GPR -> CA. Note 2 uses when feeding into a chained Sub64borrow.
+(Select1 (SUBCconst n:(NEG (SUBZEzero x)) [0])) && n.Uses <= 2 => x
+
+// Constant folding
+(FABS (FMOVDconst [x])) => (FMOVDconst [math.Abs(x)])
+(FSQRT (FMOVDconst [x])) && x >= 0 => (FMOVDconst [math.Sqrt(x)])
+(FFLOOR (FMOVDconst [x])) => (FMOVDconst [math.Floor(x)])
+(FCEIL (FMOVDconst [x])) => (FMOVDconst [math.Ceil(x)])
+(FTRUNC (FMOVDconst [x])) => (FMOVDconst [math.Trunc(x)])
+
+// Rotates
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft(32|64) ...) => ((ROTLW|ROTL) ...)
+
+// Constant rotate generation
+(ROTLW x (MOVDconst [c])) => (ROTLWconst x [c&31])
+(ROTL x (MOVDconst [c])) => (ROTLconst x [c&63])
+
+// Combine rotate and mask operations
+(Select0 (ANDCCconst [m] (ROTLWconst [r] x))) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(AND (MOVDconst [m]) (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(Select0 (ANDCCconst [m] (ROTLW x r))) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+(AND (MOVDconst [m]) (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+
+// Note, any rotated word bitmask is still a valid word bitmask.
+(ROTLWconst [r] (AND (MOVDconst [m]) x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+(ROTLWconst [r] (Select0 (ANDCCconst [m] x))) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+
+(Select0 (ANDCCconst [m] (SRWconst x [s]))) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(Select0 (ANDCCconst [m] (SRWconst x [s]))) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+
+(SRWconst (Select0 (ANDCCconst [m] x)) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (Select0 (ANDCCconst [m] x)) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+
+// Merge shift right + shift left and clear left (e.g for a table lookup)
+(CLRLSLDI [c] (SRWconst [s] x)) && mergePPC64ClrlsldiSrw(int64(c),s) != 0 => (RLWINM [mergePPC64ClrlsldiSrw(int64(c),s)] x)
+(SLDconst [l] (SRWconst [r] x)) && mergePPC64SldiSrw(l,r) != 0 => (RLWINM [mergePPC64SldiSrw(l,r)] x)
+// The following reduction shows up frequently too. e.g b[(x>>14)&0xFF]
+(CLRLSLDI [c] i:(RLWINM [s] x)) && mergePPC64ClrlsldiRlwinm(c,s) != 0 => (RLWINM [mergePPC64ClrlsldiRlwinm(c,s)] x)
+
+// large constant signed right shift, we leave the sign bit
+(Rsh64x64 x (MOVDconst [c])) && uint64(c) >= 64 => (SRADconst x [63])
+(Rsh32x64 x (MOVDconst [c])) && uint64(c) >= 32 => (SRAWconst x [63])
+(Rsh16x64 x (MOVDconst [c])) && uint64(c) >= 16 => (SRAWconst (SignExt16to32 x) [63])
+(Rsh8x64 x (MOVDconst [c])) && uint64(c) >= 8 => (SRAWconst (SignExt8to32 x) [63])
+
+// constant shifts
+((Lsh64|Rsh64|Rsh64U)x64 x (MOVDconst [c])) && uint64(c) < 64 => (S(L|RA|R)Dconst x [c])
+((Lsh32|Rsh32|Rsh32U)x64 x (MOVDconst [c])) && uint64(c) < 32 => (S(L|RA|R)Wconst x [c])
+((Rsh16|Rsh16U)x64 x (MOVDconst [c])) && uint64(c) < 16 => (SR(AW|W)const ((Sign|Zero)Ext16to32 x) [c])
+(Lsh16x64 x (MOVDconst [c])) && uint64(c) < 16 => (SLWconst x [c])
+((Rsh8|Rsh8U)x64 x (MOVDconst [c])) && uint64(c) < 8 => (SR(AW|W)const ((Sign|Zero)Ext8to32 x) [c])
+(Lsh8x64 x (MOVDconst [c])) && uint64(c) < 8 => (SLWconst x [c])
+
+// Lower bounded shifts first. No need to check shift value.
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
+(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD x y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
+(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
+
+// non-constant rotates
+// If shift > 64 then use -1 as shift count to shift all bits.
+((Lsh64|Rsh64|Rsh64U)x64 x y) => (S(L|RA|R)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+((Rsh32|Rsh32U|Lsh32)x64 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+
+(Rsh(16|16U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+(Lsh16x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+
+(Rsh(8|8U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+(Lsh8x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+
+((Rsh64|Rsh64U|Lsh64)x32 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+((Rsh32|Rsh32U|Lsh32)x32 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+
+(Rsh(16|16U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+(Lsh16x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+
+(Rsh(8|8U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+(Lsh8x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+
+((Rsh64|Rsh64U|Lsh64)x16 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
+
+((Rsh32|Rsh32U|Lsh32)x16 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
+
+(Rsh(16|16U)x16 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
+(Lsh16x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
+
+(Rsh(8|8U)x16 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
+(Lsh8x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
+
+
+((Rsh64|Rsh64U|Lsh64)x8 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
+
+((Rsh32|Rsh32U|Lsh32)x8 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
+
+(Rsh(16|16U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
+(Lsh16x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
+
+(Rsh(8|8U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
+(Lsh8x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
+
+// Cleaning up shift ops
+(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPU (Select0 (ANDCCconst [d] y)) (MOVDconst [c]))) && c >= d => (Select0 (ANDCCconst [d] y))
+(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPUconst [c] (Select0 (ANDCCconst [d] y)))) && c >= d => (Select0 (ANDCCconst [d] y))
+(ORN x (MOVDconst [-1])) => x
+
+(S(RAD|RD|LD) x (MOVDconst [c])) => (S(RAD|RD|LD)const [c&63 | (c>>6&1*63)] x)
+(S(RAW|RW|LW) x (MOVDconst [c])) => (S(RAW|RW|LW)const [c&31 | (c>>5&1*31)] x)
+
+(Addr {sym} base) => (MOVDaddr {sym} [0] base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
+(OffPtr [off] ptr) => (ADD (MOVDconst <typ.Int64> [off]) ptr)
+
+// TODO: optimize these cases?
+(Ctz32NonZero ...) => (Ctz32 ...)
+(Ctz64NonZero ...) => (Ctz64 ...)
+
+(Ctz64 x) && buildcfg.GOPPC64<=8 => (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
+(Ctz64 x) => (CNTTZD x)
+(Ctz32 x) && buildcfg.GOPPC64<=8 => (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))
+(Ctz32 x) => (CNTTZW (MOVWZreg x))
+(Ctz16 x) => (POPCNTW (MOVHZreg (ANDN <typ.Int16> (ADDconst <typ.Int16> [-1] x) x)))
+(Ctz8 x) => (POPCNTB (MOVBZreg (ANDN <typ.UInt8> (ADDconst <typ.UInt8> [-1] x) x)))
+
+(BitLen64 x) => (SUBFCconst [64] (CNTLZD <typ.Int> x))
+(BitLen32 x) => (SUBFCconst [32] (CNTLZW <typ.Int> x))
+
+(PopCount64 ...) => (POPCNTD ...)
+(PopCount(32|16|8) x) => (POPCNT(W|W|B) (MOV(W|H|B)Zreg x))
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+(Neg(64|32|16|8) ...) => (NEG ...)
+(Neg(64|32)F ...) => (FNEG ...)
+
+(Com(64|32|16|8) x) => (NOR x x)
+
+// Lowering boolean ops
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(Not x) => (XORconst [1] x)
+
+// Merge logical operations
+(AND x (NOR y y)) => (ANDN x y)
+(OR x (NOR y y)) => (ORN x y)
+
+// Lowering comparisons
+(EqB x y) => (Select0 <typ.Int> (ANDCCconst [1] (EQV x y)))
+// Sign extension dependence on operand sign sets up for sign/zero-extension elision later
+(Eq(8|16) x y) && isSigned(x.Type) && isSigned(y.Type) => (Equal (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y)))
+(Eq(8|16) x y) => (Equal (CMPW (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y)))
+(Eq(32|64|Ptr) x y) => (Equal ((CMPW|CMP|CMP) x y))
+(Eq(32|64)F x y) => (Equal (FCMPU x y))
+
+(NeqB ...) => (XOR ...)
+// Like Eq8 and Eq16, prefer sign extension likely to enable later elision.
+(Neq(8|16) x y) && isSigned(x.Type) && isSigned(y.Type) => (NotEqual (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y)))
+(Neq(8|16) x y) => (NotEqual (CMPW (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y)))
+(Neq(32|64|Ptr) x y) => (NotEqual ((CMPW|CMP|CMP) x y))
+(Neq(32|64)F x y) => (NotEqual (FCMPU x y))
+
+(Less(8|16) x y) => (LessThan (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y)))
+(Less(32|64) x y) => (LessThan ((CMPW|CMP) x y))
+(Less(32|64)F x y) => (FLessThan (FCMPU x y))
+
+(Less(8|16)U x y) => (LessThan (CMPWU (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y)))
+(Less(32|64)U x y) => (LessThan ((CMPWU|CMPU) x y))
+
+(Leq(8|16) x y) => (LessEqual (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y)))
+(Leq(32|64) x y) => (LessEqual ((CMPW|CMP) x y))
+(Leq(32|64)F x y) => (FLessEqual (FCMPU x y))
+
+(Leq(8|16)U x y) => (LessEqual (CMPWU (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y)))
+(Leq(32|64)U x y) => (LessEqual (CMP(WU|U) x y))
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) => (EQ cc yes no)
+(If (NotEqual cc) yes no) => (NE cc yes no)
+(If (LessThan cc) yes no) => (LT cc yes no)
+(If (LessEqual cc) yes no) => (LE cc yes no)
+(If (GreaterThan cc) yes no) => (GT cc yes no)
+(If (GreaterEqual cc) yes no) => (GE cc yes no)
+(If (FLessThan cc) yes no) => (FLT cc yes no)
+(If (FLessEqual cc) yes no) => (FLE cc yes no)
+(If (FGreaterThan cc) yes no) => (FGT cc yes no)
+(If (FGreaterEqual cc) yes no) => (FGE cc yes no)
+
+(If cond yes no) => (NE (CMPWconst [0] (Select0 <typ.UInt32> (ANDCCconst [1] cond))) yes no)
+
+// Absorb boolean tests into block
+(NE (CMPWconst [0] (Select0 (ANDCCconst [1] ((Equal|NotEqual|LessThan|LessEqual|GreaterThan|GreaterEqual) cc)))) yes no) => ((EQ|NE|LT|LE|GT|GE) cc yes no)
+(NE (CMPWconst [0] (Select0 (ANDCCconst [1] ((FLessThan|FLessEqual|FGreaterThan|FGreaterEqual) cc)))) yes no) => ((FLT|FLE|FGT|FGE) cc yes no)
+
+// Elide compares of bit tests
+((EQ|NE) (CMPconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no)
+((EQ|NE) (CMPWconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no)
+
+// absorb flag constants into branches
+(EQ (FlagEQ) yes no) => (First yes no)
+(EQ (FlagLT) yes no) => (First no yes)
+(EQ (FlagGT) yes no) => (First no yes)
+
+(NE (FlagEQ) yes no) => (First no yes)
+(NE (FlagLT) yes no) => (First yes no)
+(NE (FlagGT) yes no) => (First yes no)
+
+(LT (FlagEQ) yes no) => (First no yes)
+(LT (FlagLT) yes no) => (First yes no)
+(LT (FlagGT) yes no) => (First no yes)
+
+(LE (FlagEQ) yes no) => (First yes no)
+(LE (FlagLT) yes no) => (First yes no)
+(LE (FlagGT) yes no) => (First no yes)
+
+(GT (FlagEQ) yes no) => (First no yes)
+(GT (FlagLT) yes no) => (First no yes)
+(GT (FlagGT) yes no) => (First yes no)
+
+(GE (FlagEQ) yes no) => (First yes no)
+(GE (FlagLT) yes no) => (First no yes)
+(GE (FlagGT) yes no) => (First yes no)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+
+// constant comparisons
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) => (FlagLT)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) => (FlagGT)
+
+(CMPconst (MOVDconst [x]) [y]) && x==y => (FlagEQ)
+(CMPconst (MOVDconst [x]) [y]) && x<y => (FlagLT)
+(CMPconst (MOVDconst [x]) [y]) && x>y => (FlagGT)
+
+(CMPWUconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) => (FlagLT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) => (FlagGT)
+
+(CMPUconst (MOVDconst [x]) [y]) && x==y => (FlagEQ)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) => (FlagLT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) => (FlagGT)
+
+// absorb flag constants into boolean values
+(Equal (FlagEQ)) => (MOVDconst [1])
+(Equal (FlagLT)) => (MOVDconst [0])
+(Equal (FlagGT)) => (MOVDconst [0])
+
+(NotEqual (FlagEQ)) => (MOVDconst [0])
+(NotEqual (FlagLT)) => (MOVDconst [1])
+(NotEqual (FlagGT)) => (MOVDconst [1])
+
+(LessThan (FlagEQ)) => (MOVDconst [0])
+(LessThan (FlagLT)) => (MOVDconst [1])
+(LessThan (FlagGT)) => (MOVDconst [0])
+
+(LessEqual (FlagEQ)) => (MOVDconst [1])
+(LessEqual (FlagLT)) => (MOVDconst [1])
+(LessEqual (FlagGT)) => (MOVDconst [0])
+
+(GreaterThan (FlagEQ)) => (MOVDconst [0])
+(GreaterThan (FlagLT)) => (MOVDconst [0])
+(GreaterThan (FlagGT)) => (MOVDconst [1])
+
+(GreaterEqual (FlagEQ)) => (MOVDconst [1])
+(GreaterEqual (FlagLT)) => (MOVDconst [0])
+(GreaterEqual (FlagGT)) => (MOVDconst [1])
+
+// absorb InvertFlags into boolean values
+((Equal|NotEqual|LessThan|GreaterThan|LessEqual|GreaterEqual) (InvertFlags x)) => ((Equal|NotEqual|GreaterThan|LessThan|GreaterEqual|LessEqual) x)
+
+
+// Elide compares of bit tests
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPWconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ANDCC x y)) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(OR x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ORCC x y)) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(XOR x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (XORCC x y)) yes no)
+
+// Only lower after bool is lowered. It should always lower. This helps ensure the folding below happens reliably.
+(CondSelect x y bool) && flagArg(bool) == nil => (ISEL [6] x y (Select1 <types.TypeFlags> (ANDCCconst [1] bool)))
+// Fold any CR -> GPR -> CR transfers when applying the above rule.
+(ISEL [6] x y (Select1 (ANDCCconst [1] (ISELB [c] one cmp)))) => (ISEL [c] x y cmp)
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) => (MOVWZload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) => (MOVHload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) => (MOVHZload ptr mem)
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBZload ptr mem)
+(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) => (MOVBreg (MOVBZload ptr mem)) // PPC has no signed-byte load.
+(Load <t> ptr mem) && is8BitInt(t) && !isSigned(t) => (MOVBZload ptr mem)
+
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is32BitFloat(val.Type) => (FMOVDstore ptr val mem) // glitch from (Cvt32Fto64F x) => x -- type is wrong
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitInt(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Using Zero instead of LoweredZero allows the
+// target address to be folded where possible.
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstorezero destptr mem)
+(Zero [2] destptr mem) =>
+ (MOVHstorezero destptr mem)
+(Zero [3] destptr mem) =>
+ (MOVBstorezero [2] destptr
+ (MOVHstorezero destptr mem))
+(Zero [4] destptr mem) =>
+ (MOVWstorezero destptr mem)
+(Zero [5] destptr mem) =>
+ (MOVBstorezero [4] destptr
+ (MOVWstorezero destptr mem))
+(Zero [6] destptr mem) =>
+ (MOVHstorezero [4] destptr
+ (MOVWstorezero destptr mem))
+(Zero [7] destptr mem) =>
+ (MOVBstorezero [6] destptr
+ (MOVHstorezero [4] destptr
+ (MOVWstorezero destptr mem)))
+
+(Zero [8] {t} destptr mem) => (MOVDstorezero destptr mem)
+(Zero [12] {t} destptr mem) =>
+ (MOVWstorezero [8] destptr
+ (MOVDstorezero [0] destptr mem))
+(Zero [16] {t} destptr mem) =>
+ (MOVDstorezero [8] destptr
+ (MOVDstorezero [0] destptr mem))
+(Zero [24] {t} destptr mem) =>
+ (MOVDstorezero [16] destptr
+ (MOVDstorezero [8] destptr
+ (MOVDstorezero [0] destptr mem)))
+(Zero [32] {t} destptr mem) =>
+ (MOVDstorezero [24] destptr
+ (MOVDstorezero [16] destptr
+ (MOVDstorezero [8] destptr
+ (MOVDstorezero [0] destptr mem))))
+
+// Handle cases not handled above
+// Lowered Short cases do not generate loops, and as a result don't clobber
+// the address registers or flags.
+(Zero [s] ptr mem) && buildcfg.GOPPC64 <= 8 && s < 64 => (LoweredZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && buildcfg.GOPPC64 <= 8 => (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s < 128 && buildcfg.GOPPC64 >= 9 => (LoweredQuadZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && buildcfg.GOPPC64 >= 9 => (LoweredQuadZero [s] ptr mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBZload src mem) mem)
+(Move [2] dst src mem) =>
+ (MOVHstore dst (MOVHZload src mem) mem)
+(Move [4] dst src mem) =>
+ (MOVWstore dst (MOVWZload src mem) mem)
+// MOVD for load and store must have offsets that are multiple of 4
+(Move [8] {t} dst src mem) =>
+ (MOVDstore dst (MOVDload src mem) mem)
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBZload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))
+(Move [5] dst src mem) =>
+ (MOVBstore [4] dst (MOVBZload [4] src mem)
+ (MOVWstore dst (MOVWZload src mem) mem))
+(Move [6] dst src mem) =>
+ (MOVHstore [4] dst (MOVHZload [4] src mem)
+ (MOVWstore dst (MOVWZload src mem) mem))
+(Move [7] dst src mem) =>
+ (MOVBstore [6] dst (MOVBZload [6] src mem)
+ (MOVHstore [4] dst (MOVHZload [4] src mem)
+ (MOVWstore dst (MOVWZload src mem) mem)))
+
+// Large move uses a loop. Since the address is computed and the
+// offset is zero, any alignment can be used.
+(Move [s] dst src mem) && s > 8 && buildcfg.GOPPC64 <= 8 && logLargeCopy(v, s) =>
+ (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && s <= 64 && buildcfg.GOPPC64 >= 9 =>
+ (LoweredQuadMoveShort [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && buildcfg.GOPPC64 >= 9 && logLargeCopy(v, s) =>
+ (LoweredQuadMove [s] dst src mem)
+
+// Calls
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// Miscellaneous
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) => (LessThan (CMPU idx len))
+(IsSliceInBounds idx len) => (LessEqual (CMPU idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+// Publication barrier as intrinsic
+(PubBarrier ...) => (LoweredPubBarrier ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Optimizations
+// Note that PPC "logical" immediates come in 0:15 and 16:31 unsigned immediate forms,
+// so ORconst, XORconst easily expand into a pair.
+
+// Include very-large constants in the const-const case.
+(AND (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&d])
+(OR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|d])
+(XOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c^d])
+(ORN (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|^d])
+(ANDN (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&^d])
+(NOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [^(c|d)])
+
+// Discover consts
+(AND x (MOVDconst [c])) && isU16Bit(c) => (Select0 (ANDCCconst [c] x))
+(XOR x (MOVDconst [c])) && isU32Bit(c) => (XORconst [c] x)
+(OR x (MOVDconst [c])) && isU32Bit(c) => (ORconst [c] x)
+
+// Simplify consts
+(Select0 (ANDCCconst [c] (Select0 (ANDCCconst [d] x)))) => (Select0 (ANDCCconst [c&d] x))
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(Select0 (ANDCCconst [-1] x)) => x
+(Select0 (ANDCCconst [0] _)) => (MOVDconst [0])
+(XORconst [0] x) => x
+(ORconst [-1] _) => (MOVDconst [-1])
+(ORconst [0] x) => x
+
+// zero-extend of small and => small and
+(MOVBZreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFF => y
+(MOVHZreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFFFF => y
+(MOVWZreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFFFFFFFF => y
+(MOVWZreg y:(AND (MOVDconst [c]) _)) && uint64(c) <= 0xFFFFFFFF => y
+
+// sign extend of small-positive and => small-positive-and
+(MOVBreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0x7F => y
+(MOVHreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0x7FFF => y
+(MOVWreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFFFF => y // 0xFFFF is largest immediate constant, when regarded as 32-bit is > 0
+(MOVWreg y:(AND (MOVDconst [c]) _)) && uint64(c) <= 0x7FFFFFFF => y
+
+// small and of zero-extend => either zero-extend or small and
+(Select0 (ANDCCconst [c] y:(MOVBZreg _))) && c&0xFF == 0xFF => y
+(Select0 (ANDCCconst [0xFF] (MOVBreg x))) => (MOVBZreg x)
+(Select0 (ANDCCconst [c] y:(MOVHZreg _))) && c&0xFFFF == 0xFFFF => y
+(Select0 (ANDCCconst [0xFFFF] (MOVHreg x))) => (MOVHZreg x)
+
+(AND (MOVDconst [c]) y:(MOVWZreg _)) && c&0xFFFFFFFF == 0xFFFFFFFF => y
+(AND (MOVDconst [0xFFFFFFFF]) y:(MOVWreg x)) => (MOVWZreg x)
+// normal case
+(Select0 (ANDCCconst [c] (MOVBZreg x))) => (Select0 (ANDCCconst [c&0xFF] x))
+(Select0 (ANDCCconst [c] (MOVHZreg x))) => (Select0 (ANDCCconst [c&0xFFFF] x))
+(Select0 (ANDCCconst [c] (MOVWZreg x))) => (Select0 (ANDCCconst [c&0xFFFFFFFF] x))
+
+// Eliminate unnecessary sign/zero extend following right shift
+(MOV(B|H|W)Zreg (SRWconst [c] (MOVBZreg x))) => (SRWconst [c] (MOVBZreg x))
+(MOV(H|W)Zreg (SRWconst [c] (MOVHZreg x))) => (SRWconst [c] (MOVHZreg x))
+(MOVWZreg (SRWconst [c] (MOVWZreg x))) => (SRWconst [c] (MOVWZreg x))
+(MOV(B|H|W)reg (SRAWconst [c] (MOVBreg x))) => (SRAWconst [c] (MOVBreg x))
+(MOV(H|W)reg (SRAWconst [c] (MOVHreg x))) => (SRAWconst [c] (MOVHreg x))
+(MOVWreg (SRAWconst [c] (MOVWreg x))) => (SRAWconst [c] (MOVWreg x))
+
+(MOV(WZ|W)reg (S(R|RA)Wconst [c] x)) && sizeof(x.Type) <= 32 => (S(R|RA)Wconst [c] x)
+(MOV(HZ|H)reg (S(R|RA)Wconst [c] x)) && sizeof(x.Type) <= 16 => (S(R|RA)Wconst [c] x)
+(MOV(BZ|B)reg (S(R|RA)Wconst [c] x)) && sizeof(x.Type) == 8 => (S(R|RA)Wconst [c] x)
+
+// initial right shift will handle sign/zero extend
+(MOVBZreg (SRDconst [c] x)) && c>=56 => (SRDconst [c] x)
+(MOVBreg (SRDconst [c] x)) && c>56 => (SRDconst [c] x)
+(MOVBreg (SRDconst [c] x)) && c==56 => (SRADconst [c] x)
+(MOVBreg (SRADconst [c] x)) && c>=56 => (SRADconst [c] x)
+(MOVBZreg (SRWconst [c] x)) && c>=24 => (SRWconst [c] x)
+(MOVBreg (SRWconst [c] x)) && c>24 => (SRWconst [c] x)
+(MOVBreg (SRWconst [c] x)) && c==24 => (SRAWconst [c] x)
+(MOVBreg (SRAWconst [c] x)) && c>=24 => (SRAWconst [c] x)
+
+(MOVHZreg (SRDconst [c] x)) && c>=48 => (SRDconst [c] x)
+(MOVHreg (SRDconst [c] x)) && c>48 => (SRDconst [c] x)
+(MOVHreg (SRDconst [c] x)) && c==48 => (SRADconst [c] x)
+(MOVHreg (SRADconst [c] x)) && c>=48 => (SRADconst [c] x)
+(MOVHZreg (SRWconst [c] x)) && c>=16 => (SRWconst [c] x)
+(MOVHreg (SRWconst [c] x)) && c>16 => (SRWconst [c] x)
+(MOVHreg (SRAWconst [c] x)) && c>=16 => (SRAWconst [c] x)
+(MOVHreg (SRWconst [c] x)) && c==16 => (SRAWconst [c] x)
+
+(MOVWZreg (SRDconst [c] x)) && c>=32 => (SRDconst [c] x)
+(MOVWreg (SRDconst [c] x)) && c>32 => (SRDconst [c] x)
+(MOVWreg (SRADconst [c] x)) && c>=32 => (SRADconst [c] x)
+(MOVWreg (SRDconst [c] x)) && c==32 => (SRADconst [c] x)
+
+// Various redundant zero/sign extension combinations.
+(MOVBZreg y:(MOVBZreg _)) => y // repeat
+(MOVBreg y:(MOVBreg _)) => y // repeat
+(MOVBreg (MOVBZreg x)) => (MOVBreg x)
+(MOVBZreg (MOVBreg x)) => (MOVBZreg x)
+
+// H - there are more combinations than these
+
+(MOVHZreg y:(MOV(H|B)Zreg _)) => y // repeat
+(MOVHZreg y:(MOVHBRload _ _)) => y
+
+(MOVHreg y:(MOV(H|B)reg _)) => y // repeat
+
+(MOV(H|HZ)reg y:(MOV(HZ|H)reg x)) => (MOV(H|HZ)reg x)
+
+// W - there are more combinations than these
+
+(MOV(WZ|WZ|WZ|W|W|W)reg y:(MOV(WZ|HZ|BZ|W|H|B)reg _)) => y // repeat
+(MOVWZreg y:(MOV(H|W)BRload _ _)) => y
+
+(MOV(W|WZ)reg y:(MOV(WZ|W)reg x)) => (MOV(W|WZ)reg x)
+
+// Truncate then logical then truncate: omit first, lesser or equal truncate
+(MOVWZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVWZreg ((OR|XOR|AND) <t> x y))
+(MOVHZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVHZreg ((OR|XOR|AND) <t> x y))
+(MOVHZreg ((OR|XOR|AND) <t> x (MOVHZreg y))) => (MOVHZreg ((OR|XOR|AND) <t> x y))
+(MOVBZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y))
+(MOVBZreg ((OR|XOR|AND) <t> x (MOVHZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y))
+(MOVBZreg ((OR|XOR|AND) <t> x (MOVBZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y))
+
+(MOV(B|H|W)Zreg z:(Select0 (ANDCCconst [c] (MOVBZload ptr x)))) => z
+(MOV(B|H|W)Zreg z:(AND y (MOV(B|H|W)Zload ptr x))) => z
+(MOV(H|W)Zreg z:(Select0 (ANDCCconst [c] (MOVHZload ptr x)))) => z
+(MOVWZreg z:(Select0 (ANDCCconst [c] (MOVWZload ptr x)))) => z
+
+// Arithmetic constant ops
+
+(ADD x (MOVDconst [c])) && is32Bit(c) => (ADDconst [c] x)
+(ADDconst [c] (ADDconst [d] x)) && is32Bit(c+d) => (ADDconst [c+d] x)
+(ADDconst [0] x) => x
+(SUB x (MOVDconst [c])) && is32Bit(-c) => (ADDconst [-c] x)
+
+(ADDconst [c] (MOVDaddr [d] {sym} x)) && is32Bit(c+int64(d)) => (MOVDaddr [int32(c+int64(d))] {sym} x)
+(ADDconst [c] x:(SP)) && is32Bit(c) => (MOVDaddr [int32(c)] x) // so it is rematerializeable
+
+(MULL(W|D) x (MOVDconst [c])) && is16Bit(c) => (MULL(W|D)const [int32(c)] x)
+
+// Subtract from (with carry, but ignored) constant.
+// Note, these clobber the carry bit.
+(SUB (MOVDconst [c]) x) && is32Bit(c) => (SUBFCconst [c] x)
+(SUBFCconst [c] (NEG x)) => (ADDconst [c] x)
+(SUBFCconst [c] (SUBFCconst [d] x)) && is32Bit(c-d) => (ADDconst [c-d] x)
+(SUBFCconst [0] x) => (NEG x)
+(ADDconst [c] (SUBFCconst [d] x)) && is32Bit(c+d) => (SUBFCconst [c+d] x)
+(NEG (ADDconst [c] x)) && is32Bit(-c) => (SUBFCconst [-c] x)
+(NEG (SUBFCconst [c] x)) && is32Bit(-c) => (ADDconst [-c] x)
+(NEG (SUB x y)) => (SUB y x)
+(NEG (NEG x)) => x
+
+// Use register moves instead of stores and loads to move int<=>float values
+// Common with math Float64bits, Float64frombits
+(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr x _)) => (MFVSRD x)
+(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr x _)) => (MTVSRD x)
+
+(FMOVDstore [off] {sym} ptr (MTVSRD x) mem) => (MOVDstore [off] {sym} ptr x mem)
+(MOVDstore [off] {sym} ptr (MFVSRD x) mem) => (FMOVDstore [off] {sym} ptr x mem)
+
+(MTVSRD (MOVDconst [c])) && !math.IsNaN(math.Float64frombits(uint64(c))) => (FMOVDconst [math.Float64frombits(uint64(c))])
+(MFVSRD (FMOVDconst [c])) => (MOVDconst [int64(math.Float64bits(c))])
+
+(MTVSRD x:(MOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (FMOVDload [off] {sym} ptr mem)
+(MFVSRD x:(FMOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVDload [off] {sym} ptr mem)
+
+// Fold offsets for stores.
+(MOV(D|W|H|B)store [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(int64(off1)+off2) => (MOV(D|W|H|B)store [off1+int32(off2)] {sym} x val mem)
+
+(FMOV(S|D)store [off1] {sym} (ADDconst [off2] ptr) val mem) && is16Bit(int64(off1)+off2) => (FMOV(S|D)store [off1+int32(off2)] {sym} ptr val mem)
+
+// Fold address into load/store.
+// The assembler needs to generate several instructions and use
+// temp register for accessing global, and each time it will reload
+// the temp register. So don't fold address of global, unless there
+// is only one use.
+(MOV(B|H|W|D)store [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+ && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+ (MOV(B|H|W|D)store [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+(FMOV(S|D)store [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+ && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+ (FMOV(S|D)store [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+(MOV(B|H|W)Zload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+ && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+ (MOV(B|H|W)Zload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOV(H|W|D)load [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+ && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+ (MOV(H|W|D)load [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOV(S|D)load [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+ && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+ (FMOV(S|D)load [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+// Fold offsets for loads.
+(FMOV(S|D)load [off1] {sym} (ADDconst [off2] ptr) mem) && is16Bit(int64(off1)+off2) => (FMOV(S|D)load [off1+int32(off2)] {sym} ptr mem)
+
+(MOV(D|W|WZ|H|HZ|BZ)load [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => (MOV(D|W|WZ|H|HZ|BZ)load [off1+int32(off2)] {sym} x mem)
+
+// Determine load + addressing that can be done as a register indexed load
+(MOV(D|W|WZ|H|HZ|BZ)load [0] {sym} p:(ADD ptr idx) mem) && sym == nil && p.Uses == 1 => (MOV(D|W|WZ|H|HZ|BZ)loadidx ptr idx mem)
+
+// Determine if there is benefit to using a non-indexed load, since that saves the load
+// of the index register. With MOVDload and MOVWload, there is no benefit if the offset
+// value is not a multiple of 4, since that results in an extra instruction in the base
+// register address computation.
+(MOV(D|W)loadidx ptr (MOVDconst [c]) mem) && is16Bit(c) && c%4 == 0 => (MOV(D|W)load [int32(c)] ptr mem)
+(MOV(WZ|H|HZ|BZ)loadidx ptr (MOVDconst [c]) mem) && is16Bit(c) => (MOV(WZ|H|HZ|BZ)load [int32(c)] ptr mem)
+(MOV(D|W)loadidx (MOVDconst [c]) ptr mem) && is16Bit(c) && c%4 == 0 => (MOV(D|W)load [int32(c)] ptr mem)
+(MOV(WZ|H|HZ|BZ)loadidx (MOVDconst [c]) ptr mem) && is16Bit(c) => (MOV(WZ|H|HZ|BZ)load [int32(c)] ptr mem)
+
+// Store of zero => storezero
+(MOV(D|W|H|B)store [off] {sym} ptr (MOVDconst [0]) mem) => (MOV(D|W|H|B)storezero [off] {sym} ptr mem)
+
+// Fold offsets for storezero
+(MOV(D|W|H|B)storezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) =>
+ (MOV(D|W|H|B)storezero [off1+int32(off2)] {sym} x mem)
+
+// Stores with addressing that can be done as indexed stores
+(MOV(D|W|H|B)store [0] {sym} p:(ADD ptr idx) val mem) && sym == nil && p.Uses == 1 => (MOV(D|W|H|B)storeidx ptr idx val mem)
+
+// Stores with constant index values can be done without indexed instructions
+// No need to lower the idx cases if c%4 is not 0
+(MOVDstoreidx ptr (MOVDconst [c]) val mem) && is16Bit(c) && c%4 == 0 => (MOVDstore [int32(c)] ptr val mem)
+(MOV(W|H|B)storeidx ptr (MOVDconst [c]) val mem) && is16Bit(c) => (MOV(W|H|B)store [int32(c)] ptr val mem)
+(MOVDstoreidx (MOVDconst [c]) ptr val mem) && is16Bit(c) && c%4 == 0 => (MOVDstore [int32(c)] ptr val mem)
+(MOV(W|H|B)storeidx (MOVDconst [c]) ptr val mem) && is16Bit(c) => (MOV(W|H|B)store [int32(c)] ptr val mem)
+
+// Fold symbols into storezero
+(MOV(D|W|H|B)storezero [off1] {sym1} p:(MOVDaddr [off2] {sym2} x) mem) && canMergeSym(sym1,sym2)
+ && (x.Op != OpSB || p.Uses == 1) =>
+ (MOV(D|W|H|B)storezero [off1+off2] {mergeSym(sym1,sym2)} x mem)
+
+// atomic intrinsics
+(AtomicLoad(8|32|64|Ptr) ptr mem) => (LoweredAtomicLoad(8|32|64|Ptr) [1] ptr mem)
+(AtomicLoadAcq(32|64) ptr mem) => (LoweredAtomicLoad(32|64) [0] ptr mem)
+
+(AtomicStore(8|32|64) ptr val mem) => (LoweredAtomicStore(8|32|64) [1] ptr val mem)
+(AtomicStoreRel(32|64) ptr val mem) => (LoweredAtomicStore(32|64) [0] ptr val mem)
+
+(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+
+(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+
+(AtomicCompareAndSwap(32|64) ptr old new_ mem) => (LoweredAtomicCas(32|64) [1] ptr old new_ mem)
+(AtomicCompareAndSwapRel32 ptr old new_ mem) => (LoweredAtomicCas32 [0] ptr old new_ mem)
+
+(AtomicAnd(8|32) ...) => (LoweredAtomicAnd(8|32) ...)
+(AtomicOr(8|32) ...) => (LoweredAtomicOr(8|32) ...)
+
+(Slicemask <t> x) => (SRADconst (NEG <t> x) [63])
+
+// Note that MOV??reg returns a 64-bit int, x is not necessarily that wide
+// This may interact with other patterns in the future. (Compare with arm64)
+(MOV(B|H|W)Zreg x:(MOVBZload _ _)) => x
+(MOV(B|H|W)Zreg x:(MOVBZloadidx _ _ _)) => x
+(MOV(H|W)Zreg x:(MOVHZload _ _)) => x
+(MOV(H|W)Zreg x:(MOVHZloadidx _ _ _)) => x
+(MOV(H|W)reg x:(MOVHload _ _)) => x
+(MOV(H|W)reg x:(MOVHloadidx _ _ _)) => x
+(MOV(WZ|W)reg x:(MOV(WZ|W)load _ _)) => x
+(MOV(WZ|W)reg x:(MOV(WZ|W)loadidx _ _ _)) => x
+(MOV(B|W)Zreg x:(Select0 (LoweredAtomicLoad(8|32) _ _))) => x
+
+// don't extend if argument is already extended
+(MOVBreg x:(Arg <t>)) && is8BitInt(t) && isSigned(t) => x
+(MOVBZreg x:(Arg <t>)) && is8BitInt(t) && !isSigned(t) => x
+(MOVHreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && isSigned(t) => x
+(MOVHZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && !isSigned(t) => x
+(MOVWreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && isSigned(t) => x
+(MOVWZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && !isSigned(t) => x
+
+(MOVBZreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))])
+(MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))])
+(MOVHZreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+(MOVWZreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+
+// Implement clrsldi and clrslwi extended mnemonics as described in
+// ISA 3.0 section C.8. AuxInt field contains values needed for
+// the instructions, packed together since there is only one available.
+(SLDconst [c] z:(MOVBZreg x)) && c < 8 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,56,63,64)] x)
+(SLDconst [c] z:(MOVHZreg x)) && c < 16 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,48,63,64)] x)
+(SLDconst [c] z:(MOVWZreg x)) && c < 32 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,32,63,64)] x)
+
+(SLDconst [c] z:(Select0 (ANDCCconst [d] x))) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLWconst [c] z:(MOVBZreg x)) && z.Uses == 1 && c < 8 => (CLRLSLWI [newPPC64ShiftAuxInt(c,24,31,32)] x)
+(SLWconst [c] z:(MOVHZreg x)) && z.Uses == 1 && c < 16 => (CLRLSLWI [newPPC64ShiftAuxInt(c,16,31,32)] x)
+(SLWconst [c] z:(Select0 (ANDCCconst [d] x))) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+// special case for power9
+(SL(W|D)const [c] z:(MOVWreg x)) && c < 32 && buildcfg.GOPPC64 >= 9 => (EXTSWSLconst [c] x)
+
+// Lose widening ops fed to stores
+(MOVBstore [off] {sym} ptr (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOV(W|WZ)reg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 => (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem)
+(MOVBstore [off] {sym} ptr (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 => (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem)
+(MOVBstoreidx ptr idx (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVWstoreidx ptr idx (MOV(W|WZ)reg x) mem) => (MOVWstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 => (MOVBstoreidx ptr idx (SRWconst <typ.UInt32> x [c]) mem)
+(MOVBstoreidx ptr idx (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 => (MOVBstoreidx ptr idx (SRWconst <typ.UInt32> x [c]) mem)
+(MOVHBRstore {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHBRstore {sym} ptr x mem)
+(MOVWBRstore {sym} ptr (MOV(W|WZ)reg x) mem) => (MOVWBRstore {sym} ptr x mem)
+
+// Lose W-widening ops fed to compare-W
+(CMP(W|WU) x (MOV(W|WZ)reg y)) => (CMP(W|WU) x y)
+(CMP(W|WU) (MOV(W|WZ)reg x) y) => (CMP(W|WU) x y)
+
+(CMP x (MOVDconst [c])) && is16Bit(c) => (CMPconst x [c])
+(CMP (MOVDconst [c]) y) && is16Bit(c) => (InvertFlags (CMPconst y [c]))
+(CMPW x (MOVDconst [c])) && is16Bit(c) => (CMPWconst x [int32(c)])
+(CMPW (MOVDconst [c]) y) && is16Bit(c) => (InvertFlags (CMPWconst y [int32(c)]))
+
+(CMPU x (MOVDconst [c])) && isU16Bit(c) => (CMPUconst x [c])
+(CMPU (MOVDconst [c]) y) && isU16Bit(c) => (InvertFlags (CMPUconst y [c]))
+(CMPWU x (MOVDconst [c])) && isU16Bit(c) => (CMPWUconst x [int32(c)])
+(CMPWU (MOVDconst [c]) y) && isU16Bit(c) => (InvertFlags (CMPWUconst y [int32(c)]))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+((CMP|CMPW|CMPU|CMPWU) x y) && canonLessThan(x,y) => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x))
+
+// ISEL auxInt values 0=LT 1=GT 2=EQ arg2 ? arg0 : arg1
+// ISEL auxInt values 4=GE 5=LE 6=NE !arg2 ? arg1 : arg0
+// ISELB special case where arg0, arg1 values are 0, 1
+
+(Equal cmp) => (ISELB [2] (MOVDconst [1]) cmp)
+(NotEqual cmp) => (ISELB [6] (MOVDconst [1]) cmp)
+(LessThan cmp) => (ISELB [0] (MOVDconst [1]) cmp)
+(FLessThan cmp) => (ISELB [0] (MOVDconst [1]) cmp)
+(FLessEqual cmp) => (ISEL [2] (MOVDconst [1]) (ISELB [0] (MOVDconst [1]) cmp) cmp)
+(GreaterEqual cmp) => (ISELB [4] (MOVDconst [1]) cmp)
+(GreaterThan cmp) => (ISELB [1] (MOVDconst [1]) cmp)
+(FGreaterThan cmp) => (ISELB [1] (MOVDconst [1]) cmp)
+(FGreaterEqual cmp) => (ISEL [2] (MOVDconst [1]) (ISELB [1] (MOVDconst [1]) cmp) cmp)
+(LessEqual cmp) => (ISELB [5] (MOVDconst [1]) cmp)
+
+(ISELB [0] _ (FlagLT)) => (MOVDconst [1])
+(ISELB [0] _ (Flag(GT|EQ))) => (MOVDconst [0])
+(ISELB [1] _ (FlagGT)) => (MOVDconst [1])
+(ISELB [1] _ (Flag(LT|EQ))) => (MOVDconst [0])
+(ISELB [2] _ (FlagEQ)) => (MOVDconst [1])
+(ISELB [2] _ (Flag(LT|GT))) => (MOVDconst [0])
+(ISELB [4] _ (FlagLT)) => (MOVDconst [0])
+(ISELB [4] _ (Flag(GT|EQ))) => (MOVDconst [1])
+(ISELB [5] _ (FlagGT)) => (MOVDconst [0])
+(ISELB [5] _ (Flag(LT|EQ))) => (MOVDconst [1])
+(ISELB [6] _ (FlagEQ)) => (MOVDconst [0])
+(ISELB [6] _ (Flag(LT|GT))) => (MOVDconst [1])
+
+(ISEL [2] x _ (FlagEQ)) => x
+(ISEL [2] _ y (Flag(LT|GT))) => y
+
+(ISEL [6] _ y (FlagEQ)) => y
+(ISEL [6] x _ (Flag(LT|GT))) => x
+
+(ISEL [0] _ y (Flag(EQ|GT))) => y
+(ISEL [0] x _ (FlagLT)) => x
+
+(ISEL [5] _ x (Flag(EQ|LT))) => x
+(ISEL [5] y _ (FlagGT)) => y
+
+(ISEL [1] _ y (Flag(EQ|LT))) => y
+(ISEL [1] x _ (FlagGT)) => x
+
+(ISEL [4] x _ (Flag(EQ|GT))) => x
+(ISEL [4] _ y (FlagLT)) => y
+
+(ISEL [2] x y ((CMP|CMPW)const [0] (Select0 (ANDCCconst [n] z)))) => (ISEL [2] x y (Select1 <types.TypeFlags> (ANDCCconst [n] z )))
+(ISEL [6] x y ((CMP|CMPW)const [0] (Select0 (ANDCCconst [n] z)))) => (ISEL [6] x y (Select1 <types.TypeFlags> (ANDCCconst [n] z )))
+(ISELB [2] x ((CMP|CMPW)const [0] (Select0 (ANDCCconst [1] z)))) => (XORconst [1] (Select0 <typ.UInt64> (ANDCCconst [1] z )))
+(ISELB [6] x ((CMP|CMPW)const [0] (Select0 (ANDCCconst [1] z)))) => (Select0 <typ.UInt64> (ANDCCconst [1] z ))
+
+(ISELB [2] x (CMPWconst [0] (Select0 (ANDCCconst [n] z)))) => (ISELB [2] x (Select1 <types.TypeFlags> (ANDCCconst [n] z )))
+(ISELB [6] x (CMPWconst [0] (Select0 (ANDCCconst [n] z)))) => (ISELB [6] x (Select1 <types.TypeFlags> (ANDCCconst [n] z )))
+
+// Only CMPconst for these in case AND|OR|XOR result is > 32 bits
+(ISELB [2] x (CMPconst [0] a:(AND y z))) && a.Uses == 1 => (ISELB [2] x (Select1 <types.TypeFlags> (ANDCC y z )))
+(ISELB [6] x (CMPconst [0] a:(AND y z))) && a.Uses == 1 => (ISELB [6] x (Select1 <types.TypeFlags> (ANDCC y z )))
+
+(ISELB [2] x (CMPconst [0] o:(OR y z))) && o.Uses == 1 => (ISELB [2] x (Select1 <types.TypeFlags> (ORCC y z )))
+(ISELB [6] x (CMPconst [0] o:(OR y z))) && o.Uses == 1 => (ISELB [6] x (Select1 <types.TypeFlags> (ORCC y z )))
+
+(ISELB [2] x (CMPconst [0] a:(XOR y z))) && a.Uses == 1 => (ISELB [2] x (Select1 <types.TypeFlags> (XORCC y z )))
+(ISELB [6] x (CMPconst [0] a:(XOR y z))) && a.Uses == 1 => (ISELB [6] x (Select1 <types.TypeFlags> (XORCC y z )))
+
+(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 0 => (ISELB [n+1] (MOVDconst [1]) bool)
+(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 1 => (ISELB [n-1] (MOVDconst [1]) bool)
+(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 2 => (ISELB [n] (MOVDconst [1]) bool)
+(ISEL [n] x y (InvertFlags bool)) && n%4 == 0 => (ISEL [n+1] x y bool)
+(ISEL [n] x y (InvertFlags bool)) && n%4 == 1 => (ISEL [n-1] x y bool)
+(ISEL [n] x y (InvertFlags bool)) && n%4 == 2 => (ISEL [n] x y bool)
+(XORconst [1] (ISELB [6] (MOVDconst [1]) cmp)) => (ISELB [2] (MOVDconst [1]) cmp)
+(XORconst [1] (ISELB [5] (MOVDconst [1]) cmp)) => (ISELB [1] (MOVDconst [1]) cmp)
+(XORconst [1] (ISELB [4] (MOVDconst [1]) cmp)) => (ISELB [0] (MOVDconst [1]) cmp)
+
+// A particular pattern seen in cgo code:
+(AND (MOVDconst [c]) x:(MOVBZload _ _)) => (Select0 (ANDCCconst [c&0xFF] x))
+
+// floating point negative abs
+(FNEG (F(ABS|NABS) x)) => (F(NABS|ABS) x)
+
+// floating-point fused multiply-add/sub
+(F(ADD|SUB) (FMUL x y) z) && x.Block.Func.useFMA(v) => (FM(ADD|SUB) x y z)
+(F(ADDS|SUBS) (FMULS x y) z) && x.Block.Func.useFMA(v) => (FM(ADDS|SUBS) x y z)
+
+// The following statements are found in encoding/binary functions UintXX (load) and PutUintXX (store)
+// and convert the statements in these functions from multiple single byte loads or stores to
+// the single largest possible load or store.
+// Some are marked big or little endian based on the order in which the bytes are loaded or stored,
+// not on the ordering of the machine. These are intended for little endian machines.
+// To implement for big endian machines, most rules would have to be duplicated but the
+// resulting rule would be reversed, i. e., MOVHZload on little endian would be MOVHBRload on big endian
+// and vice versa.
+// b[0] | b[1]<<8 => load 16-bit Little endian
+(OR <t> x0:(MOVBZload [i0] {s} p mem)
+ o1:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [8]))
+ && !config.BigEndian
+ && i1 == i0+1
+ && x0.Uses ==1 && x1.Uses == 1
+ && o1.Uses == 1
+ && mergePoint(b, x0, x1) != nil
+ && clobber(x0, x1, o1)
+ => @mergePoint(b,x0,x1) (MOVHZload <t> {s} [i0] p mem)
+
+// b[0]<<8 | b[1] => load 16-bit Big endian on Little endian arch.
+// Use byte-reverse indexed load for 2 bytes.
+(OR <t> x0:(MOVBZload [i1] {s} p mem)
+ o1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [8]))
+ && !config.BigEndian
+ && i1 == i0+1
+ && x0.Uses ==1 && x1.Uses == 1
+ && o1.Uses == 1
+ && mergePoint(b, x0, x1) != nil
+ && clobber(x0, x1, o1)
+ => @mergePoint(b,x0,x1) (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<n+8 | b[1]<<n => load 16-bit Big endian (where n%8== 0)
+// Use byte-reverse indexed load for 2 bytes,
+// then shift left to the correct position. Used to match subrules
+// from longer rules.
+(OR <t> s0:(SL(W|D)const x0:(MOVBZload [i1] {s} p mem) [n1])
+ s1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [n2]))
+ && !config.BigEndian
+ && i1 == i0+1
+ && n1%8 == 0
+ && n2 == n1+8
+ && x0.Uses == 1 && x1.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1
+ && mergePoint(b, x0, x1) != nil
+ && clobber(x0, x1, s0, s1)
+ => @mergePoint(b,x0,x1) (SLDconst <t> (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [n1])
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 => load 32-bit Little endian
+// Use byte-reverse indexed load for 4 bytes.
+(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i3] {s} p mem) [24])
+ o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [16])
+ x0:(MOVHZload [i0] {s} p mem)))
+ && !config.BigEndian
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses ==1 && x1.Uses == 1 && x2.Uses == 1
+ && o0.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1
+ && mergePoint(b, x0, x1, x2) != nil
+ && clobber(x0, x1, x2, s0, s1, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWZload <t> {s} [i0] p mem)
+
+// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with computed address.
+// Could be used to match subrules of a longer rule.
+(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i0] {s} p mem) [24])
+ o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [16])
+ x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem)))
+ && !config.BigEndian
+ && i1 == i0+1
+ && i2 == i0+2
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && o0.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1
+ && mergePoint(b, x0, x1, x2) != nil
+ && clobber(x0, x1, x2, s0, s1, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with computed address.
+// Could be used to match subrules of a longer rule.
+(OR <t> x0:(MOVBZload [i3] {s} p mem)
+ o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [8])
+ s1:(SL(W|D)const x2:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [16])))
+ && !config.BigEndian
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && o0.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1
+ && mergePoint(b, x0, x1, x2) != nil
+ && clobber(x0, x1, x2, s0, s1, o0)
+ => @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load to for 4 bytes with computed address.
+// Used to match longer rules.
+(OR <t> s2:(SLDconst x2:(MOVBZload [i3] {s} p mem) [32])
+ o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i2] {s} p mem) [40])
+ s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [48])))
+ && !config.BigEndian
+ && i2 == i0+2
+ && i3 == i0+3
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && o0.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1
+ && mergePoint(b, x0, x1, x2) != nil
+ && clobber(x0, x1, x2, s0, s1, s2, o0)
+ => @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])
+
+// b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with constant address.
+// Used to match longer rules.
+(OR <t> s2:(SLDconst x2:(MOVBZload [i0] {s} p mem) [56])
+ o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48])
+ s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem) [32])))
+ && !config.BigEndian
+ && i1 == i0+1
+ && i2 == i0+2
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && o0.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1
+ && mergePoint(b, x0, x1, x2) != nil
+ && clobber(x0, x1, x2, s0, s1, s2, o0)
+ => @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4] <<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 => load 64-bit Little endian
+// Rules with commutative ops and many operands will result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+// Offset must be multiple of 4 for MOVD
+(OR <t> s6:(SLDconst x7:(MOVBZload [i7] {s} p mem) [56])
+ o5:(OR <t> s5:(SLDconst x6:(MOVBZload [i6] {s} p mem) [48])
+ o4:(OR <t> s4:(SLDconst x5:(MOVBZload [i5] {s} p mem) [40])
+ o3:(OR <t> s3:(SLDconst x4:(MOVBZload [i4] {s} p mem) [32])
+ x0:(MOVWZload {s} [i0] p mem)))))
+ && !config.BigEndian
+ && i4 == i0+4
+ && i5 == i0+5
+ && i6 == i0+6
+ && i7 == i0+7
+ && x0.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses ==1 && x7.Uses == 1
+ && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1
+ && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1
+ && mergePoint(b, x0, x4, x5, x6, x7) != nil
+ && clobber(x0, x4, x5, x6, x7, s3, s4, s5, s6, o3, o4, o5)
+ => @mergePoint(b,x0,x4,x5,x6,x7) (MOVDload <t> {s} [i0] p mem)
+
+// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 load 64-bit Big endian ordered bytes on Little endian arch
+// Use byte-reverse indexed load of 8 bytes.
+// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+(OR <t> s0:(SLDconst x0:(MOVBZload [i0] {s} p mem) [56])
+ o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48])
+ o1:(OR <t> s2:(SLDconst x2:(MOVBZload [i2] {s} p mem) [40])
+ o2:(OR <t> s3:(SLDconst x3:(MOVBZload [i3] {s} p mem) [32])
+ x4:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i4] p) mem)))))
+ && !config.BigEndian
+ && i1 == i0+1
+ && i2 == i0+2
+ && i3 == i0+3
+ && i4 == i0+4
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+ && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+ && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+ && mergePoint(b, x0, x1, x2, x3, x4) != nil
+ && clobber(x0, x1, x2, x3, x4, o0, o1, o2, s0, s1, s2, s3)
+ => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] => load 64-bit Big endian ordered bytes on Little endian arch
+// Use byte-reverse indexed load of 8 bytes.
+// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+(OR <t> x7:(MOVBZload [i7] {s} p mem)
+ o5:(OR <t> s6:(SLDconst x6:(MOVBZload [i6] {s} p mem) [8])
+ o4:(OR <t> s5:(SLDconst x5:(MOVBZload [i5] {s} p mem) [16])
+ o3:(OR <t> s4:(SLDconst x4:(MOVBZload [i4] {s} p mem) [24])
+ s0:(SL(W|D)const x3:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])))))
+ && !config.BigEndian
+ && i4 == i0+4
+ && i5 == i0+5
+ && i6 == i0+6
+ && i7 == i0+7
+ && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+ && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1
+ && s0.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1
+ && mergePoint(b, x3, x4, x5, x6, x7) != nil
+ && clobber(x3, x4, x5, x6, x7, o3, o4, o5, s0, s4, s5, s6)
+ => @mergePoint(b,x3,x4,x5,x6,x7) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// 2 byte store Little endian as in:
+// b[0] = byte(v >> 16)
+// b[1] = byte(v >> 24)
+// Added for use in matching longer rules.
+(MOVBstore [i1] {s} p (SR(W|D)const w [24])
+ x0:(MOVBstore [i0] {s} p (SR(W|D)const w [16]) mem))
+ && !config.BigEndian
+ && x0.Uses == 1
+ && i1 == i0+1
+ && clobber(x0)
+ => (MOVHstore [i0] {s} p (SRWconst <typ.UInt16> w [16]) mem)
+
+// 2 byte store Little endian as in:
+// b[0] = byte(v)
+// b[1] = byte(v >> 8)
+(MOVBstore [i1] {s} p (SR(W|D)const w [8])
+ x0:(MOVBstore [i0] {s} p w mem))
+ && !config.BigEndian
+ && x0.Uses == 1
+ && i1 == i0+1
+ && clobber(x0)
+ => (MOVHstore [i0] {s} p w mem)
+
+// 4 byte store Little endian as in:
+// b[0:1] = uint16(v)
+// b[2:3] = uint16(v >> 16)
+(MOVHstore [i1] {s} p (SR(W|D)const w [16])
+ x0:(MOVHstore [i0] {s} p w mem))
+ && !config.BigEndian
+ && x0.Uses == 1
+ && i1 == i0+2
+ && clobber(x0)
+ => (MOVWstore [i0] {s} p w mem)
+
+// 4 byte store Big endian as in:
+// b[0] = byte(v >> 24)
+// b[1] = byte(v >> 16)
+// b[2] = byte(v >> 8)
+// b[3] = byte(v)
+// Use byte-reverse indexed 4 byte store.
+(MOVBstore [i3] {s} p w
+ x0:(MOVBstore [i2] {s} p (SRWconst w [8])
+ x1:(MOVBstore [i1] {s} p (SRWconst w [16])
+ x2:(MOVBstore [i0] {s} p (SRWconst w [24]) mem))))
+ && !config.BigEndian
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+ && i1 == i0+1 && i2 == i0+2 && i3 == i0+3
+ && clobber(x0, x1, x2)
+ => (MOVWBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
+// The 2 byte store appears after the 4 byte store so that the
+// match for the 2 byte store is not done first.
+// If the 4 byte store is based on the 2 byte store then there are
+// variations on the MOVDaddr subrule that would require additional
+// rules to be written.
+
+// 2 byte store Big endian as in:
+// b[0] = byte(v >> 8)
+// b[1] = byte(v)
+(MOVBstore [i1] {s} p w x0:(MOVBstore [i0] {s} p (SRWconst w [8]) mem))
+ && !config.BigEndian
+ && x0.Uses == 1
+ && i1 == i0+1
+ && clobber(x0)
+ => (MOVHBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
+// 8 byte store Little endian as in:
+// b[0] = byte(v)
+// b[1] = byte(v >> 8)
+// b[2] = byte(v >> 16)
+// b[3] = byte(v >> 24)
+// b[4] = byte(v >> 32)
+// b[5] = byte(v >> 40)
+// b[6] = byte(v >> 48)
+// b[7] = byte(v >> 56)
+// Built on previously defined rules
+// Offset must be multiple of 4 for MOVDstore
+(MOVBstore [i7] {s} p (SRDconst w [56])
+ x0:(MOVBstore [i6] {s} p (SRDconst w [48])
+ x1:(MOVBstore [i5] {s} p (SRDconst w [40])
+ x2:(MOVBstore [i4] {s} p (SRDconst w [32])
+ x3:(MOVWstore [i0] {s} p w mem)))))
+ && !config.BigEndian
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+ && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
+ && clobber(x0, x1, x2, x3)
+ => (MOVDstore [i0] {s} p w mem)
+
+// 8 byte store Big endian as in:
+// b[0] = byte(v >> 56)
+// b[1] = byte(v >> 48)
+// b[2] = byte(v >> 40)
+// b[3] = byte(v >> 32)
+// b[4] = byte(v >> 24)
+// b[5] = byte(v >> 16)
+// b[6] = byte(v >> 8)
+// b[7] = byte(v)
+// Use byte-reverse indexed 8 byte store.
+(MOVBstore [i7] {s} p w
+ x0:(MOVBstore [i6] {s} p (SRDconst w [8])
+ x1:(MOVBstore [i5] {s} p (SRDconst w [16])
+ x2:(MOVBstore [i4] {s} p (SRDconst w [24])
+ x3:(MOVBstore [i3] {s} p (SRDconst w [32])
+ x4:(MOVBstore [i2] {s} p (SRDconst w [40])
+ x5:(MOVBstore [i1] {s} p (SRDconst w [48])
+ x6:(MOVBstore [i0] {s} p (SRDconst w [56]) mem))))))))
+ && !config.BigEndian
+ && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1
+ && i1 == i0+1 && i2 == i0+2 && i3 == i0+3 && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
+ && clobber(x0, x1, x2, x3, x4, x5, x6)
+ => (MOVDBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
+// Arch-specific inlining for small or disjoint runtime.memmove
+(SelectN [0] call:(CALLstatic {sym} s1:(MOVDstore _ (MOVDconst [sz]) s2:(MOVDstore _ src s3:(MOVDstore {t} _ dst mem)))))
+ && sz >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+ && isInlinableMemmove(dst, src, sz, config)
+ && clobber(s1, s2, s3, call)
+ => (Move [sz] dst src mem)
+
+// Match post-lowering calls, register version.
+(SelectN [0] call:(CALLstatic {sym} dst src (MOVDconst [sz]) mem))
+ && sz >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && call.Uses == 1
+ && isInlinableMemmove(dst, src, sz, config)
+ && clobber(call)
+ => (Move [sz] dst src mem)
+
+// Prefetch instructions (TH specified using aux field)
+// For DCBT Ra,Rb,TH, A value of TH indicates:
+// 0, hint this cache line will be used soon. (PrefetchCache)
+// 16, hint this cache line will not be used for long. (PrefetchCacheStreamed)
+// See ISA 3.0 Book II 4.3.2 for more detail. https://openpower.foundation/specifications/isa/
+(PrefetchCache ptr mem) => (DCBT ptr mem [0])
+(PrefetchCacheStreamed ptr mem) => (DCBT ptr mem [16])
+
diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go
new file mode 100644
index 0000000..2d651dd
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go
@@ -0,0 +1,740 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Less-than-64-bit integer types live in the low portion of registers.
+// The upper portion is junk.
+// - Boolean types are zero or 1; stored in a byte, with upper bytes of the register containing junk.
+// - *const instructions may use a constant larger than the instruction can encode.
+// In this case the assembler expands to multiple instructions and uses tmp
+// register (R31).
+
+var regNamesPPC64 = []string{
+ "R0", // REGZERO, not used, but simplifies counting in regalloc
+ "SP", // REGSP
+ "SB", // REGSB
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11", // REGCTXT for closures
+ "R12",
+ "R13", // REGTLS
+ "R14",
+ "R15",
+ "R16",
+ "R17",
+ "R18",
+ "R19",
+ "R20",
+ "R21",
+ "R22",
+ "R23",
+ "R24",
+ "R25",
+ "R26",
+ "R27",
+ "R28",
+ "R29",
+ "g", // REGG. Using name "g" and setting Config.hasGReg makes it "just happen".
+ "R31", // REGTMP
+
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15",
+ "F16",
+ "F17",
+ "F18",
+ "F19",
+ "F20",
+ "F21",
+ "F22",
+ "F23",
+ "F24",
+ "F25",
+ "F26",
+ "F27",
+ "F28",
+ "F29",
+ "F30",
+ // "F31", the allocator is limited to 64 entries. We sacrifice this FPR to support XER.
+
+ "XER",
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ // "CR0",
+ // "CR1",
+ // "CR2",
+ // "CR3",
+ // "CR4",
+ // "CR5",
+ // "CR6",
+ // "CR7",
+
+ // "CR",
+ // "LR",
+ // "CTR",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesPPC64) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesPPC64 {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ var (
+ gp = buildReg("R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29")
+ fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30")
+ sp = buildReg("SP")
+ sb = buildReg("SB")
+ gr = buildReg("g")
+ xer = buildReg("XER")
+ // cr = buildReg("CR")
+ // ctr = buildReg("CTR")
+ // lr = buildReg("LR")
+ tmp = buildReg("R31")
+ ctxt = buildReg("R11")
+ callptr = buildReg("R12")
+ // tls = buildReg("R13")
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
+ xergp = regInfo{inputs: []regMask{xer}, outputs: []regMask{gp}, clobbers: xer}
+ gp11cxer = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}, clobbers: xer}
+ gp11xer = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp, xer}}
+ gp21 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+ gp21a0 = regInfo{inputs: []regMask{gp, gp | sp | sb}, outputs: []regMask{gp}}
+ gp21cxer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}, clobbers: xer}
+ gp21xer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, xer}, clobbers: xer}
+ gp2xer1xer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, xer}, outputs: []regMask{gp, xer}, clobbers: xer}
+ gp31 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+ gp1cr = regInfo{inputs: []regMask{gp | sp | sb}}
+ gp2cr = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
+ crgp = regInfo{inputs: nil, outputs: []regMask{gp}}
+ crgp11 = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}
+ crgp21 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+ gpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
+ gploadidx = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}}
+ prefreg = regInfo{inputs: []regMask{gp | sp | sb}}
+ gpstore = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
+ gpstoreidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}}
+ gpstorezero = regInfo{inputs: []regMask{gp | sp | sb}} // ppc64.REGZERO is reserved zero value
+ gpxchg = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}}
+ gpcas = regInfo{inputs: []regMask{gp | sp | sb, gp, gp}, outputs: []regMask{gp}}
+ fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
+ fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+ fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+ gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
+ fp2cr = regInfo{inputs: []regMask{fp, fp}}
+ fpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{fp}}
+ fploadidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{fp}}
+ fpstore = regInfo{inputs: []regMask{gp | sp | sb, fp}}
+ fpstoreidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, fp}}
+ callerSave = regMask(gp | fp | gr | xer)
+ r3 = buildReg("R3")
+ r4 = buildReg("R4")
+ r5 = buildReg("R5")
+ r6 = buildReg("R6")
+ )
+ ops := []opData{
+ {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1
+ {name: "ADDconst", argLength: 1, reg: gp11, asm: "ADD", aux: "Int64"}, // arg0 + auxInt
+ {name: "FADD", argLength: 2, reg: fp21, asm: "FADD", commutative: true}, // arg0+arg1
+ {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0+arg1
+ {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0-arg1
+ {name: "SUBFCconst", argLength: 1, reg: gp11cxer, asm: "SUBC", aux: "Int64"}, // auxInt - arg0 (carry is ignored)
+ {name: "FSUB", argLength: 2, reg: fp21, asm: "FSUB"}, // arg0-arg1
+ {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0-arg1
+
+ {name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit)
+ {name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit)
+ {name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+ {name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+ {name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"}, // (arg0*arg1)+arg2 (signed 64-bit)
+
+ {name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true}, // (arg0 * arg1) >> 64, signed
+ {name: "MULHW", argLength: 2, reg: gp21, asm: "MULHW", commutative: true}, // (arg0 * arg1) >> 32, signed
+ {name: "MULHDU", argLength: 2, reg: gp21, asm: "MULHDU", commutative: true}, // (arg0 * arg1) >> 64, unsigned
+ {name: "MULHWU", argLength: 2, reg: gp21, asm: "MULHWU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+
+ {name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true}, // arg0*arg1
+ {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1
+
+ {name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD"}, // arg0*arg1 + arg2
+ {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // arg0*arg1 + arg2
+ {name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB"}, // arg0*arg1 - arg2
+ {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // arg0*arg1 - arg2
+
+ {name: "SRAD", argLength: 2, reg: gp21cxer, asm: "SRAD"}, // signed arg0 >> (arg1&127), 64 bit width (note: 127, not 63!)
+ {name: "SRAW", argLength: 2, reg: gp21cxer, asm: "SRAW"}, // signed arg0 >> (arg1&63), 32 bit width
+ {name: "SRD", argLength: 2, reg: gp21, asm: "SRD"}, // unsigned arg0 >> (arg1&127), 64 bit width
+ {name: "SRW", argLength: 2, reg: gp21, asm: "SRW"}, // unsigned arg0 >> (arg1&63), 32 bit width
+ {name: "SLD", argLength: 2, reg: gp21, asm: "SLD"}, // arg0 << (arg1&127), 64 bit width
+ {name: "SLW", argLength: 2, reg: gp21, asm: "SLW"}, // arg0 << (arg1&63), 32 bit width
+
+ {name: "ROTL", argLength: 2, reg: gp21, asm: "ROTL"}, // arg0 rotate left by arg1 mod 64
+ {name: "ROTLW", argLength: 2, reg: gp21, asm: "ROTLW"}, // uint32(arg0) rotate left by arg1 mod 32
+ // The following are ops to implement the extended mnemonics for shifts as described in section C.8 of the ISA.
+ // The constant shift values are packed into the aux int32.
+ {name: "RLDICL", argLength: 1, reg: gp11, asm: "RLDICL", aux: "Int32"}, // arg0 extract bits identified by shift params"
+ {name: "CLRLSLWI", argLength: 1, reg: gp11, asm: "CLRLSLWI", aux: "Int32"}, //
+ {name: "CLRLSLDI", argLength: 1, reg: gp11, asm: "CLRLSLDI", aux: "Int32"}, //
+
+ // Operations which consume or generate the CA (xer)
+ {name: "ADDC", argLength: 2, reg: gp21xer, asm: "ADDC", commutative: true, typ: "(UInt64, UInt64)"}, // arg0 + arg1 -> out, CA
+ {name: "SUBC", argLength: 2, reg: gp21xer, asm: "SUBC", typ: "(UInt64, UInt64)"}, // arg0 - arg1 -> out, CA
+ {name: "ADDCconst", argLength: 1, reg: gp11xer, asm: "ADDC", typ: "(UInt64, UInt64)", aux: "Int64"}, // arg0 + imm16 -> out, CA
+ {name: "SUBCconst", argLength: 1, reg: gp11xer, asm: "SUBC", typ: "(UInt64, UInt64)", aux: "Int64"}, // imm16 - arg0 -> out, CA
+ {name: "ADDE", argLength: 3, reg: gp2xer1xer, asm: "ADDE", typ: "(UInt64, UInt64)", commutative: true}, // arg0 + arg1 + CA (arg2) -> out, CA
+ {name: "SUBE", argLength: 3, reg: gp2xer1xer, asm: "SUBE", typ: "(UInt64, UInt64)"}, // arg0 - arg1 - CA (arg2) -> out, CA
+ {name: "ADDZEzero", argLength: 1, reg: xergp, asm: "ADDZE", typ: "UInt64"}, // CA (arg0) + $0 -> out
+ {name: "SUBZEzero", argLength: 1, reg: xergp, asm: "SUBZE", typ: "UInt64"}, // $0 - CA (arg0) -> out
+
+ {name: "SRADconst", argLength: 1, reg: gp11cxer, asm: "SRAD", aux: "Int64"}, // signed arg0 >> auxInt, 0 <= auxInt < 64, 64 bit width
+ {name: "SRAWconst", argLength: 1, reg: gp11cxer, asm: "SRAW", aux: "Int64"}, // signed arg0 >> auxInt, 0 <= auxInt < 32, 32 bit width
+ {name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "Int64"}, // unsigned arg0 >> auxInt, 0 <= auxInt < 64, 64 bit width
+ {name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "Int64"}, // unsigned arg0 >> auxInt, 0 <= auxInt < 32, 32 bit width
+ {name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "Int64"}, // arg0 << auxInt, 0 <= auxInt < 64, 64 bit width
+ {name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "Int64"}, // arg0 << auxInt, 0 <= auxInt < 32, 32 bit width
+
+ {name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"}, // arg0 rotate left by auxInt bits
+ {name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
+ {name: "EXTSWSLconst", argLength: 1, reg: gp11, asm: "EXTSWSLI", aux: "Int64"},
+
+ {name: "RLWINM", argLength: 1, reg: gp11, asm: "RLWNM", aux: "Int64"}, // Rotate and mask by immediate "rlwinm". encodePPC64RotateMask describes aux
+ {name: "RLWNM", argLength: 2, reg: gp21, asm: "RLWNM", aux: "Int64"}, // Rotate and mask by "rlwnm". encodePPC64RotateMask describes aux
+ {name: "RLWMI", argLength: 2, reg: gp21a0, asm: "RLWMI", aux: "Int64", resultInArg0: true}, // "rlwimi" similar aux encoding as above
+
+ {name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
+ {name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
+
+ {name: "CNTTZD", argLength: 1, reg: gp11, asm: "CNTTZD"}, // count trailing zeros
+ {name: "CNTTZW", argLength: 1, reg: gp11, asm: "CNTTZW"}, // count trailing zeros (32 bit)
+
+ {name: "POPCNTD", argLength: 1, reg: gp11, asm: "POPCNTD"}, // number of set bits in arg0
+ {name: "POPCNTW", argLength: 1, reg: gp11, asm: "POPCNTW"}, // number of set bits in each word of arg0 placed in corresponding word
+ {name: "POPCNTB", argLength: 1, reg: gp11, asm: "POPCNTB"}, // number of set bits in each byte of arg0 placed in corresponding byte
+
+ {name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"}, // arg0/arg1
+ {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1
+
+ {name: "DIVD", argLength: 2, reg: gp21, asm: "DIVD", typ: "Int64"}, // arg0/arg1 (signed 64-bit)
+ {name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"}, // arg0/arg1 (signed 32-bit)
+ {name: "DIVDU", argLength: 2, reg: gp21, asm: "DIVDU", typ: "Int64"}, // arg0/arg1 (unsigned 64-bit)
+ {name: "DIVWU", argLength: 2, reg: gp21, asm: "DIVWU", typ: "Int32"}, // arg0/arg1 (unsigned 32-bit)
+
+ {name: "MODUD", argLength: 2, reg: gp21, asm: "MODUD", typ: "UInt64"}, // arg0 % arg1 (unsigned 64-bit)
+ {name: "MODSD", argLength: 2, reg: gp21, asm: "MODSD", typ: "Int64"}, // arg0 % arg1 (signed 64-bit)
+ {name: "MODUW", argLength: 2, reg: gp21, asm: "MODUW", typ: "UInt32"}, // arg0 % arg1 (unsigned 32-bit)
+ {name: "MODSW", argLength: 2, reg: gp21, asm: "MODSW", typ: "Int32"}, // arg0 % arg1 (signed 32-bit)
+ // MOD is implemented as rem := arg0 - (arg0/arg1) * arg1
+
+ // Conversions are all float-to-float register operations. "Integer" refers to encoding in the FP register.
+ {name: "FCTIDZ", argLength: 1, reg: fp11, asm: "FCTIDZ", typ: "Float64"}, // convert float to 64-bit int round towards zero
+ {name: "FCTIWZ", argLength: 1, reg: fp11, asm: "FCTIWZ", typ: "Float64"}, // convert float to 32-bit int round towards zero
+ {name: "FCFID", argLength: 1, reg: fp11, asm: "FCFID", typ: "Float64"}, // convert 64-bit integer to float
+ {name: "FCFIDS", argLength: 1, reg: fp11, asm: "FCFIDS", typ: "Float32"}, // convert 32-bit integer to float
+ {name: "FRSP", argLength: 1, reg: fp11, asm: "FRSP", typ: "Float64"}, // round float to 32-bit value
+
+ // Movement between float and integer registers with no change in bits; accomplished with stores+loads on PPC.
+ // Because the 32-bit load-literal-bits instructions have impoverished addressability, always widen the
+ // data instead and use FMOVDload and FMOVDstore instead (this will also dodge endianess issues).
+ // There are optimizations that should apply -- (Xi2f64 (MOVWload (not-ADD-ptr+offset) ) ) could use
+ // the word-load instructions. (Xi2f64 (MOVDload ptr )) can be (FMOVDload ptr)
+
+ {name: "MFVSRD", argLength: 1, reg: fpgp, asm: "MFVSRD", typ: "Int64"}, // move 64 bits of F register into G register
+ {name: "MTVSRD", argLength: 1, reg: gpfp, asm: "MTVSRD", typ: "Float64"}, // move 64 bits of G register into F register
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0&arg1
+ {name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"}, // arg0&^arg1
+ {name: "ANDCC", argLength: 2, reg: gp21, asm: "ANDCC", commutative: true, clobberFlags: true, typ: "(Int64,Flags)"}, // arg0&arg1 sets CC
+ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0|arg1
+ {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // arg0|^arg1
+ {name: "ORCC", argLength: 2, reg: gp21, asm: "ORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"}, // arg0|arg1 sets CC
+ {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0|arg1)
+ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", typ: "Int64", commutative: true}, // arg0^arg1
+ {name: "XORCC", argLength: 2, reg: gp21, asm: "XORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"}, // arg0^arg1 sets CC
+ {name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true}, // arg0^^arg1
+ {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0 (integer)
+ {name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"}, // -arg0 (floating point)
+ {name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0) (floating point)
+ {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0) (floating point, single precision)
+ {name: "FFLOOR", argLength: 1, reg: fp11, asm: "FRIM"}, // floor(arg0), float64
+ {name: "FCEIL", argLength: 1, reg: fp11, asm: "FRIP"}, // ceil(arg0), float64
+ {name: "FTRUNC", argLength: 1, reg: fp11, asm: "FRIZ"}, // trunc(arg0), float64
+ {name: "FROUND", argLength: 1, reg: fp11, asm: "FRIN"}, // round(arg0), float64
+ {name: "FABS", argLength: 1, reg: fp11, asm: "FABS"}, // abs(arg0), float64
+ {name: "FNABS", argLength: 1, reg: fp11, asm: "FNABS"}, // -abs(arg0), float64
+ {name: "FCPSGN", argLength: 2, reg: fp21, asm: "FCPSGN"}, // copysign arg0 -> arg1, float64
+
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0|aux
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64"}, // arg0^aux
+ {name: "ANDCCconst", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}, asm: "ANDCC", aux: "Int64", clobberFlags: true, typ: "(Int,Flags)"}, // arg0&aux == 0 // and-immediate sets CC on PPC, always.
+
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB", typ: "Int64"}, // sign extend int8 to int64
+ {name: "MOVBZreg", argLength: 1, reg: gp11, asm: "MOVBZ", typ: "Int64"}, // zero extend uint8 to uint64
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH", typ: "Int64"}, // sign extend int16 to int64
+ {name: "MOVHZreg", argLength: 1, reg: gp11, asm: "MOVHZ", typ: "Int64"}, // zero extend uint16 to uint64
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW", typ: "Int64"}, // sign extend int32 to int64
+ {name: "MOVWZreg", argLength: 1, reg: gp11, asm: "MOVWZ", typ: "Int64"}, // zero extend uint32 to uint64
+
+ // Load bytes in the endian order of the arch from arg0+aux+auxint into a 64 bit register.
+ {name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte zero extend
+ {name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes sign extend
+ {name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend
+ {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes sign extend
+ {name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend
+ {name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes
+
+ // Load bytes in reverse endian order of the arch from arg0 into a 64 bit register, all zero extend.
+ // The generated instructions are indexed loads with no offset field in the instruction so the aux fields are not used.
+ // In these cases the index register field is set to 0 and the full address is in the base register.
+ {name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes reverse order
+ {name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend reverse order
+ {name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend reverse order
+
+ // In these cases an index register is used in addition to a base register
+ // Loads from memory location arg[0] + arg[1].
+ {name: "MOVBZloadidx", argLength: 3, reg: gploadidx, asm: "MOVBZ", typ: "UInt8"}, // zero extend uint8 to uint64
+ {name: "MOVHloadidx", argLength: 3, reg: gploadidx, asm: "MOVH", typ: "Int16"}, // sign extend int16 to int64
+ {name: "MOVHZloadidx", argLength: 3, reg: gploadidx, asm: "MOVHZ", typ: "UInt16"}, // zero extend uint16 to uint64
+ {name: "MOVWloadidx", argLength: 3, reg: gploadidx, asm: "MOVW", typ: "Int32"}, // sign extend int32 to int64
+ {name: "MOVWZloadidx", argLength: 3, reg: gploadidx, asm: "MOVWZ", typ: "UInt32"}, // zero extend uint32 to uint64
+ {name: "MOVDloadidx", argLength: 3, reg: gploadidx, asm: "MOVD", typ: "Int64"},
+ {name: "MOVHBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVHBR", typ: "Int16"}, // sign extend int16 to int64
+ {name: "MOVWBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVWBR", typ: "Int32"}, // sign extend int32 to int64
+ {name: "MOVDBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVDBR", typ: "Int64"},
+ {name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", typ: "Float64"},
+ {name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", typ: "Float32"},
+
+ // Prefetch instruction
+ // Do prefetch of address generated with arg0 and arg1 with option aux. arg0=addr,arg1=memory, aux=option.
+ {name: "DCBT", argLength: 2, aux: "Int64", reg: prefreg, asm: "DCBT", hasSideEffects: true},
+
+ // Store bytes in the reverse endian order of the arch into arg0.
+ // These are indexed stores with no offset field in the instruction so the auxint fields are not used.
+ {name: "MOVDBRstore", argLength: 3, reg: gpstore, asm: "MOVDBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes reverse order
+ {name: "MOVWBRstore", argLength: 3, reg: gpstore, asm: "MOVWBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes reverse order
+ {name: "MOVHBRstore", argLength: 3, reg: gpstore, asm: "MOVHBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes reverse order
+
+ // Floating point loads from arg0+aux+auxint
+ {name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load double float
+ {name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load single float
+
+ // Store bytes in the endian order of the arch into arg0+aux+auxint
+ {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte
+ {name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes
+ {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes
+ {name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes
+
+ // Store floating point value into arg0+aux+auxint
+ {name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store double flot
+ {name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store single float
+
+ // Stores using index and base registers
+ // Stores to arg[0] + arg[1]
+ {name: "MOVBstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVB", typ: "Mem"}, // store bye
+ {name: "MOVHstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVH", typ: "Mem"}, // store half word
+ {name: "MOVWstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVW", typ: "Mem"}, // store word
+ {name: "MOVDstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVD", typ: "Mem"}, // store double word
+ {name: "FMOVDstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVD", typ: "Mem"}, // store double float
+ {name: "FMOVSstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVS", typ: "Mem"}, // store single float
+ {name: "MOVHBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVHBR", typ: "Mem"}, // store half word reversed byte using index reg
+ {name: "MOVWBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVWBR", typ: "Mem"}, // store word reversed byte using index reg
+ {name: "MOVDBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVDBR", typ: "Mem"}, // store double word reversed byte using index reg
+
+ // The following ops store 0 into arg0+aux+auxint arg1=mem
+ {name: "MOVBstorezero", argLength: 2, reg: gpstorezero, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 1 byte
+ {name: "MOVHstorezero", argLength: 2, reg: gpstorezero, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 2 bytes
+ {name: "MOVWstorezero", argLength: 2, reg: gpstorezero, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 4 bytes
+ {name: "MOVDstorezero", argLength: 2, reg: gpstorezero, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 8 bytes
+
+ {name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb | gp}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB/GP
+
+ {name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "Int64", rematerializeable: true}, //
+ {name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", rematerializeable: true}, //
+ {name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float32", asm: "FMOVS", rematerializeable: true}, //
+ {name: "FCMPU", argLength: 2, reg: fp2cr, asm: "FCMPU", typ: "Flags"},
+
+ {name: "CMP", argLength: 2, reg: gp2cr, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPU", argLength: 2, reg: gp2cr, asm: "CMPU", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPW", argLength: 2, reg: gp2cr, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPWU", argLength: 2, reg: gp2cr, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPconst", argLength: 1, reg: gp1cr, asm: "CMP", aux: "Int64", typ: "Flags"},
+ {name: "CMPUconst", argLength: 1, reg: gp1cr, asm: "CMPU", aux: "Int64", typ: "Flags"},
+ {name: "CMPWconst", argLength: 1, reg: gp1cr, asm: "CMPW", aux: "Int32", typ: "Flags"},
+ {name: "CMPWUconst", argLength: 1, reg: gp1cr, asm: "CMPWU", aux: "Int32", typ: "Flags"},
+
+ // ISEL arg2 ? arg0 : arg1
+ // ISELB arg1 ? arg0 : $0. arg0 is some register holding $1.
+ // ISELZ arg1 ? arg0 : $0
+ // auxInt values 0=LT 1=GT 2=EQ 3=SO (summary overflow/unordered) 4=GE 5=LE 6=NE 7=NSO (not summary overflow/not unordered)
+ // Note, auxInt^4 inverts the comparison condition. For example, LT^4 becomes GE, and "ISEL [a] x y z" is equivalent to ISEL [a^4] y x z".
+ {name: "ISEL", argLength: 3, reg: crgp21, asm: "ISEL", aux: "Int32", typ: "Int32"},
+ {name: "ISELB", argLength: 2, reg: crgp11, asm: "ISEL", aux: "Int32", typ: "Int32"},
+ {name: "ISELZ", argLength: 2, reg: crgp11, asm: "ISEL", aux: "Int32"},
+
+ // pseudo-ops
+ {name: "Equal", argLength: 1, reg: crgp}, // bool, true flags encode x==y false otherwise.
+ {name: "NotEqual", argLength: 1, reg: crgp}, // bool, true flags encode x!=y false otherwise.
+ {name: "LessThan", argLength: 1, reg: crgp}, // bool, true flags encode x<y false otherwise.
+ {name: "FLessThan", argLength: 1, reg: crgp}, // bool, true flags encode x<y false otherwise.
+ {name: "LessEqual", argLength: 1, reg: crgp}, // bool, true flags encode x<=y false otherwise.
+ {name: "FLessEqual", argLength: 1, reg: crgp}, // bool, true flags encode x<=y false otherwise; PPC <= === !> which is wrong for NaN
+ {name: "GreaterThan", argLength: 1, reg: crgp}, // bool, true flags encode x>y false otherwise.
+ {name: "FGreaterThan", argLength: 1, reg: crgp}, // bool, true flags encode x>y false otherwise.
+ {name: "GreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode x>=y false otherwise.
+ {name: "FGreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode x>=y false otherwise.; PPC >= === !< which is wrong for NaN
+
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of the closure pointer.
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{ctxt}}, zeroWidth: true},
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+ // Round ops to block fused-multiply-add extraction.
+ {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+ {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+
+ {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{callptr, ctxt, 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{callptr}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // large or unaligned zeroing
+ // arg0 = address of memory to zero (in R3, changed as side effect)
+ // returns mem
+ //
+ // a loop is generated when there is more than one iteration
+ // needed to clear 4 doublewords
+ //
+ // XXLXOR VS32,VS32,VS32
+ // MOVD $len/32,R31
+ // MOVD R31,CTR
+ // MOVD $16,R31
+ // loop:
+ // STXVD2X VS32,(R0)(R3)
+ // STXVD2X VS32,(R31),R3)
+ // ADD R3,32
+ // BC loop
+
+ // remaining doubleword clears generated as needed
+ // MOVD R0,(R3)
+ // MOVD R0,8(R3)
+ // MOVD R0,16(R3)
+ // MOVD R0,24(R3)
+
+ // one or more of these to clear remainder < 8 bytes
+ // MOVW R0,n1(R3)
+ // MOVH R0,n2(R3)
+ // MOVB R0,n3(R3)
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20")},
+ clobbers: buildReg("R20"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+ {
+ name: "LoweredZeroShort",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp}},
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+ {
+ name: "LoweredQuadZeroShort",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp},
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+ {
+ name: "LoweredQuadZero",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20")},
+ clobbers: buildReg("R20"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ unsafePoint: true,
+ },
+
+ // R31 is temp register
+ // Loop code:
+ // MOVD len/32,R31 set up loop ctr
+ // MOVD R31,CTR
+ // MOVD $16,R31 index register
+ // loop:
+ // LXVD2X (R0)(R4),VS32
+ // LXVD2X (R31)(R4),VS33
+ // ADD R4,$32 increment src
+ // STXVD2X VS32,(R0)(R3)
+ // STXVD2X VS33,(R31)(R3)
+ // ADD R3,$32 increment dst
+ // BC 16,0,loop branch ctr
+ // For this purpose, VS32 and VS33 are treated as
+ // scratch registers. Since regalloc does not
+ // track vector registers, even if it could be marked
+ // as clobbered it would have no effect.
+ // TODO: If vector registers are managed by regalloc
+ // mark these as clobbered.
+ //
+ // Bytes not moved by this loop are moved
+ // with a combination of the following instructions,
+ // starting with the largest sizes and generating as
+ // many as needed, using the appropriate offset value.
+ // MOVD n(R4),R14
+ // MOVD R14,n(R3)
+ // MOVW n1(R4),R14
+ // MOVW R14,n1(R3)
+ // MOVH n2(R4),R14
+ // MOVH R14,n2(R3)
+ // MOVB n3(R4),R14
+ // MOVB R14,n3(R3)
+
+ {
+ name: "LoweredMove",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20"), buildReg("R21")},
+ clobbers: buildReg("R20 R21"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
+ {
+ name: "LoweredMoveShort",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{gp, gp},
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
+
+ // The following is similar to the LoweredMove, but uses
+ // LXV instead of LXVD2X, which does not require an index
+ // register and will do 4 in a loop instead of only.
+ {
+ name: "LoweredQuadMove",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R20"), buildReg("R21")},
+ clobbers: buildReg("R20 R21"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
+
+ {
+ name: "LoweredQuadMoveShort",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{gp, gp},
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ unsafePoint: true,
+ },
+
+ {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
+
+ {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, typ: "UInt8", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, typ: "UInt32", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoadPtr", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+
+ // atomic add32, 64
+ // LWSYNC
+ // LDAR (Rarg0), Rout
+ // ADD Rarg1, Rout
+ // STDCCC Rout, (Rarg0)
+ // BNE -3(PC)
+ // return new sum
+ {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic exchange32, 64
+ // LWSYNC
+ // LDAR (Rarg0), Rout
+ // STDCCC Rarg1, (Rarg0)
+ // BNE -2(PC)
+ // ISYNC
+ // return old val
+ {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // SYNC
+ // LDAR (Rarg0), Rtmp
+ // CMP Rarg1, Rtmp
+ // BNE 3(PC)
+ // STDCCC Rarg2, (Rarg0)
+ // BNE -4(PC)
+ // CBNZ Rtmp, -4(PC)
+ // CSET EQ, Rout
+ {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+
+ // atomic 8/32 and/or.
+ // *arg0 &= (|=) arg1. arg2=mem. returns memory. auxint must be zero.
+ // LBAR/LWAT (Rarg0), Rtmp
+ // AND/OR Rarg1, Rtmp
+ // STBCCC/STWCCC Rtmp, (Rarg0), Rtmp
+ // BNE Rtmp, -3(PC)
+ {name: "LoweredAtomicAnd8", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicAnd32", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicOr8", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicOr32", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It preserves R0 through R17 (except special registers R1, R2, R11, R12, R13), g, and its arguments R20 and R21,
+ // but may clobber anything else, including R31 (REGTMP).
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ buildReg("R0 R3 R4 R5 R6 R7 R8 R9 R10 R14 R15 R16 R17 R20 R21 g")) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ {name: "LoweredPubBarrier", argLength: 1, asm: "LWSYNC", hasSideEffects: true}, // Do data barrier. arg0=memory
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r6}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r5}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+
+ // (InvertFlags (CMP a b)) == (CMP b a)
+ // So if we want (LessThan (CMP a b)) but we can't do that because a is a constant,
+ // then we do (LessThan (InvertFlags (CMP b a))) instead.
+ // Rewrites will convert this to (GreaterThan (CMP b a)).
+ // InvertFlags is a pseudo-op which can't appear in assembly output.
+ {name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+ // Constant flag values. For any comparison, there are 3 possible
+ // outcomes: either the three from the signed total order (<,==,>)
+ // or the three from the unsigned total order, depending on which
+ // comparison operation was used (CMP or CMPU -- PPC is different from
+ // the other architectures, which have a single comparison producing
+ // both signed and unsigned comparison results.)
+
+ // These ops are for temporary use by rewrite rules. They
+ // cannot appear in the generated assembly.
+ {name: "FlagEQ"}, // equal
+ {name: "FlagLT"}, // signed < or unsigned <
+ {name: "FlagGT"}, // signed > or unsigned >
+ }
+
+ blocks := []blockData{
+ {name: "EQ", controls: 1},
+ {name: "NE", controls: 1},
+ {name: "LT", controls: 1},
+ {name: "LE", controls: 1},
+ {name: "GT", controls: 1},
+ {name: "GE", controls: 1},
+ {name: "FLT", controls: 1},
+ {name: "FLE", controls: 1},
+ {name: "FGT", controls: 1},
+ {name: "FGE", controls: 1},
+ }
+
+ archs = append(archs, arch{
+ name: "PPC64",
+ pkg: "cmd/internal/obj/ppc64",
+ genfile: "../../ppc64/ssa.go",
+ ops: ops,
+ blocks: blocks,
+ regnames: regNamesPPC64,
+ ParamIntRegNames: "R3 R4 R5 R6 R7 R8 R9 R10 R14 R15 R16 R17",
+ ParamFloatRegNames: "F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12",
+ gpregmask: gp,
+ fpregmask: fp,
+ specialregmask: xer,
+ framepointerreg: -1,
+ linkreg: -1, // not used
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
new file mode 100644
index 0000000..ada97b2
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
@@ -0,0 +1,10 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules used by the laterLower pass.
+
+// Simplify ISEL x $0 z into ISELZ
+(ISEL [a] x (MOVDconst [0]) z) => (ISELZ [a] x z)
+// Simplify ISEL $0 y z into ISELZ by inverting comparison and reversing arguments.
+(ISEL [a] (MOVDconst [0]) y z) => (ISELZ [a^0x4] y z)
diff --git a/src/cmd/compile/internal/ssa/_gen/README b/src/cmd/compile/internal/ssa/_gen/README
new file mode 100644
index 0000000..0c7ceba
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/README
@@ -0,0 +1,7 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+This package generates opcode tables, rewrite rules, etc. for the ssa compiler.
+Run it with go-1.13 (or above):
+ go run .
diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
new file mode 100644
index 0000000..59f71be
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
@@ -0,0 +1,845 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add64 ...) => (ADD ...)
+(AddPtr ...) => (ADD ...)
+(Add32 ...) => (ADD ...)
+(Add16 ...) => (ADD ...)
+(Add8 ...) => (ADD ...)
+(Add32F ...) => (FADDS ...)
+(Add64F ...) => (FADDD ...)
+
+(Sub64 ...) => (SUB ...)
+(SubPtr ...) => (SUB ...)
+(Sub32 ...) => (SUB ...)
+(Sub16 ...) => (SUB ...)
+(Sub8 ...) => (SUB ...)
+(Sub32F ...) => (FSUBS ...)
+(Sub64F ...) => (FSUBD ...)
+
+(Mul64 ...) => (MUL ...)
+(Mul64uhilo ...) => (LoweredMuluhilo ...)
+(Mul64uover ...) => (LoweredMuluover ...)
+(Mul32 ...) => (MULW ...)
+(Mul16 x y) => (MULW (SignExt16to32 x) (SignExt16to32 y))
+(Mul8 x y) => (MULW (SignExt8to32 x) (SignExt8to32 y))
+(Mul32F ...) => (FMULS ...)
+(Mul64F ...) => (FMULD ...)
+
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIVD ...)
+
+(Div64 x y [false]) => (DIV x y)
+(Div64u ...) => (DIVU ...)
+(Div32 x y [false]) => (DIVW x y)
+(Div32u ...) => (DIVUW ...)
+(Div16 x y [false]) => (DIVW (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (DIVUW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (DIVUW (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(Hmul64 ...) => (MULH ...)
+(Hmul64u ...) => (MULHU ...)
+(Hmul32 x y) => (SRAI [32] (MUL (SignExt32to64 x) (SignExt32to64 y)))
+(Hmul32u x y) => (SRLI [32] (MUL (ZeroExt32to64 x) (ZeroExt32to64 y)))
+
+(Select0 (Add64carry x y c)) => (ADD (ADD <typ.UInt64> x y) c)
+(Select1 (Add64carry x y c)) =>
+ (OR (SLTU <typ.UInt64> s:(ADD <typ.UInt64> x y) x) (SLTU <typ.UInt64> (ADD <typ.UInt64> s c) s))
+
+(Select0 (Sub64borrow x y c)) => (SUB (SUB <typ.UInt64> x y) c)
+(Select1 (Sub64borrow x y c)) =>
+ (OR (SLTU <typ.UInt64> x s:(SUB <typ.UInt64> x y)) (SLTU <typ.UInt64> s (SUB <typ.UInt64> s c)))
+
+// (x + y) / 2 => (x / 2) + (y / 2) + (x & y & 1)
+(Avg64u <t> x y) => (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y)))
+
+(Mod64 x y [false]) => (REM x y)
+(Mod64u ...) => (REMU ...)
+(Mod32 x y [false]) => (REMW x y)
+(Mod32u ...) => (REMUW ...)
+(Mod16 x y [false]) => (REMW (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (REMW (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (REMUW (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(And64 ...) => (AND ...)
+(And32 ...) => (AND ...)
+(And16 ...) => (AND ...)
+(And8 ...) => (AND ...)
+
+(Or64 ...) => (OR ...)
+(Or32 ...) => (OR ...)
+(Or16 ...) => (OR ...)
+(Or8 ...) => (OR ...)
+
+(Xor64 ...) => (XOR ...)
+(Xor32 ...) => (XOR ...)
+(Xor16 ...) => (XOR ...)
+(Xor8 ...) => (XOR ...)
+
+(Neg64 ...) => (NEG ...)
+(Neg32 ...) => (NEG ...)
+(Neg16 ...) => (NEG ...)
+(Neg8 ...) => (NEG ...)
+(Neg32F ...) => (FNEGS ...)
+(Neg64F ...) => (FNEGD ...)
+
+(Com64 ...) => (NOT ...)
+(Com32 ...) => (NOT ...)
+(Com16 ...) => (NOT ...)
+(Com8 ...) => (NOT ...)
+
+(Sqrt ...) => (FSQRTD ...)
+(Sqrt32 ...) => (FSQRTS ...)
+
+(Copysign ...) => (FSGNJD ...)
+
+(Abs ...) => (FABSD ...)
+
+(FMA ...) => (FMADDD ...)
+
+// Sign and zero extension.
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(Cvt32to32F ...) => (FCVTSW ...)
+(Cvt32to64F ...) => (FCVTDW ...)
+(Cvt64to32F ...) => (FCVTSL ...)
+(Cvt64to64F ...) => (FCVTDL ...)
+
+(Cvt32Fto32 ...) => (FCVTWS ...)
+(Cvt32Fto64 ...) => (FCVTLS ...)
+(Cvt64Fto32 ...) => (FCVTWD ...)
+(Cvt64Fto64 ...) => (FCVTLD ...)
+
+(Cvt32Fto64F ...) => (FCVTDS ...)
+(Cvt64Fto32F ...) => (FCVTSD ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (Copy ...)
+(Round64F ...) => (Copy ...)
+
+(Slicemask <t> x) => (SRAI [63] (NEG <t> x))
+
+// Truncations
+// We ignore the unused high parts of registers, so truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Shifts
+
+// SLL only considers the bottom 6 bits of y. If y > 64, the result should
+// always be 0.
+//
+// Breaking down the operation:
+//
+// (SLL x y) generates x << (y & 63).
+//
+// If y < 64, this is the value we want. Otherwise, we want zero.
+//
+// So, we AND with -1 * uint64(y < 64), which is 0xfffff... if y < 64 and 0 otherwise.
+(Lsh8x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Lsh8x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh8x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh8x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] y)))
+(Lsh16x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Lsh16x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh16x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh16x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] y)))
+(Lsh32x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Lsh32x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh32x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh32x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] y)))
+(Lsh64x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Lsh64x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh64x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh64x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
+
+(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
+
+// SRL only considers the bottom 6 bits of y. If y > 64, the result should
+// always be 0. See Lsh above for a detailed description.
+(Rsh8Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Rsh8Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh8Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh8Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] y)))
+(Rsh16Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
+(Rsh32Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
+(Rsh64Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
+(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
+
+(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt8to64 x) y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt16to64 x) y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt32to64 x) y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x y)
+
+// SRA only considers the bottom 6 bits of y. If y > 64, the result should
+// be either 0 or -1 based on the sign bit.
+//
+// We implement this by performing the max shift (-1) if y >= 64.
+//
+// We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves
+// us with -1 (0xffff...) if y >= 64.
+//
+// We don't need to sign-extend the OR result, as it will be at minimum 8 bits,
+// more than the 6 bits SRA cares about.
+(Rsh8x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
+(Rsh8x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh8x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh8x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+(Rsh16x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
+(Rsh16x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh16x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh16x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+(Rsh32x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
+(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+(Rsh64x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
+(Rsh64x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh64x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh64x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+
+(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt8to64 x) y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt16to64 x) y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt32to64 x) y)
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x y)
+
+// Rotates.
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 <t> x (MOVDconst [c])) => (Or32 (Lsh32x64 <t> x (MOVDconst [c&31])) (Rsh32Ux64 <t> x (MOVDconst [-c&31])))
+(RotateLeft64 <t> x (MOVDconst [c])) => (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63])))
+
+(Less64 ...) => (SLT ...)
+(Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y))
+(Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y))
+(Less8 x y) => (SLT (SignExt8to64 x) (SignExt8to64 y))
+(Less64U ...) => (SLTU ...)
+(Less32U x y) => (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Less16U x y) => (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Less8U x y) => (SLTU (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Less64F ...) => (FLTD ...)
+(Less32F ...) => (FLTS ...)
+
+// Convert x <= y to !(y > x).
+(Leq64 x y) => (Not (Less64 y x))
+(Leq32 x y) => (Not (Less32 y x))
+(Leq16 x y) => (Not (Less16 y x))
+(Leq8 x y) => (Not (Less8 y x))
+(Leq64U x y) => (Not (Less64U y x))
+(Leq32U x y) => (Not (Less32U y x))
+(Leq16U x y) => (Not (Less16U y x))
+(Leq8U x y) => (Not (Less8U y x))
+(Leq64F ...) => (FLED ...)
+(Leq32F ...) => (FLES ...)
+
+(EqPtr x y) => (SEQZ (SUB <typ.Uintptr> x y))
+(Eq64 x y) => (SEQZ (SUB <x.Type> x y))
+(Eq32 x y) => (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Eq16 x y) => (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq8 x y) => (SEQZ (SUB <x.Type> (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Eq64F ...) => (FEQD ...)
+(Eq32F ...) => (FEQS ...)
+
+(NeqPtr x y) => (SNEZ (SUB <typ.Uintptr> x y))
+(Neq64 x y) => (SNEZ (SUB <x.Type> x y))
+(Neq32 x y) => (SNEZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Neq16 x y) => (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Neq8 x y) => (SNEZ (SUB <x.Type> (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Neq64F ...) => (FNED ...)
+(Neq32F ...) => (FNES ...)
+
+// Loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && ( is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && ( is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVWload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+// Stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+
+// We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis
+// knows what variables are being read/written by the ops.
+(MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+ (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVBUload [off1+int32(off2)] {sym} base mem)
+(MOVBload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVBload [off1+int32(off2)] {sym} base mem)
+(MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVHUload [off1+int32(off2)] {sym} base mem)
+(MOVHload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVHload [off1+int32(off2)] {sym} base mem)
+(MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVWUload [off1+int32(off2)] {sym} base mem)
+(MOVWload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVWload [off1+int32(off2)] {sym} base mem)
+(MOVDload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ (MOVDload [off1+int32(off2)] {sym} base mem)
+
+(MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+ (MOVBstore [off1+int32(off2)] {sym} base val mem)
+(MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+ (MOVHstore [off1+int32(off2)] {sym} base val mem)
+(MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+ (MOVWstore [off1+int32(off2)] {sym} base val mem)
+(MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+ (MOVDstore [off1+int32(off2)] {sym} base val mem)
+(MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
+
+// Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis
+// with OffPtr -> ADDI.
+(ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+int64(d)) => (MOVaddr [int32(c)+d] {s} x)
+
+// Small zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVDconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore ptr (MOVDconst [0]) mem)
+(Zero [2] ptr mem) =>
+ (MOVBstore [1] ptr (MOVDconst [0])
+ (MOVBstore ptr (MOVDconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore ptr (MOVDconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem))
+(Zero [4] ptr mem) =>
+ (MOVBstore [3] ptr (MOVDconst [0])
+ (MOVBstore [2] ptr (MOVDconst [0])
+ (MOVBstore [1] ptr (MOVDconst [0])
+ (MOVBstore ptr (MOVDconst [0]) mem))))
+(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore ptr (MOVDconst [0]) mem)
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] ptr (MOVDconst [0])
+ (MOVHstore [4] ptr (MOVDconst [0])
+ (MOVHstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+ (MOVBstore [2] ptr (MOVDconst [0])
+ (MOVBstore [1] ptr (MOVDconst [0])
+ (MOVBstore ptr (MOVDconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] ptr (MOVDconst [0])
+ (MOVHstore [2] ptr (MOVDconst [0])
+ (MOVHstore ptr (MOVDconst [0]) mem)))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] ptr (MOVDconst [0])
+ (MOVWstore [4] ptr (MOVDconst [0])
+ (MOVWstore ptr (MOVDconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore [16] ptr (MOVDconst [0])
+ (MOVDstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [32] {t} ptr mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore [24] ptr (MOVDconst [0])
+ (MOVDstore [16] ptr (MOVDconst [0])
+ (MOVDstore [8] ptr (MOVDconst [0])
+ (MOVDstore ptr (MOVDconst [0]) mem))))
+
+// Medium 8-aligned zeroing uses a Duff's device
+// 8 and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+ && s%8 == 0 && s <= 8*128
+ && t.Alignment()%8 == 0 && !config.noDuffDevice =>
+ (DUFFZERO [8 * (128 - s/8)] ptr mem)
+
+// Generic zeroing uses a loop
+(Zero [s] {t} ptr mem) =>
+ (LoweredZero [t.Alignment()]
+ ptr
+ (ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)]))
+ mem)
+
+(Convert ...) => (MOVconvert ...)
+
+// Checks
+(IsNonNil ...) => (SNEZ ...)
+(IsInBounds ...) => (Less64U ...)
+(IsSliceInBounds ...) => (Leq64U ...)
+
+// Trivial lowering
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Small moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore dst (MOVHload src mem) mem)
+(Move [2] dst src mem) =>
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))
+(Move [4] dst src mem) =>
+ (MOVBstore [3] dst (MOVBload [3] src mem)
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem))))
+(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore dst (MOVDload src mem) mem)
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [6] dst (MOVHload [6] src mem)
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem))))
+
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBload [2] src mem)
+ (MOVBstore [1] dst (MOVBload [1] src mem)
+ (MOVBstore dst (MOVBload src mem) mem)))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+ (MOVHstore [4] dst (MOVHload [4] src mem)
+ (MOVHstore [2] dst (MOVHload [2] src mem)
+ (MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+ (MOVWstore [8] dst (MOVWload [8] src mem)
+ (MOVWstore [4] dst (MOVWload [4] src mem)
+ (MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore [16] dst (MOVDload [16] src mem)
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem)))
+(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 =>
+ (MOVDstore [24] dst (MOVDload [24] src mem)
+ (MOVDstore [16] dst (MOVDload [16] src mem)
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))))
+
+// Medium 8-aligned move uses a Duff's device
+// 16 and 128 are magic constants, see runtime/mkduff.go
+(Move [s] {t} dst src mem)
+ && s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0
+ && !config.noDuffDevice && logLargeCopy(v, s) =>
+ (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+
+// Generic move uses a loop
+(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
+ (LoweredMove [t.Alignment()]
+ dst
+ src
+ (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src)
+ mem)
+
+// Boolean ops; 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (SEQZ (SUB <typ.Bool> x y))
+(NeqB x y) => (SNEZ (SUB <typ.Bool> x y))
+(Not ...) => (SEQZ ...)
+
+// Lowering pointer arithmetic
+// TODO: Special handling for SP offsets, like ARM
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVaddr [int32(off)] ptr)
+(OffPtr [off] ptr) && is32Bit(off) => (ADDI [off] ptr)
+(OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr)
+
+(Const8 [val]) => (MOVDconst [int64(val)])
+(Const16 [val]) => (MOVDconst [int64(val)])
+(Const32 [val]) => (MOVDconst [int64(val)])
+(Const64 [val]) => (MOVDconst [int64(val)])
+(Const32F [val]) => (FMVSX (MOVDconst [int64(math.Float32bits(val))]))
+(Const64F [val]) => (FMVDX (MOVDconst [int64(math.Float64bits(val))]))
+(ConstNil) => (MOVDconst [0])
+(ConstBool [val]) => (MOVDconst [int64(b2i(val))])
+
+(Addr {sym} base) => (MOVaddr {sym} [0] base)
+(LocalAddr {sym} base _) => (MOVaddr {sym} base)
+
+// Calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// Atomic Intrinsics
+(AtomicLoad8 ...) => (LoweredAtomicLoad8 ...)
+(AtomicLoad32 ...) => (LoweredAtomicLoad32 ...)
+(AtomicLoad64 ...) => (LoweredAtomicLoad64 ...)
+(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...)
+
+(AtomicStore8 ...) => (LoweredAtomicStore8 ...)
+(AtomicStore32 ...) => (LoweredAtomicStore32 ...)
+(AtomicStore64 ...) => (LoweredAtomicStore64 ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
+
+(AtomicAdd32 ...) => (LoweredAtomicAdd32 ...)
+(AtomicAdd64 ...) => (LoweredAtomicAdd64 ...)
+
+// AtomicAnd8(ptr,val) => LoweredAtomicAnd32(ptr&^3, ^((uint8(val) ^ 0xff) << ((ptr & 3) * 8)))
+(AtomicAnd8 ptr val mem) =>
+ (LoweredAtomicAnd32 (ANDI <typ.Uintptr> [^3] ptr)
+ (NOT <typ.UInt32> (SLL <typ.UInt32> (XORI <typ.UInt32> [0xff] (ZeroExt8to32 val))
+ (SLLI <typ.UInt64> [3] (ANDI <typ.UInt64> [3] ptr)))) mem)
+
+(AtomicAnd32 ...) => (LoweredAtomicAnd32 ...)
+
+(AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem)
+(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...)
+
+(AtomicExchange32 ...) => (LoweredAtomicExchange32 ...)
+(AtomicExchange64 ...) => (LoweredAtomicExchange64 ...)
+
+// AtomicOr8(ptr,val) => LoweredAtomicOr32(ptr&^3, uint32(val)<<((ptr&3)*8))
+(AtomicOr8 ptr val mem) =>
+ (LoweredAtomicOr32 (ANDI <typ.Uintptr> [^3] ptr)
+ (SLL <typ.UInt32> (ZeroExt8to32 val)
+ (SLLI <typ.UInt64> [3] (ANDI <typ.UInt64> [3] ptr))) mem)
+
+(AtomicOr32 ...) => (LoweredAtomicOr32 ...)
+
+// Conditional branches
+(If cond yes no) => (BNEZ (MOVBUreg <typ.UInt64> cond) yes no)
+
+// Optimizations
+
+// Absorb SEQZ/SNEZ into branch.
+(BEQZ (SEQZ x) yes no) => (BNEZ x yes no)
+(BEQZ (SNEZ x) yes no) => (BEQZ x yes no)
+(BNEZ (SEQZ x) yes no) => (BEQZ x yes no)
+(BNEZ (SNEZ x) yes no) => (BNEZ x yes no)
+
+// Remove redundant NEG from BEQZ/BNEZ.
+(BEQZ (NEG x) yes no) => (BEQZ x yes no)
+(BNEZ (NEG x) yes no) => (BNEZ x yes no)
+
+// Negate comparison with FNES/FNED.
+(BEQZ (FNES <t> x y) yes no) => (BNEZ (FEQS <t> x y) yes no)
+(BNEZ (FNES <t> x y) yes no) => (BEQZ (FEQS <t> x y) yes no)
+(BEQZ (FNED <t> x y) yes no) => (BNEZ (FEQD <t> x y) yes no)
+(BNEZ (FNED <t> x y) yes no) => (BEQZ (FEQD <t> x y) yes no)
+
+// Convert BEQZ/BNEZ into more optimal branch conditions.
+(BEQZ (SUB x y) yes no) => (BEQ x y yes no)
+(BNEZ (SUB x y) yes no) => (BNE x y yes no)
+(BEQZ (SLT x y) yes no) => (BGE x y yes no)
+(BNEZ (SLT x y) yes no) => (BLT x y yes no)
+(BEQZ (SLTU x y) yes no) => (BGEU x y yes no)
+(BNEZ (SLTU x y) yes no) => (BLTU x y yes no)
+(BEQZ (SLTI [x] y) yes no) => (BGE y (MOVDconst [x]) yes no)
+(BNEZ (SLTI [x] y) yes no) => (BLT y (MOVDconst [x]) yes no)
+(BEQZ (SLTIU [x] y) yes no) => (BGEU y (MOVDconst [x]) yes no)
+(BNEZ (SLTIU [x] y) yes no) => (BLTU y (MOVDconst [x]) yes no)
+
+// Convert branch with zero to more optimal branch zero.
+(BEQ (MOVDconst [0]) cond yes no) => (BEQZ cond yes no)
+(BEQ cond (MOVDconst [0]) yes no) => (BEQZ cond yes no)
+(BNE (MOVDconst [0]) cond yes no) => (BNEZ cond yes no)
+(BNE cond (MOVDconst [0]) yes no) => (BNEZ cond yes no)
+(BLT (MOVDconst [0]) cond yes no) => (BGTZ cond yes no)
+(BLT cond (MOVDconst [0]) yes no) => (BLTZ cond yes no)
+(BGE (MOVDconst [0]) cond yes no) => (BLEZ cond yes no)
+(BGE cond (MOVDconst [0]) yes no) => (BGEZ cond yes no)
+
+// Remove redundant NEG from SEQZ/SNEZ.
+(SEQZ (NEG x)) => (SEQZ x)
+(SNEZ (NEG x)) => (SNEZ x)
+
+// Remove redundant SEQZ/SNEZ.
+(SEQZ (SEQZ x)) => (SNEZ x)
+(SEQZ (SNEZ x)) => (SEQZ x)
+(SNEZ (SEQZ x)) => (SEQZ x)
+(SNEZ (SNEZ x)) => (SNEZ x)
+
+// Store zero.
+(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem)
+
+// Boolean ops are already extended.
+(MOVBUreg x:((FLES|FLTS|FEQS|FNES) _ _)) => x
+(MOVBUreg x:((FLED|FLTD|FEQD|FNED) _ _)) => x
+(MOVBUreg x:((SEQZ|SNEZ) _)) => x
+(MOVBUreg x:((SLT|SLTU) _ _)) => x
+
+// Avoid extending when already sufficiently masked.
+(MOVBreg x:(ANDI [c] y)) && c >= 0 && int64(int8(c)) == c => x
+(MOVHreg x:(ANDI [c] y)) && c >= 0 && int64(int16(c)) == c => x
+(MOVWreg x:(ANDI [c] y)) && c >= 0 && int64(int32(c)) == c => x
+(MOVBUreg x:(ANDI [c] y)) && c >= 0 && int64(uint8(c)) == c => x
+(MOVHUreg x:(ANDI [c] y)) && c >= 0 && int64(uint16(c)) == c => x
+(MOVWUreg x:(ANDI [c] y)) && c >= 0 && int64(uint32(c)) == c => x
+
+// Combine masking and zero extension.
+(MOVBUreg (ANDI [c] x)) && c < 0 => (ANDI [int64(uint8(c))] x)
+(MOVHUreg (ANDI [c] x)) && c < 0 => (ANDI [int64(uint16(c))] x)
+(MOVWUreg (ANDI [c] x)) && c < 0 => (AND (MOVDconst [int64(uint32(c))]) x)
+
+// Avoid sign/zero extension for consts.
+(MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))])
+(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+(MOVBUreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))])
+(MOVHUreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVWUreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+
+// Avoid sign/zero extension after properly typed load.
+(MOVBreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x)
+
+// Avoid zero extension after properly typed atomic operation.
+(MOVBUreg x:(Select0 (LoweredAtomicLoad8 _ _))) => (MOVDreg x)
+(MOVBUreg x:(Select0 (LoweredAtomicCas32 _ _ _ _))) => (MOVDreg x)
+(MOVBUreg x:(Select0 (LoweredAtomicCas64 _ _ _ _))) => (MOVDreg x)
+
+// Avoid sign extension after word arithmetic.
+(MOVWreg x:(ADDIW _)) => (MOVDreg x)
+(MOVWreg x:(SUBW _ _)) => (MOVDreg x)
+(MOVWreg x:(NEGW _)) => (MOVDreg x)
+(MOVWreg x:(MULW _ _)) => (MOVDreg x)
+(MOVWreg x:(DIVW _ _)) => (MOVDreg x)
+(MOVWreg x:(DIVUW _ _)) => (MOVDreg x)
+(MOVWreg x:(REMW _ _)) => (MOVDreg x)
+(MOVWreg x:(REMUW _ _)) => (MOVDreg x)
+
+// Fold double extensions.
+(MOVBreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x)
+
+// Do not extend before store.
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// Replace extend after load with alternate load where possible.
+(MOVBreg <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <t> [off] {sym} ptr mem)
+(MOVHreg <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload <t> [off] {sym} ptr mem)
+(MOVWreg <t> x:(MOVWUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <t> [off] {sym} ptr mem)
+(MOVBUreg <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem)
+(MOVHUreg <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem)
+(MOVWUreg <t> x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWUload <t> [off] {sym} ptr mem)
+
+// If a register move has only 1 use, just use the same register without emitting instruction
+// MOVnop does not emit an instruction, only for ensuring the type.
+(MOVDreg x) && x.Uses == 1 => (MOVDnop x)
+
+// TODO: we should be able to get rid of MOVDnop all together.
+// But for now, this is enough to get rid of lots of them.
+(MOVDnop (MOVDconst [c])) => (MOVDconst [c])
+
+// Fold constant into immediate instructions where possible.
+(ADD (MOVDconst [val]) x) && is32Bit(val) => (ADDI [val] x)
+(AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x)
+(OR (MOVDconst [val]) x) && is32Bit(val) => (ORI [val] x)
+(XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x)
+(SLL x (MOVDconst [val])) => (SLLI [int64(val&63)] x)
+(SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x)
+(SRA x (MOVDconst [val])) => (SRAI [int64(val&63)] x)
+(SLT x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI [val] x)
+(SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x)
+
+// Convert const subtraction into ADDI with negative immediate, where possible.
+(SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x)
+(SUB <t> (MOVDconst [val]) y) && is32Bit(-val) => (NEG (ADDI <t> [-val] y))
+
+// Subtraction of zero.
+(SUB x (MOVDconst [0])) => x
+(SUBW x (MOVDconst [0])) => (ADDIW [0] x)
+
+// Subtraction from zero.
+(SUB (MOVDconst [0]) x) => (NEG x)
+(SUBW (MOVDconst [0]) x) => (NEGW x)
+
+// Fold negation into subtraction.
+(NEG (SUB x y)) => (SUB y x)
+(NEG <t> s:(ADDI [val] (SUB x y))) && s.Uses == 1 && is32Bit(-val) => (ADDI [-val] (SUB <t> y x))
+
+// Double negation.
+(NEG (NEG x)) => x
+
+// Addition of zero or two constants.
+(ADDI [0] x) => x
+(ADDI [x] (MOVDconst [y])) && is32Bit(x + y) => (MOVDconst [x + y])
+
+// ANDI with all zeros, all ones or two constants.
+(ANDI [0] x) => (MOVDconst [0])
+(ANDI [-1] x) => x
+(ANDI [x] (MOVDconst [y])) => (MOVDconst [x & y])
+
+// ORI with all zeroes, all ones or two constants.
+(ORI [0] x) => x
+(ORI [-1] x) => (MOVDconst [-1])
+(ORI [x] (MOVDconst [y])) => (MOVDconst [x | y])
+
+// Combine operations with immediate.
+(ADDI [x] (ADDI [y] z)) && is32Bit(x + y) => (ADDI [x + y] z)
+(ANDI [x] (ANDI [y] z)) => (ANDI [x & y] z)
+(ORI [x] (ORI [y] z)) => (ORI [x | y] z)
+
+// Negation of a constant.
+(NEG (MOVDconst [x])) => (MOVDconst [-x])
+(NEGW (MOVDconst [x])) => (MOVDconst [int64(int32(-x))])
+
+// Shift of a constant.
+(SLLI [x] (MOVDconst [y])) && is32Bit(y << uint32(x)) => (MOVDconst [y << uint32(x)])
+(SRLI [x] (MOVDconst [y])) => (MOVDconst [int64(uint64(y) >> uint32(x))])
+(SRAI [x] (MOVDconst [y])) => (MOVDconst [int64(y) >> uint32(x)])
+
+// SLTI/SLTIU with constants.
+(SLTI [x] (MOVDconst [y])) => (MOVDconst [b2i(int64(y) < int64(x))])
+(SLTIU [x] (MOVDconst [y])) => (MOVDconst [b2i(uint64(y) < uint64(x))])
+
+// SLTI/SLTIU with known outcomes.
+(SLTI [x] (ANDI [y] _)) && y >= 0 && int64(y) < int64(x) => (MOVDconst [1])
+(SLTIU [x] (ANDI [y] _)) && y >= 0 && uint64(y) < uint64(x) => (MOVDconst [1])
+(SLTI [x] (ORI [y] _)) && y >= 0 && int64(y) >= int64(x) => (MOVDconst [0])
+(SLTIU [x] (ORI [y] _)) && y >= 0 && uint64(y) >= uint64(x) => (MOVDconst [0])
+
+// SLT/SLTU with known outcomes.
+(SLT x x) => (MOVDconst [0])
+(SLTU x x) => (MOVDconst [0])
+
+// Deadcode for LoweredMuluhilo
+(Select0 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MULHU x y)
+(Select1 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MUL x y)
+
+// Merge negation into fused multiply-add and multiply-subtract.
+//
+// Key:
+//
+// [+ -](x * y) [+ -] z.
+// _ N A S
+// D U
+// D B
+//
+// Note: multiplication commutativity handled by rule generator.
+(F(MADD|NMADD|MSUB|NMSUB)D neg:(FNEGD x) y z) && neg.Uses == 1 => (F(NMADD|MADD|NMSUB|MSUB)D x y z)
+(F(MADD|NMADD|MSUB|NMSUB)D x y neg:(FNEGD z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)D x y z)
diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
new file mode 100644
index 0000000..09b1620
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
@@ -0,0 +1,482 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "fmt"
+)
+
+// Notes:
+// - Boolean types occupy the entire register. 0=false, 1=true.
+
+// Suffixes encode the bit width of various instructions:
+//
+// D (double word) = 64 bit int
+// W (word) = 32 bit int
+// H (half word) = 16 bit int
+// B (byte) = 8 bit int
+// S (single) = 32 bit float
+// D (double) = 64 bit float
+// L = 64 bit int, used when the opcode starts with F
+
+const (
+ riscv64REG_G = 27
+ riscv64REG_CTXT = 26
+ riscv64REG_LR = 1
+ riscv64REG_SP = 2
+ riscv64REG_GP = 3
+ riscv64REG_TP = 4
+ riscv64REG_TMP = 31
+ riscv64REG_ZERO = 0
+)
+
+func riscv64RegName(r int) string {
+ switch {
+ case r == riscv64REG_G:
+ return "g"
+ case r == riscv64REG_SP:
+ return "SP"
+ case 0 <= r && r <= 31:
+ return fmt.Sprintf("X%d", r)
+ case 32 <= r && r <= 63:
+ return fmt.Sprintf("F%d", r-32)
+ default:
+ panic(fmt.Sprintf("unknown register %d", r))
+ }
+}
+
+func init() {
+ var regNamesRISCV64 []string
+ var gpMask, fpMask, gpgMask, gpspMask, gpspsbMask, gpspsbgMask regMask
+ regNamed := make(map[string]regMask)
+
+ // Build the list of register names, creating an appropriately indexed
+ // regMask for the gp and fp registers as we go.
+ //
+ // If name is specified, use it rather than the riscv reg number.
+ addreg := func(r int, name string) regMask {
+ mask := regMask(1) << uint(len(regNamesRISCV64))
+ if name == "" {
+ name = riscv64RegName(r)
+ }
+ regNamesRISCV64 = append(regNamesRISCV64, name)
+ regNamed[name] = mask
+ return mask
+ }
+
+ // General purpose registers.
+ for r := 0; r <= 31; r++ {
+ if r == riscv64REG_LR {
+ // LR is not used by regalloc, so we skip it to leave
+ // room for pseudo-register SB.
+ continue
+ }
+
+ mask := addreg(r, "")
+
+ // Add general purpose registers to gpMask.
+ switch r {
+ // ZERO, GP, TP and TMP are not in any gp mask.
+ case riscv64REG_ZERO, riscv64REG_GP, riscv64REG_TP, riscv64REG_TMP:
+ case riscv64REG_G:
+ gpgMask |= mask
+ gpspsbgMask |= mask
+ case riscv64REG_SP:
+ gpspMask |= mask
+ gpspsbMask |= mask
+ gpspsbgMask |= mask
+ default:
+ gpMask |= mask
+ gpgMask |= mask
+ gpspMask |= mask
+ gpspsbMask |= mask
+ gpspsbgMask |= mask
+ }
+ }
+
+ // Floating pointer registers.
+ for r := 32; r <= 63; r++ {
+ mask := addreg(r, "")
+ fpMask |= mask
+ }
+
+ // Pseudo-register: SB
+ mask := addreg(-1, "SB")
+ gpspsbMask |= mask
+ gpspsbgMask |= mask
+
+ if len(regNamesRISCV64) > 64 {
+ // regMask is only 64 bits.
+ panic("Too many RISCV64 registers")
+ }
+
+ regCtxt := regNamed["X26"]
+ callerSave := gpMask | fpMask | regNamed["g"]
+
+ var (
+ gpstore = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register
+ gpstore0 = regInfo{inputs: []regMask{gpspsbMask}}
+ gp01 = regInfo{outputs: []regMask{gpMask}}
+ gp11 = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}
+ gp21 = regInfo{inputs: []regMask{gpMask, gpMask}, outputs: []regMask{gpMask}}
+ gp22 = regInfo{inputs: []regMask{gpMask, gpMask}, outputs: []regMask{gpMask, gpMask}}
+ gpload = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{gpMask}}
+ gp11sb = regInfo{inputs: []regMask{gpspsbMask}, outputs: []regMask{gpMask}}
+ gpxchg = regInfo{inputs: []regMask{gpspsbgMask, gpgMask}, outputs: []regMask{gpMask}}
+ gpcas = regInfo{inputs: []regMask{gpspsbgMask, gpgMask, gpgMask}, outputs: []regMask{gpMask}}
+ gpatomic = regInfo{inputs: []regMask{gpspsbgMask, gpgMask}}
+
+ fp11 = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{fpMask}}
+ fp21 = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{fpMask}}
+ fp31 = regInfo{inputs: []regMask{fpMask, fpMask, fpMask}, outputs: []regMask{fpMask}}
+ gpfp = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{fpMask}}
+ fpgp = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{gpMask}}
+ fpstore = regInfo{inputs: []regMask{gpspsbMask, fpMask, 0}}
+ fpload = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{fpMask}}
+ fp2gp = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{gpMask}}
+
+ call = regInfo{clobbers: callerSave}
+ callClosure = regInfo{inputs: []regMask{gpspMask, regCtxt, 0}, clobbers: callerSave}
+ callInter = regInfo{inputs: []regMask{gpMask}, clobbers: callerSave}
+ )
+
+ RISCV64ops := []opData{
+ {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1
+ {name: "ADDI", argLength: 1, reg: gp11sb, asm: "ADDI", aux: "Int64"}, // arg0 + auxint
+ {name: "ADDIW", argLength: 1, reg: gp11, asm: "ADDIW", aux: "Int64"}, // 32 low bits of arg0 + auxint, sign extended to 64 bits
+ {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0
+ {name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW"}, // -arg0 of 32 bits, sign extended to 64 bits
+ {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0 - arg1
+ {name: "SUBW", argLength: 2, reg: gp21, asm: "SUBW"}, // 32 low bits of arg 0 - 32 low bits of arg 1, sign extended to 64 bits
+
+ // M extension. H means high (i.e., it returns the top bits of
+ // the result). U means unsigned. W means word (i.e., 32-bit).
+ {name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true, typ: "Int64"}, // arg0 * arg1
+ {name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true, typ: "Int32"},
+ {name: "MULH", argLength: 2, reg: gp21, asm: "MULH", commutative: true, typ: "Int64"},
+ {name: "MULHU", argLength: 2, reg: gp21, asm: "MULHU", commutative: true, typ: "UInt64"},
+ {name: "LoweredMuluhilo", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, return (hi, lo)
+ {name: "LoweredMuluover", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, return (64 bits of arg0*arg1, overflow)
+
+ {name: "DIV", argLength: 2, reg: gp21, asm: "DIV", typ: "Int64"}, // arg0 / arg1
+ {name: "DIVU", argLength: 2, reg: gp21, asm: "DIVU", typ: "UInt64"},
+ {name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"},
+ {name: "DIVUW", argLength: 2, reg: gp21, asm: "DIVUW", typ: "UInt32"},
+ {name: "REM", argLength: 2, reg: gp21, asm: "REM", typ: "Int64"}, // arg0 % arg1
+ {name: "REMU", argLength: 2, reg: gp21, asm: "REMU", typ: "UInt64"},
+ {name: "REMW", argLength: 2, reg: gp21, asm: "REMW", typ: "Int32"},
+ {name: "REMUW", argLength: 2, reg: gp21, asm: "REMUW", typ: "UInt32"},
+
+ {name: "MOVaddr", argLength: 1, reg: gp11sb, asm: "MOV", aux: "SymOff", rematerializeable: true, symEffect: "RdWr"}, // arg0 + auxint + offset encoded in aux
+ // auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+
+ {name: "MOVDconst", reg: gp01, asm: "MOV", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
+
+ // Loads: load <size> bits from arg0+auxint+aux and extend to 64 bits; arg1=mem
+ {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // 8 bits, sign extend
+ {name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // 16 bits, sign extend
+ {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // 32 bits, sign extend
+ {name: "MOVDload", argLength: 2, reg: gpload, asm: "MOV", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // 64 bits
+ {name: "MOVBUload", argLength: 2, reg: gpload, asm: "MOVBU", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // 8 bits, zero extend
+ {name: "MOVHUload", argLength: 2, reg: gpload, asm: "MOVHU", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // 16 bits, zero extend
+ {name: "MOVWUload", argLength: 2, reg: gpload, asm: "MOVWU", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // 32 bits, zero extend
+
+ // Stores: store <size> lowest bits in arg1 to arg0+auxint+aux; arg2=mem
+ {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 8 bits
+ {name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits
+ {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits
+ {name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOV", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 64 bits
+
+ // Stores: store <size> of zero in arg0+auxint+aux; arg1=mem
+ {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 8 bits
+ {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits
+ {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits
+ {name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 64 bits
+
+ // Conversions
+ {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
+ {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half
+ {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word
+ {name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOV"}, // move from arg0
+ {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+ {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+ {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+
+ {name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register
+
+ // Shift ops
+ {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63)
+ {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), signed
+ {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), unsigned
+ {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63
+ {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63
+ {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63
+
+ // Bitwise ops
+ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+ {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint
+ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1
+ {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint
+ {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0
+
+ // Generate boolean values
+ {name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"}, // arg0 == 0, result is 0 or 1
+ {name: "SNEZ", argLength: 1, reg: gp11, asm: "SNEZ"}, // arg0 != 0, result is 0 or 1
+ {name: "SLT", argLength: 2, reg: gp21, asm: "SLT"}, // arg0 < arg1, result is 0 or 1
+ {name: "SLTI", argLength: 1, reg: gp11, asm: "SLTI", aux: "Int64"}, // arg0 < auxint, result is 0 or 1
+ {name: "SLTU", argLength: 2, reg: gp21, asm: "SLTU"}, // arg0 < arg1, unsigned, result is 0 or 1
+ {name: "SLTIU", argLength: 1, reg: gp11, asm: "SLTIU", aux: "Int64"}, // arg0 < auxint, unsigned, result is 0 or 1
+
+ // MOVconvert converts between pointers and integers.
+ // We have a special op for this so as to not confuse GC
+ // (particularly stack maps). It takes a memory arg so it
+ // gets correctly ordered with respect to GC safepoints.
+ {name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem
+
+ // Calls
+ {name: "CALLstatic", argLength: -1, reg: call, aux: "CallOff", call: true}, // call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: -1, reg: call, aux: "CallOff", call: true, tailCall: true}, // tail call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: -1, reg: callClosure, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: -1, reg: callInter, aux: "CallOff", call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem
+
+ // duffzero
+ // arg0 = address of memory to zero (in X25, changed as side effect)
+ // arg1 = mem
+ // auxint = offset into duffzero code to start executing
+ // X1 (link register) changed because of function call
+ // returns mem
+ {
+ name: "DUFFZERO",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{regNamed["X25"]},
+ clobbers: regNamed["X1"] | regNamed["X25"],
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ },
+
+ // duffcopy
+ // arg0 = address of dst memory (in X25, changed as side effect)
+ // arg1 = address of src memory (in X24, changed as side effect)
+ // arg2 = mem
+ // auxint = offset into duffcopy code to start executing
+ // X1 (link register) changed because of function call
+ // returns mem
+ {
+ name: "DUFFCOPY",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{regNamed["X25"], regNamed["X24"]},
+ clobbers: regNamed["X1"] | regNamed["X24"] | regNamed["X25"],
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // Generic moves and zeros
+
+ // general unaligned zeroing
+ // arg0 = address of memory to zero (in X5, changed as side effect)
+ // arg1 = address of the last element to zero (inclusive)
+ // arg2 = mem
+ // auxint = element size
+ // returns mem
+ // mov ZERO, (X5)
+ // ADD $sz, X5
+ // BGEU Rarg1, X5, -2(PC)
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{regNamed["X5"], gpMask},
+ clobbers: regNamed["X5"],
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ },
+
+ // general unaligned move
+ // arg0 = address of dst memory (in X5, changed as side effect)
+ // arg1 = address of src memory (in X6, changed as side effect)
+ // arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2)
+ // arg3 = mem
+ // auxint = alignment
+ // clobbers X7 as a tmp register.
+ // returns mem
+ // mov (X6), X7
+ // mov X7, (X5)
+ // ADD $sz, X5
+ // ADD $sz, X6
+ // BGEU Rarg2, X5, -4(PC)
+ {
+ name: "LoweredMove",
+ aux: "Int64",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]},
+ clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"],
+ },
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // Atomic loads.
+ // load from arg0. arg1=mem.
+ // returns <value,memory> so they can be properly ordered with other loads.
+ {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+ {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+ // Atomic stores.
+ // store arg1 to *arg0. arg2=mem. returns memory.
+ {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+
+ // Atomic exchange.
+ // store arg1 to *arg0. arg2=mem. returns <old content of *arg0, memory>.
+ {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
+ // Atomic add.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+ {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // Atomic compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+ // if *arg0 == arg1 {
+ // *arg0 = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // MOV $0, Rout
+ // LR (Rarg0), Rtmp
+ // BNE Rtmp, Rarg1, 3(PC)
+ // SC Rarg2, (Rarg0), Rtmp
+ // BNE Rtmp, ZERO, -3(PC)
+ // MOV $1, Rout
+ {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+ {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+ // Atomic 32 bit AND/OR.
+ // *arg0 &= (|=) arg1. arg2=mem. returns nil.
+ {name: "LoweredAtomicAnd32", argLength: 3, reg: gpatomic, asm: "AMOANDW", faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicOr32", argLength: 3, reg: gpatomic, asm: "AMOORW", faultOnNilArg0: true, hasSideEffects: true},
+
+ // Lowering pass-throughs
+ {name: "LoweredNilCheck", argLength: 2, faultOnNilArg0: true, nilCheck: true, reg: regInfo{inputs: []regMask{gpspMask}}}, // arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{regCtxt}}}, // scheduler ensures only at beginning of entry block
+
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers RA (LR) because it's a call
+ // and T6 (REG_TMP).
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}, clobbers: (callerSave &^ (gpMask | regNamed["g"])) | regNamed["X1"]}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X7"], regNamed["X28"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X6"], regNamed["X7"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+
+ // F extension.
+ {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true, typ: "Float32"}, // arg0 + arg1
+ {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS", commutative: false, typ: "Float32"}, // arg0 - arg1
+ {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, typ: "Float32"}, // arg0 * arg1
+ {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", commutative: false, typ: "Float32"}, // arg0 / arg1
+ {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"}, // sqrt(arg0)
+ {name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"}, // -arg0
+ {name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float
+ {name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"}, // float32(low 32 bits of arg0)
+ {name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"}, // float32(arg0)
+ {name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"}, // int32(arg0)
+ {name: "FCVTLS", argLength: 1, reg: fpgp, asm: "FCVTLS", typ: "Int64"}, // int64(arg0)
+ {name: "FMOVWload", argLength: 2, reg: fpload, asm: "MOVF", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load float32 from arg0+auxint+aux
+ {name: "FMOVWstore", argLength: 3, reg: fpstore, asm: "MOVF", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store float32 to arg0+auxint+aux
+ {name: "FEQS", argLength: 2, reg: fp2gp, asm: "FEQS", commutative: true}, // arg0 == arg1
+ {name: "FNES", argLength: 2, reg: fp2gp, asm: "FNES", commutative: true}, // arg0 != arg1
+ {name: "FLTS", argLength: 2, reg: fp2gp, asm: "FLTS"}, // arg0 < arg1
+ {name: "FLES", argLength: 2, reg: fp2gp, asm: "FLES"}, // arg0 <= arg1
+
+ // D extension.
+ {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true, typ: "Float64"}, // arg0 + arg1
+ {name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD", commutative: false, typ: "Float64"}, // arg0 - arg1
+ {name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true, typ: "Float64"}, // arg0 * arg1
+ {name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD", commutative: false, typ: "Float64"}, // arg0 / arg1
+ {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD", commutative: true, typ: "Float64"}, // (arg0 * arg1) + arg2
+ {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD", commutative: true, typ: "Float64"}, // (arg0 * arg1) - arg2
+ {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD", commutative: true, typ: "Float64"}, // -(arg0 * arg1) + arg2
+ {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD", commutative: true, typ: "Float64"}, // -(arg0 * arg1) - arg2
+ {name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD", typ: "Float64"}, // sqrt(arg0)
+ {name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"}, // -arg0
+ {name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD", typ: "Float64"}, // abs(arg0)
+ {name: "FSGNJD", argLength: 2, reg: fp21, asm: "FSGNJD", typ: "Float64"}, // copy sign of arg1 to arg0
+ {name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"}, // reinterpret arg0 as float
+ {name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"}, // float64(low 32 bits of arg0)
+ {name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"}, // float64(arg0)
+ {name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"}, // int32(arg0)
+ {name: "FCVTLD", argLength: 1, reg: fpgp, asm: "FCVTLD", typ: "Int64"}, // int64(arg0)
+ {name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS", typ: "Float64"}, // float64(arg0)
+ {name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD", typ: "Float32"}, // float32(arg0)
+ {name: "FMOVDload", argLength: 2, reg: fpload, asm: "MOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load float64 from arg0+auxint+aux
+ {name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store float6 to arg0+auxint+aux
+ {name: "FEQD", argLength: 2, reg: fp2gp, asm: "FEQD", commutative: true}, // arg0 == arg1
+ {name: "FNED", argLength: 2, reg: fp2gp, asm: "FNED", commutative: true}, // arg0 != arg1
+ {name: "FLTD", argLength: 2, reg: fp2gp, asm: "FLTD"}, // arg0 < arg1
+ {name: "FLED", argLength: 2, reg: fp2gp, asm: "FLED"}, // arg0 <= arg1
+ }
+
+ RISCV64blocks := []blockData{
+ {name: "BEQ", controls: 2},
+ {name: "BNE", controls: 2},
+ {name: "BLT", controls: 2},
+ {name: "BGE", controls: 2},
+ {name: "BLTU", controls: 2},
+ {name: "BGEU", controls: 2},
+
+ {name: "BEQZ", controls: 1},
+ {name: "BNEZ", controls: 1},
+ {name: "BLEZ", controls: 1},
+ {name: "BGEZ", controls: 1},
+ {name: "BLTZ", controls: 1},
+ {name: "BGTZ", controls: 1},
+ }
+
+ archs = append(archs, arch{
+ name: "RISCV64",
+ pkg: "cmd/internal/obj/riscv",
+ genfile: "../../riscv64/ssa.go",
+ ops: RISCV64ops,
+ blocks: RISCV64blocks,
+ regnames: regNamesRISCV64,
+ gpregmask: gpMask,
+ fpregmask: fpMask,
+ framepointerreg: -1, // not used
+ // Integer parameters passed in register X10-X17, X8-X9, X18-X23
+ ParamIntRegNames: "X10 X11 X12 X13 X14 X15 X16 X17 X8 X9 X18 X19 X20 X21 X22 X23",
+ // Float parameters passed in register F10-F17, F8-F9, F18-F23
+ ParamFloatRegNames: "F10 F11 F12 F13 F14 F15 F16 F17 F8 F9 F18 F19 F20 F21 F22 F23",
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules
new file mode 100644
index 0000000..cd55331
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules
@@ -0,0 +1,19 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fold constant shift with extension.
+(SRAI [c] (MOVBreg x)) && c < 8 => (SRAI [56+c] (SLLI <typ.Int64> [56] x))
+(SRAI [c] (MOVHreg x)) && c < 16 => (SRAI [48+c] (SLLI <typ.Int64> [48] x))
+(SRAI [c] (MOVWreg x)) && c < 32 => (SRAI [32+c] (SLLI <typ.Int64> [32] x))
+(SRLI [c] (MOVBUreg x)) && c < 8 => (SRLI [56+c] (SLLI <typ.UInt64> [56] x))
+(SRLI [c] (MOVHUreg x)) && c < 16 => (SRLI [48+c] (SLLI <typ.UInt64> [48] x))
+(SRLI [c] (MOVWUreg x)) && c < 32 => (SRLI [32+c] (SLLI <typ.UInt64> [32] x))
+(SLLI [c] (MOVBUreg x)) && c <= 56 => (SRLI [56-c] (SLLI <typ.UInt64> [56] x))
+(SLLI [c] (MOVHUreg x)) && c <= 48 => (SRLI [48-c] (SLLI <typ.UInt64> [48] x))
+(SLLI [c] (MOVWUreg x)) && c <= 32 => (SRLI [32-c] (SLLI <typ.UInt64> [32] x))
+
+// Shift by zero.
+(SRAI [0] x) => x
+(SRLI [0] x) => x
+(SLLI [0] x) => x
diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules
new file mode 100644
index 0000000..e9becb2
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules
@@ -0,0 +1,1704 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(64|Ptr) ...) => (ADD ...)
+(Add(32|16|8) ...) => (ADDW ...)
+(Add32F x y) => (Select0 (FADDS x y))
+(Add64F x y) => (Select0 (FADD x y))
+
+(Sub(64|Ptr) ...) => (SUB ...)
+(Sub(32|16|8) ...) => (SUBW ...)
+(Sub32F x y) => (Select0 (FSUBS x y))
+(Sub64F x y) => (Select0 (FSUB x y))
+
+(Mul64 ...) => (MULLD ...)
+(Mul(32|16|8) ...) => (MULLW ...)
+(Mul32F ...) => (FMULS ...)
+(Mul64F ...) => (FMUL ...)
+(Mul64uhilo ...) => (MLGR ...)
+
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIV ...)
+
+(Div64 x y) => (DIVD x y)
+(Div64u ...) => (DIVDU ...)
+// DIVW/DIVWU has a 64-bit dividend and a 32-bit divisor,
+// so a sign/zero extension of the dividend is required.
+(Div32 x y) => (DIVW (MOVWreg x) y)
+(Div32u x y) => (DIVWU (MOVWZreg x) y)
+(Div16 x y) => (DIVW (MOVHreg x) (MOVHreg y))
+(Div16u x y) => (DIVWU (MOVHZreg x) (MOVHZreg y))
+(Div8 x y) => (DIVW (MOVBreg x) (MOVBreg y))
+(Div8u x y) => (DIVWU (MOVBZreg x) (MOVBZreg y))
+
+(Hmul(64|64u) ...) => (MULH(D|DU) ...)
+(Hmul32 x y) => (SRDconst [32] (MULLD (MOVWreg x) (MOVWreg y)))
+(Hmul32u x y) => (SRDconst [32] (MULLD (MOVWZreg x) (MOVWZreg y)))
+
+(Mod64 x y) => (MODD x y)
+(Mod64u ...) => (MODDU ...)
+// MODW/MODWU has a 64-bit dividend and a 32-bit divisor,
+// so a sign/zero extension of the dividend is required.
+(Mod32 x y) => (MODW (MOVWreg x) y)
+(Mod32u x y) => (MODWU (MOVWZreg x) y)
+(Mod16 x y) => (MODW (MOVHreg x) (MOVHreg y))
+(Mod16u x y) => (MODWU (MOVHZreg x) (MOVHZreg y))
+(Mod8 x y) => (MODW (MOVBreg x) (MOVBreg y))
+(Mod8u x y) => (MODWU (MOVBZreg x) (MOVBZreg y))
+
+// (x + y) / 2 with x>=y -> (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y)
+
+(And64 ...) => (AND ...)
+(And(32|16|8) ...) => (ANDW ...)
+
+(Or64 ...) => (OR ...)
+(Or(32|16|8) ...) => (ORW ...)
+
+(Xor64 ...) => (XOR ...)
+(Xor(32|16|8) ...) => (XORW ...)
+
+(Neg64 ...) => (NEG ...)
+(Neg(32|16|8) ...) => (NEGW ...)
+(Neg32F ...) => (FNEGS ...)
+(Neg64F ...) => (FNEG ...)
+
+(Com64 ...) => (NOT ...)
+(Com(32|16|8) ...) => (NOTW ...)
+(NOT x) => (XOR (MOVDconst [-1]) x)
+(NOTW x) => (XORWconst [-1] x)
+
+// Lowering boolean ops
+(AndB ...) => (ANDW ...)
+(OrB ...) => (ORW ...)
+(Not x) => (XORWconst [1] x)
+
+// Lowering pointer arithmetic
+(OffPtr [off] ptr:(SP)) => (MOVDaddr [int32(off)] ptr)
+(OffPtr [off] ptr) && is32Bit(off) => (ADDconst [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr)
+
+// TODO: optimize these cases?
+(Ctz64NonZero ...) => (Ctz64 ...)
+(Ctz32NonZero ...) => (Ctz32 ...)
+
+// Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
+(Ctz64 <t> x) => (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
+(Ctz32 <t> x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
+
+(BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x))
+
+// POPCNT treats the input register as a vector of 8 bytes, producing
+// a population count for each individual byte. For inputs larger than
+// a single byte we therefore need to sum the individual bytes produced
+// by the POPCNT instruction. For example, the following instruction
+// sequence could be used to calculate the population count of a 4-byte
+// value:
+//
+// MOVD $0x12345678, R1 // R1=0x12345678 <-- input
+// POPCNT R1, R2 // R2=0x02030404
+// SRW $16, R2, R3 // R3=0x00000203
+// ADDW R2, R3, R4 // R4=0x02030607
+// SRW $8, R4, R5 // R5=0x00020306
+// ADDW R4, R5, R6 // R6=0x0205090d
+// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13
+//
+(PopCount8 x) => (POPCNT (MOVBZreg x))
+(PopCount16 x) => (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
+(PopCount32 x) => (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
+(PopCount64 x) => (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
+
+// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
+// 2, 4 or 8 bytes respectively. The result is a single byte however
+// other bytes might contain junk so a zero extension is required if
+// the desired output type is larger than 1 byte.
+(SumBytes2 x) => (ADDW (SRWconst <typ.UInt8> x [8]) x)
+(SumBytes4 x) => (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
+(SumBytes8 x) => (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
+
+(Bswap64 ...) => (MOVDBR ...)
+(Bswap32 ...) => (MOVWBR ...)
+
+// add with carry
+(Select0 (Add64carry x y c))
+ => (Select0 <typ.UInt64> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))
+(Select1 (Add64carry x y c))
+ => (Select0 <typ.UInt64> (ADDE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))))
+
+// subtract with borrow
+(Select0 (Sub64borrow x y c))
+ => (Select0 <typ.UInt64> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c))))
+(Select1 (Sub64borrow x y c))
+ => (NEG (Select0 <typ.UInt64> (SUBE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c)))))))
+
+// math package intrinsics
+(Sqrt ...) => (FSQRT ...)
+(Floor x) => (FIDBR [7] x)
+(Ceil x) => (FIDBR [6] x)
+(Trunc x) => (FIDBR [5] x)
+(RoundToEven x) => (FIDBR [4] x)
+(Round x) => (FIDBR [1] x)
+(FMA x y z) => (FMADD z x y)
+
+(Sqrt32 ...) => (FSQRTS ...)
+
+// Atomic loads and stores.
+// The SYNC instruction (fast-BCR-serialization) prevents store-load
+// reordering. Other sequences of memory operations (load-load,
+// store-store and load-store) are already guaranteed not to be reordered.
+(AtomicLoad(8|32|Acq32|64|Ptr) ptr mem) => (MOV(BZ|WZ|WZ|D|D)atomicload ptr mem)
+(AtomicStore(8|32|64|PtrNoWB) ptr val mem) => (SYNC (MOV(B|W|D|D)atomicstore ptr val mem))
+
+// Store-release doesn't require store-load ordering.
+(AtomicStoreRel32 ptr val mem) => (MOVWatomicstore ptr val mem)
+
+// Atomic adds.
+(AtomicAdd32 ptr val mem) => (AddTupleFirst32 val (LAA ptr val mem))
+(AtomicAdd64 ptr val mem) => (AddTupleFirst64 val (LAAG ptr val mem))
+(Select0 <t> (AddTupleFirst32 val tuple)) => (ADDW val (Select0 <t> tuple))
+(Select1 (AddTupleFirst32 _ tuple)) => (Select1 tuple)
+(Select0 <t> (AddTupleFirst64 val tuple)) => (ADD val (Select0 <t> tuple))
+(Select1 (AddTupleFirst64 _ tuple)) => (Select1 tuple)
+
+// Atomic exchanges.
+(AtomicExchange32 ptr val mem) => (LoweredAtomicExchange32 ptr val mem)
+(AtomicExchange64 ptr val mem) => (LoweredAtomicExchange64 ptr val mem)
+
+// Atomic compare and swap.
+(AtomicCompareAndSwap32 ptr old new_ mem) => (LoweredAtomicCas32 ptr old new_ mem)
+(AtomicCompareAndSwap64 ptr old new_ mem) => (LoweredAtomicCas64 ptr old new_ mem)
+
+// Atomic and: *(*uint8)(ptr) &= val
+//
+// Round pointer down to nearest word boundary and pad value with ones before
+// applying atomic AND operation to target word.
+//
+// *(*uint32)(ptr &^ 3) &= rotateleft(uint32(val) | 0xffffff00, ((3 << 3) ^ ((ptr & 3) << 3))
+//
+(AtomicAnd8 ptr val mem)
+ => (LANfloor
+ ptr
+ (RLL <typ.UInt32>
+ (ORWconst <typ.UInt32> val [-1<<8])
+ (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
+ mem)
+
+// Atomic or: *(*uint8)(ptr) |= val
+//
+// Round pointer down to nearest word boundary and pad value with zeros before
+// applying atomic OR operation to target word.
+//
+// *(*uint32)(ptr &^ 3) |= uint32(val) << ((3 << 3) ^ ((ptr & 3) << 3))
+//
+(AtomicOr8 ptr val mem)
+ => (LAOfloor
+ ptr
+ (SLW <typ.UInt32>
+ (MOVBZreg <typ.UInt32> val)
+ (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
+ mem)
+
+(AtomicAnd32 ...) => (LAN ...)
+(AtomicOr32 ...) => (LAO ...)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to(16|32|64) ...) => (MOVBreg ...)
+(SignExt16to(32|64) ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to(16|32|64) ...) => (MOVBZreg ...)
+(ZeroExt16to(32|64) ...) => (MOVHZreg ...)
+(ZeroExt32to64 ...) => (MOVWZreg ...)
+
+(Slicemask <t> x) => (SRADconst (NEG <t> x) [63])
+
+// Lowering truncation
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc(16|32|64)to8 ...) => (Copy ...)
+(Trunc(32|64)to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Lowering float <-> int
+(Cvt32to32F ...) => (CEFBRA ...)
+(Cvt32to64F ...) => (CDFBRA ...)
+(Cvt64to32F ...) => (CEGBRA ...)
+(Cvt64to64F ...) => (CDGBRA ...)
+
+(Cvt32Fto32 ...) => (CFEBRA ...)
+(Cvt32Fto64 ...) => (CGEBRA ...)
+(Cvt64Fto32 ...) => (CFDBRA ...)
+(Cvt64Fto64 ...) => (CGDBRA ...)
+
+// Lowering float <-> uint
+(Cvt32Uto32F ...) => (CELFBR ...)
+(Cvt32Uto64F ...) => (CDLFBR ...)
+(Cvt64Uto32F ...) => (CELGBR ...)
+(Cvt64Uto64F ...) => (CDLGBR ...)
+
+(Cvt32Fto32U ...) => (CLFEBR ...)
+(Cvt32Fto64U ...) => (CLGEBR ...)
+(Cvt64Fto32U ...) => (CLFDBR ...)
+(Cvt64Fto64U ...) => (CLGDBR ...)
+
+// Lowering float32 <-> float64
+(Cvt32Fto64F ...) => (LDEBR ...)
+(Cvt64Fto32F ...) => (LEDBR ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
+
+// Lowering shifts
+
+// Lower bounded shifts first. No need to check shift value.
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
+(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD x y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
+(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
+
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+// result = shift >= 64 ? 0 : arg << shift
+(Lsh(64|32|16|8)x64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
+(Lsh(64|32|16|8)x32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Lsh(64|32|16|8)x16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Lsh(64|32|16|8)x8 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
+
+(Rsh(64|32)Ux64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
+(Rsh(64|32)Ux32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Rsh(64|32)Ux16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Rsh(64|32)Ux8 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
+
+(Rsh(16|8)Ux64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPUconst y [64]))
+(Rsh(16|8)Ux32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Rsh(16|8)Ux16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Rsh(16|8)Ux8 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to 63 (all ones) if the shift value is more than 63.
+// result = arg >> (shift >= 64 ? 63 : shift)
+(Rsh(64|32)x64 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst y [64])))
+(Rsh(64|32)x32 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
+(Rsh(64|32)x16 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
+(Rsh(64|32)x8 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))
+
+(Rsh(16|8)x64 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst y [64])))
+(Rsh(16|8)x32 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
+(Rsh(16|8)x16 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
+(Rsh(16|8)x8 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))
+
+// Lowering rotates
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 ...) => (RLL ...)
+(RotateLeft64 ...) => (RLLG ...)
+
+// Lowering comparisons
+(Less64 x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Less32 x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Less(16|8) x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
+(Less64U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
+(Less32U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
+(Less(16|8)U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
+(Less64F x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Less32F x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+(Leq64 x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Leq32 x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Leq(16|8) x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
+(Leq64U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
+(Leq32U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
+(Leq(16|8)U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
+(Leq64F x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Leq32F x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+(Eq(64|Ptr) x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Eq32 x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Eq(16|8|B) x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y)))
+(Eq64F x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Eq32F x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+(Neq(64|Ptr) x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Neq32 x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Neq(16|8|B) x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y)))
+(Neq64F x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Neq32F x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) => (MOVWZload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) => (MOVHload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) => (MOVHZload ptr mem)
+(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && !isSigned(t))) => (MOVBZload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+// Lowering stores
+// These more-specific FP versions of Store pattern should come first.
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem)
+
+(Store {t} ptr val mem) && t.Size() == 8 => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Lowering moves
+
+// Load and store for small copies.
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBZload src mem) mem)
+(Move [2] dst src mem) => (MOVHstore dst (MOVHZload src mem) mem)
+(Move [4] dst src mem) => (MOVWstore dst (MOVWZload src mem) mem)
+(Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem)
+(Move [16] dst src mem) =>
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem))
+(Move [24] dst src mem) =>
+ (MOVDstore [16] dst (MOVDload [16] src mem)
+ (MOVDstore [8] dst (MOVDload [8] src mem)
+ (MOVDstore dst (MOVDload src mem) mem)))
+(Move [3] dst src mem) =>
+ (MOVBstore [2] dst (MOVBZload [2] src mem)
+ (MOVHstore dst (MOVHZload src mem) mem))
+(Move [5] dst src mem) =>
+ (MOVBstore [4] dst (MOVBZload [4] src mem)
+ (MOVWstore dst (MOVWZload src mem) mem))
+(Move [6] dst src mem) =>
+ (MOVHstore [4] dst (MOVHZload [4] src mem)
+ (MOVWstore dst (MOVWZload src mem) mem))
+(Move [7] dst src mem) =>
+ (MOVBstore [6] dst (MOVBZload [6] src mem)
+ (MOVHstore [4] dst (MOVHZload [4] src mem)
+ (MOVWstore dst (MOVWZload src mem) mem)))
+
+// MVC for other moves. Use up to 4 instructions (sizes up to 1024 bytes).
+(Move [s] dst src mem) && s > 0 && s <= 256 && logLargeCopy(v, s) =>
+ (MVC [makeValAndOff(int32(s), 0)] dst src mem)
+(Move [s] dst src mem) && s > 256 && s <= 512 && logLargeCopy(v, s) =>
+ (MVC [makeValAndOff(int32(s)-256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))
+(Move [s] dst src mem) && s > 512 && s <= 768 && logLargeCopy(v, s) =>
+ (MVC [makeValAndOff(int32(s)-512, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)))
+(Move [s] dst src mem) && s > 768 && s <= 1024 && logLargeCopy(v, s) =>
+ (MVC [makeValAndOff(int32(s)-768, 768)] dst src (MVC [makeValAndOff(256, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))))
+
+// Move more than 1024 bytes using a loop.
+(Move [s] dst src mem) && s > 1024 && logLargeCopy(v, s) =>
+ (LoweredMove [s%256] dst src (ADD <src.Type> src (MOVDconst [(s/256)*256])) mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [0] destptr mem)
+(Zero [2] destptr mem) => (MOVHstoreconst [0] destptr mem)
+(Zero [4] destptr mem) => (MOVWstoreconst [0] destptr mem)
+(Zero [8] destptr mem) => (MOVDstoreconst [0] destptr mem)
+(Zero [3] destptr mem) =>
+ (MOVBstoreconst [makeValAndOff(0,2)] destptr
+ (MOVHstoreconst [0] destptr mem))
+(Zero [5] destptr mem) =>
+ (MOVBstoreconst [makeValAndOff(0,4)] destptr
+ (MOVWstoreconst [0] destptr mem))
+(Zero [6] destptr mem) =>
+ (MOVHstoreconst [makeValAndOff(0,4)] destptr
+ (MOVWstoreconst [0] destptr mem))
+(Zero [7] destptr mem) =>
+ (MOVWstoreconst [makeValAndOff(0,3)] destptr
+ (MOVWstoreconst [0] destptr mem))
+
+(Zero [s] destptr mem) && s > 0 && s <= 1024 =>
+ (CLEAR [makeValAndOff(int32(s), 0)] destptr mem)
+
+// Zero more than 1024 bytes using a loop.
+(Zero [s] destptr mem) && s > 1024 =>
+ (LoweredZero [s%256] destptr (ADDconst <destptr.Type> destptr [(int32(s)/256)*256]) mem)
+
+// Lowering constants
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32|64)F ...) => (FMOV(S|D)const ...)
+(ConstNil) => (MOVDconst [0])
+(ConstBool [t]) => (MOVDconst [b2i(t)])
+
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+(TailCall ...) => (CALLtail ...)
+
+// Miscellaneous
+(IsNonNil p) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPconst p [0]))
+(IsInBounds idx len) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
+(IsSliceInBounds idx len) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetG ...) => (LoweredGetG ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(Addr {sym} base) => (MOVDaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
+(ITab (Load ptr mem)) => (MOVDload ptr mem)
+
+// block rewrites
+(If cond yes no) => (CLIJ {s390x.LessOrGreater} (MOVBZreg <typ.Bool> cond) [0] yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// ***************************
+// Above: lowering rules
+// Below: optimizations
+// ***************************
+// TODO: Should the optimizations be a separate pass?
+
+// Note: when removing unnecessary sign/zero extensions.
+//
+// After a value is spilled it is restored using a sign- or zero-extension
+// to register-width as appropriate for its type. For example, a uint8 will
+// be restored using a MOVBZ (llgc) instruction which will zero extend the
+// 8-bit value to 64-bits.
+//
+// This is a hazard when folding sign- and zero-extensions since we need to
+// ensure not only that the value in the argument register is correctly
+// extended but also that it will still be correctly extended if it is
+// spilled and restored.
+//
+// In general this means we need type checks when the RHS of a rule is an
+// OpCopy (i.e. "(... x:(...) ...) -> x").
+
+// Merge double extensions.
+(MOV(H|HZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(W|WZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(W|WZ)reg e:(MOV(H|HZ)reg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+
+// Bypass redundant sign extensions.
+(MOV(B|BZ)reg e:(MOVBreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVHreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(H|HZ)reg e:(MOVHreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(H|HZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(W|WZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(W|WZ)reg x)
+
+// Bypass redundant zero extensions.
+(MOV(B|BZ)reg e:(MOVBZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVHZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(H|HZ)reg e:(MOVHZreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(H|HZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(W|WZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(W|WZ)reg x)
+
+// Remove zero extensions after zero extending load.
+// Note: take care that if x is spilled it is restored correctly.
+(MOV(B|H|W)Zreg x:(MOVBZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) => x
+(MOV(H|W)Zreg x:(MOVHZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) => x
+(MOVWZreg x:(MOVWZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 4) => x
+
+// Remove sign extensions after sign extending load.
+// Note: take care that if x is spilled it is restored correctly.
+(MOV(B|H|W)reg x:(MOVBload _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x
+(MOV(H|W)reg x:(MOVHload _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x
+(MOVWreg x:(MOVWload _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x
+
+// Remove sign extensions after zero extending load.
+// These type checks are probably unnecessary but do them anyway just in case.
+(MOV(H|W)reg x:(MOVBZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) => x
+(MOVWreg x:(MOVHZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) => x
+
+// Fold sign and zero extensions into loads.
+//
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+//
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values. See test/atomicload.go.
+(MOV(B|H|W)Zreg <t> x:(MOV(B|H|W)load [o] {s} p mem))
+ && x.Uses == 1
+ && clobber(x)
+ => @x.Block (MOV(B|H|W)Zload <t> [o] {s} p mem)
+(MOV(B|H|W)reg <t> x:(MOV(B|H|W)Zload [o] {s} p mem))
+ && x.Uses == 1
+ && clobber(x)
+ => @x.Block (MOV(B|H|W)load <t> [o] {s} p mem)
+
+// Remove zero extensions after argument load.
+(MOVBZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() == 1 => x
+(MOVHZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 2 => x
+(MOVWZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 4 => x
+
+// Remove sign extensions after argument load.
+(MOVBreg x:(Arg <t>)) && t.IsSigned() && t.Size() == 1 => x
+(MOVHreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 2 => x
+(MOVWreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 4 => x
+
+// Fold zero extensions into constants.
+(MOVBZreg (MOVDconst [c])) => (MOVDconst [int64( uint8(c))])
+(MOVHZreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVWZreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+
+// Fold sign extensions into constants.
+(MOVBreg (MOVDconst [c])) => (MOVDconst [int64( int8(c))])
+(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+
+// Remove zero extension of conditional move.
+// Note: only for MOVBZreg for now since it is added as part of 'if' statement lowering.
+(MOVBZreg x:(LOCGR (MOVDconst [c]) (MOVDconst [d]) _))
+ && int64(uint8(c)) == c
+ && int64(uint8(d)) == d
+ && (!x.Type.IsSigned() || x.Type.Size() > 1)
+ => x
+
+// Fold boolean tests into blocks.
+// Note: this must match If statement lowering.
+(CLIJ {s390x.LessOrGreater} (LOCGR {d} (MOVDconst [0]) (MOVDconst [x]) cmp) [0] yes no)
+ && int32(x) != 0
+ => (BRC {d} cmp yes no)
+
+// Canonicalize BRC condition code mask by removing impossible conditions.
+// Integer comparisons cannot generate the unordered condition.
+(BRC {c} x:((CMP|CMPW|CMPU|CMPWU) _ _) yes no) && c&s390x.Unordered != 0 => (BRC {c&^s390x.Unordered} x yes no)
+(BRC {c} x:((CMP|CMPW|CMPU|CMPWU)const _) yes no) && c&s390x.Unordered != 0 => (BRC {c&^s390x.Unordered} x yes no)
+
+// Compare-and-branch.
+// Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered.
+(BRC {c} (CMP x y) yes no) => (CGRJ {c&^s390x.Unordered} x y yes no)
+(BRC {c} (CMPW x y) yes no) => (CRJ {c&^s390x.Unordered} x y yes no)
+(BRC {c} (CMPU x y) yes no) => (CLGRJ {c&^s390x.Unordered} x y yes no)
+(BRC {c} (CMPWU x y) yes no) => (CLRJ {c&^s390x.Unordered} x y yes no)
+
+// Compare-and-branch (immediate).
+// Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered.
+(BRC {c} (CMPconst x [y]) yes no) && y == int32( int8(y)) => (CGIJ {c&^s390x.Unordered} x [ int8(y)] yes no)
+(BRC {c} (CMPWconst x [y]) yes no) && y == int32( int8(y)) => (CIJ {c&^s390x.Unordered} x [ int8(y)] yes no)
+(BRC {c} (CMPUconst x [y]) yes no) && y == int32(uint8(y)) => (CLGIJ {c&^s390x.Unordered} x [uint8(y)] yes no)
+(BRC {c} (CMPWUconst x [y]) yes no) && y == int32(uint8(y)) => (CLIJ {c&^s390x.Unordered} x [uint8(y)] yes no)
+
+// Absorb immediate into compare-and-branch.
+(C(R|GR)J {c} x (MOVDconst [y]) yes no) && is8Bit(y) => (C(I|GI)J {c} x [ int8(y)] yes no)
+(CL(R|GR)J {c} x (MOVDconst [y]) yes no) && isU8Bit(y) => (CL(I|GI)J {c} x [uint8(y)] yes no)
+(C(R|GR)J {c} (MOVDconst [x]) y yes no) && is8Bit(x) => (C(I|GI)J {c.ReverseComparison()} y [ int8(x)] yes no)
+(CL(R|GR)J {c} (MOVDconst [x]) y yes no) && isU8Bit(x) => (CL(I|GI)J {c.ReverseComparison()} y [uint8(x)] yes no)
+
+// Prefer comparison with immediate to compare-and-branch.
+(CGRJ {c} x (MOVDconst [y]) yes no) && !is8Bit(y) && is32Bit(y) => (BRC {c} (CMPconst x [int32(y)]) yes no)
+(CRJ {c} x (MOVDconst [y]) yes no) && !is8Bit(y) && is32Bit(y) => (BRC {c} (CMPWconst x [int32(y)]) yes no)
+(CLGRJ {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) => (BRC {c} (CMPUconst x [int32(y)]) yes no)
+(CLRJ {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) => (BRC {c} (CMPWUconst x [int32(y)]) yes no)
+(CGRJ {c} (MOVDconst [x]) y yes no) && !is8Bit(x) && is32Bit(x) => (BRC {c.ReverseComparison()} (CMPconst y [int32(x)]) yes no)
+(CRJ {c} (MOVDconst [x]) y yes no) && !is8Bit(x) && is32Bit(x) => (BRC {c.ReverseComparison()} (CMPWconst y [int32(x)]) yes no)
+(CLGRJ {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) => (BRC {c.ReverseComparison()} (CMPUconst y [int32(x)]) yes no)
+(CLRJ {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) => (BRC {c.ReverseComparison()} (CMPWUconst y [int32(x)]) yes no)
+
+// Absorb sign/zero extensions into 32-bit compare-and-branch.
+(CIJ {c} (MOV(W|WZ)reg x) [y] yes no) => (CIJ {c} x [y] yes no)
+(CLIJ {c} (MOV(W|WZ)reg x) [y] yes no) => (CLIJ {c} x [y] yes no)
+
+// Bring out-of-range signed immediates into range by varying branch condition.
+(BRC {s390x.Less} (CMPconst x [ 128]) yes no) => (CGIJ {s390x.LessOrEqual} x [ 127] yes no)
+(BRC {s390x.Less} (CMPWconst x [ 128]) yes no) => (CIJ {s390x.LessOrEqual} x [ 127] yes no)
+(BRC {s390x.LessOrEqual} (CMPconst x [-129]) yes no) => (CGIJ {s390x.Less} x [-128] yes no)
+(BRC {s390x.LessOrEqual} (CMPWconst x [-129]) yes no) => (CIJ {s390x.Less} x [-128] yes no)
+(BRC {s390x.Greater} (CMPconst x [-129]) yes no) => (CGIJ {s390x.GreaterOrEqual} x [-128] yes no)
+(BRC {s390x.Greater} (CMPWconst x [-129]) yes no) => (CIJ {s390x.GreaterOrEqual} x [-128] yes no)
+(BRC {s390x.GreaterOrEqual} (CMPconst x [ 128]) yes no) => (CGIJ {s390x.Greater} x [ 127] yes no)
+(BRC {s390x.GreaterOrEqual} (CMPWconst x [ 128]) yes no) => (CIJ {s390x.Greater} x [ 127] yes no)
+
+// Bring out-of-range unsigned immediates into range by varying branch condition.
+(BRC {s390x.Less} (CMP(WU|U)const x [256]) yes no) => (C(L|LG)IJ {s390x.LessOrEqual} x [255] yes no)
+(BRC {s390x.GreaterOrEqual} (CMP(WU|U)const x [256]) yes no) => (C(L|LG)IJ {s390x.Greater} x [255] yes no)
+
+// Bring out-of-range immediates into range by switching signedness (only == and !=).
+(BRC {c} (CMPconst x [y]) yes no) && y == int32(uint8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CLGIJ {c} x [uint8(y)] yes no)
+(BRC {c} (CMPWconst x [y]) yes no) && y == int32(uint8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CLIJ {c} x [uint8(y)] yes no)
+(BRC {c} (CMPUconst x [y]) yes no) && y == int32( int8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CGIJ {c} x [ int8(y)] yes no)
+(BRC {c} (CMPWUconst x [y]) yes no) && y == int32( int8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CIJ {c} x [ int8(y)] yes no)
+
+// Fold constants into instructions.
+(ADD x (MOVDconst [c])) && is32Bit(c) => (ADDconst [int32(c)] x)
+(ADDW x (MOVDconst [c])) => (ADDWconst [int32(c)] x)
+
+(SUB x (MOVDconst [c])) && is32Bit(c) => (SUBconst x [int32(c)])
+(SUB (MOVDconst [c]) x) && is32Bit(c) => (NEG (SUBconst <v.Type> x [int32(c)]))
+(SUBW x (MOVDconst [c])) => (SUBWconst x [int32(c)])
+(SUBW (MOVDconst [c]) x) => (NEGW (SUBWconst <v.Type> x [int32(c)]))
+
+(MULLD x (MOVDconst [c])) && is32Bit(c) => (MULLDconst [int32(c)] x)
+(MULLW x (MOVDconst [c])) => (MULLWconst [int32(c)] x)
+
+// NILF instructions leave the high 32 bits unchanged which is
+// equivalent to the leftmost 32 bits being set.
+// TODO(mundaym): modify the assembler to accept 64-bit values
+// and use isU32Bit(^c).
+(AND x (MOVDconst [c]))
+ && s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)) != nil
+ => (RISBGZ x {*s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c))})
+(AND x (MOVDconst [c]))
+ && is32Bit(c)
+ && c < 0
+ => (ANDconst [c] x)
+(AND x (MOVDconst [c]))
+ && is32Bit(c)
+ && c >= 0
+ => (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x))
+
+(ANDW x (MOVDconst [c])) => (ANDWconst [int32(c)] x)
+
+((AND|ANDW)const [c] ((AND|ANDW)const [d] x)) => ((AND|ANDW)const [c&d] x)
+
+((OR|XOR) x (MOVDconst [c])) && isU32Bit(c) => ((OR|XOR)const [c] x)
+((OR|XOR)W x (MOVDconst [c])) => ((OR|XOR)Wconst [int32(c)] x)
+
+// Constant shifts.
+(S(LD|RD|RAD) x (MOVDconst [c])) => (S(LD|RD|RAD)const x [uint8(c&63)])
+(S(LW|RW|RAW) x (MOVDconst [c])) && c&32 == 0 => (S(LW|RW|RAW)const x [uint8(c&31)])
+(S(LW|RW) _ (MOVDconst [c])) && c&32 != 0 => (MOVDconst [0])
+(SRAW x (MOVDconst [c])) && c&32 != 0 => (SRAWconst x [31])
+
+// Shifts only use the rightmost 6 bits of the shift value.
+(S(LD|RD|RAD|LW|RW|RAW) x (RISBGZ y {r}))
+ && r.Amount == 0
+ && r.OutMask()&63 == 63
+ => (S(LD|RD|RAD|LW|RW|RAW) x y)
+(S(LD|RD|RAD|LW|RW|RAW) x (AND (MOVDconst [c]) y))
+ => (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [int32(c&63)] y))
+(S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst [c] y)) && c&63 == 63
+ => (S(LD|RD|RAD|LW|RW|RAW) x y)
+(SLD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLD x y)
+(SRD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRD x y)
+(SRAD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAD x y)
+(SLW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLW x y)
+(SRW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRW x y)
+(SRAW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAW x y)
+
+// Match rotate by constant.
+(RLLG x (MOVDconst [c])) => (RISBGZ x {s390x.NewRotateParams(0, 63, uint8(c&63))})
+(RLL x (MOVDconst [c])) => (RLLconst x [uint8(c&31)])
+
+// Signed 64-bit comparison with immediate.
+(CMP x (MOVDconst [c])) && is32Bit(c) => (CMPconst x [int32(c)])
+(CMP (MOVDconst [c]) x) && is32Bit(c) => (InvertFlags (CMPconst x [int32(c)]))
+
+// Unsigned 64-bit comparison with immediate.
+(CMPU x (MOVDconst [c])) && isU32Bit(c) => (CMPUconst x [int32(c)])
+(CMPU (MOVDconst [c]) x) && isU32Bit(c) => (InvertFlags (CMPUconst x [int32(c)]))
+
+// Signed and unsigned 32-bit comparison with immediate.
+(CMP(W|WU) x (MOVDconst [c])) => (CMP(W|WU)const x [int32(c)])
+(CMP(W|WU) (MOVDconst [c]) x) => (InvertFlags (CMP(W|WU)const x [int32(c)]))
+
+// Match (x >> c) << d to 'rotate then insert selected bits [into zero]'.
+(SLDconst (SRDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(uint8(max8(0, int8(c-d))), 63-d, uint8(int8(d-c)&63))})
+
+// Match (x << c) >> d to 'rotate then insert selected bits [into zero]'.
+(SRDconst (SLDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(d, uint8(min8(63, int8(63-c+d))), uint8(int8(c-d)&63))})
+
+// Absorb input zero extension into 'rotate then insert selected bits [into zero]'.
+(RISBGZ (MOVWZreg x) {r}) && r.InMerge(0xffffffff) != nil => (RISBGZ x {*r.InMerge(0xffffffff)})
+(RISBGZ (MOVHZreg x) {r}) && r.InMerge(0x0000ffff) != nil => (RISBGZ x {*r.InMerge(0x0000ffff)})
+(RISBGZ (MOVBZreg x) {r}) && r.InMerge(0x000000ff) != nil => (RISBGZ x {*r.InMerge(0x000000ff)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into zero extension.
+(MOVWZreg (RISBGZ x {r})) && r.OutMerge(0xffffffff) != nil => (RISBGZ x {*r.OutMerge(0xffffffff)})
+(MOVHZreg (RISBGZ x {r})) && r.OutMerge(0x0000ffff) != nil => (RISBGZ x {*r.OutMerge(0x0000ffff)})
+(MOVBZreg (RISBGZ x {r})) && r.OutMerge(0x000000ff) != nil => (RISBGZ x {*r.OutMerge(0x000000ff)})
+
+// Absorb shift into 'rotate then insert selected bits [into zero]'.
+//
+// Any unsigned shift can be represented as a rotate and mask operation:
+//
+// x << c => RotateLeft64(x, c) & (^uint64(0) << c)
+// x >> c => RotateLeft64(x, -c) & (^uint64(0) >> c)
+//
+// Therefore when a shift is used as the input to a rotate then insert
+// selected bits instruction we can merge the two together. We just have
+// to be careful that the resultant mask is representable (non-zero and
+// contiguous). For example, assuming that x is variable and c, y and m
+// are constants, a shift followed by a rotate then insert selected bits
+// could be represented as:
+//
+// RotateLeft64(RotateLeft64(x, c) & (^uint64(0) << c), y) & m
+//
+// We can split the rotation by y into two, one rotate for x and one for
+// the mask:
+//
+// RotateLeft64(RotateLeft64(x, c), y) & (RotateLeft64(^uint64(0) << c, y)) & m
+//
+// The rotations of x by c followed by y can then be combined:
+//
+// RotateLeft64(x, c+y) & (RotateLeft64(^uint64(0) << c, y)) & m
+// ^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+// rotate mask
+//
+// To perform this optimization we therefore just need to check that it
+// is valid to merge the shift mask (^(uint64(0)<<c)) into the selected
+// bits mask (i.e. that the resultant mask is non-zero and contiguous).
+//
+(RISBGZ (SLDconst x [c]) {r}) && r.InMerge(^uint64(0)<<c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)<<c)).RotateLeft(c)})
+(RISBGZ (SRDconst x [c]) {r}) && r.InMerge(^uint64(0)>>c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)>>c)).RotateLeft(-c)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into left shift.
+(SLDconst (RISBGZ x {r}) [c])
+ && s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask()) != nil
+ => (RISBGZ x {(*s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into right shift.
+(SRDconst (RISBGZ x {r}) [c])
+ && s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask()) != nil
+ => (RISBGZ x {(*s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+
+// Merge 'rotate then insert selected bits [into zero]' instructions together.
+(RISBGZ (RISBGZ x {y}) {z})
+ && z.InMerge(y.OutMask()) != nil
+ => (RISBGZ x {(*z.InMerge(y.OutMask())).RotateLeft(y.Amount)})
+
+// Convert RISBGZ into 64-bit shift (helps CSE).
+(RISBGZ x {r}) && r.End == 63 && r.Start == -r.Amount&63 => (SRDconst x [-r.Amount&63])
+(RISBGZ x {r}) && r.Start == 0 && r.End == 63-r.Amount => (SLDconst x [r.Amount])
+
+// Optimize single bit isolation when it is known to be equivalent to
+// the most significant bit due to mask produced by arithmetic shift.
+// Simply isolate the most significant bit itself and place it in the
+// correct position.
+//
+// Example: (int64(x) >> 63) & 0x8 -> RISBGZ $60, $60, $4, Rsrc, Rdst
+(RISBGZ (SRADconst x [c]) {r})
+ && r.Start == r.End // single bit selected
+ && (r.Start+r.Amount)&63 <= c // equivalent to most significant bit of x
+ => (RISBGZ x {s390x.NewRotateParams(r.Start, r.Start, -r.Start&63)})
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+((CMP|CMPW|CMPU|CMPWU) x y) && canonLessThan(x,y) => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x))
+
+// Use sign/zero extend instead of RISBGZ.
+(RISBGZ x {r}) && r == s390x.NewRotateParams(56, 63, 0) => (MOVBZreg x)
+(RISBGZ x {r}) && r == s390x.NewRotateParams(48, 63, 0) => (MOVHZreg x)
+(RISBGZ x {r}) && r == s390x.NewRotateParams(32, 63, 0) => (MOVWZreg x)
+
+// Use sign/zero extend instead of ANDW.
+(ANDWconst [0x00ff] x) => (MOVBZreg x)
+(ANDWconst [0xffff] x) => (MOVHZreg x)
+
+// Strength reduce multiplication to the sum (or difference) of two powers of two.
+//
+// Examples:
+// 5x -> 4x + 1x
+// 10x -> 8x + 2x
+// 120x -> 128x - 8x
+// -120x -> 8x - 128x
+//
+// We know that the rightmost bit of any positive value, once isolated, must either
+// be a power of 2 (because it is a single bit) or 0 (if the original value is 0).
+// In all of these rules we use a rightmost bit calculation to determine one operand
+// for the addition or subtraction. We then just need to calculate if the other
+// operand is a valid power of 2 before we can match the rule.
+//
+// Notes:
+// - the generic rules have already matched single powers of two so we ignore them here
+// - isPowerOfTwo32 asserts that its argument is greater than 0
+// - c&(c-1) = clear rightmost bit
+// - c&^(c-1) = isolate rightmost bit
+
+// c = 2ˣ + 2ʸ => c - 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c&(c-1))
+ => ((ADD|ADDW) (SL(D|W)const <t> x [uint8(log32(c&(c-1)))])
+ (SL(D|W)const <t> x [uint8(log32(c&^(c-1)))]))
+
+// c = 2ʸ - 2ˣ => c + 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c+(c&^(c-1)))
+ => ((SUB|SUBW) (SL(D|W)const <t> x [uint8(log32(c+(c&^(c-1))))])
+ (SL(D|W)const <t> x [uint8(log32(c&^(c-1)))]))
+
+// c = 2ˣ - 2ʸ => -c + 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(-c+(-c&^(-c-1)))
+ => ((SUB|SUBW) (SL(D|W)const <t> x [uint8(log32(-c&^(-c-1)))])
+ (SL(D|W)const <t> x [uint8(log32(-c+(-c&^(-c-1))))]))
+
+// Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them).
+(ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(int64(c)+int64(d)) => (MOVDaddr [c+d] {s} x)
+(ADDconst [c] (MOVDaddr [d] {s} x)) && x.Op != OpSB && is20Bit(int64(c)+int64(d)) => (MOVDaddr [c+d] {s} x)
+(ADD idx (MOVDaddr [c] {s} ptr)) && ptr.Op != OpSB => (MOVDaddridx [c] {s} ptr idx)
+
+// fold ADDconst into MOVDaddrx
+(ADDconst [c] (MOVDaddridx [d] {s} x y)) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y)
+(MOVDaddridx [c] {s} (ADDconst [d] x) y) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y)
+(MOVDaddridx [c] {s} x (ADDconst [d] y)) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y)
+
+// reverse ordering of compare instruction
+(LOCGR {c} x y (InvertFlags cmp)) => (LOCGR {c.ReverseComparison()} x y cmp)
+
+// replace load from same location as preceding store with copy
+(MOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x
+(MOVWload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWreg x)
+(MOVHload [off] {sym} ptr1 (MOVHstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVHreg x)
+(MOVBload [off] {sym} ptr1 (MOVBstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVBreg x)
+(MOVWZload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWZreg x)
+(MOVHZload [off] {sym} ptr1 (MOVHstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVHZreg x)
+(MOVBZload [off] {sym} ptr1 (MOVBstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVBZreg x)
+(MOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (LGDR x)
+(FMOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (LDGR x)
+(FMOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x
+(FMOVSload [off] {sym} ptr1 (FMOVSstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x
+
+// prefer FPR <-> GPR moves over combined load ops
+(MULLDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (MULLD x (LGDR <t> y))
+(ADDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (ADD x (LGDR <t> y))
+(SUBload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (SUB x (LGDR <t> y))
+(ORload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (OR x (LGDR <t> y))
+(ANDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (AND x (LGDR <t> y))
+(XORload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (XOR x (LGDR <t> y))
+
+// detect attempts to set/clear the sign bit
+// may need to be reworked when NIHH/OIHH are added
+(RISBGZ (LGDR <t> x) {r}) && r == s390x.NewRotateParams(1, 63, 0) => (LGDR <t> (LPDFR <x.Type> x))
+(LDGR <t> (RISBGZ x {r})) && r == s390x.NewRotateParams(1, 63, 0) => (LPDFR (LDGR <t> x))
+(OR (MOVDconst [-1<<63]) (LGDR <t> x)) => (LGDR <t> (LNDFR <x.Type> x))
+(LDGR <t> (OR (MOVDconst [-1<<63]) x)) => (LNDFR (LDGR <t> x))
+
+// detect attempts to set the sign bit with load
+(LDGR <t> x:(ORload <t1> [off] {sym} (MOVDconst [-1<<63]) ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (LNDFR <t> (LDGR <t> (MOVDload <t1> [off] {sym} ptr mem)))
+
+// detect copysign
+(OR (RISBGZ (LGDR x) {r}) (LGDR (LPDFR <t> y)))
+ && r == s390x.NewRotateParams(0, 0, 0)
+ => (LGDR (CPSDR <t> y x))
+(OR (RISBGZ (LGDR x) {r}) (MOVDconst [c]))
+ && c >= 0
+ && r == s390x.NewRotateParams(0, 0, 0)
+ => (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
+(CPSDR y (FMOVDconst [c])) && !math.Signbit(c) => (LPDFR y)
+(CPSDR y (FMOVDconst [c])) && math.Signbit(c) => (LNDFR y)
+
+// absorb negations into set/clear sign bit
+(FNEG (LPDFR x)) => (LNDFR x)
+(FNEG (LNDFR x)) => (LPDFR x)
+(FNEGS (LPDFR x)) => (LNDFR x)
+(FNEGS (LNDFR x)) => (LPDFR x)
+
+// no need to convert float32 to float64 to set/clear sign bit
+(LEDBR (LPDFR (LDEBR x))) => (LPDFR x)
+(LEDBR (LNDFR (LDEBR x))) => (LNDFR x)
+
+// remove unnecessary FPR <-> GPR moves
+(LDGR (LGDR x)) => x
+(LGDR (LDGR x)) => x
+
+// Don't extend before storing
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWZreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHZreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBZreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+
+// Fold constants into memory operations.
+// Note that this is not always a good idea because if not all the uses of
+// the ADDconst get eliminated, we still have to compute the ADDconst and we now
+// have potentially two live values (ptr and (ADDconst [off] ptr)) instead of one.
+// Nevertheless, let's do it!
+(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVDload [off1+off2] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWload [off1+off2] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHload [off1+off2] {sym} ptr mem)
+(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBload [off1+off2] {sym} ptr mem)
+(MOVWZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWZload [off1+off2] {sym} ptr mem)
+(MOVHZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHZload [off1+off2] {sym} ptr mem)
+(MOVBZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBZload [off1+off2] {sym} ptr mem)
+(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVSload [off1+off2] {sym} ptr mem)
+(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVDload [off1+off2] {sym} ptr mem)
+
+(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVDstore [off1+off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWstore [off1+off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHstore [off1+off2] {sym} ptr val mem)
+(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBstore [off1+off2] {sym} ptr val mem)
+(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVSstore [off1+off2] {sym} ptr val mem)
+(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVDstore [off1+off2] {sym} ptr val mem)
+
+(ADDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ADDload [off1+off2] {sym} x ptr mem)
+(ADDWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ADDWload [off1+off2] {sym} x ptr mem)
+(MULLDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (MULLDload [off1+off2] {sym} x ptr mem)
+(MULLWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (MULLWload [off1+off2] {sym} x ptr mem)
+(SUBload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (SUBload [off1+off2] {sym} x ptr mem)
+(SUBWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (SUBWload [off1+off2] {sym} x ptr mem)
+
+(ANDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ANDload [off1+off2] {sym} x ptr mem)
+(ANDWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ANDWload [off1+off2] {sym} x ptr mem)
+(ORload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ORload [off1+off2] {sym} x ptr mem)
+(ORWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ORWload [off1+off2] {sym} x ptr mem)
+(XORload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (XORload [off1+off2] {sym} x ptr mem)
+(XORWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (XORWload [off1+off2] {sym} x ptr mem)
+
+// Fold constants into stores.
+(MOVDstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(int64(off)) && ptr.Op != OpSB =>
+ (MOVDstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(int64(off)) && ptr.Op != OpSB =>
+ (MOVWstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [c]) mem) && isU12Bit(int64(off)) && ptr.Op != OpSB =>
+ (MOVHstoreconst [makeValAndOff(int32(int16(c)),off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVDconst [c]) mem) && is20Bit(int64(off)) && ptr.Op != OpSB =>
+ (MOVBstoreconst [makeValAndOff(int32(int8(c)),off)] {sym} ptr mem)
+
+// Fold address offsets into constant stores.
+(MOVDstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off64()+int64(off)) =>
+ (MOVDstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVWstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off64()+int64(off)) =>
+ (MOVWstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVHstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off64()+int64(off)) =>
+ (MOVHstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVBstoreconst [sc] {s} (ADDconst [off] ptr) mem) && is20Bit(sc.Off64()+int64(off)) =>
+ (MOVBstoreconst [sc.addOffset32(off)] {s} ptr mem)
+
+// Merge address calculations into loads and stores.
+// Offsets from SB must not be merged into unaligned memory accesses because
+// loads/stores using PC-relative addressing directly must be aligned to the
+// size of the target.
+(MOVDload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) =>
+ (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWZload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) =>
+ (MOVWZload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHZload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) =>
+ (MOVHZload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBZload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVBZload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (FMOVSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVWload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) =>
+ (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) =>
+ (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVDstore [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) =>
+ (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) =>
+ (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVHstore [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) =>
+ (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+ (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(ADDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ADDload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ADDWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ADDWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(MULLDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (MULLDload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(MULLWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (MULLWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(SUBload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (SUBload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(SUBWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (SUBWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+
+(ANDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ANDload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ANDWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ANDWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ORload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ORload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ORWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ORWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(XORload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (XORload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(XORWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (XORWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+
+// Cannot store constant to SB directly (no 'move relative long immediate' instructions).
+(MOVDstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+ (MOVDstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVWstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+ (MOVWstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVHstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+ (MOVHstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVBstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+ (MOVBstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+// MOVDaddr into MOVDaddridx
+(MOVDaddridx [off1] {sym1} (MOVDaddr [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+ (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
+(MOVDaddridx [off1] {sym1} x (MOVDaddr [off2] {sym2} y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && y.Op != OpSB =>
+ (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// Absorb InvertFlags into branches.
+(BRC {c} (InvertFlags cmp) yes no) => (BRC {c.ReverseComparison()} cmp yes no)
+
+// Constant comparisons.
+(CMPconst (MOVDconst [x]) [y]) && x==int64(y) => (FlagEQ)
+(CMPconst (MOVDconst [x]) [y]) && x<int64(y) => (FlagLT)
+(CMPconst (MOVDconst [x]) [y]) && x>int64(y) => (FlagGT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)==uint64(y) => (FlagEQ)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) => (FlagLT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) => (FlagGT)
+
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) => (FlagLT)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) => (FlagGT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)==uint32(y) => (FlagEQ)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) => (FlagLT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) => (FlagGT)
+
+(CMP(W|WU)const (MOVBZreg _) [c]) && 0xff < c => (FlagLT)
+(CMP(W|WU)const (MOVHZreg _) [c]) && 0xffff < c => (FlagLT)
+
+(CMPconst (SRDconst _ [c]) [n]) && c > 0 && n < 0 => (FlagGT)
+(CMPWconst (SRWconst _ [c]) [n]) && c > 0 && n < 0 => (FlagGT)
+
+(CMPUconst (SRDconst _ [c]) [n]) && c > 0 && c < 64 && (1<<uint(64-c)) <= uint64(n) => (FlagLT)
+(CMPWUconst (SRWconst _ [c]) [n]) && c > 0 && c < 32 && (1<<uint(32-c)) <= uint32(n) => (FlagLT)
+
+(CMPWconst (ANDWconst _ [m]) [n]) && int32(m) >= 0 && int32(m) < int32(n) => (FlagLT)
+(CMPWUconst (ANDWconst _ [m]) [n]) && uint32(m) < uint32(n) => (FlagLT)
+
+(CMPconst (RISBGZ x {r}) [c]) && c > 0 && r.OutMask() < uint64(c) => (FlagLT)
+(CMPUconst (RISBGZ x {r}) [c]) && r.OutMask() < uint64(uint32(c)) => (FlagLT)
+
+// Constant compare-and-branch with immediate.
+(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && int64(x) == int64(y) => (First yes no)
+(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && int64(x) < int64(y) => (First yes no)
+(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && int64(x) > int64(y) => (First yes no)
+(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && int32(x) == int32(y) => (First yes no)
+(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && int32(x) < int32(y) => (First yes no)
+(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && int32(x) > int32(y) => (First yes no)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && uint64(x) == uint64(y) => (First yes no)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && uint64(x) < uint64(y) => (First yes no)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && uint64(x) > uint64(y) => (First yes no)
+(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && uint32(x) == uint32(y) => (First yes no)
+(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && uint32(x) < uint32(y) => (First yes no)
+(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && uint32(x) > uint32(y) => (First yes no)
+(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && int64(x) == int64(y) => (First no yes)
+(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && int64(x) < int64(y) => (First no yes)
+(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && int64(x) > int64(y) => (First no yes)
+(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && int32(x) == int32(y) => (First no yes)
+(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && int32(x) < int32(y) => (First no yes)
+(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && int32(x) > int32(y) => (First no yes)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && uint64(x) == uint64(y) => (First no yes)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && uint64(x) < uint64(y) => (First no yes)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && uint64(x) > uint64(y) => (First no yes)
+(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && uint32(x) == uint32(y) => (First no yes)
+(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && uint32(x) < uint32(y) => (First no yes)
+(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && uint32(x) > uint32(y) => (First no yes)
+
+// Constant compare-and-branch with immediate when unsigned comparison with zero.
+(C(L|LG)IJ {s390x.GreaterOrEqual} _ [0] yes no) => (First yes no)
+(C(L|LG)IJ {s390x.Less} _ [0] yes no) => (First no yes)
+
+// Constant compare-and-branch when operands match.
+(C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c&s390x.Equal != 0 => (First yes no)
+(C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c&s390x.Equal == 0 => (First no yes)
+
+// Convert 64-bit comparisons to 32-bit comparisons and signed comparisons
+// to unsigned comparisons.
+// Helps simplify constant comparison detection.
+(CM(P|PU)const (MOV(W|WZ)reg x) [c]) => (CMP(W|WU)const x [c])
+(CM(P|P|PU|PU)const x:(MOV(H|HZ|H|HZ)reg _) [c]) => (CMP(W|W|WU|WU)const x [c])
+(CM(P|P|PU|PU)const x:(MOV(B|BZ|B|BZ)reg _) [c]) => (CMP(W|W|WU|WU)const x [c])
+(CMPconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 && c >= 0 => (CMPWUconst x [c])
+(CMPUconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 => (CMPWUconst x [c])
+(CMPconst x:(SRDconst _ [c]) [n]) && c > 0 && n >= 0 => (CMPUconst x [n])
+(CMPWconst x:(SRWconst _ [c]) [n]) && c > 0 && n >= 0 => (CMPWUconst x [n])
+
+// Absorb sign and zero extensions into 32-bit comparisons.
+(CMP(W|W|WU|WU) x (MOV(W|WZ|W|WZ)reg y)) => (CMP(W|W|WU|WU) x y)
+(CMP(W|W|WU|WU) (MOV(W|WZ|W|WZ)reg x) y) => (CMP(W|W|WU|WU) x y)
+(CMP(W|W|WU|WU)const (MOV(W|WZ|W|WZ)reg x) [c]) => (CMP(W|W|WU|WU)const x [c])
+
+// Absorb flag constants into branches.
+(BRC {c} (FlagEQ) yes no) && c&s390x.Equal != 0 => (First yes no)
+(BRC {c} (FlagLT) yes no) && c&s390x.Less != 0 => (First yes no)
+(BRC {c} (FlagGT) yes no) && c&s390x.Greater != 0 => (First yes no)
+(BRC {c} (FlagOV) yes no) && c&s390x.Unordered != 0 => (First yes no)
+
+(BRC {c} (FlagEQ) yes no) && c&s390x.Equal == 0 => (First no yes)
+(BRC {c} (FlagLT) yes no) && c&s390x.Less == 0 => (First no yes)
+(BRC {c} (FlagGT) yes no) && c&s390x.Greater == 0 => (First no yes)
+(BRC {c} (FlagOV) yes no) && c&s390x.Unordered == 0 => (First no yes)
+
+// Absorb flag constants into SETxx ops.
+(LOCGR {c} _ x (FlagEQ)) && c&s390x.Equal != 0 => x
+(LOCGR {c} _ x (FlagLT)) && c&s390x.Less != 0 => x
+(LOCGR {c} _ x (FlagGT)) && c&s390x.Greater != 0 => x
+(LOCGR {c} _ x (FlagOV)) && c&s390x.Unordered != 0 => x
+
+(LOCGR {c} x _ (FlagEQ)) && c&s390x.Equal == 0 => x
+(LOCGR {c} x _ (FlagLT)) && c&s390x.Less == 0 => x
+(LOCGR {c} x _ (FlagGT)) && c&s390x.Greater == 0 => x
+(LOCGR {c} x _ (FlagOV)) && c&s390x.Unordered == 0 => x
+
+// Remove redundant *const ops
+(ADDconst [0] x) => x
+(ADDWconst [c] x) && int32(c)==0 => x
+(SUBconst [0] x) => x
+(SUBWconst [c] x) && int32(c) == 0 => x
+(ANDconst [0] _) => (MOVDconst [0])
+(ANDWconst [c] _) && int32(c)==0 => (MOVDconst [0])
+(ANDconst [-1] x) => x
+(ANDWconst [c] x) && int32(c)==-1 => x
+(ORconst [0] x) => x
+(ORWconst [c] x) && int32(c)==0 => x
+(ORconst [-1] _) => (MOVDconst [-1])
+(ORWconst [c] _) && int32(c)==-1 => (MOVDconst [-1])
+(XORconst [0] x) => x
+(XORWconst [c] x) && int32(c)==0 => x
+
+// Shifts by zero (may be inserted during multiplication strength reduction).
+((SLD|SLW|SRD|SRW|SRAD|SRAW)const x [0]) => x
+
+// Convert constant subtracts to constant adds.
+(SUBconst [c] x) && c != -(1<<31) => (ADDconst [-c] x)
+(SUBWconst [c] x) => (ADDWconst [-int32(c)] x)
+
+// generic constant folding
+// TODO: more of this
+(ADDconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)+d])
+(ADDWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)+d])
+(ADDconst [c] (ADDconst [d] x)) && is32Bit(int64(c)+int64(d)) => (ADDconst [c+d] x)
+(ADDWconst [c] (ADDWconst [d] x)) => (ADDWconst [int32(c+d)] x)
+(SUBconst (MOVDconst [d]) [c]) => (MOVDconst [d-int64(c)])
+(SUBconst (SUBconst x [d]) [c]) && is32Bit(-int64(c)-int64(d)) => (ADDconst [-c-d] x)
+(SRADconst [c] (MOVDconst [d])) => (MOVDconst [d>>uint64(c)])
+(SRAWconst [c] (MOVDconst [d])) => (MOVDconst [int64(int32(d))>>uint64(c)])
+(NEG (MOVDconst [c])) => (MOVDconst [-c])
+(NEGW (MOVDconst [c])) => (MOVDconst [int64(int32(-c))])
+(MULLDconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)*d])
+(MULLWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c*int32(d))])
+(AND (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&d])
+(ANDconst [c] (MOVDconst [d])) => (MOVDconst [c&d])
+(ANDWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)&d])
+(OR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|d])
+(ORconst [c] (MOVDconst [d])) => (MOVDconst [c|d])
+(ORWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)|d])
+(XOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c^d])
+(XORconst [c] (MOVDconst [d])) => (MOVDconst [c^d])
+(XORWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)^d])
+(LoweredRound32F x:(FMOVSconst)) => x
+(LoweredRound64F x:(FMOVDconst)) => x
+
+// generic simplifications
+// TODO: more of this
+(ADD x (NEG y)) => (SUB x y)
+(ADDW x (NEGW y)) => (SUBW x y)
+(SUB x x) => (MOVDconst [0])
+(SUBW x x) => (MOVDconst [0])
+(AND x x) => x
+(ANDW x x) => x
+(OR x x) => x
+(ORW x x) => x
+(XOR x x) => (MOVDconst [0])
+(XORW x x) => (MOVDconst [0])
+(NEG (ADDconst [c] (NEG x))) && c != -(1<<31) => (ADDconst [-c] x)
+(MOVBZreg (ANDWconst [m] x)) => (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x))
+(MOVHZreg (ANDWconst [m] x)) => (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x))
+(MOVBreg (ANDWconst [m] x)) && int8(m) >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x))
+(MOVHreg (ANDWconst [m] x)) && int16(m) >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x))
+
+// carry flag generation
+// (only constant fold carry of zero)
+(Select1 (ADDCconst (MOVDconst [c]) [d]))
+ && uint64(c+int64(d)) >= uint64(c) && c+int64(d) == 0
+ => (FlagEQ)
+(Select1 (ADDCconst (MOVDconst [c]) [d]))
+ && uint64(c+int64(d)) >= uint64(c) && c+int64(d) != 0
+ => (FlagLT)
+
+// borrow flag generation
+// (only constant fold borrow of zero)
+(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
+ && uint64(d) <= uint64(c) && c-d == 0
+ => (FlagGT)
+(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
+ && uint64(d) <= uint64(c) && c-d != 0
+ => (FlagOV)
+
+// add with carry
+(ADDE x y (FlagEQ)) => (ADDC x y)
+(ADDE x y (FlagLT)) => (ADDC x y)
+(ADDC x (MOVDconst [c])) && is16Bit(c) => (ADDCconst x [int16(c)])
+(Select0 (ADDCconst (MOVDconst [c]) [d])) => (MOVDconst [c+int64(d)])
+
+// subtract with borrow
+(SUBE x y (FlagGT)) => (SUBC x y)
+(SUBE x y (FlagOV)) => (SUBC x y)
+(Select0 (SUBC (MOVDconst [c]) (MOVDconst [d]))) => (MOVDconst [c-d])
+
+// collapse carry chain
+(ADDE x y (Select1 (ADDCconst [-1] (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) c)))))
+ => (ADDE x y c)
+
+// collapse borrow chain
+(SUBE x y (Select1 (SUBC (MOVDconst [0]) (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) c))))))
+ => (SUBE x y c)
+
+// branch on carry
+(C(G|LG)IJ {s390x.Equal} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.NoCarry} carry)
+(C(G|LG)IJ {s390x.Equal} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [1]) => (BRC {s390x.Carry} carry)
+(C(G|LG)IJ {s390x.LessOrGreater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.Carry} carry)
+(C(G|LG)IJ {s390x.LessOrGreater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [1]) => (BRC {s390x.NoCarry} carry)
+(C(G|LG)IJ {s390x.Greater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.Carry} carry)
+
+// branch on borrow
+(C(G|LG)IJ {s390x.Equal} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.NoBorrow} borrow)
+(C(G|LG)IJ {s390x.Equal} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [1]) => (BRC {s390x.Borrow} borrow)
+(C(G|LG)IJ {s390x.LessOrGreater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.Borrow} borrow)
+(C(G|LG)IJ {s390x.LessOrGreater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [1]) => (BRC {s390x.NoBorrow} borrow)
+(C(G|LG)IJ {s390x.Greater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.Borrow} borrow)
+
+// fused multiply-add
+(Select0 (F(ADD|SUB) (FMUL y z) x)) && x.Block.Func.useFMA(v) => (FM(ADD|SUB) x y z)
+(Select0 (F(ADDS|SUBS) (FMULS y z) x)) && x.Block.Func.useFMA(v) => (FM(ADDS|SUBS) x y z)
+
+// Convert floating point comparisons against zero into 'load and test' instructions.
+(F(CMP|CMPS) x (FMOV(D|S)const [0.0])) => (LT(D|E)BR x)
+(F(CMP|CMPS) (FMOV(D|S)const [0.0]) x) => (InvertFlags (LT(D|E)BR <v.Type> x))
+
+// FSUB, FSUBS, FADD, FADDS now produce a condition code representing the
+// comparison of the result with 0.0. If a compare with zero instruction
+// (e.g. LTDBR) is following one of those instructions, we can use the
+// generated flag and remove the comparison instruction.
+// Note: when inserting Select1 ops we need to ensure they are in the
+// same block as their argument. We could also use @x.Block for this
+// but moving the flag generating value to a different block seems to
+// increase the likelihood that the flags value will have to be regenerated
+// by flagalloc which is not what we want.
+(LTDBR (Select0 x:(F(ADD|SUB) _ _))) && b == x.Block => (Select1 x)
+(LTEBR (Select0 x:(F(ADDS|SUBS) _ _))) && b == x.Block => (Select1 x)
+
+// Fold memory operations into operations.
+// Exclude global data (SB) because these instructions cannot handle relative addresses.
+// TODO(mundaym): indexed versions of these?
+((ADD|SUB|MULLD|AND|OR|XOR) <t> x g:(MOVDload [off] {sym} ptr mem))
+ && ptr.Op != OpSB
+ && is20Bit(int64(off))
+ && canMergeLoadClobber(v, g, x)
+ && clobber(g)
+ => ((ADD|SUB|MULLD|AND|OR|XOR)load <t> [off] {sym} x ptr mem)
+((ADD|SUB|MULL|AND|OR|XOR)W <t> x g:(MOVWload [off] {sym} ptr mem))
+ && ptr.Op != OpSB
+ && is20Bit(int64(off))
+ && canMergeLoadClobber(v, g, x)
+ && clobber(g)
+ => ((ADD|SUB|MULL|AND|OR|XOR)Wload <t> [off] {sym} x ptr mem)
+((ADD|SUB|MULL|AND|OR|XOR)W <t> x g:(MOVWZload [off] {sym} ptr mem))
+ && ptr.Op != OpSB
+ && is20Bit(int64(off))
+ && canMergeLoadClobber(v, g, x)
+ && clobber(g)
+ => ((ADD|SUB|MULL|AND|OR|XOR)Wload <t> [off] {sym} x ptr mem)
+
+// Combine constant stores into larger (unaligned) stores.
+// Avoid SB because constant stores to relative offsets are
+// emulated by the assembler and also can't handle unaligned offsets.
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && a.Off() + 1 == c.Off()
+ && clobber(x)
+ => (MOVHstoreconst [makeValAndOff(c.Val()&0xff | a.Val()<<8, a.Off())] {s} p mem)
+(MOVHstoreconst [c] {s} p x:(MOVHstoreconst [a] {s} p mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && a.Off() + 2 == c.Off()
+ && clobber(x)
+ => (MOVWstore [a.Off()] {s} p (MOVDconst [int64(c.Val()&0xffff | a.Val()<<16)]) mem)
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && a.Off() + 4 == c.Off()
+ && clobber(x)
+ => (MOVDstore [a.Off()] {s} p (MOVDconst [c.Val64()&0xffffffff | a.Val64()<<32]) mem)
+
+// Combine stores into larger (unaligned) stores.
+// It doesn't work on global data (based on SB) because stores with relative addressing
+// require that the memory operand be aligned.
+(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRDconst [8] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w0:(SRDconst [j] w) x:(MOVBstore [i-1] {s} p (SRDconst [j+8] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRWconst [8] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w0:(SRWconst [j] w) x:(MOVBstore [i-1] {s} p (SRWconst [j+8] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHstore [i-1] {s} p w0 mem)
+(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRDconst [16] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-2] {s} p w mem)
+(MOVHstore [i] {s} p w0:(SRDconst [j] w) x:(MOVHstore [i-2] {s} p (SRDconst [j+16] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-2] {s} p w0 mem)
+(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRWconst [16] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-2] {s} p w mem)
+(MOVHstore [i] {s} p w0:(SRWconst [j] w) x:(MOVHstore [i-2] {s} p (SRWconst [j+16] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWstore [i-2] {s} p w0 mem)
+(MOVWstore [i] {s} p (SRDconst [32] w) x:(MOVWstore [i-4] {s} p w mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVDstore [i-4] {s} p w mem)
+(MOVWstore [i] {s} p w0:(SRDconst [j] w) x:(MOVWstore [i-4] {s} p (SRDconst [j+32] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVDstore [i-4] {s} p w0 mem)
+
+// Combine stores into larger (unaligned) stores with the bytes reversed (little endian).
+// Store-with-bytes-reversed instructions do not support relative memory addresses,
+// so these stores can't operate on global data (SB).
+(MOVBstore [i] {s} p (SRDconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHBRstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p (SRDconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRDconst [j-8] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHBRstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p (SRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHBRstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p (SRWconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRWconst [j-8] w) mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVHBRstore [i-1] {s} p w0 mem)
+(MOVHBRstore [i] {s} p (SRDconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWBRstore [i-2] {s} p w mem)
+(MOVHBRstore [i] {s} p (SRDconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRDconst [j-16] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWBRstore [i-2] {s} p w0 mem)
+(MOVHBRstore [i] {s} p (SRWconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWBRstore [i-2] {s} p w mem)
+(MOVHBRstore [i] {s} p (SRWconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRWconst [j-16] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVWBRstore [i-2] {s} p w0 mem)
+(MOVWBRstore [i] {s} p (SRDconst [32] w) x:(MOVWBRstore [i-4] {s} p w mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVDBRstore [i-4] {s} p w mem)
+(MOVWBRstore [i] {s} p (SRDconst [j] w) x:(MOVWBRstore [i-4] {s} p w0:(SRDconst [j-32] w) mem))
+ && x.Uses == 1
+ && clobber(x)
+ => (MOVDBRstore [i-4] {s} p w0 mem)
+
+(MOVBstore [7] {s} p1 (SRDconst w)
+ x1:(MOVHBRstore [5] {s} p1 (SRDconst w)
+ x2:(MOVWBRstore [1] {s} p1 (SRDconst w)
+ x3:(MOVBstore [0] {s} p1 w mem))))
+ && x1.Uses == 1
+ && x2.Uses == 1
+ && x3.Uses == 1
+ && clobber(x1, x2, x3)
+ => (MOVDBRstore {s} p1 w mem)
+
+// Combining byte loads into larger (unaligned) loads.
+
+// Big-endian loads
+
+(ORW x1:(MOVBZload [i1] {s} p mem)
+ sh:(SLWconst [8] x0:(MOVBZload [i0] {s} p mem)))
+ && i1 == i0+1
+ && p.Op != OpSB
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
+
+(OR x1:(MOVBZload [i1] {s} p mem)
+ sh:(SLDconst [8] x0:(MOVBZload [i0] {s} p mem)))
+ && i1 == i0+1
+ && p.Op != OpSB
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
+
+(ORW x1:(MOVHZload [i1] {s} p mem)
+ sh:(SLWconst [16] x0:(MOVHZload [i0] {s} p mem)))
+ && i1 == i0+2
+ && p.Op != OpSB
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
+
+(OR x1:(MOVHZload [i1] {s} p mem)
+ sh:(SLDconst [16] x0:(MOVHZload [i0] {s} p mem)))
+ && i1 == i0+2
+ && p.Op != OpSB
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
+
+(OR x1:(MOVWZload [i1] {s} p mem)
+ sh:(SLDconst [32] x0:(MOVWZload [i0] {s} p mem)))
+ && i1 == i0+4
+ && p.Op != OpSB
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVDload [i0] {s} p mem)
+
+(ORW
+ s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
+ or:(ORW
+ s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
+ y))
+ && i1 == i0+1
+ && j1 == j0-8
+ && j1 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
+
+(OR
+ s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
+ or:(OR
+ s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
+ y))
+ && i1 == i0+1
+ && j1 == j0-8
+ && j1 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
+
+(OR
+ s0:(SLDconst [j0] x0:(MOVHZload [i0] {s} p mem))
+ or:(OR
+ s1:(SLDconst [j1] x1:(MOVHZload [i1] {s} p mem))
+ y))
+ && i1 == i0+2
+ && j1 == j0-16
+ && j1 % 32 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZload [i0] {s} p mem)) y)
+
+// Little-endian loads
+
+(ORW x0:(MOVBZload [i0] {s} p mem)
+ sh:(SLWconst [8] x1:(MOVBZload [i1] {s} p mem)))
+ && p.Op != OpSB
+ && i1 == i0+1
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
+
+(OR x0:(MOVBZload [i0] {s} p mem)
+ sh:(SLDconst [8] x1:(MOVBZload [i1] {s} p mem)))
+ && p.Op != OpSB
+ && i1 == i0+1
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, sh)
+ => @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
+
+(ORW r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
+ sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
+ && i1 == i0+2
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (MOVWBRload [i0] {s} p mem)
+
+(OR r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
+ sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
+ && i1 == i0+2
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRload [i0] {s} p mem))
+
+(OR r0:(MOVWZreg x0:(MOVWBRload [i0] {s} p mem))
+ sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRload [i1] {s} p mem))))
+ && i1 == i0+4
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && sh.Uses == 1
+ && mergePoint(b,x0,x1) != nil
+ && clobber(x0, x1, r0, r1, sh)
+ => @mergePoint(b,x0,x1) (MOVDBRload [i0] {s} p mem)
+
+(ORW
+ s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
+ or:(ORW
+ s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
+ y))
+ && p.Op != OpSB
+ && i1 == i0+1
+ && j1 == j0+8
+ && j0 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
+
+(OR
+ s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
+ or:(OR
+ s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
+ y))
+ && p.Op != OpSB
+ && i1 == i0+1
+ && j1 == j0+8
+ && j0 % 16 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
+
+(OR
+ s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem)))
+ or:(OR
+ s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem)))
+ y))
+ && i1 == i0+2
+ && j1 == j0+16
+ && j0 % 32 == 0
+ && x0.Uses == 1
+ && x1.Uses == 1
+ && r0.Uses == 1
+ && r1.Uses == 1
+ && s0.Uses == 1
+ && s1.Uses == 1
+ && or.Uses == 1
+ && mergePoint(b,x0,x1,y) != nil
+ && clobber(x0, x1, r0, r1, s0, s1, or)
+ => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRload [i0] {s} p mem))) y)
+
+// Combine stores into store multiples.
+// 32-bit
+(MOVWstore [i] {s} p w1 x:(MOVWstore [i-4] {s} p w0 mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && is20Bit(int64(i)-4)
+ && clobber(x)
+ => (STM2 [i-4] {s} p w0 w1 mem)
+(MOVWstore [i] {s} p w2 x:(STM2 [i-8] {s} p w0 w1 mem))
+ && x.Uses == 1
+ && is20Bit(int64(i)-8)
+ && clobber(x)
+ => (STM3 [i-8] {s} p w0 w1 w2 mem)
+(MOVWstore [i] {s} p w3 x:(STM3 [i-12] {s} p w0 w1 w2 mem))
+ && x.Uses == 1
+ && is20Bit(int64(i)-12)
+ && clobber(x)
+ => (STM4 [i-12] {s} p w0 w1 w2 w3 mem)
+(STM2 [i] {s} p w2 w3 x:(STM2 [i-8] {s} p w0 w1 mem))
+ && x.Uses == 1
+ && is20Bit(int64(i)-8)
+ && clobber(x)
+ => (STM4 [i-8] {s} p w0 w1 w2 w3 mem)
+// 64-bit
+(MOVDstore [i] {s} p w1 x:(MOVDstore [i-8] {s} p w0 mem))
+ && p.Op != OpSB
+ && x.Uses == 1
+ && is20Bit(int64(i)-8)
+ && clobber(x)
+ => (STMG2 [i-8] {s} p w0 w1 mem)
+(MOVDstore [i] {s} p w2 x:(STMG2 [i-16] {s} p w0 w1 mem))
+ && x.Uses == 1
+ && is20Bit(int64(i)-16)
+ && clobber(x)
+ => (STMG3 [i-16] {s} p w0 w1 w2 mem)
+(MOVDstore [i] {s} p w3 x:(STMG3 [i-24] {s} p w0 w1 w2 mem))
+ && x.Uses == 1
+ && is20Bit(int64(i)-24)
+ && clobber(x)
+ => (STMG4 [i-24] {s} p w0 w1 w2 w3 mem)
+(STMG2 [i] {s} p w2 w3 x:(STMG2 [i-16] {s} p w0 w1 mem))
+ && x.Uses == 1
+ && is20Bit(int64(i)-16)
+ && clobber(x)
+ => (STMG4 [i-16] {s} p w0 w1 w2 w3 mem)
+
+// Convert 32-bit store multiples into 64-bit stores.
+(STM2 [i] {s} p (SRDconst [32] x) x mem) => (MOVDstore [i] {s} p x mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/S390XOps.go b/src/cmd/compile/internal/ssa/_gen/S390XOps.go
new file mode 100644
index 0000000..896fdaa
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/S390XOps.go
@@ -0,0 +1,817 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+// Notes:
+// - Integer types live in the low portion of registers. Upper portions are junk.
+// - Boolean types use the low-order byte of a register. 0=false, 1=true.
+// Upper bytes are junk.
+// - When doing sub-register operations, we try to write the whole
+// destination register to avoid a partial-register write.
+// - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+// filled by sign-extending the used portion. Users of AuxInt which interpret
+// AuxInt as unsigned (e.g. shifts) must be careful.
+// - The SB 'register' is implemented using instruction-relative addressing. This
+// places some limitations on when and how memory operands that are addressed
+// relative to SB can be used:
+//
+// 1. Pseudo-instructions do not always map to a single machine instruction when
+// using the SB 'register' to address data. This is because many machine
+// instructions do not have relative long (RL suffix) equivalents. For example,
+// ADDload, which is assembled as AG.
+//
+// 2. Loads and stores using relative addressing require the data be aligned
+// according to its size (8-bytes for double words, 4-bytes for words
+// and so on).
+//
+// We can always work around these by inserting LARL instructions (load address
+// relative long) in the assembler, but typically this results in worse code
+// generation because the address can't be re-used. Inserting instructions in the
+// assembler also means clobbering the temp register and it is a long-term goal
+// to prevent the compiler doing this so that it can be allocated as a normal
+// register.
+//
+// For more information about the z/Architecture, the instruction set and the
+// addressing modes it supports take a look at the z/Architecture Principles of
+// Operation: http://publibfp.boulder.ibm.com/epubs/pdf/dz9zr010.pdf
+//
+// Suffixes encode the bit width of pseudo-instructions.
+// D (double word) = 64 bit (frequently omitted)
+// W (word) = 32 bit
+// H (half word) = 16 bit
+// B (byte) = 8 bit
+// S (single prec.) = 32 bit (double precision is omitted)
+
+// copied from ../../s390x/reg.go
+var regNamesS390X = []string{
+ "R0",
+ "R1",
+ "R2",
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "g", // R13
+ "R14",
+ "SP", // R15
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15",
+
+ // If you add registers, update asyncPreempt in runtime.
+
+ //pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesS390X) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesS390X {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ // Common individual register masks
+ var (
+ sp = buildReg("SP")
+ sb = buildReg("SB")
+ r0 = buildReg("R0")
+ tmp = buildReg("R11") // R11 is used as a temporary in a small number of instructions.
+
+ // R10 is reserved by the assembler.
+ gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14")
+ gpg = gp | buildReg("g")
+ gpsp = gp | sp
+
+ // R0 is considered to contain the value 0 in address calculations.
+ ptr = gp &^ r0
+ ptrsp = ptr | sp
+ ptrspsb = ptrsp | sb
+
+ fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+ callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r1 = buildReg("R1")
+ r2 = buildReg("R2")
+ r3 = buildReg("R3")
+ )
+ // Common slices of register masks
+ var (
+ gponly = []regMask{gp}
+ fponly = []regMask{fp}
+ )
+
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: []regMask{}, outputs: gponly}
+ gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+ gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+ gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+ gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+ gp21tmp = regInfo{inputs: []regMask{gp &^ tmp, gp &^ tmp}, outputs: []regMask{gp &^ tmp}, clobbers: tmp}
+
+ // R0 evaluates to 0 when used as the number of bits to shift
+ // so we need to exclude it from that operand.
+ sh21 = regInfo{inputs: []regMask{gp, ptr}, outputs: gponly}
+
+ addr = regInfo{inputs: []regMask{sp | sb}, outputs: gponly}
+ addridx = regInfo{inputs: []regMask{sp | sb, ptrsp}, outputs: gponly}
+
+ gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
+ gp1flags = regInfo{inputs: []regMask{gpsp}}
+ gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+ gp11flags = regInfo{inputs: []regMask{gp}, outputs: gponly}
+ gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+ gp2flags1flags = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+
+ gpload = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: gponly}
+ gploadidx = regInfo{inputs: []regMask{ptrspsb, ptrsp, 0}, outputs: gponly}
+ gpopload = regInfo{inputs: []regMask{gp, ptrsp, 0}, outputs: gponly}
+ gpstore = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}}
+ gpstoreconst = regInfo{inputs: []regMask{ptrspsb, 0}}
+ gpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, gpsp, 0}}
+ gpstorebr = regInfo{inputs: []regMask{ptrsp, gpsp, 0}}
+ gpstorelaa = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}, outputs: gponly}
+ gpstorelab = regInfo{inputs: []regMask{r1, gpsp, 0}, clobbers: r1}
+
+ gpmvc = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}}
+
+ fp01 = regInfo{inputs: []regMask{}, outputs: fponly}
+ fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+ fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
+ fp21clobber = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+ fpgp = regInfo{inputs: fponly, outputs: gponly}
+ gpfp = regInfo{inputs: gponly, outputs: fponly}
+ fp11 = regInfo{inputs: fponly, outputs: fponly}
+ fp1flags = regInfo{inputs: []regMask{fp}}
+ fp11clobber = regInfo{inputs: fponly, outputs: fponly}
+ fp2flags = regInfo{inputs: []regMask{fp, fp}}
+
+ fpload = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: fponly}
+ fploadidx = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}, outputs: fponly}
+
+ fpstore = regInfo{inputs: []regMask{ptrspsb, fp, 0}}
+ fpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, fp, 0}}
+
+ sync = regInfo{inputs: []regMask{0}}
+
+ // LoweredAtomicCas may overwrite arg1, so force it to R0 for now.
+ cas = regInfo{inputs: []regMask{ptrsp, r0, gpsp, 0}, outputs: []regMask{gp, 0}, clobbers: r0}
+
+ // LoweredAtomicExchange overwrites the output before executing
+ // CS{,G}, so the output register must not be the same as the
+ // input register. For now we just force the output register to
+ // R0.
+ exchange = regInfo{inputs: []regMask{ptrsp, gpsp &^ r0, 0}, outputs: []regMask{r0, 0}}
+ )
+
+ var S390Xops = []opData{
+ // fp ops
+ {name: "FADDS", argLength: 2, reg: fp21clobber, typ: "(Float32,Flags)", asm: "FADDS", commutative: true, resultInArg0: true}, // fp32 arg0 + arg1
+ {name: "FADD", argLength: 2, reg: fp21clobber, typ: "(Float64,Flags)", asm: "FADD", commutative: true, resultInArg0: true}, // fp64 arg0 + arg1
+ {name: "FSUBS", argLength: 2, reg: fp21clobber, typ: "(Float32,Flags)", asm: "FSUBS", resultInArg0: true}, // fp32 arg0 - arg1
+ {name: "FSUB", argLength: 2, reg: fp21clobber, typ: "(Float64,Flags)", asm: "FSUB", resultInArg0: true}, // fp64 arg0 - arg1
+ {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, resultInArg0: true}, // fp32 arg0 * arg1
+ {name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true, resultInArg0: true}, // fp64 arg0 * arg1
+ {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", resultInArg0: true}, // fp32 arg0 / arg1
+ {name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV", resultInArg0: true}, // fp64 arg0 / arg1
+ {name: "FNEGS", argLength: 1, reg: fp11clobber, asm: "FNEGS", clobberFlags: true}, // fp32 -arg0
+ {name: "FNEG", argLength: 1, reg: fp11clobber, asm: "FNEG", clobberFlags: true}, // fp64 -arg0
+ {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS", resultInArg0: true}, // fp32 arg1 * arg2 + arg0
+ {name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD", resultInArg0: true}, // fp64 arg1 * arg2 + arg0
+ {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", resultInArg0: true}, // fp32 arg1 * arg2 - arg0
+ {name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB", resultInArg0: true}, // fp64 arg1 * arg2 - arg0
+ {name: "LPDFR", argLength: 1, reg: fp11, asm: "LPDFR"}, // fp64/fp32 set sign bit
+ {name: "LNDFR", argLength: 1, reg: fp11, asm: "LNDFR"}, // fp64/fp32 clear sign bit
+ {name: "CPSDR", argLength: 2, reg: fp21, asm: "CPSDR"}, // fp64/fp32 copy arg1 sign bit to arg0
+
+ // Round to integer, float64 only.
+ //
+ // aux | rounding mode
+ // ----+-----------------------------------
+ // 1 | round to nearest, ties away from 0
+ // 4 | round to nearest, ties to even
+ // 5 | round toward 0
+ // 6 | round toward +∞
+ // 7 | round toward -∞
+ {name: "FIDBR", argLength: 1, reg: fp11, asm: "FIDBR", aux: "Int8"},
+
+ {name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
+ {name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
+ {name: "FMOVSconst", reg: fp01, asm: "FMOVS", aux: "Float32", rematerializeable: true}, // fp32 constant
+ {name: "FMOVDconst", reg: fp01, asm: "FMOVD", aux: "Float64", rematerializeable: true}, // fp64 constant
+ {name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by i
+ {name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by i
+
+ {name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store
+ {name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store
+ {name: "FMOVSstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by i store
+ {name: "FMOVDstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by i store
+
+ // binary ops
+ {name: "ADD", argLength: 2, reg: gp21sp, asm: "ADD", commutative: true, clobberFlags: true}, // arg0 + arg1
+ {name: "ADDW", argLength: 2, reg: gp21sp, asm: "ADDW", commutative: true, clobberFlags: true}, // arg0 + arg1
+ {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32", typ: "UInt64", clobberFlags: true}, // arg0 + auxint
+ {name: "ADDWconst", argLength: 1, reg: gp11sp, asm: "ADDW", aux: "Int32", clobberFlags: true}, // arg0 + auxint
+ {name: "ADDload", argLength: 3, reg: gpopload, asm: "ADD", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + *arg1. arg2=mem
+ {name: "ADDWload", argLength: 3, reg: gpopload, asm: "ADDW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + *arg1. arg2=mem
+
+ {name: "SUB", argLength: 2, reg: gp21, asm: "SUB", clobberFlags: true}, // arg0 - arg1
+ {name: "SUBW", argLength: 2, reg: gp21, asm: "SUBW", clobberFlags: true}, // arg0 - arg1
+ {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+ {name: "SUBWconst", argLength: 1, reg: gp11, asm: "SUBW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+ {name: "SUBload", argLength: 3, reg: gpopload, asm: "SUB", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - *arg1. arg2=mem
+ {name: "SUBWload", argLength: 3, reg: gpopload, asm: "SUBW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - *arg1. arg2=mem
+
+ {name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+ {name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+ {name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 * auxint
+ {name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 * auxint
+ {name: "MULLDload", argLength: 3, reg: gpopload, asm: "MULLD", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * *arg1. arg2=mem
+ {name: "MULLWload", argLength: 3, reg: gpopload, asm: "MULLW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * *arg1. arg2=mem
+
+ {name: "MULHD", argLength: 2, reg: gp21tmp, asm: "MULHD", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 * arg1) >> width
+ {name: "MULHDU", argLength: 2, reg: gp21tmp, asm: "MULHDU", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 * arg1) >> width
+
+ {name: "DIVD", argLength: 2, reg: gp21tmp, asm: "DIVD", resultInArg0: true, clobberFlags: true}, // arg0 / arg1
+ {name: "DIVW", argLength: 2, reg: gp21tmp, asm: "DIVW", resultInArg0: true, clobberFlags: true}, // arg0 / arg1
+ {name: "DIVDU", argLength: 2, reg: gp21tmp, asm: "DIVDU", resultInArg0: true, clobberFlags: true}, // arg0 / arg1
+ {name: "DIVWU", argLength: 2, reg: gp21tmp, asm: "DIVWU", resultInArg0: true, clobberFlags: true}, // arg0 / arg1
+
+ {name: "MODD", argLength: 2, reg: gp21tmp, asm: "MODD", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+ {name: "MODW", argLength: 2, reg: gp21tmp, asm: "MODW", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+
+ {name: "MODDU", argLength: 2, reg: gp21tmp, asm: "MODDU", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+ {name: "MODWU", argLength: 2, reg: gp21tmp, asm: "MODWU", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+
+ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true, clobberFlags: true}, // arg0 & arg1
+ {name: "ANDW", argLength: 2, reg: gp21, asm: "ANDW", commutative: true, clobberFlags: true}, // arg0 & arg1
+ {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+ {name: "ANDWconst", argLength: 1, reg: gp11, asm: "ANDW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+ {name: "ANDload", argLength: 3, reg: gpopload, asm: "AND", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & *arg1. arg2=mem
+ {name: "ANDWload", argLength: 3, reg: gpopload, asm: "ANDW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & *arg1. arg2=mem
+
+ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true, clobberFlags: true}, // arg0 | arg1
+ {name: "ORW", argLength: 2, reg: gp21, asm: "ORW", commutative: true, clobberFlags: true}, // arg0 | arg1
+ {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+ {name: "ORWconst", argLength: 1, reg: gp11, asm: "ORW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+ {name: "ORload", argLength: 3, reg: gpopload, asm: "OR", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | *arg1. arg2=mem
+ {name: "ORWload", argLength: 3, reg: gpopload, asm: "ORW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | *arg1. arg2=mem
+
+ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, clobberFlags: true}, // arg0 ^ arg1
+ {name: "XORW", argLength: 2, reg: gp21, asm: "XORW", commutative: true, clobberFlags: true}, // arg0 ^ arg1
+ {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+ {name: "XORWconst", argLength: 1, reg: gp11, asm: "XORW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+ {name: "XORload", argLength: 3, reg: gpopload, asm: "XOR", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ *arg1. arg2=mem
+ {name: "XORWload", argLength: 3, reg: gpopload, asm: "XORW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ *arg1. arg2=mem
+
+ // Arithmetic ops with carry/borrow chain.
+ //
+ // A carry is represented by a condition code of 2 or 3 (GT or OV).
+ // A borrow is represented by a condition code of 0 or 1 (EQ or LT).
+ {name: "ADDC", argLength: 2, reg: gp21flags, asm: "ADDC", typ: "(UInt64,Flags)", commutative: true}, // (arg0 + arg1, carry out)
+ {name: "ADDCconst", argLength: 1, reg: gp11flags, asm: "ADDC", typ: "(UInt64,Flags)", aux: "Int16"}, // (arg0 + auxint, carry out)
+ {name: "ADDE", argLength: 3, reg: gp2flags1flags, asm: "ADDE", typ: "(UInt64,Flags)", commutative: true, resultInArg0: true}, // (arg0 + arg1 + arg2 (carry in), carry out)
+ {name: "SUBC", argLength: 2, reg: gp21flags, asm: "SUBC", typ: "(UInt64,Flags)"}, // (arg0 - arg1, borrow out)
+ {name: "SUBE", argLength: 3, reg: gp2flags1flags, asm: "SUBE", typ: "(UInt64,Flags)", resultInArg0: true}, // (arg0 - arg1 - arg2 (borrow in), borrow out)
+
+ // Comparisons.
+ {name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1
+
+ {name: "CMPU", argLength: 2, reg: gp2flags, asm: "CMPU", typ: "Flags"}, // arg0 compare to arg1
+ {name: "CMPWU", argLength: 2, reg: gp2flags, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1
+
+ {name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+ {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+ {name: "CMPUconst", argLength: 1, reg: gp1flags, asm: "CMPU", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+ {name: "CMPWUconst", argLength: 1, reg: gp1flags, asm: "CMPWU", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+
+ {name: "FCMPS", argLength: 2, reg: fp2flags, asm: "CEBR", typ: "Flags"}, // arg0 compare to arg1, f32
+ {name: "FCMP", argLength: 2, reg: fp2flags, asm: "FCMPU", typ: "Flags"}, // arg0 compare to arg1, f64
+ {name: "LTDBR", argLength: 1, reg: fp1flags, asm: "LTDBR", typ: "Flags"}, // arg0 compare to 0, f64
+ {name: "LTEBR", argLength: 1, reg: fp1flags, asm: "LTEBR", typ: "Flags"}, // arg0 compare to 0, f32
+
+ {name: "SLD", argLength: 2, reg: sh21, asm: "SLD"}, // arg0 << arg1, shift amount is mod 64
+ {name: "SLW", argLength: 2, reg: sh21, asm: "SLW"}, // arg0 << arg1, shift amount is mod 64
+ {name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "UInt8"}, // arg0 << auxint, shift amount 0-63
+ {name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "UInt8"}, // arg0 << auxint, shift amount 0-31
+
+ {name: "SRD", argLength: 2, reg: sh21, asm: "SRD"}, // unsigned arg0 >> arg1, shift amount is mod 64
+ {name: "SRW", argLength: 2, reg: sh21, asm: "SRW"}, // unsigned uint32(arg0) >> arg1, shift amount is mod 64
+ {name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "UInt8"}, // unsigned arg0 >> auxint, shift amount 0-63
+ {name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "UInt8"}, // unsigned uint32(arg0) >> auxint, shift amount 0-31
+
+ // Arithmetic shifts clobber flags.
+ {name: "SRAD", argLength: 2, reg: sh21, asm: "SRAD", clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 64
+ {name: "SRAW", argLength: 2, reg: sh21, asm: "SRAW", clobberFlags: true}, // signed int32(arg0) >> arg1, shift amount is mod 64
+ {name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "UInt8", clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63
+ {name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "UInt8", clobberFlags: true}, // signed int32(arg0) >> auxint, shift amount 0-31
+
+ // Rotate instructions.
+ // Note: no RLLGconst - use RISBGZ instead.
+ {name: "RLLG", argLength: 2, reg: sh21, asm: "RLLG"}, // arg0 rotate left arg1, rotate amount 0-63
+ {name: "RLL", argLength: 2, reg: sh21, asm: "RLL"}, // arg0 rotate left arg1, rotate amount 0-31
+ {name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "UInt8"}, // arg0 rotate left auxint, rotate amount 0-31
+
+ // Rotate then (and|or|xor|insert) selected bits instructions.
+ //
+ // Aux is an s390x.RotateParams struct containing Start, End and rotation
+ // Amount fields.
+ //
+ // arg1 is rotated left by the rotation amount then the bits from the start
+ // bit to the end bit (inclusive) are combined with arg0 using the logical
+ // operation specified. Bit indices are specified from left to right - the
+ // MSB is 0 and the LSB is 63.
+ //
+ // Examples:
+ // | aux |
+ // | instruction | start | end | amount | arg0 | arg1 | result |
+ // +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
+ // | RXSBG (XOR) | 0 | 1 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0x3fff_ffff_ffff_ffff |
+ // | RXSBG (XOR) | 62 | 63 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_fffc |
+ // | RXSBG (XOR) | 0 | 47 | 16 | 0xffff_ffff_ffff_ffff | 0x0000_0000_0000_ffff | 0xffff_ffff_0000_ffff |
+ // +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
+ //
+ {name: "RXSBG", argLength: 2, reg: gp21, asm: "RXSBG", resultInArg0: true, aux: "S390XRotateParams", clobberFlags: true}, // rotate then xor selected bits
+ {name: "RISBGZ", argLength: 1, reg: gp11, asm: "RISBGZ", aux: "S390XRotateParams", clobberFlags: true}, // rotate then insert selected bits [into zero]
+
+ // unary ops
+ {name: "NEG", argLength: 1, reg: gp11, asm: "NEG", clobberFlags: true}, // -arg0
+ {name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW", clobberFlags: true}, // -arg0
+
+ {name: "NOT", argLength: 1, reg: gp11, resultInArg0: true, clobberFlags: true}, // ^arg0
+ {name: "NOTW", argLength: 1, reg: gp11, resultInArg0: true, clobberFlags: true}, // ^arg0
+
+ {name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0)
+ {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0), float32
+
+ // Conditional register-register moves.
+ // The aux for these values is an s390x.CCMask value representing the condition code mask.
+ {name: "LOCGR", argLength: 3, reg: gp2flags1, resultInArg0: true, asm: "LOCGR", aux: "S390XCCMask"}, // load arg1 into arg0 if the condition code in arg2 matches a masked bit in aux.
+
+ {name: "MOVBreg", argLength: 1, reg: gp11sp, asm: "MOVB", typ: "Int64"}, // sign extend arg0 from int8 to int64
+ {name: "MOVBZreg", argLength: 1, reg: gp11sp, asm: "MOVBZ", typ: "UInt64"}, // zero extend arg0 from int8 to int64
+ {name: "MOVHreg", argLength: 1, reg: gp11sp, asm: "MOVH", typ: "Int64"}, // sign extend arg0 from int16 to int64
+ {name: "MOVHZreg", argLength: 1, reg: gp11sp, asm: "MOVHZ", typ: "UInt64"}, // zero extend arg0 from int16 to int64
+ {name: "MOVWreg", argLength: 1, reg: gp11sp, asm: "MOVW", typ: "Int64"}, // sign extend arg0 from int32 to int64
+ {name: "MOVWZreg", argLength: 1, reg: gp11sp, asm: "MOVWZ", typ: "UInt64"}, // zero extend arg0 from int32 to int64
+
+ {name: "MOVDconst", reg: gp01, asm: "MOVD", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
+
+ {name: "LDGR", argLength: 1, reg: gpfp, asm: "LDGR"}, // move int64 to float64 (no conversion)
+ {name: "LGDR", argLength: 1, reg: fpgp, asm: "LGDR"}, // move float64 to int64 (no conversion)
+
+ {name: "CFDBRA", argLength: 1, reg: fpgp, asm: "CFDBRA", clobberFlags: true}, // convert float64 to int32
+ {name: "CGDBRA", argLength: 1, reg: fpgp, asm: "CGDBRA", clobberFlags: true}, // convert float64 to int64
+ {name: "CFEBRA", argLength: 1, reg: fpgp, asm: "CFEBRA", clobberFlags: true}, // convert float32 to int32
+ {name: "CGEBRA", argLength: 1, reg: fpgp, asm: "CGEBRA", clobberFlags: true}, // convert float32 to int64
+ {name: "CEFBRA", argLength: 1, reg: gpfp, asm: "CEFBRA", clobberFlags: true}, // convert int32 to float32
+ {name: "CDFBRA", argLength: 1, reg: gpfp, asm: "CDFBRA", clobberFlags: true}, // convert int32 to float64
+ {name: "CEGBRA", argLength: 1, reg: gpfp, asm: "CEGBRA", clobberFlags: true}, // convert int64 to float32
+ {name: "CDGBRA", argLength: 1, reg: gpfp, asm: "CDGBRA", clobberFlags: true}, // convert int64 to float64
+ {name: "CLFEBR", argLength: 1, reg: fpgp, asm: "CLFEBR", clobberFlags: true}, // convert float32 to uint32
+ {name: "CLFDBR", argLength: 1, reg: fpgp, asm: "CLFDBR", clobberFlags: true}, // convert float64 to uint32
+ {name: "CLGEBR", argLength: 1, reg: fpgp, asm: "CLGEBR", clobberFlags: true}, // convert float32 to uint64
+ {name: "CLGDBR", argLength: 1, reg: fpgp, asm: "CLGDBR", clobberFlags: true}, // convert float64 to uint64
+ {name: "CELFBR", argLength: 1, reg: gpfp, asm: "CELFBR", clobberFlags: true}, // convert uint32 to float32
+ {name: "CDLFBR", argLength: 1, reg: gpfp, asm: "CDLFBR", clobberFlags: true}, // convert uint32 to float64
+ {name: "CELGBR", argLength: 1, reg: gpfp, asm: "CELGBR", clobberFlags: true}, // convert uint64 to float32
+ {name: "CDLGBR", argLength: 1, reg: gpfp, asm: "CDLGBR", clobberFlags: true}, // convert uint64 to float64
+
+ {name: "LEDBR", argLength: 1, reg: fp11, asm: "LEDBR"}, // convert float64 to float32
+ {name: "LDEBR", argLength: 1, reg: fp11, asm: "LDEBR"}, // convert float32 to float64
+
+ {name: "MOVDaddr", argLength: 1, reg: addr, aux: "SymOff", rematerializeable: true, symEffect: "Read"}, // arg0 + auxint + offset encoded in aux
+ {name: "MOVDaddridx", argLength: 2, reg: addridx, aux: "SymOff", symEffect: "Read"}, // arg0 + arg1 + auxint + aux
+
+ // auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+ {name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int64
+ {name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int64
+ {name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
+ {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int64
+ {name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes from arg0+auxint+aux. arg1=mem
+
+ {name: "MOVWBR", argLength: 1, reg: gp11, asm: "MOVWBR"}, // arg0 swap bytes
+ {name: "MOVDBR", argLength: 1, reg: gp11, asm: "MOVDBR"}, // arg0 swap bytes
+
+ {name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes.
+ {name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes.
+ {name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes.
+
+ {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
+ {name: "MOVHBRstore", argLength: 3, reg: gpstorebr, asm: "MOVHBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes.
+ {name: "MOVWBRstore", argLength: 3, reg: gpstorebr, asm: "MOVWBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes.
+ {name: "MOVDBRstore", argLength: 3, reg: gpstorebr, asm: "MOVDBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes.
+
+ {name: "MVC", argLength: 3, reg: gpmvc, asm: "MVC", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, faultOnNilArg1: true, symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size,off
+
+ // indexed loads/stores
+ {name: "MOVBZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+ {name: "MOVBloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVB", aux: "SymOff", typ: "Int8", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem. Sign extend.
+ {name: "MOVHZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+ {name: "MOVHloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVH", aux: "SymOff", typ: "Int16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Sign extend.
+ {name: "MOVWZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+ {name: "MOVWloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVW", aux: "SymOff", typ: "Int32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Sign extend.
+ {name: "MOVDloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVD", aux: "SymOff", typ: "UInt64", symEffect: "Read"}, // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem
+ {name: "MOVHBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVHBR", aux: "SymOff", typ: "Int16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes.
+ {name: "MOVWBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWBR", aux: "SymOff", typ: "Int32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes.
+ {name: "MOVDBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVDBR", aux: "SymOff", typ: "Int64", symEffect: "Read"}, // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes.
+ {name: "MOVBstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVHstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVH", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVWstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVDstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVD", aux: "SymOff", symEffect: "Write"}, // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+ {name: "MOVHBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVHBR", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes.
+ {name: "MOVWBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVWBR", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes.
+ {name: "MOVDBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVDBR", aux: "SymOff", symEffect: "Write"}, // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes.
+
+ // For storeconst ops, the AuxInt field encodes both
+ // the value to store and an address offset of the store.
+ // Cast AuxInt to a ValAndOff to extract Val and Off fields.
+ {name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux. arg1=mem
+ {name: "MOVHstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVH", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ...
+ {name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ...
+ {name: "MOVDstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVD", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of ...
+
+ {name: "CLEAR", argLength: 2, reg: regInfo{inputs: []regMask{ptr, 0}}, asm: "CLEAR", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Write"},
+
+ {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{ptrsp, buildReg("R12"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{ptr}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ // (InvertFlags (CMP a b)) == (CMP b a)
+ // InvertFlags is a pseudo-op which can't appear in assembly output.
+ {name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+ // Pseudo-ops
+ {name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+ // Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+ // and sorts it to the very beginning of the block to prevent other
+ // use of R12 (the closure pointer)
+ {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R12")}}, zeroWidth: true},
+ // arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
+ // LoweredGetCallerSP returns the SP of the caller of the current function.
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+ // LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+ // I.e., if f calls g "calls" getcallerpc,
+ // the result should be the PC within f that g will return to.
+ // See runtime/stubs.go for a more detailed discussion.
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{ptrsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+ // Round ops to block fused-multiply-add extraction.
+ {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+ {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+
+ // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+ // It saves all GP registers if necessary,
+ // but clobbers R14 (LR) because it's a call,
+ // and also clobbers R1 as the PLT stub does.
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R14") | r1}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+ // There are three of these functions so that they can have three different register inputs.
+ // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ // default registers to match so we don't need to copy registers around unnecessarily.
+ {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+ {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+ {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+
+ // Constant condition code values. The condition code can be 0, 1, 2 or 3.
+ {name: "FlagEQ"}, // CC=0 (equal)
+ {name: "FlagLT"}, // CC=1 (less than)
+ {name: "FlagGT"}, // CC=2 (greater than)
+ {name: "FlagOV"}, // CC=3 (overflow)
+
+ // Fast-BCR-serialization to ensure store-load ordering.
+ {name: "SYNC", argLength: 1, reg: sync, asm: "SYNC", typ: "Mem"},
+
+ // Atomic loads. These are just normal loads but return <value,memory> tuples
+ // so they can be properly ordered with other loads.
+ // load from arg0+auxint+aux. arg1=mem.
+ {name: "MOVBZatomicload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVWZatomicload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "MOVDatomicload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+ // Atomic stores. These are just normal stores.
+ // store arg1 to arg0+auxint+aux. arg2=mem.
+ {name: "MOVBatomicstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"},
+ {name: "MOVWatomicstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"},
+ {name: "MOVDatomicstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"},
+
+ // Atomic adds.
+ // *(arg0+auxint+aux) += arg1. arg2=mem.
+ // Returns a tuple of <old contents of *(arg0+auxint+aux), memory>.
+ {name: "LAA", argLength: 3, reg: gpstorelaa, asm: "LAA", typ: "(UInt32,Mem)", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "LAAG", argLength: 3, reg: gpstorelaa, asm: "LAAG", typ: "(UInt64,Mem)", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
+ {name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
+
+ // Atomic bitwise operations.
+ // Note: 'floor' operations round the pointer down to the nearest word boundary
+ // which reflects how they are used in the runtime.
+ {name: "LAN", argLength: 3, reg: gpstore, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 &= arg1. arg2 = mem.
+ {name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem.
+ {name: "LAO", argLength: 3, reg: gpstore, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 |= arg1. arg2 = mem.
+ {name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
+
+ // Compare and swap.
+ // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+ // if *(arg0+auxint+aux) == arg1 {
+ // *(arg0+auxint+aux) = arg2
+ // return (true, memory)
+ // } else {
+ // return (false, memory)
+ // }
+ // Note that these instructions also return the old value in arg1, but we ignore it.
+ // TODO: have these return flags instead of bool. The current system generates:
+ // CS ...
+ // MOVD $0, ret
+ // BNE 2(PC)
+ // MOVD $1, ret
+ // CMPW ret, $0
+ // BNE ...
+ // instead of just
+ // CS ...
+ // BEQ ...
+ // but we can't do that because memory-using ops can't generate flags yet
+ // (flagalloc wants to move flag-generating instructions around).
+ {name: "LoweredAtomicCas32", argLength: 4, reg: cas, asm: "CS", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "LoweredAtomicCas64", argLength: 4, reg: cas, asm: "CSG", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+
+ // Lowered atomic swaps, emulated using compare-and-swap.
+ // store arg1 to arg0+auxint+aux, arg2=mem.
+ {name: "LoweredAtomicExchange32", argLength: 3, reg: exchange, asm: "CS", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+ {name: "LoweredAtomicExchange64", argLength: 3, reg: exchange, asm: "CSG", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+
+ // find leftmost one
+ {
+ name: "FLOGR",
+ argLength: 1,
+ reg: regInfo{inputs: gponly, outputs: []regMask{buildReg("R0")}, clobbers: buildReg("R1")},
+ asm: "FLOGR",
+ typ: "UInt64",
+ clobberFlags: true,
+ },
+
+ // population count
+ //
+ // Counts the number of ones in each byte of arg0
+ // and places the result into the corresponding byte
+ // of the result.
+ {
+ name: "POPCNT",
+ argLength: 1,
+ reg: gp11,
+ asm: "POPCNT",
+ typ: "UInt64",
+ clobberFlags: true,
+ },
+
+ // unsigned multiplication (64x64 → 128)
+ //
+ // Multiply the two 64-bit input operands together and place the 128-bit result into
+ // an even-odd register pair. The second register in the target pair also contains
+ // one of the input operands. Since we don't currently have a way to specify an
+ // even-odd register pair we hardcode this register pair as R2:R3.
+ {
+ name: "MLGR",
+ argLength: 2,
+ reg: regInfo{inputs: []regMask{gp, r3}, outputs: []regMask{r2, r3}},
+ asm: "MLGR",
+ },
+
+ // pseudo operations to sum the output of the POPCNT instruction
+ {name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
+ {name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
+ {name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
+
+ // store multiple
+ {
+ name: "STMG2",
+ argLength: 4,
+ reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), 0}},
+ aux: "SymOff",
+ typ: "Mem",
+ asm: "STMG",
+ faultOnNilArg0: true,
+ symEffect: "Write",
+ clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets
+ },
+ {
+ name: "STMG3",
+ argLength: 5,
+ reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), buildReg("R3"), 0}},
+ aux: "SymOff",
+ typ: "Mem",
+ asm: "STMG",
+ faultOnNilArg0: true,
+ symEffect: "Write",
+ clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets
+ },
+ {
+ name: "STMG4",
+ argLength: 6,
+ reg: regInfo{inputs: []regMask{
+ ptrsp,
+ buildReg("R1"),
+ buildReg("R2"),
+ buildReg("R3"),
+ buildReg("R4"),
+ 0,
+ }},
+ aux: "SymOff",
+ typ: "Mem",
+ asm: "STMG",
+ faultOnNilArg0: true,
+ symEffect: "Write",
+ clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets
+ },
+ {
+ name: "STM2",
+ argLength: 4,
+ reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), 0}},
+ aux: "SymOff",
+ typ: "Mem",
+ asm: "STMY",
+ faultOnNilArg0: true,
+ symEffect: "Write",
+ clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets
+ },
+ {
+ name: "STM3",
+ argLength: 5,
+ reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), buildReg("R3"), 0}},
+ aux: "SymOff",
+ typ: "Mem",
+ asm: "STMY",
+ faultOnNilArg0: true,
+ symEffect: "Write",
+ clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets
+ },
+ {
+ name: "STM4",
+ argLength: 6,
+ reg: regInfo{inputs: []regMask{
+ ptrsp,
+ buildReg("R1"),
+ buildReg("R2"),
+ buildReg("R3"),
+ buildReg("R4"),
+ 0,
+ }},
+ aux: "SymOff",
+ typ: "Mem",
+ asm: "STMY",
+ faultOnNilArg0: true,
+ symEffect: "Write",
+ clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets
+ },
+
+ // large move
+ // auxint = remaining bytes after loop (rem)
+ // arg0 = address of dst memory (in R1, changed as a side effect)
+ // arg1 = address of src memory (in R2, changed as a side effect)
+ // arg2 = pointer to last address to move in loop + 256
+ // arg3 = mem
+ // returns mem
+ //
+ // mvc: MVC $256, 0(R2), 0(R1)
+ // MOVD $256(R1), R1
+ // MOVD $256(R2), R2
+ // CMP R2, Rarg2
+ // BNE mvc
+ // MVC $rem, 0(R2), 0(R1) // if rem > 0
+ {
+ name: "LoweredMove",
+ aux: "Int64",
+ argLength: 4,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), buildReg("R2"), gpsp},
+ clobbers: buildReg("R1 R2"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
+ },
+
+ // large clear
+ // auxint = remaining bytes after loop (rem)
+ // arg0 = address of dst memory (in R1, changed as a side effect)
+ // arg1 = pointer to last address to zero in loop + 256
+ // arg2 = mem
+ // returns mem
+ //
+ // clear: CLEAR $256, 0(R1)
+ // MOVD $256(R1), R1
+ // CMP R1, Rarg2
+ // BNE clear
+ // CLEAR $rem, 0(R1) // if rem > 0
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 3,
+ reg: regInfo{
+ inputs: []regMask{buildReg("R1"), gpsp},
+ clobbers: buildReg("R1"),
+ },
+ clobberFlags: true,
+ typ: "Mem",
+ faultOnNilArg0: true,
+ },
+ }
+
+ // All blocks on s390x have their condition code mask (s390x.CCMask) as the Aux value.
+ // The condition code mask is a 4-bit mask where each bit corresponds to a condition
+ // code value. If the value of the condition code matches a bit set in the condition
+ // code mask then the first successor is executed. Otherwise the second successor is
+ // executed.
+ //
+ // | condition code value | mask bit |
+ // +----------------------+------------+
+ // | 0 (equal) | 0b1000 (8) |
+ // | 1 (less than) | 0b0100 (4) |
+ // | 2 (greater than) | 0b0010 (2) |
+ // | 3 (unordered) | 0b0001 (1) |
+ //
+ // Note: that compare-and-branch instructions must not have bit 3 (0b0001) set.
+ var S390Xblocks = []blockData{
+ // branch on condition
+ {name: "BRC", controls: 1, aux: "S390XCCMask"}, // condition code value (flags) is Controls[0]
+
+ // compare-and-branch (register-register)
+ // - integrates comparison of Controls[0] with Controls[1]
+ // - both control values must be in general purpose registers
+ {name: "CRJ", controls: 2, aux: "S390XCCMask"}, // signed 32-bit integer comparison
+ {name: "CGRJ", controls: 2, aux: "S390XCCMask"}, // signed 64-bit integer comparison
+ {name: "CLRJ", controls: 2, aux: "S390XCCMask"}, // unsigned 32-bit integer comparison
+ {name: "CLGRJ", controls: 2, aux: "S390XCCMask"}, // unsigned 64-bit integer comparison
+
+ // compare-and-branch (register-immediate)
+ // - integrates comparison of Controls[0] with AuxInt
+ // - control value must be in a general purpose register
+ // - the AuxInt value is sign-extended for signed comparisons
+ // and zero-extended for unsigned comparisons
+ {name: "CIJ", controls: 1, aux: "S390XCCMaskInt8"}, // signed 32-bit integer comparison
+ {name: "CGIJ", controls: 1, aux: "S390XCCMaskInt8"}, // signed 64-bit integer comparison
+ {name: "CLIJ", controls: 1, aux: "S390XCCMaskUint8"}, // unsigned 32-bit integer comparison
+ {name: "CLGIJ", controls: 1, aux: "S390XCCMaskUint8"}, // unsigned 64-bit integer comparison
+ }
+
+ archs = append(archs, arch{
+ name: "S390X",
+ pkg: "cmd/internal/obj/s390x",
+ genfile: "../../s390x/ssa.go",
+ ops: S390Xops,
+ blocks: S390Xblocks,
+ regnames: regNamesS390X,
+ gpregmask: gp,
+ fpregmask: fp,
+ framepointerreg: -1, // not used
+ linkreg: int8(num["R14"]),
+ imports: []string{
+ "cmd/internal/obj/s390x",
+ },
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/Wasm.rules b/src/cmd/compile/internal/ssa/_gen/Wasm.rules
new file mode 100644
index 0000000..a9ed82e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/Wasm.rules
@@ -0,0 +1,396 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(64|32|16|8|Ptr) ...) => (I64Add ...)
+(Add(64|32)F ...) => (F(64|32)Add ...)
+
+(Sub(64|32|16|8|Ptr) ...) => (I64Sub ...)
+(Sub(64|32)F ...) => (F(64|32)Sub ...)
+
+(Mul(64|32|16|8) ...) => (I64Mul ...)
+(Mul(64|32)F ...) => (F(64|32)Mul ...)
+
+(Div64 [false] x y) => (I64DivS x y)
+(Div32 [false] x y) => (I64DivS (SignExt32to64 x) (SignExt32to64 y))
+(Div16 [false] x y) => (I64DivS (SignExt16to64 x) (SignExt16to64 y))
+(Div8 x y) => (I64DivS (SignExt8to64 x) (SignExt8to64 y))
+(Div64u ...) => (I64DivU ...)
+(Div32u x y) => (I64DivU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Div16u x y) => (I64DivU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Div8u x y) => (I64DivU (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Div(64|32)F ...) => (F(64|32)Div ...)
+
+(Mod64 [false] x y) => (I64RemS x y)
+(Mod32 [false] x y) => (I64RemS (SignExt32to64 x) (SignExt32to64 y))
+(Mod16 [false] x y) => (I64RemS (SignExt16to64 x) (SignExt16to64 y))
+(Mod8 x y) => (I64RemS (SignExt8to64 x) (SignExt8to64 y))
+(Mod64u ...) => (I64RemU ...)
+(Mod32u x y) => (I64RemU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Mod16u x y) => (I64RemU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Mod8u x y) => (I64RemU (ZeroExt8to64 x) (ZeroExt8to64 y))
+
+(And(64|32|16|8|B) ...) => (I64And ...)
+
+(Or(64|32|16|8|B) ...) => (I64Or ...)
+
+(Xor(64|32|16|8) ...) => (I64Xor ...)
+
+(Neg(64|32|16|8) x) => (I64Sub (I64Const [0]) x)
+(Neg(64|32)F ...) => (F(64|32)Neg ...)
+
+(Com(64|32|16|8) x) => (I64Xor x (I64Const [-1]))
+
+(Not ...) => (I64Eqz ...)
+
+// Lowering pointer arithmetic
+(OffPtr ...) => (I64AddConst ...)
+
+// Lowering extension
+// It is unnecessary to extend loads
+(SignExt32to64 x:(I64Load32S _ _)) => x
+(SignExt16to(64|32) x:(I64Load16S _ _)) => x
+(SignExt8to(64|32|16) x:(I64Load8S _ _)) => x
+(ZeroExt32to64 x:(I64Load32U _ _)) => x
+(ZeroExt16to(64|32) x:(I64Load16U _ _)) => x
+(ZeroExt8to(64|32|16) x:(I64Load8U _ _)) => x
+(SignExt32to64 x) && buildcfg.GOWASM.SignExt => (I64Extend32S x)
+(SignExt8to(64|32|16) x) && buildcfg.GOWASM.SignExt => (I64Extend8S x)
+(SignExt16to(64|32) x) && buildcfg.GOWASM.SignExt => (I64Extend16S x)
+(SignExt32to64 x) => (I64ShrS (I64Shl x (I64Const [32])) (I64Const [32]))
+(SignExt16to(64|32) x) => (I64ShrS (I64Shl x (I64Const [48])) (I64Const [48]))
+(SignExt8to(64|32|16) x) => (I64ShrS (I64Shl x (I64Const [56])) (I64Const [56]))
+(ZeroExt32to64 x) => (I64And x (I64Const [0xffffffff]))
+(ZeroExt16to(64|32) x) => (I64And x (I64Const [0xffff]))
+(ZeroExt8to(64|32|16) x) => (I64And x (I64Const [0xff]))
+
+(Slicemask x) => (I64ShrS (I64Sub (I64Const [0]) x) (I64Const [63]))
+
+// Lowering truncation
+// Because we ignore the high parts, truncates are just copies.
+(Trunc64to(32|16|8) ...) => (Copy ...)
+(Trunc32to(16|8) ...) => (Copy ...)
+(Trunc16to8 ...) => (Copy ...)
+
+// Lowering float <=> int
+(Cvt32to(64|32)F x) => (F(64|32)ConvertI64S (SignExt32to64 x))
+(Cvt64to(64|32)F ...) => (F(64|32)ConvertI64S ...)
+(Cvt32Uto(64|32)F x) => (F(64|32)ConvertI64U (ZeroExt32to64 x))
+(Cvt64Uto(64|32)F ...) => (F(64|32)ConvertI64U ...)
+
+(Cvt32Fto32 ...) => (I64TruncSatF32S ...)
+(Cvt32Fto64 ...) => (I64TruncSatF32S ...)
+(Cvt64Fto32 ...) => (I64TruncSatF64S ...)
+(Cvt64Fto64 ...) => (I64TruncSatF64S ...)
+(Cvt32Fto32U ...) => (I64TruncSatF32U ...)
+(Cvt32Fto64U ...) => (I64TruncSatF32U ...)
+(Cvt64Fto32U ...) => (I64TruncSatF64U ...)
+(Cvt64Fto64U ...) => (I64TruncSatF64U ...)
+
+(Cvt32Fto64F ...) => (F64PromoteF32 ...)
+(Cvt64Fto32F ...) => (F32DemoteF64 ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (Copy ...)
+(Round64F ...) => (Copy ...)
+
+// Lowering shifts
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+
+(Lsh64x64 x y) && shiftIsBounded(v) => (I64Shl x y)
+(Lsh64x64 x (I64Const [c])) && uint64(c) < 64 => (I64Shl x (I64Const [c]))
+(Lsh64x64 x (I64Const [c])) && uint64(c) >= 64 => (I64Const [0])
+(Lsh64x64 x y) => (Select (I64Shl x y) (I64Const [0]) (I64LtU y (I64Const [64])))
+(Lsh64x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Lsh32x64 ...) => (Lsh64x64 ...)
+(Lsh32x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Lsh16x64 ...) => (Lsh64x64 ...)
+(Lsh16x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Lsh8x64 ...) => (Lsh64x64 ...)
+(Lsh8x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Rsh64Ux64 x y) && shiftIsBounded(v) => (I64ShrU x y)
+(Rsh64Ux64 x (I64Const [c])) && uint64(c) < 64 => (I64ShrU x (I64Const [c]))
+(Rsh64Ux64 x (I64Const [c])) && uint64(c) >= 64 => (I64Const [0])
+(Rsh64Ux64 x y) => (Select (I64ShrU x y) (I64Const [0]) (I64LtU y (I64Const [64])))
+(Rsh64Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Rsh32Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt32to64 x) y)
+(Rsh32Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt32to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh16Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt16to64 x) y)
+(Rsh16Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt16to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh8Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt8to64 x) y)
+(Rsh8Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt8to64 x) (ZeroExt(32|16|8)to64 y))
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to (width - 1) if the shift value is >= width.
+
+(Rsh64x64 x y) && shiftIsBounded(v) => (I64ShrS x y)
+(Rsh64x64 x (I64Const [c])) && uint64(c) < 64 => (I64ShrS x (I64Const [c]))
+(Rsh64x64 x (I64Const [c])) && uint64(c) >= 64 => (I64ShrS x (I64Const [63]))
+(Rsh64x64 x y) => (I64ShrS x (Select <typ.Int64> y (I64Const [63]) (I64LtU y (I64Const [64]))))
+(Rsh64x(32|16|8) [c] x y) => (Rsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Rsh32x64 [c] x y) => (Rsh64x64 [c] (SignExt32to64 x) y)
+(Rsh32x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt32to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh16x64 [c] x y) => (Rsh64x64 [c] (SignExt16to64 x) y)
+(Rsh16x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt16to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh8x64 [c] x y) => (Rsh64x64 [c] (SignExt8to64 x) y)
+(Rsh8x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt8to64 x) (ZeroExt(32|16|8)to64 y))
+
+// Lowering rotates
+(RotateLeft8 <t> x (I64Const [c])) => (Or8 (Lsh8x64 <t> x (I64Const [c&7])) (Rsh8Ux64 <t> x (I64Const [-c&7])))
+(RotateLeft16 <t> x (I64Const [c])) => (Or16 (Lsh16x64 <t> x (I64Const [c&15])) (Rsh16Ux64 <t> x (I64Const [-c&15])))
+(RotateLeft32 ...) => (I32Rotl ...)
+(RotateLeft64 ...) => (I64Rotl ...)
+
+// Lowering comparisons
+(Less64 ...) => (I64LtS ...)
+(Less32 x y) => (I64LtS (SignExt32to64 x) (SignExt32to64 y))
+(Less16 x y) => (I64LtS (SignExt16to64 x) (SignExt16to64 y))
+(Less8 x y) => (I64LtS (SignExt8to64 x) (SignExt8to64 y))
+(Less64U ...) => (I64LtU ...)
+(Less32U x y) => (I64LtU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Less16U x y) => (I64LtU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Less8U x y) => (I64LtU (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Less(64|32)F ...) => (F(64|32)Lt ...)
+
+(Leq64 ...) => (I64LeS ...)
+(Leq32 x y) => (I64LeS (SignExt32to64 x) (SignExt32to64 y))
+(Leq16 x y) => (I64LeS (SignExt16to64 x) (SignExt16to64 y))
+(Leq8 x y) => (I64LeS (SignExt8to64 x) (SignExt8to64 y))
+(Leq64U ...) => (I64LeU ...)
+(Leq32U x y) => (I64LeU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Leq16U x y) => (I64LeU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Leq8U x y) => (I64LeU (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Leq(64|32)F ...) => (F(64|32)Le ...)
+
+(Eq64 ...) => (I64Eq ...)
+(Eq32 x y) => (I64Eq (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Eq16 x y) => (I64Eq (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Eq8 x y) => (I64Eq (ZeroExt8to64 x) (ZeroExt8to64 y))
+(EqB ...) => (I64Eq ...)
+(EqPtr ...) => (I64Eq ...)
+(Eq(64|32)F ...) => (F(64|32)Eq ...)
+
+(Neq64 ...) => (I64Ne ...)
+(Neq32 x y) => (I64Ne (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Neq16 x y) => (I64Ne (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Neq8 x y) => (I64Ne (ZeroExt8to64 x) (ZeroExt8to64 y))
+(NeqB ...) => (I64Ne ...)
+(NeqPtr ...) => (I64Ne ...)
+(Neq(64|32)F ...) => (F(64|32)Ne ...)
+
+// Lowering loads
+(Load <t> ptr mem) && is32BitFloat(t) => (F32Load ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (F64Load ptr mem)
+(Load <t> ptr mem) && t.Size() == 8 => (I64Load ptr mem)
+(Load <t> ptr mem) && t.Size() == 4 && !t.IsSigned() => (I64Load32U ptr mem)
+(Load <t> ptr mem) && t.Size() == 4 && t.IsSigned() => (I64Load32S ptr mem)
+(Load <t> ptr mem) && t.Size() == 2 && !t.IsSigned() => (I64Load16U ptr mem)
+(Load <t> ptr mem) && t.Size() == 2 && t.IsSigned() => (I64Load16S ptr mem)
+(Load <t> ptr mem) && t.Size() == 1 && !t.IsSigned() => (I64Load8U ptr mem)
+(Load <t> ptr mem) && t.Size() == 1 && t.IsSigned() => (I64Load8S ptr mem)
+
+// Lowering stores
+(Store {t} ptr val mem) && is64BitFloat(t) => (F64Store ptr val mem)
+(Store {t} ptr val mem) && is32BitFloat(t) => (F32Store ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 => (I64Store ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (I64Store32 ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (I64Store16 ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (I64Store8 ptr val mem)
+
+// Lowering moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (I64Store8 dst (I64Load8U src mem) mem)
+(Move [2] dst src mem) => (I64Store16 dst (I64Load16U src mem) mem)
+(Move [4] dst src mem) => (I64Store32 dst (I64Load32U src mem) mem)
+(Move [8] dst src mem) => (I64Store dst (I64Load src mem) mem)
+(Move [16] dst src mem) =>
+ (I64Store [8] dst (I64Load [8] src mem)
+ (I64Store dst (I64Load src mem) mem))
+(Move [3] dst src mem) =>
+ (I64Store8 [2] dst (I64Load8U [2] src mem)
+ (I64Store16 dst (I64Load16U src mem) mem))
+(Move [5] dst src mem) =>
+ (I64Store8 [4] dst (I64Load8U [4] src mem)
+ (I64Store32 dst (I64Load32U src mem) mem))
+(Move [6] dst src mem) =>
+ (I64Store16 [4] dst (I64Load16U [4] src mem)
+ (I64Store32 dst (I64Load32U src mem) mem))
+(Move [7] dst src mem) =>
+ (I64Store32 [3] dst (I64Load32U [3] src mem)
+ (I64Store32 dst (I64Load32U src mem) mem))
+(Move [s] dst src mem) && s > 8 && s < 16 =>
+ (I64Store [s-8] dst (I64Load [s-8] src mem)
+ (I64Store dst (I64Load src mem) mem))
+
+// Large copying uses helper.
+(Move [s] dst src mem) && logLargeCopy(v, s) =>
+ (LoweredMove [s] dst src mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (I64Store8 destptr (I64Const [0]) mem)
+(Zero [2] destptr mem) => (I64Store16 destptr (I64Const [0]) mem)
+(Zero [4] destptr mem) => (I64Store32 destptr (I64Const [0]) mem)
+(Zero [8] destptr mem) => (I64Store destptr (I64Const [0]) mem)
+
+(Zero [3] destptr mem) =>
+ (I64Store8 [2] destptr (I64Const [0])
+ (I64Store16 destptr (I64Const [0]) mem))
+(Zero [5] destptr mem) =>
+ (I64Store8 [4] destptr (I64Const [0])
+ (I64Store32 destptr (I64Const [0]) mem))
+(Zero [6] destptr mem) =>
+ (I64Store16 [4] destptr (I64Const [0])
+ (I64Store32 destptr (I64Const [0]) mem))
+(Zero [7] destptr mem) =>
+ (I64Store32 [3] destptr (I64Const [0])
+ (I64Store32 destptr (I64Const [0]) mem))
+
+// Strip off any fractional word zeroing.
+(Zero [s] destptr mem) && s%8 != 0 && s > 8 && s < 32 =>
+ (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
+ (I64Store destptr (I64Const [0]) mem))
+
+// Zero small numbers of words directly.
+(Zero [16] destptr mem) =>
+ (I64Store [8] destptr (I64Const [0])
+ (I64Store destptr (I64Const [0]) mem))
+(Zero [24] destptr mem) =>
+ (I64Store [16] destptr (I64Const [0])
+ (I64Store [8] destptr (I64Const [0])
+ (I64Store destptr (I64Const [0]) mem)))
+(Zero [32] destptr mem) =>
+ (I64Store [24] destptr (I64Const [0])
+ (I64Store [16] destptr (I64Const [0])
+ (I64Store [8] destptr (I64Const [0])
+ (I64Store destptr (I64Const [0]) mem))))
+
+// Large zeroing uses helper.
+(Zero [s] destptr mem) =>
+ (LoweredZero [s] destptr mem)
+
+// Lowering constants
+(Const64 ...) => (I64Const ...)
+(Const(32|16|8) [c]) => (I64Const [int64(c)])
+(Const(64|32)F ...) => (F(64|32)Const ...)
+(ConstNil) => (I64Const [0])
+(ConstBool [c]) => (I64Const [b2i(c)])
+
+// Lowering calls
+(StaticCall ...) => (LoweredStaticCall ...)
+(ClosureCall ...) => (LoweredClosureCall ...)
+(InterCall ...) => (LoweredInterCall ...)
+(TailCall ...) => (LoweredTailCall ...)
+
+// Miscellaneous
+(Convert ...) => (LoweredConvert ...)
+(IsNonNil p) => (I64Eqz (I64Eqz p))
+(IsInBounds ...) => (I64LtU ...)
+(IsSliceInBounds ...) => (I64LeU ...)
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(Addr {sym} base) => (LoweredAddr {sym} [0] base)
+(LocalAddr {sym} base _) => (LoweredAddr {sym} base)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+// --- Intrinsics ---
+(Sqrt ...) => (F64Sqrt ...)
+(Trunc ...) => (F64Trunc ...)
+(Ceil ...) => (F64Ceil ...)
+(Floor ...) => (F64Floor ...)
+(RoundToEven ...) => (F64Nearest ...)
+(Abs ...) => (F64Abs ...)
+(Copysign ...) => (F64Copysign ...)
+
+(Sqrt32 ...) => (F32Sqrt ...)
+
+(Ctz64 ...) => (I64Ctz ...)
+(Ctz32 x) => (I64Ctz (I64Or x (I64Const [0x100000000])))
+(Ctz16 x) => (I64Ctz (I64Or x (I64Const [0x10000])))
+(Ctz8 x) => (I64Ctz (I64Or x (I64Const [0x100])))
+
+(Ctz(64|32|16|8)NonZero ...) => (I64Ctz ...)
+
+(BitLen64 x) => (I64Sub (I64Const [64]) (I64Clz x))
+
+(PopCount64 ...) => (I64Popcnt ...)
+(PopCount32 x) => (I64Popcnt (ZeroExt32to64 x))
+(PopCount16 x) => (I64Popcnt (ZeroExt16to64 x))
+(PopCount8 x) => (I64Popcnt (ZeroExt8to64 x))
+
+(CondSelect ...) => (Select ...)
+
+// --- Optimizations ---
+(I64Add (I64Const [x]) (I64Const [y])) => (I64Const [x + y])
+(I64Mul (I64Const [x]) (I64Const [y])) => (I64Const [x * y])
+(I64And (I64Const [x]) (I64Const [y])) => (I64Const [x & y])
+(I64Or (I64Const [x]) (I64Const [y])) => (I64Const [x | y])
+(I64Xor (I64Const [x]) (I64Const [y])) => (I64Const [x ^ y])
+(F64Add (F64Const [x]) (F64Const [y])) => (F64Const [x + y])
+(F64Mul (F64Const [x]) (F64Const [y])) && !math.IsNaN(x * y) => (F64Const [x * y])
+(I64Eq (I64Const [x]) (I64Const [y])) && x == y => (I64Const [1])
+(I64Eq (I64Const [x]) (I64Const [y])) && x != y => (I64Const [0])
+(I64Ne (I64Const [x]) (I64Const [y])) && x == y => (I64Const [0])
+(I64Ne (I64Const [x]) (I64Const [y])) && x != y => (I64Const [1])
+
+(I64Shl (I64Const [x]) (I64Const [y])) => (I64Const [x << uint64(y)])
+(I64ShrU (I64Const [x]) (I64Const [y])) => (I64Const [int64(uint64(x) >> uint64(y))])
+(I64ShrS (I64Const [x]) (I64Const [y])) => (I64Const [x >> uint64(y)])
+
+// TODO: declare these operations as commutative and get rid of these rules?
+(I64Add (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Add y (I64Const [x]))
+(I64Mul (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Mul y (I64Const [x]))
+(I64And (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64And y (I64Const [x]))
+(I64Or (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Or y (I64Const [x]))
+(I64Xor (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Xor y (I64Const [x]))
+(F64Add (F64Const [x]) y) && y.Op != OpWasmF64Const => (F64Add y (F64Const [x]))
+(F64Mul (F64Const [x]) y) && y.Op != OpWasmF64Const => (F64Mul y (F64Const [x]))
+(I64Eq (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Eq y (I64Const [x]))
+(I64Ne (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Ne y (I64Const [x]))
+
+(I64Eq x (I64Const [0])) => (I64Eqz x)
+(I64LtU (I64Const [0]) x) => (I64Eqz (I64Eqz x))
+(I64LeU x (I64Const [0])) => (I64Eqz x)
+(I64LtU x (I64Const [1])) => (I64Eqz x)
+(I64LeU (I64Const [1]) x) => (I64Eqz (I64Eqz x))
+(I64Ne x (I64Const [0])) => (I64Eqz (I64Eqz x))
+
+(I64Add x (I64Const [y])) => (I64AddConst [y] x)
+(I64AddConst [0] x) => x
+(I64Eqz (I64Eqz (I64Eqz x))) => (I64Eqz x)
+
+// folding offset into load/store
+((I64Load|I64Load32U|I64Load32S|I64Load16U|I64Load16S|I64Load8U|I64Load8S) [off] (I64AddConst [off2] ptr) mem)
+ && isU32Bit(off+off2) =>
+ ((I64Load|I64Load32U|I64Load32S|I64Load16U|I64Load16S|I64Load8U|I64Load8S) [off+off2] ptr mem)
+
+((I64Store|I64Store32|I64Store16|I64Store8) [off] (I64AddConst [off2] ptr) val mem)
+ && isU32Bit(off+off2) =>
+ ((I64Store|I64Store32|I64Store16|I64Store8) [off+off2] ptr val mem)
+
+// folding offset into address
+(I64AddConst [off] (LoweredAddr {sym} [off2] base)) && isU32Bit(off+int64(off2)) =>
+ (LoweredAddr {sym} [int32(off)+off2] base)
+(I64AddConst [off] x:(SP)) && isU32Bit(off) => (LoweredAddr [int32(off)] x) // so it is rematerializeable
+
+// transforming readonly globals into constants
+(I64Load [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read64(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))])
+(I64Load32U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read32(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))])
+(I64Load16U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read16(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))])
+(I64Load8U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read8(sym, off+int64(off2)))])
diff --git a/src/cmd/compile/internal/ssa/_gen/WasmOps.go b/src/cmd/compile/internal/ssa/_gen/WasmOps.go
new file mode 100644
index 0000000..cd127b5
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/WasmOps.go
@@ -0,0 +1,277 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+var regNamesWasm = []string{
+ "R0",
+ "R1",
+ "R2",
+ "R3",
+ "R4",
+ "R5",
+ "R6",
+ "R7",
+ "R8",
+ "R9",
+ "R10",
+ "R11",
+ "R12",
+ "R13",
+ "R14",
+ "R15",
+
+ "F0",
+ "F1",
+ "F2",
+ "F3",
+ "F4",
+ "F5",
+ "F6",
+ "F7",
+ "F8",
+ "F9",
+ "F10",
+ "F11",
+ "F12",
+ "F13",
+ "F14",
+ "F15",
+
+ "F16",
+ "F17",
+ "F18",
+ "F19",
+ "F20",
+ "F21",
+ "F22",
+ "F23",
+ "F24",
+ "F25",
+ "F26",
+ "F27",
+ "F28",
+ "F29",
+ "F30",
+ "F31",
+
+ "SP",
+ "g",
+
+ // pseudo-registers
+ "SB",
+}
+
+func init() {
+ // Make map from reg names to reg integers.
+ if len(regNamesWasm) > 64 {
+ panic("too many registers")
+ }
+ num := map[string]int{}
+ for i, name := range regNamesWasm {
+ num[name] = i
+ }
+ buildReg := func(s string) regMask {
+ m := regMask(0)
+ for _, r := range strings.Split(s, " ") {
+ if n, ok := num[r]; ok {
+ m |= regMask(1) << uint(n)
+ continue
+ }
+ panic("register " + r + " not found")
+ }
+ return m
+ }
+
+ var (
+ gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
+ fp32 = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+ fp64 = buildReg("F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+ gpsp = gp | buildReg("SP")
+ gpspsb = gpsp | buildReg("SB")
+ // The "registers", which are actually local variables, can get clobbered
+ // if we're switching goroutines, because it unwinds the WebAssembly stack.
+ callerSave = gp | fp32 | fp64 | buildReg("g")
+ )
+
+ // Common regInfo
+ var (
+ gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
+ gp11 = regInfo{inputs: []regMask{gpsp}, outputs: []regMask{gp}}
+ gp21 = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: []regMask{gp}}
+ gp31 = regInfo{inputs: []regMask{gpsp, gpsp, gpsp}, outputs: []regMask{gp}}
+ fp32_01 = regInfo{inputs: nil, outputs: []regMask{fp32}}
+ fp32_11 = regInfo{inputs: []regMask{fp32}, outputs: []regMask{fp32}}
+ fp32_21 = regInfo{inputs: []regMask{fp32, fp32}, outputs: []regMask{fp32}}
+ fp32_21gp = regInfo{inputs: []regMask{fp32, fp32}, outputs: []regMask{gp}}
+ fp64_01 = regInfo{inputs: nil, outputs: []regMask{fp64}}
+ fp64_11 = regInfo{inputs: []regMask{fp64}, outputs: []regMask{fp64}}
+ fp64_21 = regInfo{inputs: []regMask{fp64, fp64}, outputs: []regMask{fp64}}
+ fp64_21gp = regInfo{inputs: []regMask{fp64, fp64}, outputs: []regMask{gp}}
+ gpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{gp}}
+ gpstore = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+ fp32load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{fp32}}
+ fp32store = regInfo{inputs: []regMask{gpspsb, fp32, 0}}
+ fp64load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{fp64}}
+ fp64store = regInfo{inputs: []regMask{gpspsb, fp64, 0}}
+ )
+
+ var WasmOps = []opData{
+ {name: "LoweredStaticCall", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "LoweredTailCall", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+ {name: "LoweredClosureCall", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp, 0}, clobbers: callerSave}, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+ {name: "LoweredInterCall", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+ {name: "LoweredAddr", argLength: 1, reg: gp11, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // returns base+aux+auxint, arg0=base
+ {name: "LoweredMove", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp}}, aux: "Int64"}, // large move. arg0=dst, arg1=src, arg2=mem, auxint=len, returns mem
+ {name: "LoweredZero", argLength: 2, reg: regInfo{inputs: []regMask{gp}}, aux: "Int64"}, // large zeroing. arg0=start, arg1=mem, auxint=len, returns mem
+
+ {name: "LoweredGetClosurePtr", reg: gp01}, // returns wasm.REG_CTXT, the closure pointer
+ {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, // returns the PC of the caller of the current function
+ {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, // returns the SP of the caller of the current function
+ {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem
+ {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp}}, aux: "Sym", symEffect: "None"}, // invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+
+ // LoweredConvert converts between pointers and integers.
+ // We have a special op for this so as to not confuse GCCallOff
+ // (particularly stack maps). It takes a memory arg so it
+ // gets correctly ordered with respect to GC safepoints.
+ // arg0=ptr/int arg1=mem, output=int/ptr
+ //
+ // TODO(neelance): LoweredConvert should not be necessary any more, since OpConvert does not need to be lowered any more (CL 108496).
+ {name: "LoweredConvert", argLength: 2, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}},
+
+ // The following are native WebAssembly instructions, see https://webassembly.github.io/spec/core/syntax/instructions.html
+
+ {name: "Select", asm: "Select", argLength: 3, reg: gp31}, // returns arg0 if arg2 != 0, otherwise returns arg1
+
+ {name: "I64Load8U", asm: "I64Load8U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt8"}, // read unsigned 8-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Load8S", asm: "I64Load8S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int8"}, // read signed 8-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Load16U", asm: "I64Load16U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt16"}, // read unsigned 16-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Load16S", asm: "I64Load16S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int16"}, // read signed 16-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Load32U", asm: "I64Load32U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt32"}, // read unsigned 32-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Load32S", asm: "I64Load32S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int32"}, // read signed 32-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Load", asm: "I64Load", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt64"}, // read 64-bit integer from address arg0+aux, arg1=mem
+ {name: "I64Store8", asm: "I64Store8", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 8-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+ {name: "I64Store16", asm: "I64Store16", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 16-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+ {name: "I64Store32", asm: "I64Store32", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 32-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+ {name: "I64Store", asm: "I64Store", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 64-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+
+ {name: "F32Load", asm: "F32Load", argLength: 2, reg: fp32load, aux: "Int64", typ: "Float32"}, // read 32-bit float from address arg0+aux, arg1=mem
+ {name: "F64Load", asm: "F64Load", argLength: 2, reg: fp64load, aux: "Int64", typ: "Float64"}, // read 64-bit float from address arg0+aux, arg1=mem
+ {name: "F32Store", asm: "F32Store", argLength: 3, reg: fp32store, aux: "Int64", typ: "Mem"}, // store 32-bit float arg1 at address arg0+aux, arg2=mem, returns mem
+ {name: "F64Store", asm: "F64Store", argLength: 3, reg: fp64store, aux: "Int64", typ: "Mem"}, // store 64-bit float arg1 at address arg0+aux, arg2=mem, returns mem
+
+ {name: "I64Const", reg: gp01, aux: "Int64", rematerializeable: true, typ: "Int64"}, // returns the constant integer aux
+ {name: "F32Const", reg: fp32_01, aux: "Float32", rematerializeable: true, typ: "Float32"}, // returns the constant float aux
+ {name: "F64Const", reg: fp64_01, aux: "Float64", rematerializeable: true, typ: "Float64"}, // returns the constant float aux
+
+ {name: "I64Eqz", asm: "I64Eqz", argLength: 1, reg: gp11, typ: "Bool"}, // arg0 == 0
+ {name: "I64Eq", asm: "I64Eq", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 == arg1
+ {name: "I64Ne", asm: "I64Ne", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 != arg1
+ {name: "I64LtS", asm: "I64LtS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 < arg1 (signed)
+ {name: "I64LtU", asm: "I64LtU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 < arg1 (unsigned)
+ {name: "I64GtS", asm: "I64GtS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 > arg1 (signed)
+ {name: "I64GtU", asm: "I64GtU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 > arg1 (unsigned)
+ {name: "I64LeS", asm: "I64LeS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 <= arg1 (signed)
+ {name: "I64LeU", asm: "I64LeU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 <= arg1 (unsigned)
+ {name: "I64GeS", asm: "I64GeS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 >= arg1 (signed)
+ {name: "I64GeU", asm: "I64GeU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 >= arg1 (unsigned)
+
+ {name: "F32Eq", asm: "F32Eq", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 == arg1
+ {name: "F32Ne", asm: "F32Ne", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 != arg1
+ {name: "F32Lt", asm: "F32Lt", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 < arg1
+ {name: "F32Gt", asm: "F32Gt", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 > arg1
+ {name: "F32Le", asm: "F32Le", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 <= arg1
+ {name: "F32Ge", asm: "F32Ge", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 >= arg1
+
+ {name: "F64Eq", asm: "F64Eq", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 == arg1
+ {name: "F64Ne", asm: "F64Ne", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 != arg1
+ {name: "F64Lt", asm: "F64Lt", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 < arg1
+ {name: "F64Gt", asm: "F64Gt", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 > arg1
+ {name: "F64Le", asm: "F64Le", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 <= arg1
+ {name: "F64Ge", asm: "F64Ge", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 >= arg1
+
+ {name: "I64Add", asm: "I64Add", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 + arg1
+ {name: "I64AddConst", asm: "I64Add", argLength: 1, reg: gp11, aux: "Int64", typ: "Int64"}, // arg0 + aux
+ {name: "I64Sub", asm: "I64Sub", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 - arg1
+ {name: "I64Mul", asm: "I64Mul", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 * arg1
+ {name: "I64DivS", asm: "I64DivS", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 / arg1 (signed)
+ {name: "I64DivU", asm: "I64DivU", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 / arg1 (unsigned)
+ {name: "I64RemS", asm: "I64RemS", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 % arg1 (signed)
+ {name: "I64RemU", asm: "I64RemU", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 % arg1 (unsigned)
+ {name: "I64And", asm: "I64And", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 & arg1
+ {name: "I64Or", asm: "I64Or", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 | arg1
+ {name: "I64Xor", asm: "I64Xor", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 ^ arg1
+ {name: "I64Shl", asm: "I64Shl", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 << (arg1 % 64)
+ {name: "I64ShrS", asm: "I64ShrS", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 >> (arg1 % 64) (signed)
+ {name: "I64ShrU", asm: "I64ShrU", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 >> (arg1 % 64) (unsigned)
+
+ {name: "F32Neg", asm: "F32Neg", argLength: 1, reg: fp32_11, typ: "Float32"}, // -arg0
+ {name: "F32Add", asm: "F32Add", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 + arg1
+ {name: "F32Sub", asm: "F32Sub", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 - arg1
+ {name: "F32Mul", asm: "F32Mul", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 * arg1
+ {name: "F32Div", asm: "F32Div", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 / arg1
+
+ {name: "F64Neg", asm: "F64Neg", argLength: 1, reg: fp64_11, typ: "Float64"}, // -arg0
+ {name: "F64Add", asm: "F64Add", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 + arg1
+ {name: "F64Sub", asm: "F64Sub", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 - arg1
+ {name: "F64Mul", asm: "F64Mul", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 * arg1
+ {name: "F64Div", asm: "F64Div", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 / arg1
+
+ {name: "I64TruncSatF64S", asm: "I64TruncSatF64S", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to a signed integer (saturating)
+ {name: "I64TruncSatF64U", asm: "I64TruncSatF64U", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to an unsigned integer (saturating)
+ {name: "I64TruncSatF32S", asm: "I64TruncSatF32S", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to a signed integer (saturating)
+ {name: "I64TruncSatF32U", asm: "I64TruncSatF32U", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to an unsigned integer (saturating)
+ {name: "F32ConvertI64S", asm: "F32ConvertI64S", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp32}}, typ: "Float32"}, // converts the signed integer arg0 to a float
+ {name: "F32ConvertI64U", asm: "F32ConvertI64U", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp32}}, typ: "Float32"}, // converts the unsigned integer arg0 to a float
+ {name: "F64ConvertI64S", asm: "F64ConvertI64S", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp64}}, typ: "Float64"}, // converts the signed integer arg0 to a float
+ {name: "F64ConvertI64U", asm: "F64ConvertI64U", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp64}}, typ: "Float64"}, // converts the unsigned integer arg0 to a float
+ {name: "F32DemoteF64", asm: "F32DemoteF64", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{fp32}}, typ: "Float32"},
+ {name: "F64PromoteF32", asm: "F64PromoteF32", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{fp64}}, typ: "Float64"},
+
+ {name: "I64Extend8S", asm: "I64Extend8S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 8 to 64 bit
+ {name: "I64Extend16S", asm: "I64Extend16S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 16 to 64 bit
+ {name: "I64Extend32S", asm: "I64Extend32S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 32 to 64 bit
+
+ {name: "F32Sqrt", asm: "F32Sqrt", argLength: 1, reg: fp32_11, typ: "Float32"}, // sqrt(arg0)
+ {name: "F32Trunc", asm: "F32Trunc", argLength: 1, reg: fp32_11, typ: "Float32"}, // trunc(arg0)
+ {name: "F32Ceil", asm: "F32Ceil", argLength: 1, reg: fp32_11, typ: "Float32"}, // ceil(arg0)
+ {name: "F32Floor", asm: "F32Floor", argLength: 1, reg: fp32_11, typ: "Float32"}, // floor(arg0)
+ {name: "F32Nearest", asm: "F32Nearest", argLength: 1, reg: fp32_11, typ: "Float32"}, // round(arg0)
+ {name: "F32Abs", asm: "F32Abs", argLength: 1, reg: fp32_11, typ: "Float32"}, // abs(arg0)
+ {name: "F32Copysign", asm: "F32Copysign", argLength: 2, reg: fp32_21, typ: "Float32"}, // copysign(arg0, arg1)
+
+ {name: "F64Sqrt", asm: "F64Sqrt", argLength: 1, reg: fp64_11, typ: "Float64"}, // sqrt(arg0)
+ {name: "F64Trunc", asm: "F64Trunc", argLength: 1, reg: fp64_11, typ: "Float64"}, // trunc(arg0)
+ {name: "F64Ceil", asm: "F64Ceil", argLength: 1, reg: fp64_11, typ: "Float64"}, // ceil(arg0)
+ {name: "F64Floor", asm: "F64Floor", argLength: 1, reg: fp64_11, typ: "Float64"}, // floor(arg0)
+ {name: "F64Nearest", asm: "F64Nearest", argLength: 1, reg: fp64_11, typ: "Float64"}, // round(arg0)
+ {name: "F64Abs", asm: "F64Abs", argLength: 1, reg: fp64_11, typ: "Float64"}, // abs(arg0)
+ {name: "F64Copysign", asm: "F64Copysign", argLength: 2, reg: fp64_21, typ: "Float64"}, // copysign(arg0, arg1)
+
+ {name: "I64Ctz", asm: "I64Ctz", argLength: 1, reg: gp11, typ: "Int64"}, // ctz(arg0)
+ {name: "I64Clz", asm: "I64Clz", argLength: 1, reg: gp11, typ: "Int64"}, // clz(arg0)
+ {name: "I32Rotl", asm: "I32Rotl", argLength: 2, reg: gp21, typ: "Int32"}, // rotl(arg0, arg1)
+ {name: "I64Rotl", asm: "I64Rotl", argLength: 2, reg: gp21, typ: "Int64"}, // rotl(arg0, arg1)
+ {name: "I64Popcnt", asm: "I64Popcnt", argLength: 1, reg: gp11, typ: "Int64"}, // popcnt(arg0)
+ }
+
+ archs = append(archs, arch{
+ name: "Wasm",
+ pkg: "cmd/internal/obj/wasm",
+ genfile: "../../wasm/ssa.go",
+ ops: WasmOps,
+ blocks: nil,
+ regnames: regNamesWasm,
+ gpregmask: gp,
+ fpregmask: fp32 | fp64,
+ fp32regmask: fp32,
+ fp64regmask: fp64,
+ framepointerreg: -1, // not used
+ linkreg: -1, // not used
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/allocators.go b/src/cmd/compile/internal/ssa/_gen/allocators.go
new file mode 100644
index 0000000..0f3968c
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/allocators.go
@@ -0,0 +1,198 @@
+package main
+
+// TODO: should we share backing storage for similarly-shaped types?
+// e.g. []*Value and []*Block, or even []int32 and []bool.
+
+import (
+ "bytes"
+ "fmt"
+ "go/format"
+ "io"
+ "log"
+ "os"
+)
+
+type allocator struct {
+ name string // name for alloc/free functions
+ typ string // the type they return/accept
+ mak string // code to make a new object (takes power-of-2 size as fmt arg)
+ capacity string // code to calculate the capacity of an object. Should always report a power of 2.
+ resize string // code to shrink to sub-power-of-two size (takes size as fmt arg)
+ clear string // code for clearing object before putting it on the free list
+ minLog int // log_2 of minimum allocation size
+ maxLog int // log_2 of maximum allocation size
+}
+
+func genAllocators() {
+ allocators := []allocator{
+ {
+ name: "ValueSlice",
+ typ: "[]*Value",
+ capacity: "cap(%s)",
+ mak: "make([]*Value, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = nil\n}",
+ minLog: 5,
+ maxLog: 32,
+ },
+ {
+ name: "BlockSlice",
+ typ: "[]*Block",
+ capacity: "cap(%s)",
+ mak: "make([]*Block, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = nil\n}",
+ minLog: 5,
+ maxLog: 32,
+ },
+ {
+ name: "BoolSlice",
+ typ: "[]bool",
+ capacity: "cap(%s)",
+ mak: "make([]bool, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = false\n}",
+ minLog: 8,
+ maxLog: 32,
+ },
+ {
+ name: "IntSlice",
+ typ: "[]int",
+ capacity: "cap(%s)",
+ mak: "make([]int, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}",
+ minLog: 5,
+ maxLog: 32,
+ },
+ {
+ name: "Int32Slice",
+ typ: "[]int32",
+ capacity: "cap(%s)",
+ mak: "make([]int32, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}",
+ minLog: 6,
+ maxLog: 32,
+ },
+ {
+ name: "Int8Slice",
+ typ: "[]int8",
+ capacity: "cap(%s)",
+ mak: "make([]int8, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}",
+ minLog: 8,
+ maxLog: 32,
+ },
+ {
+ name: "IDSlice",
+ typ: "[]ID",
+ capacity: "cap(%s)",
+ mak: "make([]ID, %s)",
+ resize: "%s[:%s]",
+ clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}",
+ minLog: 6,
+ maxLog: 32,
+ },
+ {
+ name: "SparseSet",
+ typ: "*sparseSet",
+ capacity: "%s.cap()",
+ mak: "newSparseSet(%s)",
+ resize: "", // larger-sized sparse sets are ok
+ clear: "%s.clear()",
+ minLog: 5,
+ maxLog: 32,
+ },
+ {
+ name: "SparseMap",
+ typ: "*sparseMap",
+ capacity: "%s.cap()",
+ mak: "newSparseMap(%s)",
+ resize: "", // larger-sized sparse maps are ok
+ clear: "%s.clear()",
+ minLog: 5,
+ maxLog: 32,
+ },
+ {
+ name: "SparseMapPos",
+ typ: "*sparseMapPos",
+ capacity: "%s.cap()",
+ mak: "newSparseMapPos(%s)",
+ resize: "", // larger-sized sparse maps are ok
+ clear: "%s.clear()",
+ minLog: 5,
+ maxLog: 32,
+ },
+ }
+
+ w := new(bytes.Buffer)
+ fmt.Fprintf(w, "// Code generated from _gen/allocators.go; DO NOT EDIT.\n")
+ fmt.Fprintln(w)
+ fmt.Fprintln(w, "package ssa")
+
+ fmt.Fprintln(w, "import (")
+ fmt.Fprintln(w, "\"math/bits\"")
+ fmt.Fprintln(w, "\"sync\"")
+ fmt.Fprintln(w, ")")
+ for _, a := range allocators {
+ genAllocator(w, a)
+ }
+ // gofmt result
+ b := w.Bytes()
+ var err error
+ b, err = format.Source(b)
+ if err != nil {
+ fmt.Printf("%s\n", w.Bytes())
+ panic(err)
+ }
+
+ if err := os.WriteFile("../allocators.go", b, 0666); err != nil {
+ log.Fatalf("can't write output: %v\n", err)
+ }
+}
+func genAllocator(w io.Writer, a allocator) {
+ fmt.Fprintf(w, "var poolFree%s [%d]sync.Pool\n", a.name, a.maxLog-a.minLog)
+ fmt.Fprintf(w, "func (c *Cache) alloc%s(n int) %s {\n", a.name, a.typ)
+ fmt.Fprintf(w, "var s %s\n", a.typ)
+ fmt.Fprintf(w, "n2 := n\n")
+ fmt.Fprintf(w, "if n2 < %d { n2 = %d }\n", 1<<a.minLog, 1<<a.minLog)
+ fmt.Fprintf(w, "b := bits.Len(uint(n2-1))\n")
+ fmt.Fprintf(w, "v := poolFree%s[b-%d].Get()\n", a.name, a.minLog)
+ fmt.Fprintf(w, "if v == nil {\n")
+ fmt.Fprintf(w, " s = %s\n", fmt.Sprintf(a.mak, "1<<b"))
+ fmt.Fprintf(w, "} else {\n")
+ if a.typ[0] == '*' {
+ fmt.Fprintf(w, "s = v.(%s)\n", a.typ)
+ } else {
+ fmt.Fprintf(w, "sp := v.(*%s)\n", a.typ)
+ fmt.Fprintf(w, "s = *sp\n")
+ fmt.Fprintf(w, "*sp = nil\n")
+ fmt.Fprintf(w, "c.hdr%s = append(c.hdr%s, sp)\n", a.name, a.name)
+ }
+ fmt.Fprintf(w, "}\n")
+ if a.resize != "" {
+ fmt.Fprintf(w, "s = %s\n", fmt.Sprintf(a.resize, "s", "n"))
+ }
+ fmt.Fprintf(w, "return s\n")
+ fmt.Fprintf(w, "}\n")
+ fmt.Fprintf(w, "func (c *Cache) free%s(s %s) {\n", a.name, a.typ)
+ fmt.Fprintf(w, "%s\n", fmt.Sprintf(a.clear, "s"))
+ fmt.Fprintf(w, "b := bits.Len(uint(%s) - 1)\n", fmt.Sprintf(a.capacity, "s"))
+ if a.typ[0] == '*' {
+ fmt.Fprintf(w, "poolFree%s[b-%d].Put(s)\n", a.name, a.minLog)
+ } else {
+ fmt.Fprintf(w, "var sp *%s\n", a.typ)
+ fmt.Fprintf(w, "if len(c.hdr%s) == 0 {\n", a.name)
+ fmt.Fprintf(w, " sp = new(%s)\n", a.typ)
+ fmt.Fprintf(w, "} else {\n")
+ fmt.Fprintf(w, " sp = c.hdr%s[len(c.hdr%s)-1]\n", a.name, a.name)
+ fmt.Fprintf(w, " c.hdr%s[len(c.hdr%s)-1] = nil\n", a.name, a.name)
+ fmt.Fprintf(w, " c.hdr%s = c.hdr%s[:len(c.hdr%s)-1]\n", a.name, a.name, a.name)
+ fmt.Fprintf(w, "}\n")
+ fmt.Fprintf(w, "*sp = s\n")
+ fmt.Fprintf(w, "poolFree%s[b-%d].Put(sp)\n", a.name, a.minLog)
+ }
+ fmt.Fprintf(w, "}\n")
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/cover.bash b/src/cmd/compile/internal/ssa/_gen/cover.bash
new file mode 100755
index 0000000..7311cfb
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/cover.bash
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright 2020 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# A quick and dirty way to obtain code coverage from rulegen's main func. For
+# example:
+#
+# ./cover.bash && go tool cover -html=cover.out
+#
+# This script is needed to set up a temporary test file, so that we don't break
+# regular 'go run .' usage to run the generator.
+
+cat >main_test.go <<-EOF
+ // +build ignore
+
+ package main
+
+ import "testing"
+
+ func TestCoverage(t *testing.T) { main() }
+EOF
+
+go test -run='^TestCoverage$' -coverprofile=cover.out "$@" *.go
+
+rm -f main_test.go
diff --git a/src/cmd/compile/internal/ssa/_gen/dec.rules b/src/cmd/compile/internal/ssa/_gen/dec.rules
new file mode 100644
index 0000000..b194898
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/dec.rules
@@ -0,0 +1,93 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules to decompose builtin compound types
+// (complex,string,slice,interface) into their constituent
+// types. These rules work together with the decomposeBuiltIn
+// pass which handles phis of these types.
+
+// complex ops
+(ComplexReal (ComplexMake real _ )) => real
+(ComplexImag (ComplexMake _ imag )) => imag
+
+(Load <t> ptr mem) && t.IsComplex() && t.Size() == 8 =>
+ (ComplexMake
+ (Load <typ.Float32> ptr mem)
+ (Load <typ.Float32>
+ (OffPtr <typ.Float32Ptr> [4] ptr)
+ mem)
+ )
+(Store {t} dst (ComplexMake real imag) mem) && t.Size() == 8 =>
+ (Store {typ.Float32}
+ (OffPtr <typ.Float32Ptr> [4] dst)
+ imag
+ (Store {typ.Float32} dst real mem))
+(Load <t> ptr mem) && t.IsComplex() && t.Size() == 16 =>
+ (ComplexMake
+ (Load <typ.Float64> ptr mem)
+ (Load <typ.Float64>
+ (OffPtr <typ.Float64Ptr> [8] ptr)
+ mem)
+ )
+(Store {t} dst (ComplexMake real imag) mem) && t.Size() == 16 =>
+ (Store {typ.Float64}
+ (OffPtr <typ.Float64Ptr> [8] dst)
+ imag
+ (Store {typ.Float64} dst real mem))
+
+// string ops
+(StringPtr (StringMake ptr _)) => ptr
+(StringLen (StringMake _ len)) => len
+
+(Load <t> ptr mem) && t.IsString() =>
+ (StringMake
+ (Load <typ.BytePtr> ptr mem)
+ (Load <typ.Int>
+ (OffPtr <typ.IntPtr> [config.PtrSize] ptr)
+ mem))
+(Store dst (StringMake ptr len) mem) =>
+ (Store {typ.Int}
+ (OffPtr <typ.IntPtr> [config.PtrSize] dst)
+ len
+ (Store {typ.BytePtr} dst ptr mem))
+
+// slice ops
+(SlicePtr (SliceMake ptr _ _ )) => ptr
+(SliceLen (SliceMake _ len _)) => len
+(SliceCap (SliceMake _ _ cap)) => cap
+(SlicePtrUnchecked (SliceMake ptr _ _ )) => ptr
+
+(Load <t> ptr mem) && t.IsSlice() =>
+ (SliceMake
+ (Load <t.Elem().PtrTo()> ptr mem)
+ (Load <typ.Int>
+ (OffPtr <typ.IntPtr> [config.PtrSize] ptr)
+ mem)
+ (Load <typ.Int>
+ (OffPtr <typ.IntPtr> [2*config.PtrSize] ptr)
+ mem))
+(Store {t} dst (SliceMake ptr len cap) mem) =>
+ (Store {typ.Int}
+ (OffPtr <typ.IntPtr> [2*config.PtrSize] dst)
+ cap
+ (Store {typ.Int}
+ (OffPtr <typ.IntPtr> [config.PtrSize] dst)
+ len
+ (Store {t.Elem().PtrTo()} dst ptr mem)))
+
+// interface ops
+(ITab (IMake itab _)) => itab
+(IData (IMake _ data)) => data
+
+(Load <t> ptr mem) && t.IsInterface() =>
+ (IMake
+ (Load <typ.Uintptr> ptr mem)
+ (Load <typ.BytePtr>
+ (OffPtr <typ.BytePtrPtr> [config.PtrSize] ptr)
+ mem))
+(Store dst (IMake itab data) mem) =>
+ (Store {typ.BytePtr}
+ (OffPtr <typ.BytePtrPtr> [config.PtrSize] dst)
+ data
+ (Store {typ.Uintptr} dst itab mem))
diff --git a/src/cmd/compile/internal/ssa/_gen/dec64.rules b/src/cmd/compile/internal/ssa/_gen/dec64.rules
new file mode 100644
index 0000000..ba776af
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/dec64.rules
@@ -0,0 +1,401 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules to decompose [u]int64 types on 32-bit
+// architectures. These rules work together with the decomposeBuiltIn
+// pass which handles phis of these typ.
+
+(Int64Hi (Int64Make hi _)) => hi
+(Int64Lo (Int64Make _ lo)) => lo
+
+(Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && t.IsSigned() =>
+ (Int64Make
+ (Load <typ.Int32> (OffPtr <typ.Int32Ptr> [4] ptr) mem)
+ (Load <typ.UInt32> ptr mem))
+
+(Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && !t.IsSigned() =>
+ (Int64Make
+ (Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem)
+ (Load <typ.UInt32> ptr mem))
+
+(Load <t> ptr mem) && is64BitInt(t) && config.BigEndian && t.IsSigned() =>
+ (Int64Make
+ (Load <typ.Int32> ptr mem)
+ (Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem))
+
+(Load <t> ptr mem) && is64BitInt(t) && config.BigEndian && !t.IsSigned() =>
+ (Int64Make
+ (Load <typ.UInt32> ptr mem)
+ (Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem))
+
+(Store {t} dst (Int64Make hi lo) mem) && t.Size() == 8 && !config.BigEndian =>
+ (Store {hi.Type}
+ (OffPtr <hi.Type.PtrTo()> [4] dst)
+ hi
+ (Store {lo.Type} dst lo mem))
+
+(Store {t} dst (Int64Make hi lo) mem) && t.Size() == 8 && config.BigEndian =>
+ (Store {lo.Type}
+ (OffPtr <lo.Type.PtrTo()> [4] dst)
+ lo
+ (Store {hi.Type} dst hi mem))
+
+// These are not enabled during decomposeBuiltin if late call expansion, but they are always enabled for softFloat
+(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") =>
+ (Int64Make
+ (Arg <typ.Int32> {n} [off+4])
+ (Arg <typ.UInt32> {n} [off]))
+(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") =>
+ (Int64Make
+ (Arg <typ.UInt32> {n} [off+4])
+ (Arg <typ.UInt32> {n} [off]))
+
+(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") =>
+ (Int64Make
+ (Arg <typ.Int32> {n} [off])
+ (Arg <typ.UInt32> {n} [off+4]))
+(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") =>
+ (Int64Make
+ (Arg <typ.UInt32> {n} [off])
+ (Arg <typ.UInt32> {n} [off+4]))
+
+(Add64 x y) =>
+ (Int64Make
+ (Add32withcarry <typ.Int32>
+ (Int64Hi x)
+ (Int64Hi y)
+ (Select1 <types.TypeFlags> (Add32carry (Int64Lo x) (Int64Lo y))))
+ (Select0 <typ.UInt32> (Add32carry (Int64Lo x) (Int64Lo y))))
+
+(Sub64 x y) =>
+ (Int64Make
+ (Sub32withcarry <typ.Int32>
+ (Int64Hi x)
+ (Int64Hi y)
+ (Select1 <types.TypeFlags> (Sub32carry (Int64Lo x) (Int64Lo y))))
+ (Select0 <typ.UInt32> (Sub32carry (Int64Lo x) (Int64Lo y))))
+
+(Mul64 x y) =>
+ (Int64Make
+ (Add32 <typ.UInt32>
+ (Mul32 <typ.UInt32> (Int64Lo x) (Int64Hi y))
+ (Add32 <typ.UInt32>
+ (Mul32 <typ.UInt32> (Int64Hi x) (Int64Lo y))
+ (Select0 <typ.UInt32> (Mul32uhilo (Int64Lo x) (Int64Lo y)))))
+ (Select1 <typ.UInt32> (Mul32uhilo (Int64Lo x) (Int64Lo y))))
+
+(And64 x y) =>
+ (Int64Make
+ (And32 <typ.UInt32> (Int64Hi x) (Int64Hi y))
+ (And32 <typ.UInt32> (Int64Lo x) (Int64Lo y)))
+
+(Or64 x y) =>
+ (Int64Make
+ (Or32 <typ.UInt32> (Int64Hi x) (Int64Hi y))
+ (Or32 <typ.UInt32> (Int64Lo x) (Int64Lo y)))
+
+(Xor64 x y) =>
+ (Int64Make
+ (Xor32 <typ.UInt32> (Int64Hi x) (Int64Hi y))
+ (Xor32 <typ.UInt32> (Int64Lo x) (Int64Lo y)))
+
+(Neg64 <t> x) => (Sub64 (Const64 <t> [0]) x)
+
+(Com64 x) =>
+ (Int64Make
+ (Com32 <typ.UInt32> (Int64Hi x))
+ (Com32 <typ.UInt32> (Int64Lo x)))
+
+// Sadly, just because we know that x is non-zero,
+// we don't know whether either component is,
+// so just treat Ctz64NonZero the same as Ctz64.
+(Ctz64NonZero ...) => (Ctz64 ...)
+
+(Ctz64 x) =>
+ (Add32 <typ.UInt32>
+ (Ctz32 <typ.UInt32> (Int64Lo x))
+ (And32 <typ.UInt32>
+ (Com32 <typ.UInt32> (Zeromask (Int64Lo x)))
+ (Ctz32 <typ.UInt32> (Int64Hi x))))
+
+(BitLen64 x) =>
+ (Add32 <typ.Int>
+ (BitLen32 <typ.Int> (Int64Hi x))
+ (BitLen32 <typ.Int>
+ (Or32 <typ.UInt32>
+ (Int64Lo x)
+ (Zeromask (Int64Hi x)))))
+
+(Bswap64 x) =>
+ (Int64Make
+ (Bswap32 <typ.UInt32> (Int64Lo x))
+ (Bswap32 <typ.UInt32> (Int64Hi x)))
+
+(SignExt32to64 x) => (Int64Make (Signmask x) x)
+(SignExt16to64 x) => (SignExt32to64 (SignExt16to32 x))
+(SignExt8to64 x) => (SignExt32to64 (SignExt8to32 x))
+
+(ZeroExt32to64 x) => (Int64Make (Const32 <typ.UInt32> [0]) x)
+(ZeroExt16to64 x) => (ZeroExt32to64 (ZeroExt16to32 x))
+(ZeroExt8to64 x) => (ZeroExt32to64 (ZeroExt8to32 x))
+
+(Trunc64to32 (Int64Make _ lo)) => lo
+(Trunc64to16 (Int64Make _ lo)) => (Trunc32to16 lo)
+(Trunc64to8 (Int64Make _ lo)) => (Trunc32to8 lo)
+// Most general
+(Trunc64to32 x) => (Int64Lo x)
+(Trunc64to16 x) => (Trunc32to16 (Int64Lo x))
+(Trunc64to8 x) => (Trunc32to8 (Int64Lo x))
+
+(Lsh32x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Rsh32x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask x)
+(Rsh32Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Lsh16x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Rsh16x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask (SignExt16to32 x))
+(Rsh16Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Lsh8x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Rsh8x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask (SignExt8to32 x))
+(Rsh8Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+
+(Lsh32x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh32x32 [c] x lo)
+(Rsh32x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh32x32 [c] x lo)
+(Rsh32Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh32Ux32 [c] x lo)
+(Lsh16x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh16x32 [c] x lo)
+(Rsh16x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh16x32 [c] x lo)
+(Rsh16Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh16Ux32 [c] x lo)
+(Lsh8x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh8x32 [c] x lo)
+(Rsh8x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh8x32 [c] x lo)
+(Rsh8Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh8Ux32 [c] x lo)
+
+(Lsh64x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const64 [0])
+(Rsh64x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Int64Make (Signmask (Int64Hi x)) (Signmask (Int64Hi x)))
+(Rsh64Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const64 [0])
+
+(Lsh64x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh64x32 [c] x lo)
+(Rsh64x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh64x32 [c] x lo)
+(Rsh64Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh64Ux32 [c] x lo)
+
+// turn x64 non-constant shifts to x32 shifts
+// if high 32-bit of the shift is nonzero, make a huge shift
+(Lsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Lsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh64Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Lsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Lsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh32Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Lsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Lsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh16Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Lsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Lsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh8Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+ (Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+
+// Most general
+(Lsh64x64 x y) => (Lsh64x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh64x64 x y) => (Rsh64x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh64Ux64 x y) => (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh32x64 x y) => (Lsh32x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh32x64 x y) => (Rsh32x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh32Ux64 x y) => (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh16x64 x y) => (Lsh16x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh16x64 x y) => (Rsh16x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh16Ux64 x y) => (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh8x64 x y) => (Lsh8x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh8x64 x y) => (Rsh8x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh8Ux64 x y) => (Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+
+(RotateLeft64 x (Int64Make hi lo)) => (RotateLeft64 x lo)
+(RotateLeft32 x (Int64Make hi lo)) => (RotateLeft32 x lo)
+(RotateLeft16 x (Int64Make hi lo)) => (RotateLeft16 x lo)
+(RotateLeft8 x (Int64Make hi lo)) => (RotateLeft8 x lo)
+
+// Clean up constants a little
+(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c == 0 => y
+(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c != 0 => (Const32 <typ.UInt32> [-1])
+
+// 64x left shift
+// result.hi = hi<<s | lo>>(32-s) | lo<<(s-32) // >> is unsigned, large shifts result 0
+// result.lo = lo<<s
+(Lsh64x32 x s) =>
+ (Int64Make
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Lsh32x32 <typ.UInt32> (Int64Hi x) s)
+ (Rsh32Ux32 <typ.UInt32>
+ (Int64Lo x)
+ (Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
+ (Lsh32x32 <typ.UInt32>
+ (Int64Lo x)
+ (Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32]))))
+ (Lsh32x32 <typ.UInt32> (Int64Lo x) s))
+(Lsh64x16 x s) =>
+ (Int64Make
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Lsh32x16 <typ.UInt32> (Int64Hi x) s)
+ (Rsh32Ux16 <typ.UInt32>
+ (Int64Lo x)
+ (Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
+ (Lsh32x16 <typ.UInt32>
+ (Int64Lo x)
+ (Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32]))))
+ (Lsh32x16 <typ.UInt32> (Int64Lo x) s))
+(Lsh64x8 x s) =>
+ (Int64Make
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Lsh32x8 <typ.UInt32> (Int64Hi x) s)
+ (Rsh32Ux8 <typ.UInt32>
+ (Int64Lo x)
+ (Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
+ (Lsh32x8 <typ.UInt32>
+ (Int64Lo x)
+ (Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32]))))
+ (Lsh32x8 <typ.UInt32> (Int64Lo x) s))
+
+// 64x unsigned right shift
+// result.hi = hi>>s
+// result.lo = lo>>s | hi<<(32-s) | hi>>(s-32) // >> is unsigned, large shifts result 0
+(Rsh64Ux32 x s) =>
+ (Int64Make
+ (Rsh32Ux32 <typ.UInt32> (Int64Hi x) s)
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Rsh32Ux32 <typ.UInt32> (Int64Lo x) s)
+ (Lsh32x32 <typ.UInt32>
+ (Int64Hi x)
+ (Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
+ (Rsh32Ux32 <typ.UInt32>
+ (Int64Hi x)
+ (Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))))
+(Rsh64Ux16 x s) =>
+ (Int64Make
+ (Rsh32Ux16 <typ.UInt32> (Int64Hi x) s)
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Rsh32Ux16 <typ.UInt32> (Int64Lo x) s)
+ (Lsh32x16 <typ.UInt32>
+ (Int64Hi x)
+ (Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
+ (Rsh32Ux16 <typ.UInt32>
+ (Int64Hi x)
+ (Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))))
+(Rsh64Ux8 x s) =>
+ (Int64Make
+ (Rsh32Ux8 <typ.UInt32> (Int64Hi x) s)
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Rsh32Ux8 <typ.UInt32> (Int64Lo x) s)
+ (Lsh32x8 <typ.UInt32>
+ (Int64Hi x)
+ (Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
+ (Rsh32Ux8 <typ.UInt32>
+ (Int64Hi x)
+ (Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))))
+
+// 64x signed right shift
+// result.hi = hi>>s
+// result.lo = lo>>s | hi<<(32-s) | (hi>>(s-32))&zeromask(s>>5) // hi>>(s-32) is signed, large shifts result 0/-1
+(Rsh64x32 x s) =>
+ (Int64Make
+ (Rsh32x32 <typ.UInt32> (Int64Hi x) s)
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Rsh32Ux32 <typ.UInt32> (Int64Lo x) s)
+ (Lsh32x32 <typ.UInt32>
+ (Int64Hi x)
+ (Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
+ (And32 <typ.UInt32>
+ (Rsh32x32 <typ.UInt32>
+ (Int64Hi x)
+ (Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))
+ (Zeromask
+ (Rsh32Ux32 <typ.UInt32> s (Const32 <typ.UInt32> [5]))))))
+(Rsh64x16 x s) =>
+ (Int64Make
+ (Rsh32x16 <typ.UInt32> (Int64Hi x) s)
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Rsh32Ux16 <typ.UInt32> (Int64Lo x) s)
+ (Lsh32x16 <typ.UInt32>
+ (Int64Hi x)
+ (Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
+ (And32 <typ.UInt32>
+ (Rsh32x16 <typ.UInt32>
+ (Int64Hi x)
+ (Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))
+ (Zeromask
+ (ZeroExt16to32
+ (Rsh16Ux32 <typ.UInt16> s (Const32 <typ.UInt32> [5])))))))
+(Rsh64x8 x s) =>
+ (Int64Make
+ (Rsh32x8 <typ.UInt32> (Int64Hi x) s)
+ (Or32 <typ.UInt32>
+ (Or32 <typ.UInt32>
+ (Rsh32Ux8 <typ.UInt32> (Int64Lo x) s)
+ (Lsh32x8 <typ.UInt32>
+ (Int64Hi x)
+ (Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
+ (And32 <typ.UInt32>
+ (Rsh32x8 <typ.UInt32>
+ (Int64Hi x)
+ (Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))
+ (Zeromask
+ (ZeroExt8to32
+ (Rsh8Ux32 <typ.UInt8> s (Const32 <typ.UInt32> [5])))))))
+
+(Const64 <t> [c]) && t.IsSigned() =>
+ (Int64Make (Const32 <typ.Int32> [int32(c>>32)]) (Const32 <typ.UInt32> [int32(c)]))
+(Const64 <t> [c]) && !t.IsSigned() =>
+ (Int64Make (Const32 <typ.UInt32> [int32(c>>32)]) (Const32 <typ.UInt32> [int32(c)]))
+
+(Eq64 x y) =>
+ (AndB
+ (Eq32 (Int64Hi x) (Int64Hi y))
+ (Eq32 (Int64Lo x) (Int64Lo y)))
+
+(Neq64 x y) =>
+ (OrB
+ (Neq32 (Int64Hi x) (Int64Hi y))
+ (Neq32 (Int64Lo x) (Int64Lo y)))
+
+(Less64U x y) =>
+ (OrB
+ (Less32U (Int64Hi x) (Int64Hi y))
+ (AndB
+ (Eq32 (Int64Hi x) (Int64Hi y))
+ (Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64U x y) =>
+ (OrB
+ (Less32U (Int64Hi x) (Int64Hi y))
+ (AndB
+ (Eq32 (Int64Hi x) (Int64Hi y))
+ (Leq32U (Int64Lo x) (Int64Lo y))))
+
+(Less64 x y) =>
+ (OrB
+ (Less32 (Int64Hi x) (Int64Hi y))
+ (AndB
+ (Eq32 (Int64Hi x) (Int64Hi y))
+ (Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64 x y) =>
+ (OrB
+ (Less32 (Int64Hi x) (Int64Hi y))
+ (AndB
+ (Eq32 (Int64Hi x) (Int64Hi y))
+ (Leq32U (Int64Lo x) (Int64Lo y))))
diff --git a/src/cmd/compile/internal/ssa/_gen/dec64Ops.go b/src/cmd/compile/internal/ssa/_gen/dec64Ops.go
new file mode 100644
index 0000000..bba218e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/dec64Ops.go
@@ -0,0 +1,18 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+var dec64Ops = []opData{}
+
+var dec64Blocks = []blockData{}
+
+func init() {
+ archs = append(archs, arch{
+ name: "dec64",
+ ops: dec64Ops,
+ blocks: dec64Blocks,
+ generic: true,
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/decOps.go b/src/cmd/compile/internal/ssa/_gen/decOps.go
new file mode 100644
index 0000000..0cc11cb
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/decOps.go
@@ -0,0 +1,18 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+var decOps = []opData{}
+
+var decBlocks = []blockData{}
+
+func init() {
+ archs = append(archs, arch{
+ name: "dec",
+ ops: decOps,
+ blocks: decBlocks,
+ generic: true,
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/generic.rules b/src/cmd/compile/internal/ssa/_gen/generic.rules
new file mode 100644
index 0000000..0406fbb
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/generic.rules
@@ -0,0 +1,2672 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Simplifications that apply to all backend architectures. As an example, this
+// Go source code
+//
+// y := 0 * x
+//
+// can be translated into y := 0 without losing any information, which saves a
+// pointless multiplication instruction. Other .rules files in this directory
+// (for example AMD64.rules) contain rules specific to the architecture in the
+// filename. The rules here apply to every architecture.
+//
+// The code for parsing this file lives in rulegen.go; this file generates
+// ssa/rewritegeneric.go.
+
+// values are specified using the following format:
+// (op <type> [auxint] {aux} arg0 arg1 ...)
+// the type, aux, and auxint fields are optional
+// on the matching side
+// - the type, aux, and auxint fields must match if they are specified.
+// - the first occurrence of a variable defines that variable. Subsequent
+// uses must match (be == to) the first use.
+// - v is defined to be the value matched.
+// - an additional conditional can be provided after the match pattern with "&&".
+// on the generated side
+// - the type of the top-level expression is the same as the one on the left-hand side.
+// - the type of any subexpressions must be specified explicitly (or
+// be specified in the op's type field).
+// - auxint will be 0 if not specified.
+// - aux will be nil if not specified.
+
+// blocks are specified using the following format:
+// (kind controlvalue succ0 succ1 ...)
+// controlvalue must be "nil" or a value expression
+// succ* fields must be variables
+// For now, the generated successors must be a permutation of the matched successors.
+
+// constant folding
+(Trunc16to8 (Const16 [c])) => (Const8 [int8(c)])
+(Trunc32to8 (Const32 [c])) => (Const8 [int8(c)])
+(Trunc32to16 (Const32 [c])) => (Const16 [int16(c)])
+(Trunc64to8 (Const64 [c])) => (Const8 [int8(c)])
+(Trunc64to16 (Const64 [c])) => (Const16 [int16(c)])
+(Trunc64to32 (Const64 [c])) => (Const32 [int32(c)])
+(Cvt64Fto32F (Const64F [c])) => (Const32F [float32(c)])
+(Cvt32Fto64F (Const32F [c])) => (Const64F [float64(c)])
+(Cvt32to32F (Const32 [c])) => (Const32F [float32(c)])
+(Cvt32to64F (Const32 [c])) => (Const64F [float64(c)])
+(Cvt64to32F (Const64 [c])) => (Const32F [float32(c)])
+(Cvt64to64F (Const64 [c])) => (Const64F [float64(c)])
+(Cvt32Fto32 (Const32F [c])) => (Const32 [int32(c)])
+(Cvt32Fto64 (Const32F [c])) => (Const64 [int64(c)])
+(Cvt64Fto32 (Const64F [c])) => (Const32 [int32(c)])
+(Cvt64Fto64 (Const64F [c])) => (Const64 [int64(c)])
+(Round32F x:(Const32F)) => x
+(Round64F x:(Const64F)) => x
+(CvtBoolToUint8 (ConstBool [false])) => (Const8 [0])
+(CvtBoolToUint8 (ConstBool [true])) => (Const8 [1])
+
+(Trunc16to8 (ZeroExt8to16 x)) => x
+(Trunc32to8 (ZeroExt8to32 x)) => x
+(Trunc32to16 (ZeroExt8to32 x)) => (ZeroExt8to16 x)
+(Trunc32to16 (ZeroExt16to32 x)) => x
+(Trunc64to8 (ZeroExt8to64 x)) => x
+(Trunc64to16 (ZeroExt8to64 x)) => (ZeroExt8to16 x)
+(Trunc64to16 (ZeroExt16to64 x)) => x
+(Trunc64to32 (ZeroExt8to64 x)) => (ZeroExt8to32 x)
+(Trunc64to32 (ZeroExt16to64 x)) => (ZeroExt16to32 x)
+(Trunc64to32 (ZeroExt32to64 x)) => x
+(Trunc16to8 (SignExt8to16 x)) => x
+(Trunc32to8 (SignExt8to32 x)) => x
+(Trunc32to16 (SignExt8to32 x)) => (SignExt8to16 x)
+(Trunc32to16 (SignExt16to32 x)) => x
+(Trunc64to8 (SignExt8to64 x)) => x
+(Trunc64to16 (SignExt8to64 x)) => (SignExt8to16 x)
+(Trunc64to16 (SignExt16to64 x)) => x
+(Trunc64to32 (SignExt8to64 x)) => (SignExt8to32 x)
+(Trunc64to32 (SignExt16to64 x)) => (SignExt16to32 x)
+(Trunc64to32 (SignExt32to64 x)) => x
+
+(ZeroExt8to16 (Const8 [c])) => (Const16 [int16( uint8(c))])
+(ZeroExt8to32 (Const8 [c])) => (Const32 [int32( uint8(c))])
+(ZeroExt8to64 (Const8 [c])) => (Const64 [int64( uint8(c))])
+(ZeroExt16to32 (Const16 [c])) => (Const32 [int32(uint16(c))])
+(ZeroExt16to64 (Const16 [c])) => (Const64 [int64(uint16(c))])
+(ZeroExt32to64 (Const32 [c])) => (Const64 [int64(uint32(c))])
+(SignExt8to16 (Const8 [c])) => (Const16 [int16(c)])
+(SignExt8to32 (Const8 [c])) => (Const32 [int32(c)])
+(SignExt8to64 (Const8 [c])) => (Const64 [int64(c)])
+(SignExt16to32 (Const16 [c])) => (Const32 [int32(c)])
+(SignExt16to64 (Const16 [c])) => (Const64 [int64(c)])
+(SignExt32to64 (Const32 [c])) => (Const64 [int64(c)])
+
+(Neg8 (Const8 [c])) => (Const8 [-c])
+(Neg16 (Const16 [c])) => (Const16 [-c])
+(Neg32 (Const32 [c])) => (Const32 [-c])
+(Neg64 (Const64 [c])) => (Const64 [-c])
+(Neg32F (Const32F [c])) && c != 0 => (Const32F [-c])
+(Neg64F (Const64F [c])) && c != 0 => (Const64F [-c])
+
+(Add8 (Const8 [c]) (Const8 [d])) => (Const8 [c+d])
+(Add16 (Const16 [c]) (Const16 [d])) => (Const16 [c+d])
+(Add32 (Const32 [c]) (Const32 [d])) => (Const32 [c+d])
+(Add64 (Const64 [c]) (Const64 [d])) => (Const64 [c+d])
+(Add32F (Const32F [c]) (Const32F [d])) && c+d == c+d => (Const32F [c+d])
+(Add64F (Const64F [c]) (Const64F [d])) && c+d == c+d => (Const64F [c+d])
+(AddPtr <t> x (Const64 [c])) => (OffPtr <t> x [c])
+(AddPtr <t> x (Const32 [c])) => (OffPtr <t> x [int64(c)])
+
+(Sub8 (Const8 [c]) (Const8 [d])) => (Const8 [c-d])
+(Sub16 (Const16 [c]) (Const16 [d])) => (Const16 [c-d])
+(Sub32 (Const32 [c]) (Const32 [d])) => (Const32 [c-d])
+(Sub64 (Const64 [c]) (Const64 [d])) => (Const64 [c-d])
+(Sub32F (Const32F [c]) (Const32F [d])) && c-d == c-d => (Const32F [c-d])
+(Sub64F (Const64F [c]) (Const64F [d])) && c-d == c-d => (Const64F [c-d])
+
+(Mul8 (Const8 [c]) (Const8 [d])) => (Const8 [c*d])
+(Mul16 (Const16 [c]) (Const16 [d])) => (Const16 [c*d])
+(Mul32 (Const32 [c]) (Const32 [d])) => (Const32 [c*d])
+(Mul64 (Const64 [c]) (Const64 [d])) => (Const64 [c*d])
+(Mul32F (Const32F [c]) (Const32F [d])) && c*d == c*d => (Const32F [c*d])
+(Mul64F (Const64F [c]) (Const64F [d])) && c*d == c*d => (Const64F [c*d])
+
+(And8 (Const8 [c]) (Const8 [d])) => (Const8 [c&d])
+(And16 (Const16 [c]) (Const16 [d])) => (Const16 [c&d])
+(And32 (Const32 [c]) (Const32 [d])) => (Const32 [c&d])
+(And64 (Const64 [c]) (Const64 [d])) => (Const64 [c&d])
+
+(Or8 (Const8 [c]) (Const8 [d])) => (Const8 [c|d])
+(Or16 (Const16 [c]) (Const16 [d])) => (Const16 [c|d])
+(Or32 (Const32 [c]) (Const32 [d])) => (Const32 [c|d])
+(Or64 (Const64 [c]) (Const64 [d])) => (Const64 [c|d])
+
+(Xor8 (Const8 [c]) (Const8 [d])) => (Const8 [c^d])
+(Xor16 (Const16 [c]) (Const16 [d])) => (Const16 [c^d])
+(Xor32 (Const32 [c]) (Const32 [d])) => (Const32 [c^d])
+(Xor64 (Const64 [c]) (Const64 [d])) => (Const64 [c^d])
+
+(Ctz64 (Const64 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz64(c))])
+(Ctz32 (Const32 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz32(c))])
+(Ctz16 (Const16 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz16(c))])
+(Ctz8 (Const8 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz8(c))])
+
+(Ctz64 (Const64 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz64(c))])
+(Ctz32 (Const32 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz32(c))])
+(Ctz16 (Const16 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz16(c))])
+(Ctz8 (Const8 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz8(c))])
+
+(Div8 (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [c/d])
+(Div16 (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [c/d])
+(Div32 (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [c/d])
+(Div64 (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [c/d])
+(Div8u (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [int8(uint8(c)/uint8(d))])
+(Div16u (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [int16(uint16(c)/uint16(d))])
+(Div32u (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [int32(uint32(c)/uint32(d))])
+(Div64u (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [int64(uint64(c)/uint64(d))])
+(Div32F (Const32F [c]) (Const32F [d])) && c/d == c/d => (Const32F [c/d])
+(Div64F (Const64F [c]) (Const64F [d])) && c/d == c/d => (Const64F [c/d])
+(Select0 (Div128u (Const64 [0]) lo y)) => (Div64u lo y)
+(Select1 (Div128u (Const64 [0]) lo y)) => (Mod64u lo y)
+
+(Not (ConstBool [c])) => (ConstBool [!c])
+
+(Floor (Const64F [c])) => (Const64F [math.Floor(c)])
+(Ceil (Const64F [c])) => (Const64F [math.Ceil(c)])
+(Trunc (Const64F [c])) => (Const64F [math.Trunc(c)])
+(RoundToEven (Const64F [c])) => (Const64F [math.RoundToEven(c)])
+
+// Convert x * 1 to x.
+(Mul(8|16|32|64) (Const(8|16|32|64) [1]) x) => x
+(Select0 (Mul(32|64)uover (Const(32|64) [1]) x)) => x
+(Select1 (Mul(32|64)uover (Const(32|64) [1]) x)) => (ConstBool [false])
+
+// Convert x * -1 to -x.
+(Mul(8|16|32|64) (Const(8|16|32|64) [-1]) x) => (Neg(8|16|32|64) x)
+
+// Convert multiplication by a power of two to a shift.
+(Mul8 <t> n (Const8 [c])) && isPowerOfTwo8(c) => (Lsh8x64 <t> n (Const64 <typ.UInt64> [log8(c)]))
+(Mul16 <t> n (Const16 [c])) && isPowerOfTwo16(c) => (Lsh16x64 <t> n (Const64 <typ.UInt64> [log16(c)]))
+(Mul32 <t> n (Const32 [c])) && isPowerOfTwo32(c) => (Lsh32x64 <t> n (Const64 <typ.UInt64> [log32(c)]))
+(Mul64 <t> n (Const64 [c])) && isPowerOfTwo64(c) => (Lsh64x64 <t> n (Const64 <typ.UInt64> [log64(c)]))
+(Mul8 <t> n (Const8 [c])) && t.IsSigned() && isPowerOfTwo8(-c) => (Neg8 (Lsh8x64 <t> n (Const64 <typ.UInt64> [log8(-c)])))
+(Mul16 <t> n (Const16 [c])) && t.IsSigned() && isPowerOfTwo16(-c) => (Neg16 (Lsh16x64 <t> n (Const64 <typ.UInt64> [log16(-c)])))
+(Mul32 <t> n (Const32 [c])) && t.IsSigned() && isPowerOfTwo32(-c) => (Neg32 (Lsh32x64 <t> n (Const64 <typ.UInt64> [log32(-c)])))
+(Mul64 <t> n (Const64 [c])) && t.IsSigned() && isPowerOfTwo64(-c) => (Neg64 (Lsh64x64 <t> n (Const64 <typ.UInt64> [log64(-c)])))
+
+(Mod8 (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [c % d])
+(Mod16 (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [c % d])
+(Mod32 (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [c % d])
+(Mod64 (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [c % d])
+
+(Mod8u (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [int8(uint8(c) % uint8(d))])
+(Mod16u (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [int16(uint16(c) % uint16(d))])
+(Mod32u (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [int32(uint32(c) % uint32(d))])
+(Mod64u (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [int64(uint64(c) % uint64(d))])
+
+(Lsh64x64 (Const64 [c]) (Const64 [d])) => (Const64 [c << uint64(d)])
+(Rsh64x64 (Const64 [c]) (Const64 [d])) => (Const64 [c >> uint64(d)])
+(Rsh64Ux64 (Const64 [c]) (Const64 [d])) => (Const64 [int64(uint64(c) >> uint64(d))])
+(Lsh32x64 (Const32 [c]) (Const64 [d])) => (Const32 [c << uint64(d)])
+(Rsh32x64 (Const32 [c]) (Const64 [d])) => (Const32 [c >> uint64(d)])
+(Rsh32Ux64 (Const32 [c]) (Const64 [d])) => (Const32 [int32(uint32(c) >> uint64(d))])
+(Lsh16x64 (Const16 [c]) (Const64 [d])) => (Const16 [c << uint64(d)])
+(Rsh16x64 (Const16 [c]) (Const64 [d])) => (Const16 [c >> uint64(d)])
+(Rsh16Ux64 (Const16 [c]) (Const64 [d])) => (Const16 [int16(uint16(c) >> uint64(d))])
+(Lsh8x64 (Const8 [c]) (Const64 [d])) => (Const8 [c << uint64(d)])
+(Rsh8x64 (Const8 [c]) (Const64 [d])) => (Const8 [c >> uint64(d)])
+(Rsh8Ux64 (Const8 [c]) (Const64 [d])) => (Const8 [int8(uint8(c) >> uint64(d))])
+
+// Fold IsInBounds when the range of the index cannot exceed the limit.
+(IsInBounds (ZeroExt8to32 _) (Const32 [c])) && (1 << 8) <= c => (ConstBool [true])
+(IsInBounds (ZeroExt8to64 _) (Const64 [c])) && (1 << 8) <= c => (ConstBool [true])
+(IsInBounds (ZeroExt16to32 _) (Const32 [c])) && (1 << 16) <= c => (ConstBool [true])
+(IsInBounds (ZeroExt16to64 _) (Const64 [c])) && (1 << 16) <= c => (ConstBool [true])
+(IsInBounds x x) => (ConstBool [false])
+(IsInBounds (And8 (Const8 [c]) _) (Const8 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to16 (And8 (Const8 [c]) _)) (Const16 [d])) && 0 <= c && int16(c) < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to32 (And8 (Const8 [c]) _)) (Const32 [d])) && 0 <= c && int32(c) < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to64 (And8 (Const8 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true])
+(IsInBounds (And16 (Const16 [c]) _) (Const16 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to32 (And16 (Const16 [c]) _)) (Const32 [d])) && 0 <= c && int32(c) < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to64 (And16 (Const16 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true])
+(IsInBounds (And32 (Const32 [c]) _) (Const32 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (ZeroExt32to64 (And32 (Const32 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true])
+(IsInBounds (And64 (Const64 [c]) _) (Const64 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (Const32 [c]) (Const32 [d])) => (ConstBool [0 <= c && c < d])
+(IsInBounds (Const64 [c]) (Const64 [d])) => (ConstBool [0 <= c && c < d])
+// (Mod64u x y) is always between 0 (inclusive) and y (exclusive).
+(IsInBounds (Mod32u _ y) y) => (ConstBool [true])
+(IsInBounds (Mod64u _ y) y) => (ConstBool [true])
+// Right shifting an unsigned number limits its value.
+(IsInBounds (ZeroExt8to64 (Rsh8Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to32 (Rsh8Ux64 _ (Const64 [c]))) (Const32 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to16 (Rsh8Ux64 _ (Const64 [c]))) (Const16 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (Rsh8Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to64 (Rsh16Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to32 (Rsh16Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true])
+(IsInBounds (Rsh16Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt32to64 (Rsh32Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 32 && 1<<uint(32-c)-1 < d => (ConstBool [true])
+(IsInBounds (Rsh32Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 32 && 1<<uint(32-c)-1 < d => (ConstBool [true])
+(IsInBounds (Rsh64Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 64 && 1<<uint(64-c)-1 < d => (ConstBool [true])
+
+(IsSliceInBounds x x) => (ConstBool [true])
+(IsSliceInBounds (And32 (Const32 [c]) _) (Const32 [d])) && 0 <= c && c <= d => (ConstBool [true])
+(IsSliceInBounds (And64 (Const64 [c]) _) (Const64 [d])) && 0 <= c && c <= d => (ConstBool [true])
+(IsSliceInBounds (Const32 [0]) _) => (ConstBool [true])
+(IsSliceInBounds (Const64 [0]) _) => (ConstBool [true])
+(IsSliceInBounds (Const32 [c]) (Const32 [d])) => (ConstBool [0 <= c && c <= d])
+(IsSliceInBounds (Const64 [c]) (Const64 [d])) => (ConstBool [0 <= c && c <= d])
+(IsSliceInBounds (SliceLen x) (SliceCap x)) => (ConstBool [true])
+
+(Eq(64|32|16|8) x x) => (ConstBool [true])
+(EqB (ConstBool [c]) (ConstBool [d])) => (ConstBool [c == d])
+(EqB (ConstBool [false]) x) => (Not x)
+(EqB (ConstBool [true]) x) => x
+
+(Neq(64|32|16|8) x x) => (ConstBool [false])
+(NeqB (ConstBool [c]) (ConstBool [d])) => (ConstBool [c != d])
+(NeqB (ConstBool [false]) x) => x
+(NeqB (ConstBool [true]) x) => (Not x)
+(NeqB (Not x) (Not y)) => (NeqB x y)
+
+(Eq64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Eq64 (Const64 <t> [c-d]) x)
+(Eq32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Eq32 (Const32 <t> [c-d]) x)
+(Eq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Eq16 (Const16 <t> [c-d]) x)
+(Eq8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Eq8 (Const8 <t> [c-d]) x)
+
+(Neq64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Neq64 (Const64 <t> [c-d]) x)
+(Neq32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Neq32 (Const32 <t> [c-d]) x)
+(Neq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Neq16 (Const16 <t> [c-d]) x)
+(Neq8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Neq8 (Const8 <t> [c-d]) x)
+
+// signed integer range: ( c <= x && x (<|<=) d ) -> ( unsigned(x-c) (<|<=) unsigned(d-c) )
+(AndB (Leq64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c]))
+(AndB (Leq32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c]))
+(AndB (Leq16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c]))
+(AndB (Leq8 (Const8 [c]) x) ((Less|Leq)8 x (Const8 [d]))) && d >= c => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c])) (Const8 <x.Type> [d-c]))
+
+// signed integer range: ( c < x && x (<|<=) d ) -> ( unsigned(x-(c+1)) (<|<=) unsigned(d-(c+1)) )
+(AndB (Less64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1]))
+(AndB (Less32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1]))
+(AndB (Less16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1]))
+(AndB (Less8 (Const8 [c]) x) ((Less|Leq)8 x (Const8 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c+1])) (Const8 <x.Type> [d-c-1]))
+
+// unsigned integer range: ( c <= x && x (<|<=) d ) -> ( x-c (<|<=) d-c )
+(AndB (Leq64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c) => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c]))
+(AndB (Leq32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c) => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c]))
+(AndB (Leq16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c) => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c]))
+(AndB (Leq8U (Const8 [c]) x) ((Less|Leq)8U x (Const8 [d]))) && uint8(d) >= uint8(c) => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c])) (Const8 <x.Type> [d-c]))
+
+// unsigned integer range: ( c < x && x (<|<=) d ) -> ( x-(c+1) (<|<=) d-(c+1) )
+(AndB (Less64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c+1) && uint64(c+1) > uint64(c) => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1]))
+(AndB (Less32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c+1) && uint32(c+1) > uint32(c) => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1]))
+(AndB (Less16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c+1) && uint16(c+1) > uint16(c) => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1]))
+(AndB (Less8U (Const8 [c]) x) ((Less|Leq)8U x (Const8 [d]))) && uint8(d) >= uint8(c+1) && uint8(c+1) > uint8(c) => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c+1])) (Const8 <x.Type> [d-c-1]))
+
+// signed integer range: ( c (<|<=) x || x < d ) -> ( unsigned(c-d) (<|<=) unsigned(x-d) )
+(OrB ((Less|Leq)64 (Const64 [c]) x) (Less64 x (Const64 [d]))) && c >= d => ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d])))
+(OrB ((Less|Leq)32 (Const32 [c]) x) (Less32 x (Const32 [d]))) && c >= d => ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d])))
+(OrB ((Less|Leq)16 (Const16 [c]) x) (Less16 x (Const16 [d]))) && c >= d => ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d])))
+(OrB ((Less|Leq)8 (Const8 [c]) x) (Less8 x (Const8 [d]))) && c >= d => ((Less|Leq)8U (Const8 <x.Type> [c-d]) (Sub8 <x.Type> x (Const8 <x.Type> [d])))
+
+// signed integer range: ( c (<|<=) x || x <= d ) -> ( unsigned(c-(d+1)) (<|<=) unsigned(x-(d+1)) )
+(OrB ((Less|Leq)64 (Const64 [c]) x) (Leq64 x (Const64 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1])))
+(OrB ((Less|Leq)32 (Const32 [c]) x) (Leq32 x (Const32 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1])))
+(OrB ((Less|Leq)16 (Const16 [c]) x) (Leq16 x (Const16 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1])))
+(OrB ((Less|Leq)8 (Const8 [c]) x) (Leq8 x (Const8 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)8U (Const8 <x.Type> [c-d-1]) (Sub8 <x.Type> x (Const8 <x.Type> [d+1])))
+
+// unsigned integer range: ( c (<|<=) x || x < d ) -> ( c-d (<|<=) x-d )
+(OrB ((Less|Leq)64U (Const64 [c]) x) (Less64U x (Const64 [d]))) && uint64(c) >= uint64(d) => ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d])))
+(OrB ((Less|Leq)32U (Const32 [c]) x) (Less32U x (Const32 [d]))) && uint32(c) >= uint32(d) => ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d])))
+(OrB ((Less|Leq)16U (Const16 [c]) x) (Less16U x (Const16 [d]))) && uint16(c) >= uint16(d) => ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d])))
+(OrB ((Less|Leq)8U (Const8 [c]) x) (Less8U x (Const8 [d]))) && uint8(c) >= uint8(d) => ((Less|Leq)8U (Const8 <x.Type> [c-d]) (Sub8 <x.Type> x (Const8 <x.Type> [d])))
+
+// unsigned integer range: ( c (<|<=) x || x <= d ) -> ( c-(d+1) (<|<=) x-(d+1) )
+(OrB ((Less|Leq)64U (Const64 [c]) x) (Leq64U x (Const64 [d]))) && uint64(c) >= uint64(d+1) && uint64(d+1) > uint64(d) => ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1])))
+(OrB ((Less|Leq)32U (Const32 [c]) x) (Leq32U x (Const32 [d]))) && uint32(c) >= uint32(d+1) && uint32(d+1) > uint32(d) => ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1])))
+(OrB ((Less|Leq)16U (Const16 [c]) x) (Leq16U x (Const16 [d]))) && uint16(c) >= uint16(d+1) && uint16(d+1) > uint16(d) => ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1])))
+(OrB ((Less|Leq)8U (Const8 [c]) x) (Leq8U x (Const8 [d]))) && uint8(c) >= uint8(d+1) && uint8(d+1) > uint8(d) => ((Less|Leq)8U (Const8 <x.Type> [c-d-1]) (Sub8 <x.Type> x (Const8 <x.Type> [d+1])))
+
+// Canonicalize x-const to x+(-const)
+(Sub64 x (Const64 <t> [c])) && x.Op != OpConst64 => (Add64 (Const64 <t> [-c]) x)
+(Sub32 x (Const32 <t> [c])) && x.Op != OpConst32 => (Add32 (Const32 <t> [-c]) x)
+(Sub16 x (Const16 <t> [c])) && x.Op != OpConst16 => (Add16 (Const16 <t> [-c]) x)
+(Sub8 x (Const8 <t> [c])) && x.Op != OpConst8 => (Add8 (Const8 <t> [-c]) x)
+
+// fold negation into comparison operators
+(Not (Eq(64|32|16|8|B|Ptr|64F|32F) x y)) => (Neq(64|32|16|8|B|Ptr|64F|32F) x y)
+(Not (Neq(64|32|16|8|B|Ptr|64F|32F) x y)) => (Eq(64|32|16|8|B|Ptr|64F|32F) x y)
+
+(Not (Less(64|32|16|8) x y)) => (Leq(64|32|16|8) y x)
+(Not (Less(64|32|16|8)U x y)) => (Leq(64|32|16|8)U y x)
+(Not (Leq(64|32|16|8) x y)) => (Less(64|32|16|8) y x)
+(Not (Leq(64|32|16|8)U x y)) => (Less(64|32|16|8)U y x)
+
+// Distribute multiplication c * (d+x) -> c*d + c*x. Useful for:
+// a[i].b = ...; a[i+1].b = ...
+(Mul64 (Const64 <t> [c]) (Add64 <t> (Const64 <t> [d]) x)) =>
+ (Add64 (Const64 <t> [c*d]) (Mul64 <t> (Const64 <t> [c]) x))
+(Mul32 (Const32 <t> [c]) (Add32 <t> (Const32 <t> [d]) x)) =>
+ (Add32 (Const32 <t> [c*d]) (Mul32 <t> (Const32 <t> [c]) x))
+
+// Rewrite x*y ± x*z to x*(y±z)
+(Add(64|32|16|8) <t> (Mul(64|32|16|8) x y) (Mul(64|32|16|8) x z))
+ => (Mul(64|32|16|8) x (Add(64|32|16|8) <t> y z))
+(Sub(64|32|16|8) <t> (Mul(64|32|16|8) x y) (Mul(64|32|16|8) x z))
+ => (Mul(64|32|16|8) x (Sub(64|32|16|8) <t> y z))
+
+// rewrite shifts of 8/16/32 bit consts into 64 bit consts to reduce
+// the number of the other rewrite rules for const shifts
+(Lsh64x32 <t> x (Const32 [c])) => (Lsh64x64 x (Const64 <t> [int64(uint32(c))]))
+(Lsh64x16 <t> x (Const16 [c])) => (Lsh64x64 x (Const64 <t> [int64(uint16(c))]))
+(Lsh64x8 <t> x (Const8 [c])) => (Lsh64x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh64x32 <t> x (Const32 [c])) => (Rsh64x64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh64x16 <t> x (Const16 [c])) => (Rsh64x64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh64x8 <t> x (Const8 [c])) => (Rsh64x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh64Ux32 <t> x (Const32 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh64Ux16 <t> x (Const16 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh64Ux8 <t> x (Const8 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+(Lsh32x32 <t> x (Const32 [c])) => (Lsh32x64 x (Const64 <t> [int64(uint32(c))]))
+(Lsh32x16 <t> x (Const16 [c])) => (Lsh32x64 x (Const64 <t> [int64(uint16(c))]))
+(Lsh32x8 <t> x (Const8 [c])) => (Lsh32x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh32x32 <t> x (Const32 [c])) => (Rsh32x64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh32x16 <t> x (Const16 [c])) => (Rsh32x64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh32x8 <t> x (Const8 [c])) => (Rsh32x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh32Ux32 <t> x (Const32 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh32Ux16 <t> x (Const16 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh32Ux8 <t> x (Const8 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+(Lsh16x32 <t> x (Const32 [c])) => (Lsh16x64 x (Const64 <t> [int64(uint32(c))]))
+(Lsh16x16 <t> x (Const16 [c])) => (Lsh16x64 x (Const64 <t> [int64(uint16(c))]))
+(Lsh16x8 <t> x (Const8 [c])) => (Lsh16x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh16x32 <t> x (Const32 [c])) => (Rsh16x64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh16x16 <t> x (Const16 [c])) => (Rsh16x64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh16x8 <t> x (Const8 [c])) => (Rsh16x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh16Ux32 <t> x (Const32 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh16Ux16 <t> x (Const16 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh16Ux8 <t> x (Const8 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+(Lsh8x32 <t> x (Const32 [c])) => (Lsh8x64 x (Const64 <t> [int64(uint32(c))]))
+(Lsh8x16 <t> x (Const16 [c])) => (Lsh8x64 x (Const64 <t> [int64(uint16(c))]))
+(Lsh8x8 <t> x (Const8 [c])) => (Lsh8x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh8x32 <t> x (Const32 [c])) => (Rsh8x64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh8x16 <t> x (Const16 [c])) => (Rsh8x64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh8x8 <t> x (Const8 [c])) => (Rsh8x64 x (Const64 <t> [int64(uint8(c))]))
+(Rsh8Ux32 <t> x (Const32 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh8Ux16 <t> x (Const16 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh8Ux8 <t> x (Const8 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+// shifts by zero
+(Lsh(64|32|16|8)x64 x (Const64 [0])) => x
+(Rsh(64|32|16|8)x64 x (Const64 [0])) => x
+(Rsh(64|32|16|8)Ux64 x (Const64 [0])) => x
+
+// rotates by multiples of register width
+(RotateLeft64 x (Const64 [c])) && c%64 == 0 => x
+(RotateLeft32 x (Const32 [c])) && c%32 == 0 => x
+(RotateLeft16 x (Const16 [c])) && c%16 == 0 => x
+(RotateLeft8 x (Const8 [c])) && c%8 == 0 => x
+
+// zero shifted
+(Lsh64x(64|32|16|8) (Const64 [0]) _) => (Const64 [0])
+(Rsh64x(64|32|16|8) (Const64 [0]) _) => (Const64 [0])
+(Rsh64Ux(64|32|16|8) (Const64 [0]) _) => (Const64 [0])
+(Lsh32x(64|32|16|8) (Const32 [0]) _) => (Const32 [0])
+(Rsh32x(64|32|16|8) (Const32 [0]) _) => (Const32 [0])
+(Rsh32Ux(64|32|16|8) (Const32 [0]) _) => (Const32 [0])
+(Lsh16x(64|32|16|8) (Const16 [0]) _) => (Const16 [0])
+(Rsh16x(64|32|16|8) (Const16 [0]) _) => (Const16 [0])
+(Rsh16Ux(64|32|16|8) (Const16 [0]) _) => (Const16 [0])
+(Lsh8x(64|32|16|8) (Const8 [0]) _) => (Const8 [0])
+(Rsh8x(64|32|16|8) (Const8 [0]) _) => (Const8 [0])
+(Rsh8Ux(64|32|16|8) (Const8 [0]) _) => (Const8 [0])
+
+// large left shifts of all values, and right shifts of unsigned values
+((Lsh64|Rsh64U)x64 _ (Const64 [c])) && uint64(c) >= 64 => (Const64 [0])
+((Lsh32|Rsh32U)x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+((Lsh16|Rsh16U)x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+((Lsh8|Rsh8U)x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+
+// combine const shifts
+(Lsh64x64 <t> (Lsh64x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh64x64 x (Const64 <t> [c+d]))
+(Lsh32x64 <t> (Lsh32x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh32x64 x (Const64 <t> [c+d]))
+(Lsh16x64 <t> (Lsh16x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh16x64 x (Const64 <t> [c+d]))
+(Lsh8x64 <t> (Lsh8x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh8x64 x (Const64 <t> [c+d]))
+
+(Rsh64x64 <t> (Rsh64x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh64x64 x (Const64 <t> [c+d]))
+(Rsh32x64 <t> (Rsh32x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh32x64 x (Const64 <t> [c+d]))
+(Rsh16x64 <t> (Rsh16x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh16x64 x (Const64 <t> [c+d]))
+(Rsh8x64 <t> (Rsh8x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh8x64 x (Const64 <t> [c+d]))
+
+(Rsh64Ux64 <t> (Rsh64Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh64Ux64 x (Const64 <t> [c+d]))
+(Rsh32Ux64 <t> (Rsh32Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh32Ux64 x (Const64 <t> [c+d]))
+(Rsh16Ux64 <t> (Rsh16Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh16Ux64 x (Const64 <t> [c+d]))
+(Rsh8Ux64 <t> (Rsh8Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh8Ux64 x (Const64 <t> [c+d]))
+
+// Remove signed right shift before an unsigned right shift that extracts the sign bit.
+(Rsh8Ux64 (Rsh8x64 x _) (Const64 <t> [7] )) => (Rsh8Ux64 x (Const64 <t> [7] ))
+(Rsh16Ux64 (Rsh16x64 x _) (Const64 <t> [15])) => (Rsh16Ux64 x (Const64 <t> [15]))
+(Rsh32Ux64 (Rsh32x64 x _) (Const64 <t> [31])) => (Rsh32Ux64 x (Const64 <t> [31]))
+(Rsh64Ux64 (Rsh64x64 x _) (Const64 <t> [63])) => (Rsh64Ux64 x (Const64 <t> [63]))
+
+// Convert x>>c<<c to x&^(1<<c-1)
+(Lsh64x64 i:(Rsh(64|64U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 64 && i.Uses == 1 => (And64 x (Const64 <v.Type> [int64(-1) << c]))
+(Lsh32x64 i:(Rsh(32|32U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 32 && i.Uses == 1 => (And32 x (Const32 <v.Type> [int32(-1) << c]))
+(Lsh16x64 i:(Rsh(16|16U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 16 && i.Uses == 1 => (And16 x (Const16 <v.Type> [int16(-1) << c]))
+(Lsh8x64 i:(Rsh(8|8U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 8 && i.Uses == 1 => (And8 x (Const8 <v.Type> [int8(-1) << c]))
+// similarly for x<<c>>c
+(Rsh64Ux64 i:(Lsh64x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 64 && i.Uses == 1 => (And64 x (Const64 <v.Type> [int64(^uint64(0)>>c)]))
+(Rsh32Ux64 i:(Lsh32x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 32 && i.Uses == 1 => (And32 x (Const32 <v.Type> [int32(^uint32(0)>>c)]))
+(Rsh16Ux64 i:(Lsh16x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 16 && i.Uses == 1 => (And16 x (Const16 <v.Type> [int16(^uint16(0)>>c)]))
+(Rsh8Ux64 i:(Lsh8x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 8 && i.Uses == 1 => (And8 x (Const8 <v.Type> [int8 (^uint8 (0)>>c)]))
+
+// ((x >> c1) << c2) >> c3
+(Rsh(64|32|16|8)Ux64 (Lsh(64|32|16|8)x64 (Rsh(64|32|16|8)Ux64 x (Const64 [c1])) (Const64 [c2])) (Const64 [c3]))
+ && uint64(c1) >= uint64(c2) && uint64(c3) >= uint64(c2) && !uaddOvf(c1-c2, c3)
+ => (Rsh(64|32|16|8)Ux64 x (Const64 <typ.UInt64> [c1-c2+c3]))
+
+// ((x << c1) >> c2) << c3
+(Lsh(64|32|16|8)x64 (Rsh(64|32|16|8)Ux64 (Lsh(64|32|16|8)x64 x (Const64 [c1])) (Const64 [c2])) (Const64 [c3]))
+ && uint64(c1) >= uint64(c2) && uint64(c3) >= uint64(c2) && !uaddOvf(c1-c2, c3)
+ => (Lsh(64|32|16|8)x64 x (Const64 <typ.UInt64> [c1-c2+c3]))
+
+// (x >> c) & uppermask = 0
+(And64 (Const64 [m]) (Rsh64Ux64 _ (Const64 [c]))) && c >= int64(64-ntz64(m)) => (Const64 [0])
+(And32 (Const32 [m]) (Rsh32Ux64 _ (Const64 [c]))) && c >= int64(32-ntz32(m)) => (Const32 [0])
+(And16 (Const16 [m]) (Rsh16Ux64 _ (Const64 [c]))) && c >= int64(16-ntz16(m)) => (Const16 [0])
+(And8 (Const8 [m]) (Rsh8Ux64 _ (Const64 [c]))) && c >= int64(8-ntz8(m)) => (Const8 [0])
+
+// (x << c) & lowermask = 0
+(And64 (Const64 [m]) (Lsh64x64 _ (Const64 [c]))) && c >= int64(64-nlz64(m)) => (Const64 [0])
+(And32 (Const32 [m]) (Lsh32x64 _ (Const64 [c]))) && c >= int64(32-nlz32(m)) => (Const32 [0])
+(And16 (Const16 [m]) (Lsh16x64 _ (Const64 [c]))) && c >= int64(16-nlz16(m)) => (Const16 [0])
+(And8 (Const8 [m]) (Lsh8x64 _ (Const64 [c]))) && c >= int64(8-nlz8(m)) => (Const8 [0])
+
+// replace shifts with zero extensions
+(Rsh16Ux64 (Lsh16x64 x (Const64 [8])) (Const64 [8])) => (ZeroExt8to16 (Trunc16to8 <typ.UInt8> x))
+(Rsh32Ux64 (Lsh32x64 x (Const64 [24])) (Const64 [24])) => (ZeroExt8to32 (Trunc32to8 <typ.UInt8> x))
+(Rsh64Ux64 (Lsh64x64 x (Const64 [56])) (Const64 [56])) => (ZeroExt8to64 (Trunc64to8 <typ.UInt8> x))
+(Rsh32Ux64 (Lsh32x64 x (Const64 [16])) (Const64 [16])) => (ZeroExt16to32 (Trunc32to16 <typ.UInt16> x))
+(Rsh64Ux64 (Lsh64x64 x (Const64 [48])) (Const64 [48])) => (ZeroExt16to64 (Trunc64to16 <typ.UInt16> x))
+(Rsh64Ux64 (Lsh64x64 x (Const64 [32])) (Const64 [32])) => (ZeroExt32to64 (Trunc64to32 <typ.UInt32> x))
+
+// replace shifts with sign extensions
+(Rsh16x64 (Lsh16x64 x (Const64 [8])) (Const64 [8])) => (SignExt8to16 (Trunc16to8 <typ.Int8> x))
+(Rsh32x64 (Lsh32x64 x (Const64 [24])) (Const64 [24])) => (SignExt8to32 (Trunc32to8 <typ.Int8> x))
+(Rsh64x64 (Lsh64x64 x (Const64 [56])) (Const64 [56])) => (SignExt8to64 (Trunc64to8 <typ.Int8> x))
+(Rsh32x64 (Lsh32x64 x (Const64 [16])) (Const64 [16])) => (SignExt16to32 (Trunc32to16 <typ.Int16> x))
+(Rsh64x64 (Lsh64x64 x (Const64 [48])) (Const64 [48])) => (SignExt16to64 (Trunc64to16 <typ.Int16> x))
+(Rsh64x64 (Lsh64x64 x (Const64 [32])) (Const64 [32])) => (SignExt32to64 (Trunc64to32 <typ.Int32> x))
+
+// constant comparisons
+(Eq(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c == d])
+(Neq(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c != d])
+(Less(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c < d])
+(Leq(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c <= d])
+
+(Less64U (Const64 [c]) (Const64 [d])) => (ConstBool [uint64(c) < uint64(d)])
+(Less32U (Const32 [c]) (Const32 [d])) => (ConstBool [uint32(c) < uint32(d)])
+(Less16U (Const16 [c]) (Const16 [d])) => (ConstBool [uint16(c) < uint16(d)])
+(Less8U (Const8 [c]) (Const8 [d])) => (ConstBool [ uint8(c) < uint8(d)])
+
+(Leq64U (Const64 [c]) (Const64 [d])) => (ConstBool [uint64(c) <= uint64(d)])
+(Leq32U (Const32 [c]) (Const32 [d])) => (ConstBool [uint32(c) <= uint32(d)])
+(Leq16U (Const16 [c]) (Const16 [d])) => (ConstBool [uint16(c) <= uint16(d)])
+(Leq8U (Const8 [c]) (Const8 [d])) => (ConstBool [ uint8(c) <= uint8(d)])
+
+(Leq8 (Const8 [0]) (And8 _ (Const8 [c]))) && c >= 0 => (ConstBool [true])
+(Leq16 (Const16 [0]) (And16 _ (Const16 [c]))) && c >= 0 => (ConstBool [true])
+(Leq32 (Const32 [0]) (And32 _ (Const32 [c]))) && c >= 0 => (ConstBool [true])
+(Leq64 (Const64 [0]) (And64 _ (Const64 [c]))) && c >= 0 => (ConstBool [true])
+
+(Leq8 (Const8 [0]) (Rsh8Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+(Leq16 (Const16 [0]) (Rsh16Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+(Leq32 (Const32 [0]) (Rsh32Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+(Leq64 (Const64 [0]) (Rsh64Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+
+(Less(64|32|16|8) (Const(64|32|16|8) <t> [0]) x) && isNonNegative(x) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+(Less(64|32|16|8) x (Const(64|32|16|8) <t> [1])) && isNonNegative(x) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x)
+
+// constant floating point comparisons
+(Eq32F (Const32F [c]) (Const32F [d])) => (ConstBool [c == d])
+(Eq64F (Const64F [c]) (Const64F [d])) => (ConstBool [c == d])
+(Neq32F (Const32F [c]) (Const32F [d])) => (ConstBool [c != d])
+(Neq64F (Const64F [c]) (Const64F [d])) => (ConstBool [c != d])
+(Less32F (Const32F [c]) (Const32F [d])) => (ConstBool [c < d])
+(Less64F (Const64F [c]) (Const64F [d])) => (ConstBool [c < d])
+(Leq32F (Const32F [c]) (Const32F [d])) => (ConstBool [c <= d])
+(Leq64F (Const64F [c]) (Const64F [d])) => (ConstBool [c <= d])
+
+// simplifications
+(Or(64|32|16|8) x x) => x
+(Or(64|32|16|8) (Const(64|32|16|8) [0]) x) => x
+(Or(64|32|16|8) (Const(64|32|16|8) [-1]) _) => (Const(64|32|16|8) [-1])
+(Or(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [-1])
+
+(And(64|32|16|8) x x) => x
+(And(64|32|16|8) (Const(64|32|16|8) [-1]) x) => x
+(And(64|32|16|8) (Const(64|32|16|8) [0]) _) => (Const(64|32|16|8) [0])
+(And(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [0])
+
+(Xor(64|32|16|8) x x) => (Const(64|32|16|8) [0])
+(Xor(64|32|16|8) (Const(64|32|16|8) [0]) x) => x
+(Xor(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [-1])
+
+(Add(64|32|16|8) (Const(64|32|16|8) [0]) x) => x
+(Sub(64|32|16|8) x x) => (Const(64|32|16|8) [0])
+(Mul(64|32|16|8) (Const(64|32|16|8) [0]) _) => (Const(64|32|16|8) [0])
+(Select0 (Mul(64|32)uover (Const(64|32) [0]) x)) => (Const(64|32) [0])
+(Select1 (Mul(64|32)uover (Const(64|32) [0]) x)) => (ConstBool [false])
+
+(Com(64|32|16|8) (Com(64|32|16|8) x)) => x
+(Com(64|32|16|8) (Const(64|32|16|8) [c])) => (Const(64|32|16|8) [^c])
+
+(Neg(64|32|16|8) (Sub(64|32|16|8) x y)) => (Sub(64|32|16|8) y x)
+(Add(64|32|16|8) x (Neg(64|32|16|8) y)) => (Sub(64|32|16|8) x y)
+
+(Xor(64|32|16|8) (Const(64|32|16|8) [-1]) x) => (Com(64|32|16|8) x)
+
+(Sub(64|32|16|8) (Neg(64|32|16|8) x) (Com(64|32|16|8) x)) => (Const(64|32|16|8) [1])
+(Sub(64|32|16|8) (Com(64|32|16|8) x) (Neg(64|32|16|8) x)) => (Const(64|32|16|8) [-1])
+(Add(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [-1])
+
+// ^(x-1) == ^x+1 == -x
+(Add(64|32|16|8) (Const(64|32|16|8) [1]) (Com(64|32|16|8) x)) => (Neg(64|32|16|8) x)
+(Com(64|32|16|8) (Add(64|32|16|8) (Const(64|32|16|8) [-1]) x)) => (Neg(64|32|16|8) x)
+
+// -(-x) == x
+(Neg(64|32|16|8) (Neg(64|32|16|8) x)) => x
+
+// -^x == x+1
+(Neg(64|32|16|8) <t> (Com(64|32|16|8) x)) => (Add(64|32|16|8) (Const(64|32|16|8) <t> [1]) x)
+
+(And(64|32|16|8) x (And(64|32|16|8) x y)) => (And(64|32|16|8) x y)
+(Or(64|32|16|8) x (Or(64|32|16|8) x y)) => (Or(64|32|16|8) x y)
+(Xor(64|32|16|8) x (Xor(64|32|16|8) x y)) => y
+
+// Unsigned comparisons to zero.
+(Less(64U|32U|16U|8U) _ (Const(64|32|16|8) [0])) => (ConstBool [false])
+(Leq(64U|32U|16U|8U) (Const(64|32|16|8) [0]) _) => (ConstBool [true])
+
+// Ands clear bits. Ors set bits.
+// If a subsequent Or will set all the bits
+// that an And cleared, we can skip the And.
+// This happens in bitmasking code like:
+// x &^= 3 << shift // clear two old bits
+// x |= v << shift // set two new bits
+// when shift is a small constant and v ends up a constant 3.
+(Or8 (And8 x (Const8 [c2])) (Const8 <t> [c1])) && ^(c1 | c2) == 0 => (Or8 (Const8 <t> [c1]) x)
+(Or16 (And16 x (Const16 [c2])) (Const16 <t> [c1])) && ^(c1 | c2) == 0 => (Or16 (Const16 <t> [c1]) x)
+(Or32 (And32 x (Const32 [c2])) (Const32 <t> [c1])) && ^(c1 | c2) == 0 => (Or32 (Const32 <t> [c1]) x)
+(Or64 (And64 x (Const64 [c2])) (Const64 <t> [c1])) && ^(c1 | c2) == 0 => (Or64 (Const64 <t> [c1]) x)
+
+(Trunc64to8 (And64 (Const64 [y]) x)) && y&0xFF == 0xFF => (Trunc64to8 x)
+(Trunc64to16 (And64 (Const64 [y]) x)) && y&0xFFFF == 0xFFFF => (Trunc64to16 x)
+(Trunc64to32 (And64 (Const64 [y]) x)) && y&0xFFFFFFFF == 0xFFFFFFFF => (Trunc64to32 x)
+(Trunc32to8 (And32 (Const32 [y]) x)) && y&0xFF == 0xFF => (Trunc32to8 x)
+(Trunc32to16 (And32 (Const32 [y]) x)) && y&0xFFFF == 0xFFFF => (Trunc32to16 x)
+(Trunc16to8 (And16 (Const16 [y]) x)) && y&0xFF == 0xFF => (Trunc16to8 x)
+
+(ZeroExt8to64 (Trunc64to8 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 56 => x
+(ZeroExt16to64 (Trunc64to16 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 48 => x
+(ZeroExt32to64 (Trunc64to32 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 32 => x
+(ZeroExt8to32 (Trunc32to8 x:(Rsh32Ux64 _ (Const64 [s])))) && s >= 24 => x
+(ZeroExt16to32 (Trunc32to16 x:(Rsh32Ux64 _ (Const64 [s])))) && s >= 16 => x
+(ZeroExt8to16 (Trunc16to8 x:(Rsh16Ux64 _ (Const64 [s])))) && s >= 8 => x
+
+(SignExt8to64 (Trunc64to8 x:(Rsh64x64 _ (Const64 [s])))) && s >= 56 => x
+(SignExt16to64 (Trunc64to16 x:(Rsh64x64 _ (Const64 [s])))) && s >= 48 => x
+(SignExt32to64 (Trunc64to32 x:(Rsh64x64 _ (Const64 [s])))) && s >= 32 => x
+(SignExt8to32 (Trunc32to8 x:(Rsh32x64 _ (Const64 [s])))) && s >= 24 => x
+(SignExt16to32 (Trunc32to16 x:(Rsh32x64 _ (Const64 [s])))) && s >= 16 => x
+(SignExt8to16 (Trunc16to8 x:(Rsh16x64 _ (Const64 [s])))) && s >= 8 => x
+
+(Slicemask (Const32 [x])) && x > 0 => (Const32 [-1])
+(Slicemask (Const32 [0])) => (Const32 [0])
+(Slicemask (Const64 [x])) && x > 0 => (Const64 [-1])
+(Slicemask (Const64 [0])) => (Const64 [0])
+
+// simplifications often used for lengths. e.g. len(s[i:i+5])==5
+(Sub(64|32|16|8) (Add(64|32|16|8) x y) x) => y
+(Sub(64|32|16|8) (Add(64|32|16|8) x y) y) => x
+(Sub(64|32|16|8) (Sub(64|32|16|8) x y) x) => (Neg(64|32|16|8) y)
+(Sub(64|32|16|8) x (Add(64|32|16|8) x y)) => (Neg(64|32|16|8) y)
+(Add(64|32|16|8) x (Sub(64|32|16|8) y x)) => y
+(Add(64|32|16|8) x (Add(64|32|16|8) y (Sub(64|32|16|8) z x))) => (Add(64|32|16|8) y z)
+
+// basic phi simplifications
+(Phi (Const8 [c]) (Const8 [c])) => (Const8 [c])
+(Phi (Const16 [c]) (Const16 [c])) => (Const16 [c])
+(Phi (Const32 [c]) (Const32 [c])) => (Const32 [c])
+(Phi (Const64 [c]) (Const64 [c])) => (Const64 [c])
+
+// slice and interface comparisons
+// The frontend ensures that we can only compare against nil,
+// so we need only compare the first word (interface type or slice ptr).
+(EqInter x y) => (EqPtr (ITab x) (ITab y))
+(NeqInter x y) => (NeqPtr (ITab x) (ITab y))
+(EqSlice x y) => (EqPtr (SlicePtr x) (SlicePtr y))
+(NeqSlice x y) => (NeqPtr (SlicePtr x) (SlicePtr y))
+
+// Load of store of same address, with compatibly typed value and same size
+(Load <t1> p1 (Store {t2} p2 x _))
+ && isSamePtr(p1, p2)
+ && t1.Compare(x.Type) == types.CMPeq
+ && t1.Size() == t2.Size()
+ => x
+(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 x _)))
+ && isSamePtr(p1, p3)
+ && t1.Compare(x.Type) == types.CMPeq
+ && t1.Size() == t2.Size()
+ && disjoint(p3, t3.Size(), p2, t2.Size())
+ => x
+(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 x _))))
+ && isSamePtr(p1, p4)
+ && t1.Compare(x.Type) == types.CMPeq
+ && t1.Size() == t2.Size()
+ && disjoint(p4, t4.Size(), p2, t2.Size())
+ && disjoint(p4, t4.Size(), p3, t3.Size())
+ => x
+(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 x _)))))
+ && isSamePtr(p1, p5)
+ && t1.Compare(x.Type) == types.CMPeq
+ && t1.Size() == t2.Size()
+ && disjoint(p5, t5.Size(), p2, t2.Size())
+ && disjoint(p5, t5.Size(), p3, t3.Size())
+ && disjoint(p5, t5.Size(), p4, t4.Size())
+ => x
+
+// Pass constants through math.Float{32,64}bits and math.Float{32,64}frombits
+ (Load <t1> p1 (Store {t2} p2 (Const64 [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 8 && is64BitFloat(t1) && !math.IsNaN(math.Float64frombits(uint64(x))) => (Const64F [math.Float64frombits(uint64(x))])
+ (Load <t1> p1 (Store {t2} p2 (Const32 [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 4 && is32BitFloat(t1) && !math.IsNaN(float64(math.Float32frombits(uint32(x)))) => (Const32F [math.Float32frombits(uint32(x))])
+(Load <t1> p1 (Store {t2} p2 (Const64F [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 8 && is64BitInt(t1) => (Const64 [int64(math.Float64bits(x))])
+(Load <t1> p1 (Store {t2} p2 (Const32F [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 4 && is32BitInt(t1) => (Const32 [int32(math.Float32bits(x))])
+
+// Float Loads up to Zeros so they can be constant folded.
+(Load <t1> op:(OffPtr [o1] p1)
+ (Store {t2} p2 _
+ mem:(Zero [n] p3 _)))
+ && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p3)
+ && fe.CanSSA(t1)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p3) mem)
+(Load <t1> op:(OffPtr [o1] p1)
+ (Store {t2} p2 _
+ (Store {t3} p3 _
+ mem:(Zero [n] p4 _))))
+ && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p4)
+ && fe.CanSSA(t1)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ && disjoint(op, t1.Size(), p3, t3.Size())
+ => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p4) mem)
+(Load <t1> op:(OffPtr [o1] p1)
+ (Store {t2} p2 _
+ (Store {t3} p3 _
+ (Store {t4} p4 _
+ mem:(Zero [n] p5 _)))))
+ && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p5)
+ && fe.CanSSA(t1)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ && disjoint(op, t1.Size(), p3, t3.Size())
+ && disjoint(op, t1.Size(), p4, t4.Size())
+ => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p5) mem)
+(Load <t1> op:(OffPtr [o1] p1)
+ (Store {t2} p2 _
+ (Store {t3} p3 _
+ (Store {t4} p4 _
+ (Store {t5} p5 _
+ mem:(Zero [n] p6 _))))))
+ && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p6)
+ && fe.CanSSA(t1)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ && disjoint(op, t1.Size(), p3, t3.Size())
+ && disjoint(op, t1.Size(), p4, t4.Size())
+ && disjoint(op, t1.Size(), p5, t5.Size())
+ => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p6) mem)
+
+// Zero to Load forwarding.
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && t1.IsBoolean()
+ && isSamePtr(p1, p2)
+ && n >= o + 1
+ => (ConstBool [false])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && is8BitInt(t1)
+ && isSamePtr(p1, p2)
+ && n >= o + 1
+ => (Const8 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && is16BitInt(t1)
+ && isSamePtr(p1, p2)
+ && n >= o + 2
+ => (Const16 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && is32BitInt(t1)
+ && isSamePtr(p1, p2)
+ && n >= o + 4
+ => (Const32 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && is64BitInt(t1)
+ && isSamePtr(p1, p2)
+ && n >= o + 8
+ => (Const64 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && is32BitFloat(t1)
+ && isSamePtr(p1, p2)
+ && n >= o + 4
+ => (Const32F [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+ && is64BitFloat(t1)
+ && isSamePtr(p1, p2)
+ && n >= o + 8
+ => (Const64F [0])
+
+// Eliminate stores of values that have just been loaded from the same location.
+// We also handle the common case where there are some intermediate stores.
+(Store {t1} p1 (Load <t2> p2 mem) mem)
+ && isSamePtr(p1, p2)
+ && t2.Size() == t1.Size()
+ => mem
+(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ oldmem))
+ && isSamePtr(p1, p2)
+ && t2.Size() == t1.Size()
+ && disjoint(p1, t1.Size(), p3, t3.Size())
+ => mem
+(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ (Store {t4} p4 _ oldmem)))
+ && isSamePtr(p1, p2)
+ && t2.Size() == t1.Size()
+ && disjoint(p1, t1.Size(), p3, t3.Size())
+ && disjoint(p1, t1.Size(), p4, t4.Size())
+ => mem
+(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 _ oldmem))))
+ && isSamePtr(p1, p2)
+ && t2.Size() == t1.Size()
+ && disjoint(p1, t1.Size(), p3, t3.Size())
+ && disjoint(p1, t1.Size(), p4, t4.Size())
+ && disjoint(p1, t1.Size(), p5, t5.Size())
+ => mem
+
+// Don't Store zeros to cleared variables.
+(Store {t} (OffPtr [o] p1) x mem:(Zero [n] p2 _))
+ && isConstZero(x)
+ && o >= 0 && t.Size() + o <= n && isSamePtr(p1, p2)
+ => mem
+(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Zero [n] p3 _)))
+ && isConstZero(x)
+ && o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p3)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ => mem
+(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Store {t3} p3 _ (Zero [n] p4 _))))
+ && isConstZero(x)
+ && o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p4)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ && disjoint(op, t1.Size(), p3, t3.Size())
+ => mem
+(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Zero [n] p5 _)))))
+ && isConstZero(x)
+ && o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p5)
+ && disjoint(op, t1.Size(), p2, t2.Size())
+ && disjoint(op, t1.Size(), p3, t3.Size())
+ && disjoint(op, t1.Size(), p4, t4.Size())
+ => mem
+
+// Collapse OffPtr
+(OffPtr (OffPtr p [y]) [x]) => (OffPtr p [x+y])
+(OffPtr p [0]) && v.Type.Compare(p.Type) == types.CMPeq => p
+
+// indexing operations
+// Note: bounds check has already been done
+(PtrIndex <t> ptr idx) && config.PtrSize == 4 && is32Bit(t.Elem().Size()) => (AddPtr ptr (Mul32 <typ.Int> idx (Const32 <typ.Int> [int32(t.Elem().Size())])))
+(PtrIndex <t> ptr idx) && config.PtrSize == 8 => (AddPtr ptr (Mul64 <typ.Int> idx (Const64 <typ.Int> [t.Elem().Size()])))
+
+// struct operations
+(StructSelect (StructMake1 x)) => x
+(StructSelect [0] (StructMake2 x _)) => x
+(StructSelect [1] (StructMake2 _ x)) => x
+(StructSelect [0] (StructMake3 x _ _)) => x
+(StructSelect [1] (StructMake3 _ x _)) => x
+(StructSelect [2] (StructMake3 _ _ x)) => x
+(StructSelect [0] (StructMake4 x _ _ _)) => x
+(StructSelect [1] (StructMake4 _ x _ _)) => x
+(StructSelect [2] (StructMake4 _ _ x _)) => x
+(StructSelect [3] (StructMake4 _ _ _ x)) => x
+
+(Load <t> _ _) && t.IsStruct() && t.NumFields() == 0 && fe.CanSSA(t) =>
+ (StructMake0)
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 1 && fe.CanSSA(t) =>
+ (StructMake1
+ (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem))
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 2 && fe.CanSSA(t) =>
+ (StructMake2
+ (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem)
+ (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem))
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 3 && fe.CanSSA(t) =>
+ (StructMake3
+ (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem)
+ (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem)
+ (Load <t.FieldType(2)> (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] ptr) mem))
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 4 && fe.CanSSA(t) =>
+ (StructMake4
+ (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem)
+ (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem)
+ (Load <t.FieldType(2)> (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] ptr) mem)
+ (Load <t.FieldType(3)> (OffPtr <t.FieldType(3).PtrTo()> [t.FieldOff(3)] ptr) mem))
+
+(StructSelect [i] x:(Load <t> ptr mem)) && !fe.CanSSA(t) =>
+ @x.Block (Load <v.Type> (OffPtr <v.Type.PtrTo()> [t.FieldOff(int(i))] ptr) mem)
+
+(Store _ (StructMake0) mem) => mem
+(Store dst (StructMake1 <t> f0) mem) =>
+ (Store {t.FieldType(0)} (OffPtr <t.FieldType(0).PtrTo()> [0] dst) f0 mem)
+(Store dst (StructMake2 <t> f0 f1) mem) =>
+ (Store {t.FieldType(1)}
+ (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst)
+ f1
+ (Store {t.FieldType(0)}
+ (OffPtr <t.FieldType(0).PtrTo()> [0] dst)
+ f0 mem))
+(Store dst (StructMake3 <t> f0 f1 f2) mem) =>
+ (Store {t.FieldType(2)}
+ (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] dst)
+ f2
+ (Store {t.FieldType(1)}
+ (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst)
+ f1
+ (Store {t.FieldType(0)}
+ (OffPtr <t.FieldType(0).PtrTo()> [0] dst)
+ f0 mem)))
+(Store dst (StructMake4 <t> f0 f1 f2 f3) mem) =>
+ (Store {t.FieldType(3)}
+ (OffPtr <t.FieldType(3).PtrTo()> [t.FieldOff(3)] dst)
+ f3
+ (Store {t.FieldType(2)}
+ (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] dst)
+ f2
+ (Store {t.FieldType(1)}
+ (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst)
+ f1
+ (Store {t.FieldType(0)}
+ (OffPtr <t.FieldType(0).PtrTo()> [0] dst)
+ f0 mem))))
+
+// Putting struct{*byte} and similar into direct interfaces.
+(IMake _typ (StructMake1 val)) => (IMake _typ val)
+(StructSelect [0] (IData x)) => (IData x)
+
+// un-SSAable values use mem->mem copies
+(Store {t} dst (Load src mem) mem) && !fe.CanSSA(t) =>
+ (Move {t} [t.Size()] dst src mem)
+(Store {t} dst (Load src mem) (VarDef {x} mem)) && !fe.CanSSA(t) =>
+ (Move {t} [t.Size()] dst src (VarDef {x} mem))
+
+// array ops
+(ArraySelect (ArrayMake1 x)) => x
+
+(Load <t> _ _) && t.IsArray() && t.NumElem() == 0 =>
+ (ArrayMake0)
+
+(Load <t> ptr mem) && t.IsArray() && t.NumElem() == 1 && fe.CanSSA(t) =>
+ (ArrayMake1 (Load <t.Elem()> ptr mem))
+
+(Store _ (ArrayMake0) mem) => mem
+(Store dst (ArrayMake1 e) mem) => (Store {e.Type} dst e mem)
+
+// Putting [1]*byte and similar into direct interfaces.
+(IMake _typ (ArrayMake1 val)) => (IMake _typ val)
+(ArraySelect [0] (IData x)) => (IData x)
+
+// string ops
+// Decomposing StringMake and lowering of StringPtr and StringLen
+// happens in a later pass, dec, so that these operations are available
+// to other passes for optimizations.
+(StringPtr (StringMake (Addr <t> {s} base) _)) => (Addr <t> {s} base)
+(StringLen (StringMake _ (Const64 <t> [c]))) => (Const64 <t> [c])
+(ConstString {str}) && config.PtrSize == 4 && str == "" =>
+ (StringMake (ConstNil) (Const32 <typ.Int> [0]))
+(ConstString {str}) && config.PtrSize == 8 && str == "" =>
+ (StringMake (ConstNil) (Const64 <typ.Int> [0]))
+(ConstString {str}) && config.PtrSize == 4 && str != "" =>
+ (StringMake
+ (Addr <typ.BytePtr> {fe.StringData(str)}
+ (SB))
+ (Const32 <typ.Int> [int32(len(str))]))
+(ConstString {str}) && config.PtrSize == 8 && str != "" =>
+ (StringMake
+ (Addr <typ.BytePtr> {fe.StringData(str)}
+ (SB))
+ (Const64 <typ.Int> [int64(len(str))]))
+
+// slice ops
+// Only a few slice rules are provided here. See dec.rules for
+// a more comprehensive set.
+(SliceLen (SliceMake _ (Const64 <t> [c]) _)) => (Const64 <t> [c])
+(SliceCap (SliceMake _ _ (Const64 <t> [c]))) => (Const64 <t> [c])
+(SliceLen (SliceMake _ (Const32 <t> [c]) _)) => (Const32 <t> [c])
+(SliceCap (SliceMake _ _ (Const32 <t> [c]))) => (Const32 <t> [c])
+(SlicePtr (SliceMake (SlicePtr x) _ _)) => (SlicePtr x)
+(SliceLen (SliceMake _ (SliceLen x) _)) => (SliceLen x)
+(SliceCap (SliceMake _ _ (SliceCap x))) => (SliceCap x)
+(SliceCap (SliceMake _ _ (SliceLen x))) => (SliceLen x)
+(ConstSlice) && config.PtrSize == 4 =>
+ (SliceMake
+ (ConstNil <v.Type.Elem().PtrTo()>)
+ (Const32 <typ.Int> [0])
+ (Const32 <typ.Int> [0]))
+(ConstSlice) && config.PtrSize == 8 =>
+ (SliceMake
+ (ConstNil <v.Type.Elem().PtrTo()>)
+ (Const64 <typ.Int> [0])
+ (Const64 <typ.Int> [0]))
+
+// interface ops
+(ConstInterface) =>
+ (IMake
+ (ConstNil <typ.Uintptr>)
+ (ConstNil <typ.BytePtr>))
+
+(NilCheck (GetG mem) mem) => mem
+
+(If (Not cond) yes no) => (If cond no yes)
+(If (ConstBool [c]) yes no) && c => (First yes no)
+(If (ConstBool [c]) yes no) && !c => (First no yes)
+
+(Phi <t> nx:(Not x) ny:(Not y)) && nx.Uses == 1 && ny.Uses == 1 => (Not (Phi <t> x y))
+
+// Get rid of Convert ops for pointer arithmetic on unsafe.Pointer.
+(Convert (Add(64|32) (Convert ptr mem) off) mem) => (AddPtr ptr off)
+(Convert (Convert ptr mem) mem) => ptr
+
+// strength reduction of divide by a constant.
+// See ../magic.go for a detailed description of these algorithms.
+
+// Unsigned divide by power of 2. Strength reduce to a shift.
+(Div8u n (Const8 [c])) && isPowerOfTwo8(c) => (Rsh8Ux64 n (Const64 <typ.UInt64> [log8(c)]))
+(Div16u n (Const16 [c])) && isPowerOfTwo16(c) => (Rsh16Ux64 n (Const64 <typ.UInt64> [log16(c)]))
+(Div32u n (Const32 [c])) && isPowerOfTwo32(c) => (Rsh32Ux64 n (Const64 <typ.UInt64> [log32(c)]))
+(Div64u n (Const64 [c])) && isPowerOfTwo64(c) => (Rsh64Ux64 n (Const64 <typ.UInt64> [log64(c)]))
+(Div64u n (Const64 [-1<<63])) => (Rsh64Ux64 n (Const64 <typ.UInt64> [63]))
+
+// Signed non-negative divide by power of 2.
+(Div8 n (Const8 [c])) && isNonNegative(n) && isPowerOfTwo8(c) => (Rsh8Ux64 n (Const64 <typ.UInt64> [log8(c)]))
+(Div16 n (Const16 [c])) && isNonNegative(n) && isPowerOfTwo16(c) => (Rsh16Ux64 n (Const64 <typ.UInt64> [log16(c)]))
+(Div32 n (Const32 [c])) && isNonNegative(n) && isPowerOfTwo32(c) => (Rsh32Ux64 n (Const64 <typ.UInt64> [log32(c)]))
+(Div64 n (Const64 [c])) && isNonNegative(n) && isPowerOfTwo64(c) => (Rsh64Ux64 n (Const64 <typ.UInt64> [log64(c)]))
+(Div64 n (Const64 [-1<<63])) && isNonNegative(n) => (Const64 [0])
+
+// Unsigned divide, not a power of 2. Strength reduce to a multiply.
+// For 8-bit divides, we just do a direct 9-bit by 8-bit multiply.
+(Div8u x (Const8 [c])) && umagicOK8(c) =>
+ (Trunc32to8
+ (Rsh32Ux64 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(1<<8+umagic8(c).m)])
+ (ZeroExt8to32 x))
+ (Const64 <typ.UInt64> [8+umagic8(c).s])))
+
+// For 16-bit divides on 64-bit machines, we do a direct 17-bit by 16-bit multiply.
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 8 =>
+ (Trunc64to16
+ (Rsh64Ux64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(1<<16+umagic16(c).m)])
+ (ZeroExt16to64 x))
+ (Const64 <typ.UInt64> [16+umagic16(c).s])))
+
+// For 16-bit divides on 32-bit machines
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && umagic16(c).m&1 == 0 =>
+ (Trunc32to16
+ (Rsh32Ux64 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(1<<15+umagic16(c).m/2)])
+ (ZeroExt16to32 x))
+ (Const64 <typ.UInt64> [16+umagic16(c).s-1])))
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && c&1 == 0 =>
+ (Trunc32to16
+ (Rsh32Ux64 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(1<<15+(umagic16(c).m+1)/2)])
+ (Rsh32Ux64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [1])))
+ (Const64 <typ.UInt64> [16+umagic16(c).s-2])))
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && config.useAvg =>
+ (Trunc32to16
+ (Rsh32Ux64 <typ.UInt32>
+ (Avg32u
+ (Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16]))
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(umagic16(c).m)])
+ (ZeroExt16to32 x)))
+ (Const64 <typ.UInt64> [16+umagic16(c).s-1])))
+
+// For 32-bit divides on 32-bit machines
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && umagic32(c).m&1 == 0 && config.useHmul =>
+ (Rsh32Ux64 <typ.UInt32>
+ (Hmul32u <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(1<<31+umagic32(c).m/2)])
+ x)
+ (Const64 <typ.UInt64> [umagic32(c).s-1]))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul =>
+ (Rsh32Ux64 <typ.UInt32>
+ (Hmul32u <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(1<<31+(umagic32(c).m+1)/2)])
+ (Rsh32Ux64 <typ.UInt32> x (Const64 <typ.UInt64> [1])))
+ (Const64 <typ.UInt64> [umagic32(c).s-2]))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul =>
+ (Rsh32Ux64 <typ.UInt32>
+ (Avg32u
+ x
+ (Hmul32u <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(umagic32(c).m)])
+ x))
+ (Const64 <typ.UInt64> [umagic32(c).s-1]))
+
+// For 32-bit divides on 64-bit machines
+// We'll use a regular (non-hi) multiply for this case.
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && umagic32(c).m&1 == 0 =>
+ (Trunc64to32
+ (Rsh64Ux64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(1<<31+umagic32(c).m/2)])
+ (ZeroExt32to64 x))
+ (Const64 <typ.UInt64> [32+umagic32(c).s-1])))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && c&1 == 0 =>
+ (Trunc64to32
+ (Rsh64Ux64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(1<<31+(umagic32(c).m+1)/2)])
+ (Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1])))
+ (Const64 <typ.UInt64> [32+umagic32(c).s-2])))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.useAvg =>
+ (Trunc64to32
+ (Rsh64Ux64 <typ.UInt64>
+ (Avg64u
+ (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32]))
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt32> [int64(umagic32(c).m)])
+ (ZeroExt32to64 x)))
+ (Const64 <typ.UInt64> [32+umagic32(c).s-1])))
+
+// For unsigned 64-bit divides on 32-bit machines,
+// if the constant fits in 16 bits (so that the last term
+// fits in 32 bits), convert to three 32-bit divides by a constant.
+//
+// If 1<<32 = Q * c + R
+// and x = hi << 32 + lo
+//
+// Then x = (hi/c*c + hi%c) << 32 + lo
+// = hi/c*c<<32 + hi%c<<32 + lo
+// = hi/c*c<<32 + (hi%c)*(Q*c+R) + lo/c*c + lo%c
+// = hi/c*c<<32 + (hi%c)*Q*c + lo/c*c + (hi%c*R+lo%c)
+// and x / c = (hi/c)<<32 + (hi%c)*Q + lo/c + (hi%c*R+lo%c)/c
+(Div64u x (Const64 [c])) && c > 0 && c <= 0xFFFF && umagicOK32(int32(c)) && config.RegSize == 4 && config.useHmul =>
+ (Add64
+ (Add64 <typ.UInt64>
+ (Add64 <typ.UInt64>
+ (Lsh64x64 <typ.UInt64>
+ (ZeroExt32to64
+ (Div32u <typ.UInt32>
+ (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32])))
+ (Const32 <typ.UInt32> [int32(c)])))
+ (Const64 <typ.UInt64> [32]))
+ (ZeroExt32to64 (Div32u <typ.UInt32> (Trunc64to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(c)]))))
+ (Mul64 <typ.UInt64>
+ (ZeroExt32to64 <typ.UInt64>
+ (Mod32u <typ.UInt32>
+ (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32])))
+ (Const32 <typ.UInt32> [int32(c)])))
+ (Const64 <typ.UInt64> [int64((1<<32)/c)])))
+ (ZeroExt32to64
+ (Div32u <typ.UInt32>
+ (Add32 <typ.UInt32>
+ (Mod32u <typ.UInt32> (Trunc64to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(c)]))
+ (Mul32 <typ.UInt32>
+ (Mod32u <typ.UInt32>
+ (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32])))
+ (Const32 <typ.UInt32> [int32(c)]))
+ (Const32 <typ.UInt32> [int32((1<<32)%c)])))
+ (Const32 <typ.UInt32> [int32(c)]))))
+
+// For 64-bit divides on 64-bit machines
+// (64-bit divides on 32-bit machines are lowered to a runtime call by the walk pass.)
+(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && umagic64(c).m&1 == 0 && config.useHmul =>
+ (Rsh64Ux64 <typ.UInt64>
+ (Hmul64u <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(1<<63+umagic64(c).m/2)])
+ x)
+ (Const64 <typ.UInt64> [umagic64(c).s-1]))
+(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && c&1 == 0 && config.useHmul =>
+ (Rsh64Ux64 <typ.UInt64>
+ (Hmul64u <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(1<<63+(umagic64(c).m+1)/2)])
+ (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [1])))
+ (Const64 <typ.UInt64> [umagic64(c).s-2]))
+(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && config.useAvg && config.useHmul =>
+ (Rsh64Ux64 <typ.UInt64>
+ (Avg64u
+ x
+ (Hmul64u <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(umagic64(c).m)])
+ x))
+ (Const64 <typ.UInt64> [umagic64(c).s-1]))
+
+// Signed divide by a negative constant. Rewrite to divide by a positive constant.
+(Div8 <t> n (Const8 [c])) && c < 0 && c != -1<<7 => (Neg8 (Div8 <t> n (Const8 <t> [-c])))
+(Div16 <t> n (Const16 [c])) && c < 0 && c != -1<<15 => (Neg16 (Div16 <t> n (Const16 <t> [-c])))
+(Div32 <t> n (Const32 [c])) && c < 0 && c != -1<<31 => (Neg32 (Div32 <t> n (Const32 <t> [-c])))
+(Div64 <t> n (Const64 [c])) && c < 0 && c != -1<<63 => (Neg64 (Div64 <t> n (Const64 <t> [-c])))
+
+// Dividing by the most-negative number. Result is always 0 except
+// if the input is also the most-negative number.
+// We can detect that using the sign bit of x & -x.
+(Div8 <t> x (Const8 [-1<<7 ])) => (Rsh8Ux64 (And8 <t> x (Neg8 <t> x)) (Const64 <typ.UInt64> [7 ]))
+(Div16 <t> x (Const16 [-1<<15])) => (Rsh16Ux64 (And16 <t> x (Neg16 <t> x)) (Const64 <typ.UInt64> [15]))
+(Div32 <t> x (Const32 [-1<<31])) => (Rsh32Ux64 (And32 <t> x (Neg32 <t> x)) (Const64 <typ.UInt64> [31]))
+(Div64 <t> x (Const64 [-1<<63])) => (Rsh64Ux64 (And64 <t> x (Neg64 <t> x)) (Const64 <typ.UInt64> [63]))
+
+// Signed divide by power of 2.
+// n / c = n >> log(c) if n >= 0
+// = (n+c-1) >> log(c) if n < 0
+// We conditionally add c-1 by adding n>>63>>(64-log(c)) (first shift signed, second shift unsigned).
+(Div8 <t> n (Const8 [c])) && isPowerOfTwo8(c) =>
+ (Rsh8x64
+ (Add8 <t> n (Rsh8Ux64 <t> (Rsh8x64 <t> n (Const64 <typ.UInt64> [ 7])) (Const64 <typ.UInt64> [int64( 8-log8(c))])))
+ (Const64 <typ.UInt64> [int64(log8(c))]))
+(Div16 <t> n (Const16 [c])) && isPowerOfTwo16(c) =>
+ (Rsh16x64
+ (Add16 <t> n (Rsh16Ux64 <t> (Rsh16x64 <t> n (Const64 <typ.UInt64> [15])) (Const64 <typ.UInt64> [int64(16-log16(c))])))
+ (Const64 <typ.UInt64> [int64(log16(c))]))
+(Div32 <t> n (Const32 [c])) && isPowerOfTwo32(c) =>
+ (Rsh32x64
+ (Add32 <t> n (Rsh32Ux64 <t> (Rsh32x64 <t> n (Const64 <typ.UInt64> [31])) (Const64 <typ.UInt64> [int64(32-log32(c))])))
+ (Const64 <typ.UInt64> [int64(log32(c))]))
+(Div64 <t> n (Const64 [c])) && isPowerOfTwo64(c) =>
+ (Rsh64x64
+ (Add64 <t> n (Rsh64Ux64 <t> (Rsh64x64 <t> n (Const64 <typ.UInt64> [63])) (Const64 <typ.UInt64> [int64(64-log64(c))])))
+ (Const64 <typ.UInt64> [int64(log64(c))]))
+
+// Signed divide, not a power of 2. Strength reduce to a multiply.
+(Div8 <t> x (Const8 [c])) && smagicOK8(c) =>
+ (Sub8 <t>
+ (Rsh32x64 <t>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(smagic8(c).m)])
+ (SignExt8to32 x))
+ (Const64 <typ.UInt64> [8+smagic8(c).s]))
+ (Rsh32x64 <t>
+ (SignExt8to32 x)
+ (Const64 <typ.UInt64> [31])))
+(Div16 <t> x (Const16 [c])) && smagicOK16(c) =>
+ (Sub16 <t>
+ (Rsh32x64 <t>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(smagic16(c).m)])
+ (SignExt16to32 x))
+ (Const64 <typ.UInt64> [16+smagic16(c).s]))
+ (Rsh32x64 <t>
+ (SignExt16to32 x)
+ (Const64 <typ.UInt64> [31])))
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 8 =>
+ (Sub32 <t>
+ (Rsh64x64 <t>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(smagic32(c).m)])
+ (SignExt32to64 x))
+ (Const64 <typ.UInt64> [32+smagic32(c).s]))
+ (Rsh64x64 <t>
+ (SignExt32to64 x)
+ (Const64 <typ.UInt64> [63])))
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul =>
+ (Sub32 <t>
+ (Rsh32x64 <t>
+ (Hmul32 <t>
+ (Const32 <typ.UInt32> [int32(smagic32(c).m/2)])
+ x)
+ (Const64 <typ.UInt64> [smagic32(c).s-1]))
+ (Rsh32x64 <t>
+ x
+ (Const64 <typ.UInt64> [31])))
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul =>
+ (Sub32 <t>
+ (Rsh32x64 <t>
+ (Add32 <t>
+ (Hmul32 <t>
+ (Const32 <typ.UInt32> [int32(smagic32(c).m)])
+ x)
+ x)
+ (Const64 <typ.UInt64> [smagic32(c).s]))
+ (Rsh32x64 <t>
+ x
+ (Const64 <typ.UInt64> [31])))
+(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul =>
+ (Sub64 <t>
+ (Rsh64x64 <t>
+ (Hmul64 <t>
+ (Const64 <typ.UInt64> [int64(smagic64(c).m/2)])
+ x)
+ (Const64 <typ.UInt64> [smagic64(c).s-1]))
+ (Rsh64x64 <t>
+ x
+ (Const64 <typ.UInt64> [63])))
+(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul =>
+ (Sub64 <t>
+ (Rsh64x64 <t>
+ (Add64 <t>
+ (Hmul64 <t>
+ (Const64 <typ.UInt64> [int64(smagic64(c).m)])
+ x)
+ x)
+ (Const64 <typ.UInt64> [smagic64(c).s]))
+ (Rsh64x64 <t>
+ x
+ (Const64 <typ.UInt64> [63])))
+
+// Unsigned mod by power of 2 constant.
+(Mod8u <t> n (Const8 [c])) && isPowerOfTwo8(c) => (And8 n (Const8 <t> [c-1]))
+(Mod16u <t> n (Const16 [c])) && isPowerOfTwo16(c) => (And16 n (Const16 <t> [c-1]))
+(Mod32u <t> n (Const32 [c])) && isPowerOfTwo32(c) => (And32 n (Const32 <t> [c-1]))
+(Mod64u <t> n (Const64 [c])) && isPowerOfTwo64(c) => (And64 n (Const64 <t> [c-1]))
+(Mod64u <t> n (Const64 [-1<<63])) => (And64 n (Const64 <t> [1<<63-1]))
+
+// Signed non-negative mod by power of 2 constant.
+(Mod8 <t> n (Const8 [c])) && isNonNegative(n) && isPowerOfTwo8(c) => (And8 n (Const8 <t> [c-1]))
+(Mod16 <t> n (Const16 [c])) && isNonNegative(n) && isPowerOfTwo16(c) => (And16 n (Const16 <t> [c-1]))
+(Mod32 <t> n (Const32 [c])) && isNonNegative(n) && isPowerOfTwo32(c) => (And32 n (Const32 <t> [c-1]))
+(Mod64 <t> n (Const64 [c])) && isNonNegative(n) && isPowerOfTwo64(c) => (And64 n (Const64 <t> [c-1]))
+(Mod64 n (Const64 [-1<<63])) && isNonNegative(n) => n
+
+// Signed mod by negative constant.
+(Mod8 <t> n (Const8 [c])) && c < 0 && c != -1<<7 => (Mod8 <t> n (Const8 <t> [-c]))
+(Mod16 <t> n (Const16 [c])) && c < 0 && c != -1<<15 => (Mod16 <t> n (Const16 <t> [-c]))
+(Mod32 <t> n (Const32 [c])) && c < 0 && c != -1<<31 => (Mod32 <t> n (Const32 <t> [-c]))
+(Mod64 <t> n (Const64 [c])) && c < 0 && c != -1<<63 => (Mod64 <t> n (Const64 <t> [-c]))
+
+// All other mods by constants, do A%B = A-(A/B*B).
+// This implements % with two * and a bunch of ancillary ops.
+// One of the * is free if the user's code also computes A/B.
+(Mod8 <t> x (Const8 [c])) && x.Op != OpConst8 && (c > 0 || c == -1<<7)
+ => (Sub8 x (Mul8 <t> (Div8 <t> x (Const8 <t> [c])) (Const8 <t> [c])))
+(Mod16 <t> x (Const16 [c])) && x.Op != OpConst16 && (c > 0 || c == -1<<15)
+ => (Sub16 x (Mul16 <t> (Div16 <t> x (Const16 <t> [c])) (Const16 <t> [c])))
+(Mod32 <t> x (Const32 [c])) && x.Op != OpConst32 && (c > 0 || c == -1<<31)
+ => (Sub32 x (Mul32 <t> (Div32 <t> x (Const32 <t> [c])) (Const32 <t> [c])))
+(Mod64 <t> x (Const64 [c])) && x.Op != OpConst64 && (c > 0 || c == -1<<63)
+ => (Sub64 x (Mul64 <t> (Div64 <t> x (Const64 <t> [c])) (Const64 <t> [c])))
+(Mod8u <t> x (Const8 [c])) && x.Op != OpConst8 && c > 0 && umagicOK8( c)
+ => (Sub8 x (Mul8 <t> (Div8u <t> x (Const8 <t> [c])) (Const8 <t> [c])))
+(Mod16u <t> x (Const16 [c])) && x.Op != OpConst16 && c > 0 && umagicOK16(c)
+ => (Sub16 x (Mul16 <t> (Div16u <t> x (Const16 <t> [c])) (Const16 <t> [c])))
+(Mod32u <t> x (Const32 [c])) && x.Op != OpConst32 && c > 0 && umagicOK32(c)
+ => (Sub32 x (Mul32 <t> (Div32u <t> x (Const32 <t> [c])) (Const32 <t> [c])))
+(Mod64u <t> x (Const64 [c])) && x.Op != OpConst64 && c > 0 && umagicOK64(c)
+ => (Sub64 x (Mul64 <t> (Div64u <t> x (Const64 <t> [c])) (Const64 <t> [c])))
+
+// For architectures without rotates on less than 32-bits, promote these checks to 32-bit.
+(Eq8 (Mod8u x (Const8 [c])) (Const8 [0])) && x.Op != OpConst8 && udivisibleOK8(c) && !hasSmallRotate(config) =>
+ (Eq32 (Mod32u <typ.UInt32> (ZeroExt8to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(uint8(c))])) (Const32 <typ.UInt32> [0]))
+(Eq16 (Mod16u x (Const16 [c])) (Const16 [0])) && x.Op != OpConst16 && udivisibleOK16(c) && !hasSmallRotate(config) =>
+ (Eq32 (Mod32u <typ.UInt32> (ZeroExt16to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(uint16(c))])) (Const32 <typ.UInt32> [0]))
+(Eq8 (Mod8 x (Const8 [c])) (Const8 [0])) && x.Op != OpConst8 && sdivisibleOK8(c) && !hasSmallRotate(config) =>
+ (Eq32 (Mod32 <typ.Int32> (SignExt8to32 <typ.Int32> x) (Const32 <typ.Int32> [int32(c)])) (Const32 <typ.Int32> [0]))
+(Eq16 (Mod16 x (Const16 [c])) (Const16 [0])) && x.Op != OpConst16 && sdivisibleOK16(c) && !hasSmallRotate(config) =>
+ (Eq32 (Mod32 <typ.Int32> (SignExt16to32 <typ.Int32> x) (Const32 <typ.Int32> [int32(c)])) (Const32 <typ.Int32> [0]))
+
+// Divisibility checks x%c == 0 convert to multiply and rotate.
+// Note, x%c == 0 is rewritten as x == c*(x/c) during the opt pass
+// where (x/c) is performed using multiplication with magic constants.
+// To rewrite x%c == 0 requires pattern matching the rewritten expression
+// and checking that the division by the same constant wasn't already calculated.
+// This check is made by counting uses of the magic constant multiplication.
+// Note that if there were an intermediate opt pass, this rule could be applied
+// directly on the Div op and magic division rewrites could be delayed to late opt.
+
+// Unsigned divisibility checks convert to multiply and rotate.
+(Eq8 x (Mul8 (Const8 [c])
+ (Trunc32to8
+ (Rsh32Ux64
+ mul:(Mul32
+ (Const32 [m])
+ (ZeroExt8to32 x))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(1<<8+umagic8(c).m) && s == 8+umagic8(c).s
+ && x.Op != OpConst8 && udivisibleOK8(c)
+ => (Leq8U
+ (RotateLeft8 <typ.UInt8>
+ (Mul8 <typ.UInt8>
+ (Const8 <typ.UInt8> [int8(udivisible8(c).m)])
+ x)
+ (Const8 <typ.UInt8> [int8(8-udivisible8(c).k)])
+ )
+ (Const8 <typ.UInt8> [int8(udivisible8(c).max)])
+ )
+
+(Eq16 x (Mul16 (Const16 [c])
+ (Trunc64to16
+ (Rsh64Ux64
+ mul:(Mul64
+ (Const64 [m])
+ (ZeroExt16to64 x))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(1<<16+umagic16(c).m) && s == 16+umagic16(c).s
+ && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+ (RotateLeft16 <typ.UInt16>
+ (Mul16 <typ.UInt16>
+ (Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+ x)
+ (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+ )
+ (Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+ )
+
+(Eq16 x (Mul16 (Const16 [c])
+ (Trunc32to16
+ (Rsh32Ux64
+ mul:(Mul32
+ (Const32 [m])
+ (ZeroExt16to32 x))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(1<<15+umagic16(c).m/2) && s == 16+umagic16(c).s-1
+ && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+ (RotateLeft16 <typ.UInt16>
+ (Mul16 <typ.UInt16>
+ (Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+ x)
+ (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+ )
+ (Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+ )
+
+(Eq16 x (Mul16 (Const16 [c])
+ (Trunc32to16
+ (Rsh32Ux64
+ mul:(Mul32
+ (Const32 [m])
+ (Rsh32Ux64 (ZeroExt16to32 x) (Const64 [1])))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(1<<15+(umagic16(c).m+1)/2) && s == 16+umagic16(c).s-2
+ && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+ (RotateLeft16 <typ.UInt16>
+ (Mul16 <typ.UInt16>
+ (Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+ x)
+ (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+ )
+ (Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+ )
+
+(Eq16 x (Mul16 (Const16 [c])
+ (Trunc32to16
+ (Rsh32Ux64
+ (Avg32u
+ (Lsh32x64 (ZeroExt16to32 x) (Const64 [16]))
+ mul:(Mul32
+ (Const32 [m])
+ (ZeroExt16to32 x)))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(umagic16(c).m) && s == 16+umagic16(c).s-1
+ && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+ (RotateLeft16 <typ.UInt16>
+ (Mul16 <typ.UInt16>
+ (Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+ x)
+ (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+ )
+ (Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Rsh32Ux64
+ mul:(Hmul32u
+ (Const32 [m])
+ x)
+ (Const64 [s]))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(1<<31+umagic32(c).m/2) && s == umagic32(c).s-1
+ && x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Rsh32Ux64
+ mul:(Hmul32u
+ (Const32 <typ.UInt32> [m])
+ (Rsh32Ux64 x (Const64 [1])))
+ (Const64 [s]))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(1<<31+(umagic32(c).m+1)/2) && s == umagic32(c).s-2
+ && x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Rsh32Ux64
+ (Avg32u
+ x
+ mul:(Hmul32u
+ (Const32 [m])
+ x))
+ (Const64 [s]))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(umagic32(c).m) && s == umagic32(c).s-1
+ && x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Trunc64to32
+ (Rsh64Ux64
+ mul:(Mul64
+ (Const64 [m])
+ (ZeroExt32to64 x))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(1<<31+umagic32(c).m/2) && s == 32+umagic32(c).s-1
+ && x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Trunc64to32
+ (Rsh64Ux64
+ mul:(Mul64
+ (Const64 [m])
+ (Rsh64Ux64 (ZeroExt32to64 x) (Const64 [1])))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(1<<31+(umagic32(c).m+1)/2) && s == 32+umagic32(c).s-2
+ && x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Trunc64to32
+ (Rsh64Ux64
+ (Avg64u
+ (Lsh64x64 (ZeroExt32to64 x) (Const64 [32]))
+ mul:(Mul64
+ (Const64 [m])
+ (ZeroExt32to64 x)))
+ (Const64 [s])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(umagic32(c).m) && s == 32+umagic32(c).s-1
+ && x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+ )
+
+(Eq64 x (Mul64 (Const64 [c])
+ (Rsh64Ux64
+ mul:(Hmul64u
+ (Const64 [m])
+ x)
+ (Const64 [s]))
+ )
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(1<<63+umagic64(c).m/2) && s == umagic64(c).s-1
+ && x.Op != OpConst64 && udivisibleOK64(c)
+ => (Leq64U
+ (RotateLeft64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(udivisible64(c).m)])
+ x)
+ (Const64 <typ.UInt64> [64-udivisible64(c).k])
+ )
+ (Const64 <typ.UInt64> [int64(udivisible64(c).max)])
+ )
+(Eq64 x (Mul64 (Const64 [c])
+ (Rsh64Ux64
+ mul:(Hmul64u
+ (Const64 [m])
+ (Rsh64Ux64 x (Const64 [1])))
+ (Const64 [s]))
+ )
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(1<<63+(umagic64(c).m+1)/2) && s == umagic64(c).s-2
+ && x.Op != OpConst64 && udivisibleOK64(c)
+ => (Leq64U
+ (RotateLeft64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(udivisible64(c).m)])
+ x)
+ (Const64 <typ.UInt64> [64-udivisible64(c).k])
+ )
+ (Const64 <typ.UInt64> [int64(udivisible64(c).max)])
+ )
+(Eq64 x (Mul64 (Const64 [c])
+ (Rsh64Ux64
+ (Avg64u
+ x
+ mul:(Hmul64u
+ (Const64 [m])
+ x))
+ (Const64 [s]))
+ )
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(umagic64(c).m) && s == umagic64(c).s-1
+ && x.Op != OpConst64 && udivisibleOK64(c)
+ => (Leq64U
+ (RotateLeft64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(udivisible64(c).m)])
+ x)
+ (Const64 <typ.UInt64> [64-udivisible64(c).k])
+ )
+ (Const64 <typ.UInt64> [int64(udivisible64(c).max)])
+ )
+
+// Signed divisibility checks convert to multiply, add and rotate.
+(Eq8 x (Mul8 (Const8 [c])
+ (Sub8
+ (Rsh32x64
+ mul:(Mul32
+ (Const32 [m])
+ (SignExt8to32 x))
+ (Const64 [s]))
+ (Rsh32x64
+ (SignExt8to32 x)
+ (Const64 [31])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(smagic8(c).m) && s == 8+smagic8(c).s
+ && x.Op != OpConst8 && sdivisibleOK8(c)
+ => (Leq8U
+ (RotateLeft8 <typ.UInt8>
+ (Add8 <typ.UInt8>
+ (Mul8 <typ.UInt8>
+ (Const8 <typ.UInt8> [int8(sdivisible8(c).m)])
+ x)
+ (Const8 <typ.UInt8> [int8(sdivisible8(c).a)])
+ )
+ (Const8 <typ.UInt8> [int8(8-sdivisible8(c).k)])
+ )
+ (Const8 <typ.UInt8> [int8(sdivisible8(c).max)])
+ )
+
+(Eq16 x (Mul16 (Const16 [c])
+ (Sub16
+ (Rsh32x64
+ mul:(Mul32
+ (Const32 [m])
+ (SignExt16to32 x))
+ (Const64 [s]))
+ (Rsh32x64
+ (SignExt16to32 x)
+ (Const64 [31])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(smagic16(c).m) && s == 16+smagic16(c).s
+ && x.Op != OpConst16 && sdivisibleOK16(c)
+ => (Leq16U
+ (RotateLeft16 <typ.UInt16>
+ (Add16 <typ.UInt16>
+ (Mul16 <typ.UInt16>
+ (Const16 <typ.UInt16> [int16(sdivisible16(c).m)])
+ x)
+ (Const16 <typ.UInt16> [int16(sdivisible16(c).a)])
+ )
+ (Const16 <typ.UInt16> [int16(16-sdivisible16(c).k)])
+ )
+ (Const16 <typ.UInt16> [int16(sdivisible16(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Sub32
+ (Rsh64x64
+ mul:(Mul64
+ (Const64 [m])
+ (SignExt32to64 x))
+ (Const64 [s]))
+ (Rsh64x64
+ (SignExt32to64 x)
+ (Const64 [63])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(smagic32(c).m) && s == 32+smagic32(c).s
+ && x.Op != OpConst32 && sdivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Add32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).a)])
+ )
+ (Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Sub32
+ (Rsh32x64
+ mul:(Hmul32
+ (Const32 [m])
+ x)
+ (Const64 [s]))
+ (Rsh32x64
+ x
+ (Const64 [31])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(smagic32(c).m/2) && s == smagic32(c).s-1
+ && x.Op != OpConst32 && sdivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Add32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).a)])
+ )
+ (Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).max)])
+ )
+
+(Eq32 x (Mul32 (Const32 [c])
+ (Sub32
+ (Rsh32x64
+ (Add32
+ mul:(Hmul32
+ (Const32 [m])
+ x)
+ x)
+ (Const64 [s]))
+ (Rsh32x64
+ x
+ (Const64 [31])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int32(smagic32(c).m) && s == smagic32(c).s
+ && x.Op != OpConst32 && sdivisibleOK32(c)
+ => (Leq32U
+ (RotateLeft32 <typ.UInt32>
+ (Add32 <typ.UInt32>
+ (Mul32 <typ.UInt32>
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).m)])
+ x)
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).a)])
+ )
+ (Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)])
+ )
+ (Const32 <typ.UInt32> [int32(sdivisible32(c).max)])
+ )
+
+(Eq64 x (Mul64 (Const64 [c])
+ (Sub64
+ (Rsh64x64
+ mul:(Hmul64
+ (Const64 [m])
+ x)
+ (Const64 [s]))
+ (Rsh64x64
+ x
+ (Const64 [63])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(smagic64(c).m/2) && s == smagic64(c).s-1
+ && x.Op != OpConst64 && sdivisibleOK64(c)
+ => (Leq64U
+ (RotateLeft64 <typ.UInt64>
+ (Add64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(sdivisible64(c).m)])
+ x)
+ (Const64 <typ.UInt64> [int64(sdivisible64(c).a)])
+ )
+ (Const64 <typ.UInt64> [64-sdivisible64(c).k])
+ )
+ (Const64 <typ.UInt64> [int64(sdivisible64(c).max)])
+ )
+
+(Eq64 x (Mul64 (Const64 [c])
+ (Sub64
+ (Rsh64x64
+ (Add64
+ mul:(Hmul64
+ (Const64 [m])
+ x)
+ x)
+ (Const64 [s]))
+ (Rsh64x64
+ x
+ (Const64 [63])))
+ )
+)
+ && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+ && m == int64(smagic64(c).m) && s == smagic64(c).s
+ && x.Op != OpConst64 && sdivisibleOK64(c)
+ => (Leq64U
+ (RotateLeft64 <typ.UInt64>
+ (Add64 <typ.UInt64>
+ (Mul64 <typ.UInt64>
+ (Const64 <typ.UInt64> [int64(sdivisible64(c).m)])
+ x)
+ (Const64 <typ.UInt64> [int64(sdivisible64(c).a)])
+ )
+ (Const64 <typ.UInt64> [64-sdivisible64(c).k])
+ )
+ (Const64 <typ.UInt64> [int64(sdivisible64(c).max)])
+ )
+
+// Divisibility check for signed integers for power of two constant are simple mask.
+// However, we must match against the rewritten n%c == 0 -> n - c*(n/c) == 0 -> n == c*(n/c)
+// where n/c contains fixup code to handle signed n.
+((Eq8|Neq8) n (Lsh8x64
+ (Rsh8x64
+ (Add8 <t> n (Rsh8Ux64 <t> (Rsh8x64 <t> n (Const64 <typ.UInt64> [ 7])) (Const64 <typ.UInt64> [kbar])))
+ (Const64 <typ.UInt64> [k]))
+ (Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 7 && kbar == 8 - k
+ => ((Eq8|Neq8) (And8 <t> n (Const8 <t> [1<<uint(k)-1])) (Const8 <t> [0]))
+
+((Eq16|Neq16) n (Lsh16x64
+ (Rsh16x64
+ (Add16 <t> n (Rsh16Ux64 <t> (Rsh16x64 <t> n (Const64 <typ.UInt64> [15])) (Const64 <typ.UInt64> [kbar])))
+ (Const64 <typ.UInt64> [k]))
+ (Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 15 && kbar == 16 - k
+ => ((Eq16|Neq16) (And16 <t> n (Const16 <t> [1<<uint(k)-1])) (Const16 <t> [0]))
+
+((Eq32|Neq32) n (Lsh32x64
+ (Rsh32x64
+ (Add32 <t> n (Rsh32Ux64 <t> (Rsh32x64 <t> n (Const64 <typ.UInt64> [31])) (Const64 <typ.UInt64> [kbar])))
+ (Const64 <typ.UInt64> [k]))
+ (Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 31 && kbar == 32 - k
+ => ((Eq32|Neq32) (And32 <t> n (Const32 <t> [1<<uint(k)-1])) (Const32 <t> [0]))
+
+((Eq64|Neq64) n (Lsh64x64
+ (Rsh64x64
+ (Add64 <t> n (Rsh64Ux64 <t> (Rsh64x64 <t> n (Const64 <typ.UInt64> [63])) (Const64 <typ.UInt64> [kbar])))
+ (Const64 <typ.UInt64> [k]))
+ (Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 63 && kbar == 64 - k
+ => ((Eq64|Neq64) (And64 <t> n (Const64 <t> [1<<uint(k)-1])) (Const64 <t> [0]))
+
+(Eq(8|16|32|64) s:(Sub(8|16|32|64) x y) (Const(8|16|32|64) [0])) && s.Uses == 1 => (Eq(8|16|32|64) x y)
+(Neq(8|16|32|64) s:(Sub(8|16|32|64) x y) (Const(8|16|32|64) [0])) && s.Uses == 1 => (Neq(8|16|32|64) x y)
+
+// Optimize bitsets
+(Eq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [y])) && oneBit8(y)
+ => (Neq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [0]))
+(Eq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [y])) && oneBit16(y)
+ => (Neq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [0]))
+(Eq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [y])) && oneBit32(y)
+ => (Neq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [0]))
+(Eq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [y])) && oneBit64(y)
+ => (Neq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [0]))
+(Neq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [y])) && oneBit8(y)
+ => (Eq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [0]))
+(Neq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [y])) && oneBit16(y)
+ => (Eq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [0]))
+(Neq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [y])) && oneBit32(y)
+ => (Eq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [0]))
+(Neq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [y])) && oneBit64(y)
+ => (Eq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [0]))
+
+// Reassociate expressions involving
+// constants such that constants come first,
+// exposing obvious constant-folding opportunities.
+// Reassociate (op (op y C) x) to (op C (op x y)) or similar, where C
+// is constant, which pushes constants to the outside
+// of the expression. At that point, any constant-folding
+// opportunities should be obvious.
+// Note: don't include AddPtr here! In order to maintain the
+// invariant that pointers must stay within the pointed-to object,
+// we can't pull part of a pointer computation above the AddPtr.
+// See issue 37881.
+// Note: we don't need to handle any (x-C) cases because we already rewrite
+// (x-C) to (x+(-C)).
+
+// x + (C + z) -> C + (x + z)
+(Add64 (Add64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Add64 <t> z x))
+(Add32 (Add32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Add32 <t> z x))
+(Add16 (Add16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Add16 <t> z x))
+(Add8 (Add8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Add8 i (Add8 <t> z x))
+
+// x + (C - z) -> C + (x - z)
+(Add64 (Sub64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Sub64 <t> x z))
+(Add32 (Sub32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Sub32 <t> x z))
+(Add16 (Sub16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Sub16 <t> x z))
+(Add8 (Sub8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Add8 i (Sub8 <t> x z))
+
+// x - (C - z) -> x + (z - C) -> (x + z) - C
+(Sub64 x (Sub64 i:(Const64 <t>) z)) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 (Add64 <t> x z) i)
+(Sub32 x (Sub32 i:(Const32 <t>) z)) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 (Add32 <t> x z) i)
+(Sub16 x (Sub16 i:(Const16 <t>) z)) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 (Add16 <t> x z) i)
+(Sub8 x (Sub8 i:(Const8 <t>) z)) && (z.Op != OpConst8 && x.Op != OpConst8) => (Sub8 (Add8 <t> x z) i)
+
+// x - (z + C) -> x + (-z - C) -> (x - z) - C
+(Sub64 x (Add64 z i:(Const64 <t>))) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 (Sub64 <t> x z) i)
+(Sub32 x (Add32 z i:(Const32 <t>))) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 (Sub32 <t> x z) i)
+(Sub16 x (Add16 z i:(Const16 <t>))) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 (Sub16 <t> x z) i)
+(Sub8 x (Add8 z i:(Const8 <t>))) && (z.Op != OpConst8 && x.Op != OpConst8) => (Sub8 (Sub8 <t> x z) i)
+
+// (C - z) - x -> C - (z + x)
+(Sub64 (Sub64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 i (Add64 <t> z x))
+(Sub32 (Sub32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 i (Add32 <t> z x))
+(Sub16 (Sub16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 i (Add16 <t> z x))
+(Sub8 (Sub8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Sub8 i (Add8 <t> z x))
+
+// (z + C) -x -> C + (z - x)
+(Sub64 (Add64 z i:(Const64 <t>)) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Sub64 <t> z x))
+(Sub32 (Add32 z i:(Const32 <t>)) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Sub32 <t> z x))
+(Sub16 (Add16 z i:(Const16 <t>)) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Sub16 <t> z x))
+(Sub8 (Add8 z i:(Const8 <t>)) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Add8 i (Sub8 <t> z x))
+
+// x & (C & z) -> C & (x & z)
+(And64 (And64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (And64 i (And64 <t> z x))
+(And32 (And32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (And32 i (And32 <t> z x))
+(And16 (And16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (And16 i (And16 <t> z x))
+(And8 (And8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (And8 i (And8 <t> z x))
+
+// x | (C | z) -> C | (x | z)
+(Or64 (Or64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Or64 i (Or64 <t> z x))
+(Or32 (Or32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Or32 i (Or32 <t> z x))
+(Or16 (Or16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Or16 i (Or16 <t> z x))
+(Or8 (Or8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Or8 i (Or8 <t> z x))
+
+// x ^ (C ^ z) -> C ^ (x ^ z)
+(Xor64 (Xor64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Xor64 i (Xor64 <t> z x))
+(Xor32 (Xor32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Xor32 i (Xor32 <t> z x))
+(Xor16 (Xor16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Xor16 i (Xor16 <t> z x))
+(Xor8 (Xor8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Xor8 i (Xor8 <t> z x))
+
+// x * (D * z) = D * (x * z)
+(Mul64 (Mul64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Mul64 i (Mul64 <t> x z))
+(Mul32 (Mul32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Mul32 i (Mul32 <t> x z))
+(Mul16 (Mul16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Mul16 i (Mul16 <t> x z))
+(Mul8 (Mul8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Mul8 i (Mul8 <t> x z))
+
+// C + (D + x) -> (C + D) + x
+(Add64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Add64 (Const64 <t> [c+d]) x)
+(Add32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Add32 (Const32 <t> [c+d]) x)
+(Add16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Add16 (Const16 <t> [c+d]) x)
+(Add8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Add8 (Const8 <t> [c+d]) x)
+
+// C + (D - x) -> (C + D) - x
+(Add64 (Const64 <t> [c]) (Sub64 (Const64 <t> [d]) x)) => (Sub64 (Const64 <t> [c+d]) x)
+(Add32 (Const32 <t> [c]) (Sub32 (Const32 <t> [d]) x)) => (Sub32 (Const32 <t> [c+d]) x)
+(Add16 (Const16 <t> [c]) (Sub16 (Const16 <t> [d]) x)) => (Sub16 (Const16 <t> [c+d]) x)
+(Add8 (Const8 <t> [c]) (Sub8 (Const8 <t> [d]) x)) => (Sub8 (Const8 <t> [c+d]) x)
+
+// C - (D - x) -> (C - D) + x
+(Sub64 (Const64 <t> [c]) (Sub64 (Const64 <t> [d]) x)) => (Add64 (Const64 <t> [c-d]) x)
+(Sub32 (Const32 <t> [c]) (Sub32 (Const32 <t> [d]) x)) => (Add32 (Const32 <t> [c-d]) x)
+(Sub16 (Const16 <t> [c]) (Sub16 (Const16 <t> [d]) x)) => (Add16 (Const16 <t> [c-d]) x)
+(Sub8 (Const8 <t> [c]) (Sub8 (Const8 <t> [d]) x)) => (Add8 (Const8 <t> [c-d]) x)
+
+// C - (D + x) -> (C - D) - x
+(Sub64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Sub64 (Const64 <t> [c-d]) x)
+(Sub32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Sub32 (Const32 <t> [c-d]) x)
+(Sub16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Sub16 (Const16 <t> [c-d]) x)
+(Sub8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Sub8 (Const8 <t> [c-d]) x)
+
+// C & (D & x) -> (C & D) & x
+(And64 (Const64 <t> [c]) (And64 (Const64 <t> [d]) x)) => (And64 (Const64 <t> [c&d]) x)
+(And32 (Const32 <t> [c]) (And32 (Const32 <t> [d]) x)) => (And32 (Const32 <t> [c&d]) x)
+(And16 (Const16 <t> [c]) (And16 (Const16 <t> [d]) x)) => (And16 (Const16 <t> [c&d]) x)
+(And8 (Const8 <t> [c]) (And8 (Const8 <t> [d]) x)) => (And8 (Const8 <t> [c&d]) x)
+
+// C | (D | x) -> (C | D) | x
+(Or64 (Const64 <t> [c]) (Or64 (Const64 <t> [d]) x)) => (Or64 (Const64 <t> [c|d]) x)
+(Or32 (Const32 <t> [c]) (Or32 (Const32 <t> [d]) x)) => (Or32 (Const32 <t> [c|d]) x)
+(Or16 (Const16 <t> [c]) (Or16 (Const16 <t> [d]) x)) => (Or16 (Const16 <t> [c|d]) x)
+(Or8 (Const8 <t> [c]) (Or8 (Const8 <t> [d]) x)) => (Or8 (Const8 <t> [c|d]) x)
+
+// C ^ (D ^ x) -> (C ^ D) ^ x
+(Xor64 (Const64 <t> [c]) (Xor64 (Const64 <t> [d]) x)) => (Xor64 (Const64 <t> [c^d]) x)
+(Xor32 (Const32 <t> [c]) (Xor32 (Const32 <t> [d]) x)) => (Xor32 (Const32 <t> [c^d]) x)
+(Xor16 (Const16 <t> [c]) (Xor16 (Const16 <t> [d]) x)) => (Xor16 (Const16 <t> [c^d]) x)
+(Xor8 (Const8 <t> [c]) (Xor8 (Const8 <t> [d]) x)) => (Xor8 (Const8 <t> [c^d]) x)
+
+// C * (D * x) = (C * D) * x
+(Mul64 (Const64 <t> [c]) (Mul64 (Const64 <t> [d]) x)) => (Mul64 (Const64 <t> [c*d]) x)
+(Mul32 (Const32 <t> [c]) (Mul32 (Const32 <t> [d]) x)) => (Mul32 (Const32 <t> [c*d]) x)
+(Mul16 (Const16 <t> [c]) (Mul16 (Const16 <t> [d]) x)) => (Mul16 (Const16 <t> [c*d]) x)
+(Mul8 (Const8 <t> [c]) (Mul8 (Const8 <t> [d]) x)) => (Mul8 (Const8 <t> [c*d]) x)
+
+// floating point optimizations
+(Mul(32|64)F x (Const(32|64)F [1])) => x
+(Mul32F x (Const32F [-1])) => (Neg32F x)
+(Mul64F x (Const64F [-1])) => (Neg64F x)
+(Mul32F x (Const32F [2])) => (Add32F x x)
+(Mul64F x (Const64F [2])) => (Add64F x x)
+
+(Div32F x (Const32F <t> [c])) && reciprocalExact32(c) => (Mul32F x (Const32F <t> [1/c]))
+(Div64F x (Const64F <t> [c])) && reciprocalExact64(c) => (Mul64F x (Const64F <t> [1/c]))
+
+// rewrite single-precision sqrt expression "float32(math.Sqrt(float64(x)))"
+(Cvt64Fto32F sqrt0:(Sqrt (Cvt32Fto64F x))) && sqrt0.Uses==1 => (Sqrt32 x)
+
+(Sqrt (Const64F [c])) && !math.IsNaN(math.Sqrt(c)) => (Const64F [math.Sqrt(c)])
+
+// for rewriting results of some late-expanded rewrites (below)
+(SelectN [0] (MakeResult x ___)) => x
+(SelectN [1] (MakeResult x y ___)) => y
+(SelectN [2] (MakeResult x y z ___)) => z
+
+// for late-expanded calls, recognize newobject and remove zeroing and nilchecks
+(Zero (SelectN [0] call:(StaticLECall _ _)) mem:(SelectN [1] call))
+ && isSameCall(call.Aux, "runtime.newobject")
+ => mem
+
+(Store (SelectN [0] call:(StaticLECall _ _)) x mem:(SelectN [1] call))
+ && isConstZero(x)
+ && isSameCall(call.Aux, "runtime.newobject")
+ => mem
+
+(Store (OffPtr (SelectN [0] call:(StaticLECall _ _))) x mem:(SelectN [1] call))
+ && isConstZero(x)
+ && isSameCall(call.Aux, "runtime.newobject")
+ => mem
+
+(NilCheck (SelectN [0] call:(StaticLECall _ _)) _)
+ && isSameCall(call.Aux, "runtime.newobject")
+ && warnRule(fe.Debug_checknil(), v, "removed nil check")
+ => (Invalid)
+
+(NilCheck (OffPtr (SelectN [0] call:(StaticLECall _ _))) _)
+ && isSameCall(call.Aux, "runtime.newobject")
+ && warnRule(fe.Debug_checknil(), v, "removed nil check")
+ => (Invalid)
+
+// for late-expanded calls, recognize memequal applied to a single constant byte
+// Support is limited by 1, 2, 4, 8 byte sizes
+(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [1]) mem)
+ && isSameCall(callAux, "runtime.memequal")
+ && symIsRO(scon)
+ => (MakeResult (Eq8 (Load <typ.Int8> sptr mem) (Const8 <typ.Int8> [int8(read8(scon,0))])) mem)
+
+(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [2]) mem)
+ && isSameCall(callAux, "runtime.memequal")
+ && symIsRO(scon)
+ && canLoadUnaligned(config)
+ => (MakeResult (Eq16 (Load <typ.Int16> sptr mem) (Const16 <typ.Int16> [int16(read16(scon,0,config.ctxt.Arch.ByteOrder))])) mem)
+
+(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [4]) mem)
+ && isSameCall(callAux, "runtime.memequal")
+ && symIsRO(scon)
+ && canLoadUnaligned(config)
+ => (MakeResult (Eq32 (Load <typ.Int32> sptr mem) (Const32 <typ.Int32> [int32(read32(scon,0,config.ctxt.Arch.ByteOrder))])) mem)
+
+(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [8]) mem)
+ && isSameCall(callAux, "runtime.memequal")
+ && symIsRO(scon)
+ && canLoadUnaligned(config) && config.PtrSize == 8
+ => (MakeResult (Eq64 (Load <typ.Int64> sptr mem) (Const64 <typ.Int64> [int64(read64(scon,0,config.ctxt.Arch.ByteOrder))])) mem)
+
+// Evaluate constant address comparisons.
+(EqPtr x x) => (ConstBool [true])
+(NeqPtr x x) => (ConstBool [false])
+(EqPtr (Addr {x} _) (Addr {y} _)) => (ConstBool [x == y])
+(EqPtr (Addr {x} _) (OffPtr [o] (Addr {y} _))) => (ConstBool [x == y && o == 0])
+(EqPtr (OffPtr [o1] (Addr {x} _)) (OffPtr [o2] (Addr {y} _))) => (ConstBool [x == y && o1 == o2])
+(NeqPtr (Addr {x} _) (Addr {y} _)) => (ConstBool [x != y])
+(NeqPtr (Addr {x} _) (OffPtr [o] (Addr {y} _))) => (ConstBool [x != y || o != 0])
+(NeqPtr (OffPtr [o1] (Addr {x} _)) (OffPtr [o2] (Addr {y} _))) => (ConstBool [x != y || o1 != o2])
+(EqPtr (LocalAddr {x} _ _) (LocalAddr {y} _ _)) => (ConstBool [x == y])
+(EqPtr (LocalAddr {x} _ _) (OffPtr [o] (LocalAddr {y} _ _))) => (ConstBool [x == y && o == 0])
+(EqPtr (OffPtr [o1] (LocalAddr {x} _ _)) (OffPtr [o2] (LocalAddr {y} _ _))) => (ConstBool [x == y && o1 == o2])
+(NeqPtr (LocalAddr {x} _ _) (LocalAddr {y} _ _)) => (ConstBool [x != y])
+(NeqPtr (LocalAddr {x} _ _) (OffPtr [o] (LocalAddr {y} _ _))) => (ConstBool [x != y || o != 0])
+(NeqPtr (OffPtr [o1] (LocalAddr {x} _ _)) (OffPtr [o2] (LocalAddr {y} _ _))) => (ConstBool [x != y || o1 != o2])
+(EqPtr (OffPtr [o1] p1) p2) && isSamePtr(p1, p2) => (ConstBool [o1 == 0])
+(NeqPtr (OffPtr [o1] p1) p2) && isSamePtr(p1, p2) => (ConstBool [o1 != 0])
+(EqPtr (OffPtr [o1] p1) (OffPtr [o2] p2)) && isSamePtr(p1, p2) => (ConstBool [o1 == o2])
+(NeqPtr (OffPtr [o1] p1) (OffPtr [o2] p2)) && isSamePtr(p1, p2) => (ConstBool [o1 != o2])
+(EqPtr (Const(32|64) [c]) (Const(32|64) [d])) => (ConstBool [c == d])
+(NeqPtr (Const(32|64) [c]) (Const(32|64) [d])) => (ConstBool [c != d])
+
+(EqPtr (LocalAddr _ _) (Addr _)) => (ConstBool [false])
+(EqPtr (OffPtr (LocalAddr _ _)) (Addr _)) => (ConstBool [false])
+(EqPtr (LocalAddr _ _) (OffPtr (Addr _))) => (ConstBool [false])
+(EqPtr (OffPtr (LocalAddr _ _)) (OffPtr (Addr _))) => (ConstBool [false])
+(NeqPtr (LocalAddr _ _) (Addr _)) => (ConstBool [true])
+(NeqPtr (OffPtr (LocalAddr _ _)) (Addr _)) => (ConstBool [true])
+(NeqPtr (LocalAddr _ _) (OffPtr (Addr _))) => (ConstBool [true])
+(NeqPtr (OffPtr (LocalAddr _ _)) (OffPtr (Addr _))) => (ConstBool [true])
+
+// Simplify address comparisons.
+(EqPtr (AddPtr p1 o1) p2) && isSamePtr(p1, p2) => (Not (IsNonNil o1))
+(NeqPtr (AddPtr p1 o1) p2) && isSamePtr(p1, p2) => (IsNonNil o1)
+(EqPtr (Const(32|64) [0]) p) => (Not (IsNonNil p))
+(NeqPtr (Const(32|64) [0]) p) => (IsNonNil p)
+(EqPtr (ConstNil) p) => (Not (IsNonNil p))
+(NeqPtr (ConstNil) p) => (IsNonNil p)
+
+// Evaluate constant user nil checks.
+(IsNonNil (ConstNil)) => (ConstBool [false])
+(IsNonNil (Const(32|64) [c])) => (ConstBool [c != 0])
+(IsNonNil (Addr _)) => (ConstBool [true])
+(IsNonNil (LocalAddr _ _)) => (ConstBool [true])
+
+// Inline small or disjoint runtime.memmove calls with constant length.
+// See the comment in op Move in genericOps.go for discussion of the type.
+//
+// Note that we've lost any knowledge of the type and alignment requirements
+// of the source and destination. We only know the size, and that the type
+// contains no pointers.
+// The type of the move is not necessarily v.Args[0].Type().Elem()!
+// See issue 55122 for details.
+//
+// Because expand calls runs after prove, constants useful to this pattern may not appear.
+// Both versions need to exist; the memory and register variants.
+//
+// Match post-expansion calls, memory version.
+(SelectN [0] call:(StaticCall {sym} s1:(Store _ (Const(64|32) [sz]) s2:(Store _ src s3:(Store {t} _ dst mem)))))
+ && sz >= 0
+ && isSameCall(sym, "runtime.memmove")
+ && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+ && isInlinableMemmove(dst, src, int64(sz), config)
+ && clobber(s1, s2, s3, call)
+ => (Move {types.Types[types.TUINT8]} [int64(sz)] dst src mem)
+
+// Match post-expansion calls, register version.
+(SelectN [0] call:(StaticCall {sym} dst src (Const(64|32) [sz]) mem))
+ && sz >= 0
+ && call.Uses == 1 // this will exclude all calls with results
+ && isSameCall(sym, "runtime.memmove")
+ && isInlinableMemmove(dst, src, int64(sz), config)
+ && clobber(call)
+ => (Move {types.Types[types.TUINT8]} [int64(sz)] dst src mem)
+
+// Match pre-expansion calls.
+(SelectN [0] call:(StaticLECall {sym} dst src (Const(64|32) [sz]) mem))
+ && sz >= 0
+ && call.Uses == 1 // this will exclude all calls with results
+ && isSameCall(sym, "runtime.memmove")
+ && isInlinableMemmove(dst, src, int64(sz), config)
+ && clobber(call)
+ => (Move {types.Types[types.TUINT8]} [int64(sz)] dst src mem)
+
+// De-virtualize late-expanded interface calls into late-expanded static calls.
+// Note that (ITab (IMake)) doesn't get rewritten until after the first opt pass,
+// so this rule should trigger reliably.
+// devirtLECall removes the first argument, adds the devirtualized symbol to the AuxCall, and changes the opcode
+(InterLECall [argsize] {auxCall} (Load (OffPtr [off] (ITab (IMake (Addr {itab} (SB)) _))) _) ___) && devirtLESym(v, auxCall, itab, off) !=
+ nil => devirtLECall(v, devirtLESym(v, auxCall, itab, off))
+
+// Move and Zero optimizations.
+// Move source and destination may overlap.
+
+// Convert Moves into Zeros when the source is known to be zeros.
+(Move {t} [n] dst1 src mem:(Zero {t} [n] dst2 _)) && isSamePtr(src, dst2)
+ => (Zero {t} [n] dst1 mem)
+(Move {t} [n] dst1 src mem:(VarDef (Zero {t} [n] dst0 _))) && isSamePtr(src, dst0)
+ => (Zero {t} [n] dst1 mem)
+(Move {t} [n] dst (Addr {sym} (SB)) mem) && symIsROZero(sym) => (Zero {t} [n] dst mem)
+
+// Don't Store to variables that are about to be overwritten by Move/Zero.
+(Zero {t1} [n] p1 store:(Store {t2} (OffPtr [o2] p2) _ mem))
+ && isSamePtr(p1, p2) && store.Uses == 1
+ && n >= o2 + t2.Size()
+ && clobber(store)
+ => (Zero {t1} [n] p1 mem)
+(Move {t1} [n] dst1 src1 store:(Store {t2} op:(OffPtr [o2] dst2) _ mem))
+ && isSamePtr(dst1, dst2) && store.Uses == 1
+ && n >= o2 + t2.Size()
+ && disjoint(src1, n, op, t2.Size())
+ && clobber(store)
+ => (Move {t1} [n] dst1 src1 mem)
+
+// Don't Move to variables that are immediately completely overwritten.
+(Zero {t} [n] dst1 move:(Move {t} [n] dst2 _ mem))
+ && move.Uses == 1
+ && isSamePtr(dst1, dst2)
+ && clobber(move)
+ => (Zero {t} [n] dst1 mem)
+(Move {t} [n] dst1 src1 move:(Move {t} [n] dst2 _ mem))
+ && move.Uses == 1
+ && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+ && clobber(move)
+ => (Move {t} [n] dst1 src1 mem)
+(Zero {t} [n] dst1 vardef:(VarDef {x} move:(Move {t} [n] dst2 _ mem)))
+ && move.Uses == 1 && vardef.Uses == 1
+ && isSamePtr(dst1, dst2)
+ && clobber(move, vardef)
+ => (Zero {t} [n] dst1 (VarDef {x} mem))
+(Move {t} [n] dst1 src1 vardef:(VarDef {x} move:(Move {t} [n] dst2 _ mem)))
+ && move.Uses == 1 && vardef.Uses == 1
+ && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+ && clobber(move, vardef)
+ => (Move {t} [n] dst1 src1 (VarDef {x} mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+ m2:(Store {t2} op2:(OffPtr [0] p2) d2
+ m3:(Move [n] p3 _ mem)))
+ && m2.Uses == 1 && m3.Uses == 1
+ && o1 == t2.Size()
+ && n == t2.Size() + t1.Size()
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3)
+ && clobber(m2, m3)
+ => (Store {t1} op1 d1 (Store {t2} op2 d2 mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+ m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+ m3:(Store {t3} op3:(OffPtr [0] p3) d3
+ m4:(Move [n] p4 _ mem))))
+ && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1
+ && o2 == t3.Size()
+ && o1-o2 == t2.Size()
+ && n == t3.Size() + t2.Size() + t1.Size()
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+ && clobber(m2, m3, m4)
+ => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 mem)))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+ m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+ m3:(Store {t3} op3:(OffPtr [o3] p3) d3
+ m4:(Store {t4} op4:(OffPtr [0] p4) d4
+ m5:(Move [n] p5 _ mem)))))
+ && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 && m5.Uses == 1
+ && o3 == t4.Size()
+ && o2-o3 == t3.Size()
+ && o1-o2 == t2.Size()
+ && n == t4.Size() + t3.Size() + t2.Size() + t1.Size()
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+ && clobber(m2, m3, m4, m5)
+ => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 (Store {t4} op4 d4 mem))))
+
+// Don't Zero variables that are immediately completely overwritten
+// before being accessed.
+(Move {t} [n] dst1 src1 zero:(Zero {t} [n] dst2 mem))
+ && zero.Uses == 1
+ && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+ && clobber(zero)
+ => (Move {t} [n] dst1 src1 mem)
+(Move {t} [n] dst1 src1 vardef:(VarDef {x} zero:(Zero {t} [n] dst2 mem)))
+ && zero.Uses == 1 && vardef.Uses == 1
+ && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+ && clobber(zero, vardef)
+ => (Move {t} [n] dst1 src1 (VarDef {x} mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+ m2:(Store {t2} op2:(OffPtr [0] p2) d2
+ m3:(Zero [n] p3 mem)))
+ && m2.Uses == 1 && m3.Uses == 1
+ && o1 == t2.Size()
+ && n == t2.Size() + t1.Size()
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3)
+ && clobber(m2, m3)
+ => (Store {t1} op1 d1 (Store {t2} op2 d2 mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+ m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+ m3:(Store {t3} op3:(OffPtr [0] p3) d3
+ m4:(Zero [n] p4 mem))))
+ && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1
+ && o2 == t3.Size()
+ && o1-o2 == t2.Size()
+ && n == t3.Size() + t2.Size() + t1.Size()
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+ && clobber(m2, m3, m4)
+ => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 mem)))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+ m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+ m3:(Store {t3} op3:(OffPtr [o3] p3) d3
+ m4:(Store {t4} op4:(OffPtr [0] p4) d4
+ m5:(Zero [n] p5 mem)))))
+ && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 && m5.Uses == 1
+ && o3 == t4.Size()
+ && o2-o3 == t3.Size()
+ && o1-o2 == t2.Size()
+ && n == t4.Size() + t3.Size() + t2.Size() + t1.Size()
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+ && clobber(m2, m3, m4, m5)
+ => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 (Store {t4} op4 d4 mem))))
+
+// Don't Move from memory if the values are likely to already be
+// in registers.
+(Move {t1} [n] dst p1
+ mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Store {t3} op3:(OffPtr <tt3> [0] p3) d2 _)))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && o2 == t3.Size()
+ && n == t2.Size() + t3.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [0] dst) d2 mem))
+(Move {t1} [n] dst p1
+ mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+ (Store {t4} op4:(OffPtr <tt4> [0] p4) d3 _))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && o3 == t4.Size()
+ && o2-o3 == t3.Size()
+ && n == t2.Size() + t3.Size() + t4.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [0] dst) d3 mem)))
+(Move {t1} [n] dst p1
+ mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+ (Store {t4} op4:(OffPtr <tt4> [o4] p4) d3
+ (Store {t5} op5:(OffPtr <tt5> [0] p5) d4 _)))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && t5.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && registerizable(b, t5)
+ && o4 == t5.Size()
+ && o3-o4 == t4.Size()
+ && o2-o3 == t3.Size()
+ && n == t2.Size() + t3.Size() + t4.Size() + t5.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [o4] dst) d3
+ (Store {t5} (OffPtr <tt5> [0] dst) d4 mem))))
+
+// Same thing but with VarDef in the middle.
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Store {t3} op3:(OffPtr <tt3> [0] p3) d2 _))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && o2 == t3.Size()
+ && n == t2.Size() + t3.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [0] dst) d2 mem))
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+ (Store {t4} op4:(OffPtr <tt4> [0] p4) d3 _)))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && o3 == t4.Size()
+ && o2-o3 == t3.Size()
+ && n == t2.Size() + t3.Size() + t4.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [0] dst) d3 mem)))
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+ (Store {t4} op4:(OffPtr <tt4> [o4] p4) d3
+ (Store {t5} op5:(OffPtr <tt5> [0] p5) d4 _))))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && t5.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && registerizable(b, t5)
+ && o4 == t5.Size()
+ && o3-o4 == t4.Size()
+ && o2-o3 == t3.Size()
+ && n == t2.Size() + t3.Size() + t4.Size() + t5.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [o4] dst) d3
+ (Store {t5} (OffPtr <tt5> [0] dst) d4 mem))))
+
+// Prefer to Zero and Store than to Move.
+(Move {t1} [n] dst p1
+ mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Zero {t3} [n] p3 _)))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && n >= o2 + t2.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Zero {t1} [n] dst mem))
+(Move {t1} [n] dst p1
+ mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1
+ (Store {t3} (OffPtr <tt3> [o3] p3) d2
+ (Zero {t4} [n] p4 _))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && n >= o2 + t2.Size()
+ && n >= o3 + t3.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Zero {t1} [n] dst mem)))
+(Move {t1} [n] dst p1
+ mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1
+ (Store {t3} (OffPtr <tt3> [o3] p3) d2
+ (Store {t4} (OffPtr <tt4> [o4] p4) d3
+ (Zero {t5} [n] p5 _)))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && t5.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && n >= o2 + t2.Size()
+ && n >= o3 + t3.Size()
+ && n >= o4 + t4.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [o4] dst) d3
+ (Zero {t1} [n] dst mem))))
+(Move {t1} [n] dst p1
+ mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1
+ (Store {t3} (OffPtr <tt3> [o3] p3) d2
+ (Store {t4} (OffPtr <tt4> [o4] p4) d3
+ (Store {t5} (OffPtr <tt5> [o5] p5) d4
+ (Zero {t6} [n] p6 _))))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) && isSamePtr(p5, p6)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && t5.Alignment() <= t1.Alignment()
+ && t6.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && registerizable(b, t5)
+ && n >= o2 + t2.Size()
+ && n >= o3 + t3.Size()
+ && n >= o4 + t4.Size()
+ && n >= o5 + t5.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [o4] dst) d3
+ (Store {t5} (OffPtr <tt5> [o5] dst) d4
+ (Zero {t1} [n] dst mem)))))
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+ (Zero {t3} [n] p3 _))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && n >= o2 + t2.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Zero {t1} [n] dst mem))
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} (OffPtr <tt2> [o2] p2) d1
+ (Store {t3} (OffPtr <tt3> [o3] p3) d2
+ (Zero {t4} [n] p4 _)))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && n >= o2 + t2.Size()
+ && n >= o3 + t3.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Zero {t1} [n] dst mem)))
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} (OffPtr <tt2> [o2] p2) d1
+ (Store {t3} (OffPtr <tt3> [o3] p3) d2
+ (Store {t4} (OffPtr <tt4> [o4] p4) d3
+ (Zero {t5} [n] p5 _))))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && t5.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && n >= o2 + t2.Size()
+ && n >= o3 + t3.Size()
+ && n >= o4 + t4.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [o4] dst) d3
+ (Zero {t1} [n] dst mem))))
+(Move {t1} [n] dst p1
+ mem:(VarDef
+ (Store {t2} (OffPtr <tt2> [o2] p2) d1
+ (Store {t3} (OffPtr <tt3> [o3] p3) d2
+ (Store {t4} (OffPtr <tt4> [o4] p4) d3
+ (Store {t5} (OffPtr <tt5> [o5] p5) d4
+ (Zero {t6} [n] p6 _)))))))
+ && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) && isSamePtr(p5, p6)
+ && t2.Alignment() <= t1.Alignment()
+ && t3.Alignment() <= t1.Alignment()
+ && t4.Alignment() <= t1.Alignment()
+ && t5.Alignment() <= t1.Alignment()
+ && t6.Alignment() <= t1.Alignment()
+ && registerizable(b, t2)
+ && registerizable(b, t3)
+ && registerizable(b, t4)
+ && registerizable(b, t5)
+ && n >= o2 + t2.Size()
+ && n >= o3 + t3.Size()
+ && n >= o4 + t4.Size()
+ && n >= o5 + t5.Size()
+ => (Store {t2} (OffPtr <tt2> [o2] dst) d1
+ (Store {t3} (OffPtr <tt3> [o3] dst) d2
+ (Store {t4} (OffPtr <tt4> [o4] dst) d3
+ (Store {t5} (OffPtr <tt5> [o5] dst) d4
+ (Zero {t1} [n] dst mem)))))
+
+(SelectN [0] call:(StaticLECall {sym} a x)) && needRaceCleanup(sym, call) && clobber(call) => x
+(SelectN [0] call:(StaticLECall {sym} x)) && needRaceCleanup(sym, call) && clobber(call) => x
+
+// When rewriting append to growslice, we use as the the new length the result of
+// growslice so that we don't have to spill/restore the new length around the growslice call.
+// The exception here is that if the new length is a constant, avoiding spilling it
+// is pointless and its constantness is sometimes useful for subsequent optimizations.
+// See issue 56440.
+// Note there are 2 rules here, one for the pre-decomposed []T result and one for
+// the post-decomposed (*T,int,int) result. (The latter is generated after call expansion.)
+(SliceLen (SelectN [0] (StaticLECall {sym} _ newLen:(Const(64|32)) _ _ _ _))) && isSameCall(sym, "runtime.growslice") => newLen
+(SelectN [1] (StaticCall {sym} _ newLen:(Const(64|32)) _ _ _ _)) && v.Type.IsInteger() && isSameCall(sym, "runtime.growslice") => newLen
+
+// Collapse moving A -> B -> C into just A -> C.
+// Later passes (deadstore, elim unread auto) will remove the A -> B move, if possible.
+// This happens most commonly when B is an autotmp inserted earlier
+// during compilation to ensure correctness.
+// Take care that overlapping moves are preserved.
+// Restrict this optimization to the stack, to avoid duplicating loads from the heap;
+// see CL 145208 for discussion.
+(Move {t1} [s] dst tmp1 midmem:(Move {t2} [s] tmp2 src _))
+ && t1.Compare(t2) == types.CMPeq
+ && isSamePtr(tmp1, tmp2)
+ && isStackPtr(src) && !isVolatile(src)
+ && disjoint(src, s, tmp2, s)
+ && (disjoint(src, s, dst, s) || isInlinableMemmove(dst, src, s, config))
+ => (Move {t1} [s] dst src midmem)
+
+// Same, but for large types that require VarDefs.
+(Move {t1} [s] dst tmp1 midmem:(VarDef (Move {t2} [s] tmp2 src _)))
+ && t1.Compare(t2) == types.CMPeq
+ && isSamePtr(tmp1, tmp2)
+ && isStackPtr(src) && !isVolatile(src)
+ && disjoint(src, s, tmp2, s)
+ && (disjoint(src, s, dst, s) || isInlinableMemmove(dst, src, s, config))
+ => (Move {t1} [s] dst src midmem)
+
+// Don't zero the same bits twice.
+(Zero {t} [s] dst1 zero:(Zero {t} [s] dst2 _)) && isSamePtr(dst1, dst2) => zero
+(Zero {t} [s] dst1 vardef:(VarDef (Zero {t} [s] dst2 _))) && isSamePtr(dst1, dst2) => vardef
+
+// Elide self-moves. This only happens rarely (e.g test/fixedbugs/bug277.go).
+// However, this rule is needed to prevent the previous rule from looping forever in such cases.
+(Move dst src mem) && isSamePtr(dst, src) => mem
+
+// Constant rotate detection.
+((Add64|Or64|Xor64) (Lsh64x64 x z:(Const64 <t> [c])) (Rsh64Ux64 x (Const64 [d]))) && c < 64 && d == 64-c && canRotate(config, 64) => (RotateLeft64 x z)
+((Add32|Or32|Xor32) (Lsh32x64 x z:(Const64 <t> [c])) (Rsh32Ux64 x (Const64 [d]))) && c < 32 && d == 32-c && canRotate(config, 32) => (RotateLeft32 x z)
+((Add16|Or16|Xor16) (Lsh16x64 x z:(Const64 <t> [c])) (Rsh16Ux64 x (Const64 [d]))) && c < 16 && d == 16-c && canRotate(config, 16) => (RotateLeft16 x z)
+((Add8|Or8|Xor8) (Lsh8x64 x z:(Const64 <t> [c])) (Rsh8Ux64 x (Const64 [d]))) && c < 8 && d == 8-c && canRotate(config, 8) => (RotateLeft8 x z)
+
+// Non-constant rotate detection.
+// We use shiftIsBounded to make sure that neither of the shifts are >64.
+// Note: these rules are subtle when the shift amounts are 0/64, as Go shifts
+// are different from most native shifts. But it works out.
+((Add64|Or64|Xor64) left:(Lsh64x64 x y) right:(Rsh64Ux64 x (Sub64 (Const64 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y)
+((Add64|Or64|Xor64) left:(Lsh64x32 x y) right:(Rsh64Ux32 x (Sub32 (Const32 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y)
+((Add64|Or64|Xor64) left:(Lsh64x16 x y) right:(Rsh64Ux16 x (Sub16 (Const16 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y)
+((Add64|Or64|Xor64) left:(Lsh64x8 x y) right:(Rsh64Ux8 x (Sub8 (Const8 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y)
+
+((Add64|Or64|Xor64) right:(Rsh64Ux64 x y) left:(Lsh64x64 x z:(Sub64 (Const64 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z)
+((Add64|Or64|Xor64) right:(Rsh64Ux32 x y) left:(Lsh64x32 x z:(Sub32 (Const32 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z)
+((Add64|Or64|Xor64) right:(Rsh64Ux16 x y) left:(Lsh64x16 x z:(Sub16 (Const16 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z)
+((Add64|Or64|Xor64) right:(Rsh64Ux8 x y) left:(Lsh64x8 x z:(Sub8 (Const8 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z)
+
+((Add32|Or32|Xor32) left:(Lsh32x64 x y) right:(Rsh32Ux64 x (Sub64 (Const64 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y)
+((Add32|Or32|Xor32) left:(Lsh32x32 x y) right:(Rsh32Ux32 x (Sub32 (Const32 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y)
+((Add32|Or32|Xor32) left:(Lsh32x16 x y) right:(Rsh32Ux16 x (Sub16 (Const16 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y)
+((Add32|Or32|Xor32) left:(Lsh32x8 x y) right:(Rsh32Ux8 x (Sub8 (Const8 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y)
+
+((Add32|Or32|Xor32) right:(Rsh32Ux64 x y) left:(Lsh32x64 x z:(Sub64 (Const64 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z)
+((Add32|Or32|Xor32) right:(Rsh32Ux32 x y) left:(Lsh32x32 x z:(Sub32 (Const32 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z)
+((Add32|Or32|Xor32) right:(Rsh32Ux16 x y) left:(Lsh32x16 x z:(Sub16 (Const16 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z)
+((Add32|Or32|Xor32) right:(Rsh32Ux8 x y) left:(Lsh32x8 x z:(Sub8 (Const8 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z)
+
+((Add16|Or16|Xor16) left:(Lsh16x64 x y) right:(Rsh16Ux64 x (Sub64 (Const64 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y)
+((Add16|Or16|Xor16) left:(Lsh16x32 x y) right:(Rsh16Ux32 x (Sub32 (Const32 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y)
+((Add16|Or16|Xor16) left:(Lsh16x16 x y) right:(Rsh16Ux16 x (Sub16 (Const16 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y)
+((Add16|Or16|Xor16) left:(Lsh16x8 x y) right:(Rsh16Ux8 x (Sub8 (Const8 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y)
+
+((Add16|Or16|Xor16) right:(Rsh16Ux64 x y) left:(Lsh16x64 x z:(Sub64 (Const64 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z)
+((Add16|Or16|Xor16) right:(Rsh16Ux32 x y) left:(Lsh16x32 x z:(Sub32 (Const32 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z)
+((Add16|Or16|Xor16) right:(Rsh16Ux16 x y) left:(Lsh16x16 x z:(Sub16 (Const16 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z)
+((Add16|Or16|Xor16) right:(Rsh16Ux8 x y) left:(Lsh16x8 x z:(Sub8 (Const8 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z)
+
+((Add8|Or8|Xor8) left:(Lsh8x64 x y) right:(Rsh8Ux64 x (Sub64 (Const64 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y)
+((Add8|Or8|Xor8) left:(Lsh8x32 x y) right:(Rsh8Ux32 x (Sub32 (Const32 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y)
+((Add8|Or8|Xor8) left:(Lsh8x16 x y) right:(Rsh8Ux16 x (Sub16 (Const16 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y)
+((Add8|Or8|Xor8) left:(Lsh8x8 x y) right:(Rsh8Ux8 x (Sub8 (Const8 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y)
+
+((Add8|Or8|Xor8) right:(Rsh8Ux64 x y) left:(Lsh8x64 x z:(Sub64 (Const64 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z)
+((Add8|Or8|Xor8) right:(Rsh8Ux32 x y) left:(Lsh8x32 x z:(Sub32 (Const32 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z)
+((Add8|Or8|Xor8) right:(Rsh8Ux16 x y) left:(Lsh8x16 x z:(Sub16 (Const16 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z)
+((Add8|Or8|Xor8) right:(Rsh8Ux8 x y) left:(Lsh8x8 x z:(Sub8 (Const8 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z)
+
+// Rotating by y&c, with c a mask that doesn't change the bottom bits, is the same as rotating by y.
+(RotateLeft64 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&63 == 63 => (RotateLeft64 x y)
+(RotateLeft32 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&31 == 31 => (RotateLeft32 x y)
+(RotateLeft16 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&15 == 15 => (RotateLeft16 x y)
+(RotateLeft8 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&7 == 7 => (RotateLeft8 x y)
+
+// Rotating by -(y&c), with c a mask that doesn't change the bottom bits, is the same as rotating by -y.
+(RotateLeft64 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&63 == 63 => (RotateLeft64 x (Neg(64|32|16|8) <y.Type> y))
+(RotateLeft32 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&31 == 31 => (RotateLeft32 x (Neg(64|32|16|8) <y.Type> y))
+(RotateLeft16 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&15 == 15 => (RotateLeft16 x (Neg(64|32|16|8) <y.Type> y))
+(RotateLeft8 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&7 == 7 => (RotateLeft8 x (Neg(64|32|16|8) <y.Type> y))
+
+// Rotating by y+c, with c a multiple of the value width, is the same as rotating by y.
+(RotateLeft64 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&63 == 0 => (RotateLeft64 x y)
+(RotateLeft32 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&31 == 0 => (RotateLeft32 x y)
+(RotateLeft16 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&15 == 0 => (RotateLeft16 x y)
+(RotateLeft8 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&7 == 0 => (RotateLeft8 x y)
+
+// Rotating by c-y, with c a multiple of the value width, is the same as rotating by -y.
+(RotateLeft64 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&63 == 0 => (RotateLeft64 x (Neg(64|32|16|8) <y.Type> y))
+(RotateLeft32 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&31 == 0 => (RotateLeft32 x (Neg(64|32|16|8) <y.Type> y))
+(RotateLeft16 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&15 == 0 => (RotateLeft16 x (Neg(64|32|16|8) <y.Type> y))
+(RotateLeft8 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&7 == 0 => (RotateLeft8 x (Neg(64|32|16|8) <y.Type> y))
+
+// Ensure we don't do Const64 rotates in a 32-bit system.
+(RotateLeft64 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft64 x (Const32 <t> [int32(c)]))
+(RotateLeft32 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft32 x (Const32 <t> [int32(c)]))
+(RotateLeft16 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft16 x (Const32 <t> [int32(c)]))
+(RotateLeft8 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft8 x (Const32 <t> [int32(c)]))
+
+// Rotating by c, then by d, is the same as rotating by c+d.
+// We're trading a rotate for an add, which seems generally a good choice. It is especially good when c and d are constants.
+// This rule is a bit tricky as c and d might be different widths. We handle only cases where they are the same width.
+(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 8 && d.Type.Size() == 8 => (RotateLeft(64|32|16|8) x (Add64 <c.Type> c d))
+(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 4 && d.Type.Size() == 4 => (RotateLeft(64|32|16|8) x (Add32 <c.Type> c d))
+(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 2 && d.Type.Size() == 2 => (RotateLeft(64|32|16|8) x (Add16 <c.Type> c d))
+(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 1 && d.Type.Size() == 1 => (RotateLeft(64|32|16|8) x (Add8 <c.Type> c d))
diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go
new file mode 100644
index 0000000..a4c8fc9
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@@ -0,0 +1,664 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// Generic opcodes typically specify a width. The inputs and outputs
+// of that op are the given number of bits wide. There is no notion of
+// "sign", so Add32 can be used both for signed and unsigned 32-bit
+// addition.
+
+// Signed/unsigned is explicit with the extension ops
+// (SignExt*/ZeroExt*) and implicit as the arg to some opcodes
+// (e.g. the second argument to shifts is unsigned). If not mentioned,
+// all args take signed inputs, or don't care whether their inputs
+// are signed or unsigned.
+
+var genericOps = []opData{
+ // 2-input arithmetic
+ // Types must be consistent with Go typing. Add, for example, must take two values
+ // of the same type and produces that same type.
+ {name: "Add8", argLength: 2, commutative: true}, // arg0 + arg1
+ {name: "Add16", argLength: 2, commutative: true},
+ {name: "Add32", argLength: 2, commutative: true},
+ {name: "Add64", argLength: 2, commutative: true},
+ {name: "AddPtr", argLength: 2}, // For address calculations. arg0 is a pointer and arg1 is an int.
+ {name: "Add32F", argLength: 2, commutative: true},
+ {name: "Add64F", argLength: 2, commutative: true},
+
+ {name: "Sub8", argLength: 2}, // arg0 - arg1
+ {name: "Sub16", argLength: 2},
+ {name: "Sub32", argLength: 2},
+ {name: "Sub64", argLength: 2},
+ {name: "SubPtr", argLength: 2},
+ {name: "Sub32F", argLength: 2},
+ {name: "Sub64F", argLength: 2},
+
+ {name: "Mul8", argLength: 2, commutative: true}, // arg0 * arg1
+ {name: "Mul16", argLength: 2, commutative: true},
+ {name: "Mul32", argLength: 2, commutative: true},
+ {name: "Mul64", argLength: 2, commutative: true},
+ {name: "Mul32F", argLength: 2, commutative: true},
+ {name: "Mul64F", argLength: 2, commutative: true},
+
+ {name: "Div32F", argLength: 2}, // arg0 / arg1
+ {name: "Div64F", argLength: 2},
+
+ {name: "Hmul32", argLength: 2, commutative: true},
+ {name: "Hmul32u", argLength: 2, commutative: true},
+ {name: "Hmul64", argLength: 2, commutative: true},
+ {name: "Hmul64u", argLength: 2, commutative: true},
+
+ {name: "Mul32uhilo", argLength: 2, typ: "(UInt32,UInt32)", commutative: true}, // arg0 * arg1, returns (hi, lo)
+ {name: "Mul64uhilo", argLength: 2, typ: "(UInt64,UInt64)", commutative: true}, // arg0 * arg1, returns (hi, lo)
+
+ {name: "Mul32uover", argLength: 2, typ: "(UInt32,Bool)", commutative: true}, // Let x = arg0*arg1 (full 32x32-> 64 unsigned multiply), returns (uint32(x), (uint32(x) != x))
+ {name: "Mul64uover", argLength: 2, typ: "(UInt64,Bool)", commutative: true}, // Let x = arg0*arg1 (full 64x64->128 unsigned multiply), returns (uint64(x), (uint64(x) != x))
+
+ // Weird special instructions for use in the strength reduction of divides.
+ // These ops compute unsigned (arg0 + arg1) / 2, correct to all
+ // 32/64 bits, even when the intermediate result of the add has 33/65 bits.
+ // These ops can assume arg0 >= arg1.
+ // Note: these ops aren't commutative!
+ {name: "Avg32u", argLength: 2, typ: "UInt32"}, // 32-bit platforms only
+ {name: "Avg64u", argLength: 2, typ: "UInt64"}, // 64-bit platforms only
+
+ // For Div16, Div32 and Div64, AuxInt non-zero means that the divisor has been proved to be not -1
+ // or that the dividend is not the most negative value.
+ {name: "Div8", argLength: 2}, // arg0 / arg1, signed
+ {name: "Div8u", argLength: 2}, // arg0 / arg1, unsigned
+ {name: "Div16", argLength: 2, aux: "Bool"},
+ {name: "Div16u", argLength: 2},
+ {name: "Div32", argLength: 2, aux: "Bool"},
+ {name: "Div32u", argLength: 2},
+ {name: "Div64", argLength: 2, aux: "Bool"},
+ {name: "Div64u", argLength: 2},
+ {name: "Div128u", argLength: 3}, // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r)
+
+ // For Mod16, Mod32 and Mod64, AuxInt non-zero means that the divisor has been proved to be not -1.
+ {name: "Mod8", argLength: 2}, // arg0 % arg1, signed
+ {name: "Mod8u", argLength: 2}, // arg0 % arg1, unsigned
+ {name: "Mod16", argLength: 2, aux: "Bool"},
+ {name: "Mod16u", argLength: 2},
+ {name: "Mod32", argLength: 2, aux: "Bool"},
+ {name: "Mod32u", argLength: 2},
+ {name: "Mod64", argLength: 2, aux: "Bool"},
+ {name: "Mod64u", argLength: 2},
+
+ {name: "And8", argLength: 2, commutative: true}, // arg0 & arg1
+ {name: "And16", argLength: 2, commutative: true},
+ {name: "And32", argLength: 2, commutative: true},
+ {name: "And64", argLength: 2, commutative: true},
+
+ {name: "Or8", argLength: 2, commutative: true}, // arg0 | arg1
+ {name: "Or16", argLength: 2, commutative: true},
+ {name: "Or32", argLength: 2, commutative: true},
+ {name: "Or64", argLength: 2, commutative: true},
+
+ {name: "Xor8", argLength: 2, commutative: true}, // arg0 ^ arg1
+ {name: "Xor16", argLength: 2, commutative: true},
+ {name: "Xor32", argLength: 2, commutative: true},
+ {name: "Xor64", argLength: 2, commutative: true},
+
+ // For shifts, AxB means the shifted value has A bits and the shift amount has B bits.
+ // Shift amounts are considered unsigned.
+ // If arg1 is known to be nonnegative and less than the number of bits in arg0,
+ // then auxInt may be set to 1.
+ // This enables better code generation on some platforms.
+ {name: "Lsh8x8", argLength: 2, aux: "Bool"}, // arg0 << arg1
+ {name: "Lsh8x16", argLength: 2, aux: "Bool"},
+ {name: "Lsh8x32", argLength: 2, aux: "Bool"},
+ {name: "Lsh8x64", argLength: 2, aux: "Bool"},
+ {name: "Lsh16x8", argLength: 2, aux: "Bool"},
+ {name: "Lsh16x16", argLength: 2, aux: "Bool"},
+ {name: "Lsh16x32", argLength: 2, aux: "Bool"},
+ {name: "Lsh16x64", argLength: 2, aux: "Bool"},
+ {name: "Lsh32x8", argLength: 2, aux: "Bool"},
+ {name: "Lsh32x16", argLength: 2, aux: "Bool"},
+ {name: "Lsh32x32", argLength: 2, aux: "Bool"},
+ {name: "Lsh32x64", argLength: 2, aux: "Bool"},
+ {name: "Lsh64x8", argLength: 2, aux: "Bool"},
+ {name: "Lsh64x16", argLength: 2, aux: "Bool"},
+ {name: "Lsh64x32", argLength: 2, aux: "Bool"},
+ {name: "Lsh64x64", argLength: 2, aux: "Bool"},
+
+ {name: "Rsh8x8", argLength: 2, aux: "Bool"}, // arg0 >> arg1, signed
+ {name: "Rsh8x16", argLength: 2, aux: "Bool"},
+ {name: "Rsh8x32", argLength: 2, aux: "Bool"},
+ {name: "Rsh8x64", argLength: 2, aux: "Bool"},
+ {name: "Rsh16x8", argLength: 2, aux: "Bool"},
+ {name: "Rsh16x16", argLength: 2, aux: "Bool"},
+ {name: "Rsh16x32", argLength: 2, aux: "Bool"},
+ {name: "Rsh16x64", argLength: 2, aux: "Bool"},
+ {name: "Rsh32x8", argLength: 2, aux: "Bool"},
+ {name: "Rsh32x16", argLength: 2, aux: "Bool"},
+ {name: "Rsh32x32", argLength: 2, aux: "Bool"},
+ {name: "Rsh32x64", argLength: 2, aux: "Bool"},
+ {name: "Rsh64x8", argLength: 2, aux: "Bool"},
+ {name: "Rsh64x16", argLength: 2, aux: "Bool"},
+ {name: "Rsh64x32", argLength: 2, aux: "Bool"},
+ {name: "Rsh64x64", argLength: 2, aux: "Bool"},
+
+ {name: "Rsh8Ux8", argLength: 2, aux: "Bool"}, // arg0 >> arg1, unsigned
+ {name: "Rsh8Ux16", argLength: 2, aux: "Bool"},
+ {name: "Rsh8Ux32", argLength: 2, aux: "Bool"},
+ {name: "Rsh8Ux64", argLength: 2, aux: "Bool"},
+ {name: "Rsh16Ux8", argLength: 2, aux: "Bool"},
+ {name: "Rsh16Ux16", argLength: 2, aux: "Bool"},
+ {name: "Rsh16Ux32", argLength: 2, aux: "Bool"},
+ {name: "Rsh16Ux64", argLength: 2, aux: "Bool"},
+ {name: "Rsh32Ux8", argLength: 2, aux: "Bool"},
+ {name: "Rsh32Ux16", argLength: 2, aux: "Bool"},
+ {name: "Rsh32Ux32", argLength: 2, aux: "Bool"},
+ {name: "Rsh32Ux64", argLength: 2, aux: "Bool"},
+ {name: "Rsh64Ux8", argLength: 2, aux: "Bool"},
+ {name: "Rsh64Ux16", argLength: 2, aux: "Bool"},
+ {name: "Rsh64Ux32", argLength: 2, aux: "Bool"},
+ {name: "Rsh64Ux64", argLength: 2, aux: "Bool"},
+
+ // 2-input comparisons
+ {name: "Eq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1
+ {name: "Eq16", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "Eq32", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "Eq64", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "EqPtr", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "EqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+ {name: "EqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+ {name: "Eq32F", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "Eq64F", argLength: 2, commutative: true, typ: "Bool"},
+
+ {name: "Neq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1
+ {name: "Neq16", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "Neq32", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "Neq64", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "NeqPtr", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "NeqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+ {name: "NeqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+ {name: "Neq32F", argLength: 2, commutative: true, typ: "Bool"},
+ {name: "Neq64F", argLength: 2, commutative: true, typ: "Bool"},
+
+ {name: "Less8", argLength: 2, typ: "Bool"}, // arg0 < arg1, signed
+ {name: "Less8U", argLength: 2, typ: "Bool"}, // arg0 < arg1, unsigned
+ {name: "Less16", argLength: 2, typ: "Bool"},
+ {name: "Less16U", argLength: 2, typ: "Bool"},
+ {name: "Less32", argLength: 2, typ: "Bool"},
+ {name: "Less32U", argLength: 2, typ: "Bool"},
+ {name: "Less64", argLength: 2, typ: "Bool"},
+ {name: "Less64U", argLength: 2, typ: "Bool"},
+ {name: "Less32F", argLength: 2, typ: "Bool"},
+ {name: "Less64F", argLength: 2, typ: "Bool"},
+
+ {name: "Leq8", argLength: 2, typ: "Bool"}, // arg0 <= arg1, signed
+ {name: "Leq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned
+ {name: "Leq16", argLength: 2, typ: "Bool"},
+ {name: "Leq16U", argLength: 2, typ: "Bool"},
+ {name: "Leq32", argLength: 2, typ: "Bool"},
+ {name: "Leq32U", argLength: 2, typ: "Bool"},
+ {name: "Leq64", argLength: 2, typ: "Bool"},
+ {name: "Leq64U", argLength: 2, typ: "Bool"},
+ {name: "Leq32F", argLength: 2, typ: "Bool"},
+ {name: "Leq64F", argLength: 2, typ: "Bool"},
+
+ // the type of a CondSelect is the same as the type of its first
+ // two arguments, which should be register-width scalars; the third
+ // argument should be a boolean
+ {name: "CondSelect", argLength: 3}, // arg2 ? arg0 : arg1
+
+ // boolean ops
+ {name: "AndB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 && arg1 (not shortcircuited)
+ {name: "OrB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 || arg1 (not shortcircuited)
+ {name: "EqB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1
+ {name: "NeqB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1
+ {name: "Not", argLength: 1, typ: "Bool"}, // !arg0, boolean
+
+ // 1-input ops
+ {name: "Neg8", argLength: 1}, // -arg0
+ {name: "Neg16", argLength: 1},
+ {name: "Neg32", argLength: 1},
+ {name: "Neg64", argLength: 1},
+ {name: "Neg32F", argLength: 1},
+ {name: "Neg64F", argLength: 1},
+
+ {name: "Com8", argLength: 1}, // ^arg0
+ {name: "Com16", argLength: 1},
+ {name: "Com32", argLength: 1},
+ {name: "Com64", argLength: 1},
+
+ {name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8)
+ {name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
+ {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
+ {name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
+ {name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
+ {name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
+ {name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
+ {name: "Ctz64NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-63
+ {name: "BitLen8", argLength: 1}, // Number of bits in arg[0] (returns 0-8)
+ {name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16)
+ {name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
+ {name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)
+
+ {name: "Bswap32", argLength: 1}, // Swap bytes
+ {name: "Bswap64", argLength: 1}, // Swap bytes
+
+ {name: "BitRev8", argLength: 1}, // Reverse the bits in arg[0]
+ {name: "BitRev16", argLength: 1}, // Reverse the bits in arg[0]
+ {name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0]
+ {name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0]
+
+ {name: "PopCount8", argLength: 1}, // Count bits in arg[0]
+ {name: "PopCount16", argLength: 1}, // Count bits in arg[0]
+ {name: "PopCount32", argLength: 1}, // Count bits in arg[0]
+ {name: "PopCount64", argLength: 1}, // Count bits in arg[0]
+
+ // RotateLeftX instructions rotate the X bits of arg[0] to the left
+ // by the low lg_2(X) bits of arg[1], interpreted as an unsigned value.
+ // Note that this works out regardless of the bit width or signedness of
+ // arg[1]. In particular, RotateLeft by x is the same as RotateRight by -x.
+ {name: "RotateLeft64", argLength: 2},
+ {name: "RotateLeft32", argLength: 2},
+ {name: "RotateLeft16", argLength: 2},
+ {name: "RotateLeft8", argLength: 2},
+
+ // Square root.
+ // Special cases:
+ // +∞ → +∞
+ // ±0 → ±0 (sign preserved)
+ // x<0 → NaN
+ // NaN → NaN
+ {name: "Sqrt", argLength: 1}, // √arg0 (floating point, double precision)
+ {name: "Sqrt32", argLength: 1}, // √arg0 (floating point, single precision)
+
+ // Round to integer, float64 only.
+ // Special cases:
+ // ±∞ → ±∞ (sign preserved)
+ // ±0 → ±0 (sign preserved)
+ // NaN → NaN
+ {name: "Floor", argLength: 1}, // round arg0 toward -∞
+ {name: "Ceil", argLength: 1}, // round arg0 toward +∞
+ {name: "Trunc", argLength: 1}, // round arg0 toward 0
+ {name: "Round", argLength: 1}, // round arg0 to nearest, ties away from 0
+ {name: "RoundToEven", argLength: 1}, // round arg0 to nearest, ties to even
+
+ // Modify the sign bit
+ {name: "Abs", argLength: 1}, // absolute value arg0
+ {name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1
+
+ // 3-input opcode.
+ // Fused-multiply-add, float64 only.
+ // When a*b+c is exactly zero (before rounding), then the result is +0 or -0.
+ // The 0's sign is determined according to the standard rules for the
+ // addition (-0 if both a*b and c are -0, +0 otherwise).
+ //
+ // Otherwise, when a*b+c rounds to zero, then the resulting 0's sign is
+ // determined by the sign of the exact result a*b+c.
+ // See section 6.3 in ieee754.
+ //
+ // When the multiply is an infinity times a zero, the result is NaN.
+ // See section 7.2 in ieee754.
+ {name: "FMA", argLength: 3}, // compute (a*b)+c without intermediate rounding
+
+ // Data movement. Max argument length for Phi is indefinite.
+ {name: "Phi", argLength: -1, zeroWidth: true}, // select an argument based on which predecessor block we came from
+ {name: "Copy", argLength: 1}, // output = arg0
+ // Convert converts between pointers and integers.
+ // We have a special op for this so as to not confuse GC
+ // (particularly stack maps). It takes a memory arg so it
+ // gets correctly ordered with respect to GC safepoints.
+ // It gets compiled to nothing, so its result must in the same
+ // register as its argument. regalloc knows it can use any
+ // allocatable integer register for OpConvert.
+ // arg0=ptr/int arg1=mem, output=int/ptr
+ {name: "Convert", argLength: 2, zeroWidth: true, resultInArg0: true},
+
+ // constants. Constant values are stored in the aux or
+ // auxint fields.
+ {name: "ConstBool", aux: "Bool"}, // auxint is 0 for false and 1 for true
+ {name: "ConstString", aux: "String"}, // value is aux.(string)
+ {name: "ConstNil", typ: "BytePtr"}, // nil pointer
+ {name: "Const8", aux: "Int8"}, // auxint is sign-extended 8 bits
+ {name: "Const16", aux: "Int16"}, // auxint is sign-extended 16 bits
+ {name: "Const32", aux: "Int32"}, // auxint is sign-extended 32 bits
+ // Note: ConstX are sign-extended even when the type of the value is unsigned.
+ // For instance, uint8(0xaa) is stored as auxint=0xffffffffffffffaa.
+ {name: "Const64", aux: "Int64"}, // value is auxint
+ // Note: for both Const32F and Const64F, we disallow encoding NaNs.
+ // Signaling NaNs are tricky because if you do anything with them, they become quiet.
+ // Particularly, converting a 32 bit sNaN to 64 bit and back converts it to a qNaN.
+ // See issue 36399 and 36400.
+ // Encodings of +inf, -inf, and -0 are fine.
+ {name: "Const32F", aux: "Float32"}, // value is math.Float64frombits(uint64(auxint)) and is exactly representable as float 32
+ {name: "Const64F", aux: "Float64"}, // value is math.Float64frombits(uint64(auxint))
+ {name: "ConstInterface"}, // nil interface
+ {name: "ConstSlice"}, // nil slice
+
+ // Constant-like things
+ {name: "InitMem", zeroWidth: true}, // memory input to the function.
+ {name: "Arg", aux: "SymOff", symEffect: "Read", zeroWidth: true}, // argument to the function. aux=GCNode of arg, off = offset in that arg.
+
+ // Like Arg, these are generic ops that survive lowering. AuxInt is a register index, and the actual output register for each index is defined by the architecture.
+ // AuxInt = integer argument index (not a register number). ABI-specified spill loc obtained from function
+ {name: "ArgIntReg", aux: "NameOffsetInt8", zeroWidth: true}, // argument to the function in an int reg.
+ {name: "ArgFloatReg", aux: "NameOffsetInt8", zeroWidth: true}, // argument to the function in a float reg.
+
+ // The address of a variable. arg0 is the base pointer.
+ // If the variable is a global, the base pointer will be SB and
+ // the Aux field will be a *obj.LSym.
+ // If the variable is a local, the base pointer will be SP and
+ // the Aux field will be a *gc.Node.
+ {name: "Addr", argLength: 1, aux: "Sym", symEffect: "Addr"}, // Address of a variable. Arg0=SB. Aux identifies the variable.
+ {name: "LocalAddr", argLength: 2, aux: "Sym", symEffect: "Addr"}, // Address of a variable. Arg0=SP. Arg1=mem. Aux identifies the variable.
+
+ {name: "SP", zeroWidth: true}, // stack pointer
+ {name: "SB", typ: "Uintptr", zeroWidth: true}, // static base pointer (a.k.a. globals pointer)
+ {name: "Invalid"}, // unused value
+
+ // Memory operations
+ {name: "Load", argLength: 2}, // Load from arg0. arg1=memory
+ {name: "Dereference", argLength: 2}, // Load from arg0. arg1=memory. Helper op for arg/result passing, result is an otherwise not-SSA-able "value".
+ {name: "Store", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
+ // Normally we require that the source and destination of Move do not overlap.
+ // There is an exception when we know all the loads will happen before all
+ // the stores. In that case, overlap is ok. See
+ // memmove inlining in generic.rules. When inlineablememmovesize (in ../rewrite.go)
+ // returns true, we must do all loads before all stores, when lowering Move.
+ // The type of Move is used for the write barrier pass to insert write barriers
+ // and for alignment on some architectures.
+ // For pointerless types, it is possible for the type to be inaccurate.
+ // For type alignment and pointer information, use the type in Aux;
+ // for type size, use the size in AuxInt.
+ // The "inline runtime.memmove" rewrite rule generates Moves with inaccurate types,
+ // such as type byte instead of the more accurate type [8]byte.
+ {name: "Move", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type. Returns memory.
+ {name: "Zero", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory.
+
+ // Memory operations with write barriers.
+ // Expand to runtime calls. Write barrier will be removed if write on stack.
+ {name: "StoreWB", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
+ {name: "MoveWB", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type. Returns memory.
+ {name: "ZeroWB", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory.
+
+ // WB invokes runtime.gcWriteBarrier. This is not a normal
+ // call: it takes arguments in registers, doesn't clobber
+ // general-purpose registers (the exact clobber set is
+ // arch-dependent), and is not a safe-point.
+ {name: "WB", argLength: 3, typ: "Mem", aux: "Sym", symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+
+ {name: "HasCPUFeature", argLength: 0, typ: "bool", aux: "Sym", symEffect: "None"}, // aux=place that this feature flag can be loaded from
+
+ // PanicBounds and PanicExtend generate a runtime panic.
+ // Their arguments provide index values to use in panic messages.
+ // Both PanicBounds and PanicExtend have an AuxInt value from the BoundsKind type (in ../op.go).
+ // PanicBounds' index is int sized.
+ // PanicExtend's index is int64 sized. (PanicExtend is only used on 32-bit archs.)
+ {name: "PanicBounds", argLength: 3, aux: "Int64", typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory.
+ {name: "PanicExtend", argLength: 4, aux: "Int64", typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory.
+
+ // Function calls. Arguments to the call have already been written to the stack.
+ // Return values appear on the stack. The method receiver, if any, is treated
+ // as a phantom first argument.
+ // TODO(josharian): ClosureCall and InterCall should have Int32 aux
+ // to match StaticCall's 32 bit arg size limit.
+ // TODO(drchase,josharian): could the arg size limit be bundled into the rules for CallOff?
+
+ // Before lowering, LECalls receive their fixed inputs (first), memory (last),
+ // and a variable number of input values in the middle.
+ // They produce a variable number of result values.
+ // These values are not necessarily "SSA-able"; they can be too large,
+ // but in that case inputs are loaded immediately before with OpDereference,
+ // and outputs are stored immediately with OpStore.
+ //
+ // After call expansion, Calls have the same fixed-middle-memory arrangement of inputs,
+ // with the difference that the "middle" is only the register-resident inputs,
+ // and the non-register inputs are instead stored at ABI-defined offsets from SP
+ // (and the stores thread through the memory that is ultimately an input to the call).
+ // Outputs follow a similar pattern; register-resident outputs are the leading elements
+ // of a Result-typed output, with memory last, and any memory-resident outputs have been
+ // stored to ABI-defined locations. Each non-memory input or output fits in a register.
+ //
+ // Subsequent architecture-specific lowering only changes the opcode.
+
+ {name: "ClosureCall", argLength: -1, aux: "CallOff", call: true}, // arg0=code pointer, arg1=context ptr, arg2..argN-1 are register inputs, argN=memory. auxint=arg size. Returns Result of register results, plus memory.
+ {name: "StaticCall", argLength: -1, aux: "CallOff", call: true}, // call function aux.(*obj.LSym), arg0..argN-1 are register inputs, argN=memory. auxint=arg size. Returns Result of register results, plus memory.
+ {name: "InterCall", argLength: -1, aux: "CallOff", call: true}, // interface call. arg0=code pointer, arg1..argN-1 are register inputs, argN=memory, auxint=arg size. Returns Result of register results, plus memory.
+ {name: "TailCall", argLength: -1, aux: "CallOff", call: true}, // tail call function aux.(*obj.LSym), arg0..argN-1 are register inputs, argN=memory. auxint=arg size. Returns Result of register results, plus memory.
+
+ {name: "ClosureLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded closure call. arg0=code pointer, arg1=context ptr, arg2..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+ {name: "StaticLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded static call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+ {name: "InterLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded interface call. arg0=code pointer, arg1..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+ {name: "TailLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded static tail call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+
+ // Conversions: signed extensions, zero (unsigned) extensions, truncations
+ {name: "SignExt8to16", argLength: 1, typ: "Int16"},
+ {name: "SignExt8to32", argLength: 1, typ: "Int32"},
+ {name: "SignExt8to64", argLength: 1, typ: "Int64"},
+ {name: "SignExt16to32", argLength: 1, typ: "Int32"},
+ {name: "SignExt16to64", argLength: 1, typ: "Int64"},
+ {name: "SignExt32to64", argLength: 1, typ: "Int64"},
+ {name: "ZeroExt8to16", argLength: 1, typ: "UInt16"},
+ {name: "ZeroExt8to32", argLength: 1, typ: "UInt32"},
+ {name: "ZeroExt8to64", argLength: 1, typ: "UInt64"},
+ {name: "ZeroExt16to32", argLength: 1, typ: "UInt32"},
+ {name: "ZeroExt16to64", argLength: 1, typ: "UInt64"},
+ {name: "ZeroExt32to64", argLength: 1, typ: "UInt64"},
+ {name: "Trunc16to8", argLength: 1},
+ {name: "Trunc32to8", argLength: 1},
+ {name: "Trunc32to16", argLength: 1},
+ {name: "Trunc64to8", argLength: 1},
+ {name: "Trunc64to16", argLength: 1},
+ {name: "Trunc64to32", argLength: 1},
+
+ {name: "Cvt32to32F", argLength: 1},
+ {name: "Cvt32to64F", argLength: 1},
+ {name: "Cvt64to32F", argLength: 1},
+ {name: "Cvt64to64F", argLength: 1},
+ {name: "Cvt32Fto32", argLength: 1},
+ {name: "Cvt32Fto64", argLength: 1},
+ {name: "Cvt64Fto32", argLength: 1},
+ {name: "Cvt64Fto64", argLength: 1},
+ {name: "Cvt32Fto64F", argLength: 1},
+ {name: "Cvt64Fto32F", argLength: 1},
+ {name: "CvtBoolToUint8", argLength: 1},
+
+ // Force rounding to precision of type.
+ {name: "Round32F", argLength: 1},
+ {name: "Round64F", argLength: 1},
+
+ // Automatically inserted safety checks
+ {name: "IsNonNil", argLength: 1, typ: "Bool"}, // arg0 != nil
+ {name: "IsInBounds", argLength: 2, typ: "Bool"}, // 0 <= arg0 < arg1. arg1 is guaranteed >= 0.
+ {name: "IsSliceInBounds", argLength: 2, typ: "Bool"}, // 0 <= arg0 <= arg1. arg1 is guaranteed >= 0.
+ {name: "NilCheck", argLength: 2, typ: "Void"}, // arg0=ptr, arg1=mem. Panics if arg0 is nil. Returns void.
+
+ // Pseudo-ops
+ {name: "GetG", argLength: 1, zeroWidth: true}, // runtime.getg() (read g pointer). arg0=mem
+ {name: "GetClosurePtr"}, // get closure pointer from dedicated register
+ {name: "GetCallerPC"}, // for getcallerpc intrinsic
+ {name: "GetCallerSP"}, // for getcallersp intrinsic
+
+ // Indexing operations
+ {name: "PtrIndex", argLength: 2}, // arg0=ptr, arg1=index. Computes ptr+sizeof(*v.type)*index, where index is extended to ptrwidth type
+ {name: "OffPtr", argLength: 1, aux: "Int64"}, // arg0 + auxint (arg0 and result are pointers)
+
+ // Slices
+ {name: "SliceMake", argLength: 3}, // arg0=ptr, arg1=len, arg2=cap
+ {name: "SlicePtr", argLength: 1, typ: "BytePtr"}, // ptr(arg0)
+ {name: "SliceLen", argLength: 1}, // len(arg0)
+ {name: "SliceCap", argLength: 1}, // cap(arg0)
+ // SlicePtrUnchecked, like SlicePtr, extracts the pointer from a slice.
+ // SlicePtr values are assumed non-nil, because they are guarded by bounds checks.
+ // SlicePtrUnchecked values can be nil.
+ {name: "SlicePtrUnchecked", argLength: 1},
+
+ // Complex (part/whole)
+ {name: "ComplexMake", argLength: 2}, // arg0=real, arg1=imag
+ {name: "ComplexReal", argLength: 1}, // real(arg0)
+ {name: "ComplexImag", argLength: 1}, // imag(arg0)
+
+ // Strings
+ {name: "StringMake", argLength: 2}, // arg0=ptr, arg1=len
+ {name: "StringPtr", argLength: 1, typ: "BytePtr"}, // ptr(arg0)
+ {name: "StringLen", argLength: 1, typ: "Int"}, // len(arg0)
+
+ // Interfaces
+ {name: "IMake", argLength: 2}, // arg0=itab, arg1=data
+ {name: "ITab", argLength: 1, typ: "Uintptr"}, // arg0=interface, returns itable field
+ {name: "IData", argLength: 1}, // arg0=interface, returns data field
+
+ // Structs
+ {name: "StructMake0"}, // Returns struct with 0 fields.
+ {name: "StructMake1", argLength: 1}, // arg0=field0. Returns struct.
+ {name: "StructMake2", argLength: 2}, // arg0,arg1=field0,field1. Returns struct.
+ {name: "StructMake3", argLength: 3}, // arg0..2=field0..2. Returns struct.
+ {name: "StructMake4", argLength: 4}, // arg0..3=field0..3. Returns struct.
+ {name: "StructSelect", argLength: 1, aux: "Int64"}, // arg0=struct, auxint=field index. Returns the auxint'th field.
+
+ // Arrays
+ {name: "ArrayMake0"}, // Returns array with 0 elements
+ {name: "ArrayMake1", argLength: 1}, // Returns array with 1 element
+ {name: "ArraySelect", argLength: 1, aux: "Int64"}, // arg0=array, auxint=index. Returns a[i].
+
+ // Spill&restore ops for the register allocator. These are
+ // semantically identical to OpCopy; they do not take/return
+ // stores like regular memory ops do. We can get away without memory
+ // args because we know there is no aliasing of spill slots on the stack.
+ {name: "StoreReg", argLength: 1},
+ {name: "LoadReg", argLength: 1},
+
+ // Used during ssa construction. Like Copy, but the arg has not been specified yet.
+ {name: "FwdRef", aux: "Sym", symEffect: "None"},
+
+ // Unknown value. Used for Values whose values don't matter because they are dead code.
+ {name: "Unknown"},
+
+ {name: "VarDef", argLength: 1, aux: "Sym", typ: "Mem", symEffect: "None", zeroWidth: true}, // aux is a *gc.Node of a variable that is about to be initialized. arg0=mem, returns mem
+ // TODO: what's the difference between VarLive and KeepAlive?
+ {name: "VarLive", argLength: 1, aux: "Sym", symEffect: "Read", zeroWidth: true}, // aux is a *gc.Node of a variable that must be kept live. arg0=mem, returns mem
+ {name: "KeepAlive", argLength: 2, typ: "Mem", zeroWidth: true}, // arg[0] is a value that must be kept alive until this mark. arg[1]=mem, returns mem
+
+ // InlMark marks the start of an inlined function body. Its AuxInt field
+ // distinguishes which entry in the local inline tree it is marking.
+ {name: "InlMark", argLength: 1, aux: "Int32", typ: "Void"}, // arg[0]=mem, returns void.
+
+ // Ops for breaking 64-bit operations on 32-bit architectures
+ {name: "Int64Make", argLength: 2, typ: "UInt64"}, // arg0=hi, arg1=lo
+ {name: "Int64Hi", argLength: 1, typ: "UInt32"}, // high 32-bit of arg0
+ {name: "Int64Lo", argLength: 1, typ: "UInt32"}, // low 32-bit of arg0
+
+ {name: "Add32carry", argLength: 2, commutative: true, typ: "(UInt32,Flags)"}, // arg0 + arg1, returns (value, carry)
+ {name: "Add32withcarry", argLength: 3, commutative: true}, // arg0 + arg1 + arg2, arg2=carry (0 or 1)
+
+ {name: "Sub32carry", argLength: 2, typ: "(UInt32,Flags)"}, // arg0 - arg1, returns (value, carry)
+ {name: "Sub32withcarry", argLength: 3}, // arg0 - arg1 - arg2, arg2=carry (0 or 1)
+
+ {name: "Add64carry", argLength: 3, commutative: true, typ: "(UInt64,UInt64)"}, // arg0 + arg1 + arg2, arg2 must be 0 or 1. returns (value, value>>64)
+ {name: "Sub64borrow", argLength: 3, typ: "(UInt64,UInt64)"}, // arg0 - (arg1 + arg2), arg2 must be 0 or 1. returns (value, value>>64&1)
+
+ {name: "Signmask", argLength: 1, typ: "Int32"}, // 0 if arg0 >= 0, -1 if arg0 < 0
+ {name: "Zeromask", argLength: 1, typ: "UInt32"}, // 0 if arg0 == 0, 0xffffffff if arg0 != 0
+ {name: "Slicemask", argLength: 1}, // 0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0. Type is native int size.
+
+ {name: "SpectreIndex", argLength: 2}, // arg0 if 0 <= arg0 < arg1, 0 otherwise. Type is native int size.
+ {name: "SpectreSliceIndex", argLength: 2}, // arg0 if 0 <= arg0 <= arg1, 0 otherwise. Type is native int size.
+
+ {name: "Cvt32Uto32F", argLength: 1}, // uint32 -> float32, only used on 32-bit arch
+ {name: "Cvt32Uto64F", argLength: 1}, // uint32 -> float64, only used on 32-bit arch
+ {name: "Cvt32Fto32U", argLength: 1}, // float32 -> uint32, only used on 32-bit arch
+ {name: "Cvt64Fto32U", argLength: 1}, // float64 -> uint32, only used on 32-bit arch
+ {name: "Cvt64Uto32F", argLength: 1}, // uint64 -> float32, only used on archs that has the instruction
+ {name: "Cvt64Uto64F", argLength: 1}, // uint64 -> float64, only used on archs that has the instruction
+ {name: "Cvt32Fto64U", argLength: 1}, // float32 -> uint64, only used on archs that has the instruction
+ {name: "Cvt64Fto64U", argLength: 1}, // float64 -> uint64, only used on archs that has the instruction
+
+ // pseudo-ops for breaking Tuple
+ {name: "Select0", argLength: 1, zeroWidth: true}, // the first component of a tuple
+ {name: "Select1", argLength: 1, zeroWidth: true}, // the second component of a tuple
+ {name: "SelectN", argLength: 1, aux: "Int64"}, // arg0=result, auxint=field index. Returns the auxint'th member.
+ {name: "SelectNAddr", argLength: 1, aux: "Int64"}, // arg0=result, auxint=field index. Returns the address of auxint'th member. Used for un-SSA-able result types.
+ {name: "MakeResult", argLength: -1}, // arg0 .. are components of a "Result" (like the result from a Call). The last arg should be memory (like the result from a call).
+
+ // Atomic operations used for semantically inlining sync/atomic and
+ // runtime/internal/atomic. Atomic loads return a new memory so that
+ // the loads are properly ordered with respect to other loads and
+ // stores.
+ {name: "AtomicLoad8", argLength: 2, typ: "(UInt8,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory.
+ {name: "AtomicLoad32", argLength: 2, typ: "(UInt32,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory.
+ {name: "AtomicLoad64", argLength: 2, typ: "(UInt64,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory.
+ {name: "AtomicLoadPtr", argLength: 2, typ: "(BytePtr,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory.
+ {name: "AtomicLoadAcq32", argLength: 2, typ: "(UInt32,Mem)"}, // Load from arg0. arg1=memory. Lock acquisition, returns loaded value and new memory.
+ {name: "AtomicLoadAcq64", argLength: 2, typ: "(UInt64,Mem)"}, // Load from arg0. arg1=memory. Lock acquisition, returns loaded value and new memory.
+ {name: "AtomicStore8", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory.
+ {name: "AtomicStore32", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory.
+ {name: "AtomicStore64", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory.
+ {name: "AtomicStorePtrNoWB", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory.
+ {name: "AtomicStoreRel32", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Lock release, returns memory.
+ {name: "AtomicStoreRel64", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Lock release, returns memory.
+ {name: "AtomicExchange32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory.
+ {name: "AtomicExchange64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory.
+ {name: "AtomicAdd32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
+ {name: "AtomicAdd64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
+ {name: "AtomicCompareAndSwap32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory.
+ {name: "AtomicCompareAndSwap64", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory.
+ {name: "AtomicCompareAndSwapRel32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Lock release, reports whether store happens and new memory.
+ {name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
+ {name: "AtomicAnd32", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
+ {name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
+ {name: "AtomicOr32", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
+
+ // Atomic operation variants
+ // These variants have the same semantics as above atomic operations.
+ // But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
+ // Currently, they are used on ARM64 only.
+ {name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
+ {name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
+ {name: "AtomicExchange32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory.
+ {name: "AtomicExchange64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory.
+ {name: "AtomicCompareAndSwap32Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory.
+ {name: "AtomicCompareAndSwap64Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory.
+ {name: "AtomicAnd8Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
+ {name: "AtomicAnd32Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
+ {name: "AtomicOr8Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
+ {name: "AtomicOr32Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
+
+ // Publication barrier
+ {name: "PubBarrier", argLength: 1, hasSideEffects: true}, // Do data barrier. arg0=memory.
+
+ // Clobber experiment op
+ {name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable
+ {name: "ClobberReg", argLength: 0, typ: "Void"}, // clobber a register
+
+ // Prefetch instruction
+ {name: "PrefetchCache", argLength: 2, hasSideEffects: true}, // Do prefetch arg0 to cache. arg0=addr, arg1=memory.
+ {name: "PrefetchCacheStreamed", argLength: 2, hasSideEffects: true}, // Do non-temporal or streamed prefetch arg0 to cache. arg0=addr, arg1=memory.
+}
+
+// kind controls successors implicit exit
+// ----------------------------------------------------------
+// Exit [return mem] [] yes
+// Ret [return mem] [] yes
+// RetJmp [return mem] [] yes
+// Plain [] [next]
+// If [boolean Value] [then, else]
+// First [] [always, never]
+
+var genericBlocks = []blockData{
+ {name: "Plain"}, // a single successor
+ {name: "If", controls: 1}, // if Controls[0] goto Succs[0] else goto Succs[1]
+ {name: "Defer", controls: 1}, // Succs[0]=defer queued, Succs[1]=defer recovered. Controls[0] is call op (of memory type)
+ {name: "Ret", controls: 1}, // no successors, Controls[0] value is memory result
+ {name: "RetJmp", controls: 1}, // no successors, Controls[0] value is a tail call
+ {name: "Exit", controls: 1}, // no successors, Controls[0] value generates a panic
+ {name: "JumpTable", controls: 1}, // multiple successors, the integer Controls[0] selects which one
+
+ // transient block state used for dead code removal
+ {name: "First"}, // 2 successors, always takes the first one (second is dead)
+}
+
+func init() {
+ archs = append(archs, arch{
+ name: "generic",
+ ops: genericOps,
+ blocks: genericBlocks,
+ generic: true,
+ })
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/main.go b/src/cmd/compile/internal/ssa/_gen/main.go
new file mode 100644
index 0000000..9251ba5
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/main.go
@@ -0,0 +1,571 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The gen command generates Go code (in the parent directory) for all
+// the architecture-specific opcodes, blocks, and rewrites.
+package main
+
+import (
+ "bytes"
+ "flag"
+ "fmt"
+ "go/format"
+ "log"
+ "math/bits"
+ "os"
+ "path"
+ "regexp"
+ "runtime"
+ "runtime/pprof"
+ "runtime/trace"
+ "sort"
+ "strings"
+ "sync"
+)
+
+// TODO: capitalize these types, so that we can more easily tell variable names
+// apart from type names, and avoid awkward func parameters like "arch arch".
+
+type arch struct {
+ name string
+ pkg string // obj package to import for this arch.
+ genfile string // source file containing opcode code generation.
+ ops []opData
+ blocks []blockData
+ regnames []string
+ ParamIntRegNames string
+ ParamFloatRegNames string
+ gpregmask regMask
+ fpregmask regMask
+ fp32regmask regMask
+ fp64regmask regMask
+ specialregmask regMask
+ framepointerreg int8
+ linkreg int8
+ generic bool
+ imports []string
+}
+
+type opData struct {
+ name string
+ reg regInfo
+ asm string
+ typ string // default result type
+ aux string
+ rematerializeable bool
+ argLength int32 // number of arguments, if -1, then this operation has a variable number of arguments
+ commutative bool // this operation is commutative on its first 2 arguments (e.g. addition)
+ resultInArg0 bool // (first, if a tuple) output of v and v.Args[0] must be allocated to the same register
+ resultNotInArgs bool // outputs must not be allocated to the same registers as inputs
+ clobberFlags bool // this op clobbers flags register
+ needIntTemp bool // need a temporary free integer register
+ call bool // is a function call
+ tailCall bool // is a tail call
+ nilCheck bool // this op is a nil check on arg0
+ faultOnNilArg0 bool // this op will fault if arg0 is nil (and aux encodes a small offset)
+ faultOnNilArg1 bool // this op will fault if arg1 is nil (and aux encodes a small offset)
+ hasSideEffects bool // for "reasons", not to be eliminated. E.g., atomic store, #19182.
+ zeroWidth bool // op never translates into any machine code. example: copy, which may sometimes translate to machine code, is not zero-width.
+ unsafePoint bool // this op is an unsafe point, i.e. not safe for async preemption
+ symEffect string // effect this op has on symbol in aux
+ scale uint8 // amd64/386 indexed load scale
+}
+
+type blockData struct {
+ name string // the suffix for this block ("EQ", "LT", etc.)
+ controls int // the number of control values this type of block requires
+ aux string // the type of the Aux/AuxInt value, if any
+}
+
+type regInfo struct {
+ // inputs[i] encodes the set of registers allowed for the i'th input.
+ // Inputs that don't use registers (flags, memory, etc.) should be 0.
+ inputs []regMask
+ // clobbers encodes the set of registers that are overwritten by
+ // the instruction (other than the output registers).
+ clobbers regMask
+ // outputs[i] encodes the set of registers allowed for the i'th output.
+ outputs []regMask
+}
+
+type regMask uint64
+
+func (a arch) regMaskComment(r regMask) string {
+ var buf strings.Builder
+ for i := uint64(0); r != 0; i++ {
+ if r&1 != 0 {
+ if buf.Len() == 0 {
+ buf.WriteString(" //")
+ }
+ buf.WriteString(" ")
+ buf.WriteString(a.regnames[i])
+ }
+ r >>= 1
+ }
+ return buf.String()
+}
+
+var archs []arch
+
+var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`")
+var memprofile = flag.String("memprofile", "", "write memory profile to `file`")
+var tracefile = flag.String("trace", "", "write trace to `file`")
+
+func main() {
+ flag.Parse()
+ if *cpuprofile != "" {
+ f, err := os.Create(*cpuprofile)
+ if err != nil {
+ log.Fatal("could not create CPU profile: ", err)
+ }
+ defer f.Close()
+ if err := pprof.StartCPUProfile(f); err != nil {
+ log.Fatal("could not start CPU profile: ", err)
+ }
+ defer pprof.StopCPUProfile()
+ }
+ if *tracefile != "" {
+ f, err := os.Create(*tracefile)
+ if err != nil {
+ log.Fatalf("failed to create trace output file: %v", err)
+ }
+ defer func() {
+ if err := f.Close(); err != nil {
+ log.Fatalf("failed to close trace file: %v", err)
+ }
+ }()
+
+ if err := trace.Start(f); err != nil {
+ log.Fatalf("failed to start trace: %v", err)
+ }
+ defer trace.Stop()
+ }
+
+ sort.Sort(ArchsByName(archs))
+
+ // The generate tasks are run concurrently, since they are CPU-intensive
+ // that can easily make use of many cores on a machine.
+ //
+ // Note that there is no limit on the concurrency at the moment. On a
+ // four-core laptop at the time of writing, peak RSS usually reaches
+ // ~200MiB, which seems doable by practically any machine nowadays. If
+ // that stops being the case, we can cap this func to a fixed number of
+ // architectures being generated at once.
+
+ tasks := []func(){
+ genOp,
+ genAllocators,
+ }
+ for _, a := range archs {
+ a := a // the funcs are ran concurrently at a later time
+ tasks = append(tasks, func() {
+ genRules(a)
+ genSplitLoadRules(a)
+ genLateLowerRules(a)
+ })
+ }
+ var wg sync.WaitGroup
+ for _, task := range tasks {
+ task := task
+ wg.Add(1)
+ go func() {
+ task()
+ wg.Done()
+ }()
+ }
+ wg.Wait()
+
+ if *memprofile != "" {
+ f, err := os.Create(*memprofile)
+ if err != nil {
+ log.Fatal("could not create memory profile: ", err)
+ }
+ defer f.Close()
+ runtime.GC() // get up-to-date statistics
+ if err := pprof.WriteHeapProfile(f); err != nil {
+ log.Fatal("could not write memory profile: ", err)
+ }
+ }
+}
+
+func genOp() {
+ w := new(bytes.Buffer)
+ fmt.Fprintf(w, "// Code generated from _gen/*Ops.go; DO NOT EDIT.\n")
+ fmt.Fprintln(w)
+ fmt.Fprintln(w, "package ssa")
+
+ fmt.Fprintln(w, "import (")
+ fmt.Fprintln(w, "\"cmd/internal/obj\"")
+ for _, a := range archs {
+ if a.pkg != "" {
+ fmt.Fprintf(w, "%q\n", a.pkg)
+ }
+ }
+ fmt.Fprintln(w, ")")
+
+ // generate Block* declarations
+ fmt.Fprintln(w, "const (")
+ fmt.Fprintln(w, "BlockInvalid BlockKind = iota")
+ for _, a := range archs {
+ fmt.Fprintln(w)
+ for _, d := range a.blocks {
+ fmt.Fprintf(w, "Block%s%s\n", a.Name(), d.name)
+ }
+ }
+ fmt.Fprintln(w, ")")
+
+ // generate block kind string method
+ fmt.Fprintln(w, "var blockString = [...]string{")
+ fmt.Fprintln(w, "BlockInvalid:\"BlockInvalid\",")
+ for _, a := range archs {
+ fmt.Fprintln(w)
+ for _, b := range a.blocks {
+ fmt.Fprintf(w, "Block%s%s:\"%s\",\n", a.Name(), b.name, b.name)
+ }
+ }
+ fmt.Fprintln(w, "}")
+ fmt.Fprintln(w, "func (k BlockKind) String() string {return blockString[k]}")
+
+ // generate block kind auxint method
+ fmt.Fprintln(w, "func (k BlockKind) AuxIntType() string {")
+ fmt.Fprintln(w, "switch k {")
+ for _, a := range archs {
+ for _, b := range a.blocks {
+ if b.auxIntType() == "invalid" {
+ continue
+ }
+ fmt.Fprintf(w, "case Block%s%s: return \"%s\"\n", a.Name(), b.name, b.auxIntType())
+ }
+ }
+ fmt.Fprintln(w, "}")
+ fmt.Fprintln(w, "return \"\"")
+ fmt.Fprintln(w, "}")
+
+ // generate Op* declarations
+ fmt.Fprintln(w, "const (")
+ fmt.Fprintln(w, "OpInvalid Op = iota") // make sure OpInvalid is 0.
+ for _, a := range archs {
+ fmt.Fprintln(w)
+ for _, v := range a.ops {
+ if v.name == "Invalid" {
+ continue
+ }
+ fmt.Fprintf(w, "Op%s%s\n", a.Name(), v.name)
+ }
+ }
+ fmt.Fprintln(w, ")")
+
+ // generate OpInfo table
+ fmt.Fprintln(w, "var opcodeTable = [...]opInfo{")
+ fmt.Fprintln(w, " { name: \"OpInvalid\" },")
+ for _, a := range archs {
+ fmt.Fprintln(w)
+
+ pkg := path.Base(a.pkg)
+ for _, v := range a.ops {
+ if v.name == "Invalid" {
+ continue
+ }
+ fmt.Fprintln(w, "{")
+ fmt.Fprintf(w, "name:\"%s\",\n", v.name)
+
+ // flags
+ if v.aux != "" {
+ fmt.Fprintf(w, "auxType: aux%s,\n", v.aux)
+ }
+ fmt.Fprintf(w, "argLen: %d,\n", v.argLength)
+
+ if v.rematerializeable {
+ if v.reg.clobbers != 0 {
+ log.Fatalf("%s is rematerializeable and clobbers registers", v.name)
+ }
+ if v.clobberFlags {
+ log.Fatalf("%s is rematerializeable and clobbers flags", v.name)
+ }
+ fmt.Fprintln(w, "rematerializeable: true,")
+ }
+ if v.commutative {
+ fmt.Fprintln(w, "commutative: true,")
+ }
+ if v.resultInArg0 {
+ fmt.Fprintln(w, "resultInArg0: true,")
+ // OpConvert's register mask is selected dynamically,
+ // so don't try to check it in the static table.
+ if v.name != "Convert" && v.reg.inputs[0] != v.reg.outputs[0] {
+ log.Fatalf("%s: input[0] and output[0] must use the same registers for %s", a.name, v.name)
+ }
+ if v.name != "Convert" && v.commutative && v.reg.inputs[1] != v.reg.outputs[0] {
+ log.Fatalf("%s: input[1] and output[0] must use the same registers for %s", a.name, v.name)
+ }
+ }
+ if v.resultNotInArgs {
+ fmt.Fprintln(w, "resultNotInArgs: true,")
+ }
+ if v.clobberFlags {
+ fmt.Fprintln(w, "clobberFlags: true,")
+ }
+ if v.needIntTemp {
+ fmt.Fprintln(w, "needIntTemp: true,")
+ }
+ if v.call {
+ fmt.Fprintln(w, "call: true,")
+ }
+ if v.tailCall {
+ fmt.Fprintln(w, "tailCall: true,")
+ }
+ if v.nilCheck {
+ fmt.Fprintln(w, "nilCheck: true,")
+ }
+ if v.faultOnNilArg0 {
+ fmt.Fprintln(w, "faultOnNilArg0: true,")
+ if v.aux != "Sym" && v.aux != "SymOff" && v.aux != "SymValAndOff" && v.aux != "Int64" && v.aux != "Int32" && v.aux != "" {
+ log.Fatalf("faultOnNilArg0 with aux %s not allowed", v.aux)
+ }
+ }
+ if v.faultOnNilArg1 {
+ fmt.Fprintln(w, "faultOnNilArg1: true,")
+ if v.aux != "Sym" && v.aux != "SymOff" && v.aux != "SymValAndOff" && v.aux != "Int64" && v.aux != "Int32" && v.aux != "" {
+ log.Fatalf("faultOnNilArg1 with aux %s not allowed", v.aux)
+ }
+ }
+ if v.hasSideEffects {
+ fmt.Fprintln(w, "hasSideEffects: true,")
+ }
+ if v.zeroWidth {
+ fmt.Fprintln(w, "zeroWidth: true,")
+ }
+ if v.unsafePoint {
+ fmt.Fprintln(w, "unsafePoint: true,")
+ }
+ needEffect := strings.HasPrefix(v.aux, "Sym")
+ if v.symEffect != "" {
+ if !needEffect {
+ log.Fatalf("symEffect with aux %s not allowed", v.aux)
+ }
+ fmt.Fprintf(w, "symEffect: Sym%s,\n", strings.Replace(v.symEffect, ",", "|Sym", -1))
+ } else if needEffect {
+ log.Fatalf("symEffect needed for aux %s", v.aux)
+ }
+ if a.name == "generic" {
+ fmt.Fprintln(w, "generic:true,")
+ fmt.Fprintln(w, "},") // close op
+ // generic ops have no reg info or asm
+ continue
+ }
+ if v.asm != "" {
+ fmt.Fprintf(w, "asm: %s.A%s,\n", pkg, v.asm)
+ }
+ if v.scale != 0 {
+ fmt.Fprintf(w, "scale: %d,\n", v.scale)
+ }
+ fmt.Fprintln(w, "reg:regInfo{")
+
+ // Compute input allocation order. We allocate from the
+ // most to the least constrained input. This order guarantees
+ // that we will always be able to find a register.
+ var s []intPair
+ for i, r := range v.reg.inputs {
+ if r != 0 {
+ s = append(s, intPair{countRegs(r), i})
+ }
+ }
+ if len(s) > 0 {
+ sort.Sort(byKey(s))
+ fmt.Fprintln(w, "inputs: []inputInfo{")
+ for _, p := range s {
+ r := v.reg.inputs[p.val]
+ fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r))
+ }
+ fmt.Fprintln(w, "},")
+ }
+
+ if v.reg.clobbers > 0 {
+ fmt.Fprintf(w, "clobbers: %d,%s\n", v.reg.clobbers, a.regMaskComment(v.reg.clobbers))
+ }
+
+ // reg outputs
+ s = s[:0]
+ for i, r := range v.reg.outputs {
+ s = append(s, intPair{countRegs(r), i})
+ }
+ if len(s) > 0 {
+ sort.Sort(byKey(s))
+ fmt.Fprintln(w, "outputs: []outputInfo{")
+ for _, p := range s {
+ r := v.reg.outputs[p.val]
+ fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r))
+ }
+ fmt.Fprintln(w, "},")
+ }
+ fmt.Fprintln(w, "},") // close reg info
+ fmt.Fprintln(w, "},") // close op
+ }
+ }
+ fmt.Fprintln(w, "}")
+
+ fmt.Fprintln(w, "func (o Op) Asm() obj.As {return opcodeTable[o].asm}")
+ fmt.Fprintln(w, "func (o Op) Scale() int16 {return int16(opcodeTable[o].scale)}")
+
+ // generate op string method
+ fmt.Fprintln(w, "func (o Op) String() string {return opcodeTable[o].name }")
+
+ fmt.Fprintln(w, "func (o Op) SymEffect() SymEffect { return opcodeTable[o].symEffect }")
+ fmt.Fprintln(w, "func (o Op) IsCall() bool { return opcodeTable[o].call }")
+ fmt.Fprintln(w, "func (o Op) IsTailCall() bool { return opcodeTable[o].tailCall }")
+ fmt.Fprintln(w, "func (o Op) HasSideEffects() bool { return opcodeTable[o].hasSideEffects }")
+ fmt.Fprintln(w, "func (o Op) UnsafePoint() bool { return opcodeTable[o].unsafePoint }")
+ fmt.Fprintln(w, "func (o Op) ResultInArg0() bool { return opcodeTable[o].resultInArg0 }")
+
+ // generate registers
+ for _, a := range archs {
+ if a.generic {
+ continue
+ }
+ fmt.Fprintf(w, "var registers%s = [...]Register {\n", a.name)
+ var gcRegN int
+ num := map[string]int8{}
+ for i, r := range a.regnames {
+ num[r] = int8(i)
+ pkg := a.pkg[len("cmd/internal/obj/"):]
+ var objname string // name in cmd/internal/obj/$ARCH
+ switch r {
+ case "SB":
+ // SB isn't a real register. cmd/internal/obj expects 0 in this case.
+ objname = "0"
+ case "SP":
+ objname = pkg + ".REGSP"
+ case "g":
+ objname = pkg + ".REGG"
+ default:
+ objname = pkg + ".REG_" + r
+ }
+ // Assign a GC register map index to registers
+ // that may contain pointers.
+ gcRegIdx := -1
+ if a.gpregmask&(1<<uint(i)) != 0 {
+ gcRegIdx = gcRegN
+ gcRegN++
+ }
+ fmt.Fprintf(w, " {%d, %s, %d, \"%s\"},\n", i, objname, gcRegIdx, r)
+ }
+ parameterRegisterList := func(paramNamesString string) []int8 {
+ paramNamesString = strings.TrimSpace(paramNamesString)
+ if paramNamesString == "" {
+ return nil
+ }
+ paramNames := strings.Split(paramNamesString, " ")
+ var paramRegs []int8
+ for _, regName := range paramNames {
+ if regName == "" {
+ // forgive extra spaces
+ continue
+ }
+ if regNum, ok := num[regName]; ok {
+ paramRegs = append(paramRegs, regNum)
+ delete(num, regName)
+ } else {
+ log.Fatalf("parameter register %s for architecture %s not a register name (or repeated in parameter list)", regName, a.name)
+ }
+ }
+ return paramRegs
+ }
+
+ paramIntRegs := parameterRegisterList(a.ParamIntRegNames)
+ paramFloatRegs := parameterRegisterList(a.ParamFloatRegNames)
+
+ if gcRegN > 32 {
+ // Won't fit in a uint32 mask.
+ log.Fatalf("too many GC registers (%d > 32) on %s", gcRegN, a.name)
+ }
+ fmt.Fprintln(w, "}")
+ fmt.Fprintf(w, "var paramIntReg%s = %#v\n", a.name, paramIntRegs)
+ fmt.Fprintf(w, "var paramFloatReg%s = %#v\n", a.name, paramFloatRegs)
+ fmt.Fprintf(w, "var gpRegMask%s = regMask(%d)\n", a.name, a.gpregmask)
+ fmt.Fprintf(w, "var fpRegMask%s = regMask(%d)\n", a.name, a.fpregmask)
+ if a.fp32regmask != 0 {
+ fmt.Fprintf(w, "var fp32RegMask%s = regMask(%d)\n", a.name, a.fp32regmask)
+ }
+ if a.fp64regmask != 0 {
+ fmt.Fprintf(w, "var fp64RegMask%s = regMask(%d)\n", a.name, a.fp64regmask)
+ }
+ fmt.Fprintf(w, "var specialRegMask%s = regMask(%d)\n", a.name, a.specialregmask)
+ fmt.Fprintf(w, "var framepointerReg%s = int8(%d)\n", a.name, a.framepointerreg)
+ fmt.Fprintf(w, "var linkReg%s = int8(%d)\n", a.name, a.linkreg)
+ }
+
+ // gofmt result
+ b := w.Bytes()
+ var err error
+ b, err = format.Source(b)
+ if err != nil {
+ fmt.Printf("%s\n", w.Bytes())
+ panic(err)
+ }
+
+ if err := os.WriteFile("../opGen.go", b, 0666); err != nil {
+ log.Fatalf("can't write output: %v\n", err)
+ }
+
+ // Check that the arch genfile handles all the arch-specific opcodes.
+ // This is very much a hack, but it is better than nothing.
+ //
+ // Do a single regexp pass to record all ops being handled in a map, and
+ // then compare that with the ops list. This is much faster than one
+ // regexp pass per opcode.
+ for _, a := range archs {
+ if a.genfile == "" {
+ continue
+ }
+
+ pattern := fmt.Sprintf(`\Wssa\.Op%s([a-zA-Z0-9_]+)\W`, a.name)
+ rxOp, err := regexp.Compile(pattern)
+ if err != nil {
+ log.Fatalf("bad opcode regexp %s: %v", pattern, err)
+ }
+
+ src, err := os.ReadFile(a.genfile)
+ if err != nil {
+ log.Fatalf("can't read %s: %v", a.genfile, err)
+ }
+ seen := make(map[string]bool, len(a.ops))
+ for _, m := range rxOp.FindAllSubmatch(src, -1) {
+ seen[string(m[1])] = true
+ }
+ for _, op := range a.ops {
+ if !seen[op.name] {
+ log.Fatalf("Op%s%s has no code generation in %s", a.name, op.name, a.genfile)
+ }
+ }
+ }
+}
+
+// Name returns the name of the architecture for use in Op* and Block* enumerations.
+func (a arch) Name() string {
+ s := a.name
+ if s == "generic" {
+ s = ""
+ }
+ return s
+}
+
+// countRegs returns the number of set bits in the register mask.
+func countRegs(r regMask) int {
+ return bits.OnesCount64(uint64(r))
+}
+
+// for sorting a pair of integers by key
+type intPair struct {
+ key, val int
+}
+type byKey []intPair
+
+func (a byKey) Len() int { return len(a) }
+func (a byKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a byKey) Less(i, j int) bool { return a[i].key < a[j].key }
+
+type ArchsByName []arch
+
+func (x ArchsByName) Len() int { return len(x) }
+func (x ArchsByName) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
+func (x ArchsByName) Less(i, j int) bool { return x[i].name < x[j].name }
diff --git a/src/cmd/compile/internal/ssa/_gen/rulegen.go b/src/cmd/compile/internal/ssa/_gen/rulegen.go
new file mode 100644
index 0000000..80fa37a
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/_gen/rulegen.go
@@ -0,0 +1,1885 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This program generates Go code that applies rewrite rules to a Value.
+// The generated code implements a function of type func (v *Value) bool
+// which reports whether if did something.
+// Ideas stolen from Swift: http://www.hpl.hp.com/techreports/Compaq-DEC/WRL-2000-2.html
+
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "flag"
+ "fmt"
+ "go/ast"
+ "go/format"
+ "go/parser"
+ "go/printer"
+ "go/token"
+ "io"
+ "log"
+ "os"
+ "path"
+ "regexp"
+ "sort"
+ "strconv"
+ "strings"
+
+ "golang.org/x/tools/go/ast/astutil"
+)
+
+// rule syntax:
+// sexpr [&& extra conditions] => [@block] sexpr
+//
+// sexpr are s-expressions (lisp-like parenthesized groupings)
+// sexpr ::= [variable:](opcode sexpr*)
+// | variable
+// | <type>
+// | [auxint]
+// | {aux}
+//
+// aux ::= variable | {code}
+// type ::= variable | {code}
+// variable ::= some token
+// opcode ::= one of the opcodes from the *Ops.go files
+
+// special rules: trailing ellipsis "..." (in the outermost sexpr?) must match on both sides of a rule.
+// trailing three underscore "___" in the outermost match sexpr indicate the presence of
+// extra ignored args that need not appear in the replacement
+
+// extra conditions is just a chunk of Go that evaluates to a boolean. It may use
+// variables declared in the matching tsexpr. The variable "v" is predefined to be
+// the value matched by the entire rule.
+
+// If multiple rules match, the first one in file order is selected.
+
+var (
+ genLog = flag.Bool("log", false, "generate code that logs; for debugging only")
+ addLine = flag.Bool("line", false, "add line number comment to generated rules; for debugging only")
+)
+
+type Rule struct {
+ Rule string
+ Loc string // file name & line number
+}
+
+func (r Rule) String() string {
+ return fmt.Sprintf("rule %q at %s", r.Rule, r.Loc)
+}
+
+func normalizeSpaces(s string) string {
+ return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
+}
+
+// parse returns the matching part of the rule, additional conditions, and the result.
+func (r Rule) parse() (match, cond, result string) {
+ s := strings.Split(r.Rule, "=>")
+ match = normalizeSpaces(s[0])
+ result = normalizeSpaces(s[1])
+ cond = ""
+ if i := strings.Index(match, "&&"); i >= 0 {
+ cond = normalizeSpaces(match[i+2:])
+ match = normalizeSpaces(match[:i])
+ }
+ return match, cond, result
+}
+
+func genRules(arch arch) { genRulesSuffix(arch, "") }
+func genSplitLoadRules(arch arch) { genRulesSuffix(arch, "splitload") }
+func genLateLowerRules(arch arch) { genRulesSuffix(arch, "latelower") }
+
+func genRulesSuffix(arch arch, suff string) {
+ // Open input file.
+ text, err := os.Open(arch.name + suff + ".rules")
+ if err != nil {
+ if suff == "" {
+ // All architectures must have a plain rules file.
+ log.Fatalf("can't read rule file: %v", err)
+ }
+ // Some architectures have bonus rules files that others don't share. That's fine.
+ return
+ }
+
+ // oprules contains a list of rules for each block and opcode
+ blockrules := map[string][]Rule{}
+ oprules := map[string][]Rule{}
+
+ // read rule file
+ scanner := bufio.NewScanner(text)
+ rule := ""
+ var lineno int
+ var ruleLineno int // line number of "=>"
+ for scanner.Scan() {
+ lineno++
+ line := scanner.Text()
+ if i := strings.Index(line, "//"); i >= 0 {
+ // Remove comments. Note that this isn't string safe, so
+ // it will truncate lines with // inside strings. Oh well.
+ line = line[:i]
+ }
+ rule += " " + line
+ rule = strings.TrimSpace(rule)
+ if rule == "" {
+ continue
+ }
+ if !strings.Contains(rule, "=>") {
+ continue
+ }
+ if ruleLineno == 0 {
+ ruleLineno = lineno
+ }
+ if strings.HasSuffix(rule, "=>") {
+ continue // continue on the next line
+ }
+ if n := balance(rule); n > 0 {
+ continue // open parentheses remain, continue on the next line
+ } else if n < 0 {
+ break // continuing the line can't help, and it will only make errors worse
+ }
+
+ loc := fmt.Sprintf("%s%s.rules:%d", arch.name, suff, ruleLineno)
+ for _, rule2 := range expandOr(rule) {
+ r := Rule{Rule: rule2, Loc: loc}
+ if rawop := strings.Split(rule2, " ")[0][1:]; isBlock(rawop, arch) {
+ blockrules[rawop] = append(blockrules[rawop], r)
+ continue
+ }
+ // Do fancier value op matching.
+ match, _, _ := r.parse()
+ op, oparch, _, _, _, _ := parseValue(match, arch, loc)
+ opname := fmt.Sprintf("Op%s%s", oparch, op.name)
+ oprules[opname] = append(oprules[opname], r)
+ }
+ rule = ""
+ ruleLineno = 0
+ }
+ if err := scanner.Err(); err != nil {
+ log.Fatalf("scanner failed: %v\n", err)
+ }
+ if balance(rule) != 0 {
+ log.Fatalf("%s.rules:%d: unbalanced rule: %v\n", arch.name, lineno, rule)
+ }
+
+ // Order all the ops.
+ var ops []string
+ for op := range oprules {
+ ops = append(ops, op)
+ }
+ sort.Strings(ops)
+
+ genFile := &File{Arch: arch, Suffix: suff}
+ // Main rewrite routine is a switch on v.Op.
+ fn := &Func{Kind: "Value", ArgLen: -1}
+
+ sw := &Switch{Expr: exprf("v.Op")}
+ for _, op := range ops {
+ eop, ok := parseEllipsisRules(oprules[op], arch)
+ if ok {
+ if strings.Contains(oprules[op][0].Rule, "=>") && opByName(arch, op).aux != opByName(arch, eop).aux {
+ panic(fmt.Sprintf("can't use ... for ops that have different aux types: %s and %s", op, eop))
+ }
+ swc := &Case{Expr: exprf("%s", op)}
+ swc.add(stmtf("v.Op = %s", eop))
+ swc.add(stmtf("return true"))
+ sw.add(swc)
+ continue
+ }
+
+ swc := &Case{Expr: exprf("%s", op)}
+ swc.add(stmtf("return rewriteValue%s%s_%s(v)", arch.name, suff, op))
+ sw.add(swc)
+ }
+ if len(sw.List) > 0 { // skip if empty
+ fn.add(sw)
+ }
+ fn.add(stmtf("return false"))
+ genFile.add(fn)
+
+ // Generate a routine per op. Note that we don't make one giant routine
+ // because it is too big for some compilers.
+ for _, op := range ops {
+ rules := oprules[op]
+ _, ok := parseEllipsisRules(oprules[op], arch)
+ if ok {
+ continue
+ }
+
+ // rr is kept between iterations, so that each rule can check
+ // that the previous rule wasn't unconditional.
+ var rr *RuleRewrite
+ fn := &Func{
+ Kind: "Value",
+ Suffix: fmt.Sprintf("_%s", op),
+ ArgLen: opByName(arch, op).argLength,
+ }
+ fn.add(declReserved("b", "v.Block"))
+ fn.add(declReserved("config", "b.Func.Config"))
+ fn.add(declReserved("fe", "b.Func.fe"))
+ fn.add(declReserved("typ", "&b.Func.Config.Types"))
+ for _, rule := range rules {
+ if rr != nil && !rr.CanFail {
+ log.Fatalf("unconditional rule %s is followed by other rules", rr.Match)
+ }
+ rr = &RuleRewrite{Loc: rule.Loc}
+ rr.Match, rr.Cond, rr.Result = rule.parse()
+ pos, _ := genMatch(rr, arch, rr.Match, fn.ArgLen >= 0)
+ if pos == "" {
+ pos = "v.Pos"
+ }
+ if rr.Cond != "" {
+ rr.add(breakf("!(%s)", rr.Cond))
+ }
+ genResult(rr, arch, rr.Result, pos)
+ if *genLog {
+ rr.add(stmtf("logRule(%q)", rule.Loc))
+ }
+ fn.add(rr)
+ }
+ if rr.CanFail {
+ fn.add(stmtf("return false"))
+ }
+ genFile.add(fn)
+ }
+
+ // Generate block rewrite function. There are only a few block types
+ // so we can make this one function with a switch.
+ fn = &Func{Kind: "Block"}
+ fn.add(declReserved("config", "b.Func.Config"))
+ fn.add(declReserved("typ", "&b.Func.Config.Types"))
+
+ sw = &Switch{Expr: exprf("b.Kind")}
+ ops = ops[:0]
+ for op := range blockrules {
+ ops = append(ops, op)
+ }
+ sort.Strings(ops)
+ for _, op := range ops {
+ name, data := getBlockInfo(op, arch)
+ swc := &Case{Expr: exprf("%s", name)}
+ for _, rule := range blockrules[op] {
+ swc.add(genBlockRewrite(rule, arch, data))
+ }
+ sw.add(swc)
+ }
+ if len(sw.List) > 0 { // skip if empty
+ fn.add(sw)
+ }
+ fn.add(stmtf("return false"))
+ genFile.add(fn)
+
+ // Remove unused imports and variables.
+ buf := new(bytes.Buffer)
+ fprint(buf, genFile)
+ fset := token.NewFileSet()
+ file, err := parser.ParseFile(fset, "", buf, parser.ParseComments)
+ if err != nil {
+ filename := fmt.Sprintf("%s_broken.go", arch.name)
+ if err := os.WriteFile(filename, buf.Bytes(), 0644); err != nil {
+ log.Printf("failed to dump broken code to %s: %v", filename, err)
+ } else {
+ log.Printf("dumped broken code to %s", filename)
+ }
+ log.Fatalf("failed to parse generated code for arch %s: %v", arch.name, err)
+ }
+ tfile := fset.File(file.Pos())
+
+ // First, use unusedInspector to find the unused declarations by their
+ // start position.
+ u := unusedInspector{unused: make(map[token.Pos]bool)}
+ u.node(file)
+
+ // Then, delete said nodes via astutil.Apply.
+ pre := func(c *astutil.Cursor) bool {
+ node := c.Node()
+ if node == nil {
+ return true
+ }
+ if u.unused[node.Pos()] {
+ c.Delete()
+ // Unused imports and declarations use exactly
+ // one line. Prevent leaving an empty line.
+ tfile.MergeLine(tfile.Position(node.Pos()).Line)
+ return false
+ }
+ return true
+ }
+ post := func(c *astutil.Cursor) bool {
+ switch node := c.Node().(type) {
+ case *ast.GenDecl:
+ if len(node.Specs) == 0 {
+ // Don't leave a broken or empty GenDecl behind,
+ // such as "import ()".
+ c.Delete()
+ }
+ }
+ return true
+ }
+ file = astutil.Apply(file, pre, post).(*ast.File)
+
+ // Write the well-formatted source to file
+ f, err := os.Create("../rewrite" + arch.name + suff + ".go")
+ if err != nil {
+ log.Fatalf("can't write output: %v", err)
+ }
+ defer f.Close()
+ // gofmt result; use a buffered writer, as otherwise go/format spends
+ // far too much time in syscalls.
+ bw := bufio.NewWriter(f)
+ if err := format.Node(bw, fset, file); err != nil {
+ log.Fatalf("can't format output: %v", err)
+ }
+ if err := bw.Flush(); err != nil {
+ log.Fatalf("can't write output: %v", err)
+ }
+ if err := f.Close(); err != nil {
+ log.Fatalf("can't write output: %v", err)
+ }
+}
+
+// unusedInspector can be used to detect unused variables and imports in an
+// ast.Node via its node method. The result is available in the "unused" map.
+//
+// note that unusedInspector is lazy and best-effort; it only supports the node
+// types and patterns used by the rulegen program.
+type unusedInspector struct {
+ // scope is the current scope, which can never be nil when a declaration
+ // is encountered. That is, the unusedInspector.node entrypoint should
+ // generally be an entire file or block.
+ scope *scope
+
+ // unused is the resulting set of unused declared names, indexed by the
+ // starting position of the node that declared the name.
+ unused map[token.Pos]bool
+
+ // defining is the object currently being defined; this is useful so
+ // that if "foo := bar" is unused and removed, we can then detect if
+ // "bar" becomes unused as well.
+ defining *object
+}
+
+// scoped opens a new scope when called, and returns a function which closes
+// that same scope. When a scope is closed, unused variables are recorded.
+func (u *unusedInspector) scoped() func() {
+ outer := u.scope
+ u.scope = &scope{outer: outer, objects: map[string]*object{}}
+ return func() {
+ for anyUnused := true; anyUnused; {
+ anyUnused = false
+ for _, obj := range u.scope.objects {
+ if obj.numUses > 0 {
+ continue
+ }
+ u.unused[obj.pos] = true
+ for _, used := range obj.used {
+ if used.numUses--; used.numUses == 0 {
+ anyUnused = true
+ }
+ }
+ // We've decremented numUses for each of the
+ // objects in used. Zero this slice too, to keep
+ // everything consistent.
+ obj.used = nil
+ }
+ }
+ u.scope = outer
+ }
+}
+
+func (u *unusedInspector) exprs(list []ast.Expr) {
+ for _, x := range list {
+ u.node(x)
+ }
+}
+
+func (u *unusedInspector) node(node ast.Node) {
+ switch node := node.(type) {
+ case *ast.File:
+ defer u.scoped()()
+ for _, decl := range node.Decls {
+ u.node(decl)
+ }
+ case *ast.GenDecl:
+ for _, spec := range node.Specs {
+ u.node(spec)
+ }
+ case *ast.ImportSpec:
+ impPath, _ := strconv.Unquote(node.Path.Value)
+ name := path.Base(impPath)
+ u.scope.objects[name] = &object{
+ name: name,
+ pos: node.Pos(),
+ }
+ case *ast.FuncDecl:
+ u.node(node.Type)
+ if node.Body != nil {
+ u.node(node.Body)
+ }
+ case *ast.FuncType:
+ if node.Params != nil {
+ u.node(node.Params)
+ }
+ if node.Results != nil {
+ u.node(node.Results)
+ }
+ case *ast.FieldList:
+ for _, field := range node.List {
+ u.node(field)
+ }
+ case *ast.Field:
+ u.node(node.Type)
+
+ // statements
+
+ case *ast.BlockStmt:
+ defer u.scoped()()
+ for _, stmt := range node.List {
+ u.node(stmt)
+ }
+ case *ast.DeclStmt:
+ u.node(node.Decl)
+ case *ast.IfStmt:
+ if node.Init != nil {
+ u.node(node.Init)
+ }
+ u.node(node.Cond)
+ u.node(node.Body)
+ if node.Else != nil {
+ u.node(node.Else)
+ }
+ case *ast.ForStmt:
+ if node.Init != nil {
+ u.node(node.Init)
+ }
+ if node.Cond != nil {
+ u.node(node.Cond)
+ }
+ if node.Post != nil {
+ u.node(node.Post)
+ }
+ u.node(node.Body)
+ case *ast.SwitchStmt:
+ if node.Init != nil {
+ u.node(node.Init)
+ }
+ if node.Tag != nil {
+ u.node(node.Tag)
+ }
+ u.node(node.Body)
+ case *ast.CaseClause:
+ u.exprs(node.List)
+ defer u.scoped()()
+ for _, stmt := range node.Body {
+ u.node(stmt)
+ }
+ case *ast.BranchStmt:
+ case *ast.ExprStmt:
+ u.node(node.X)
+ case *ast.AssignStmt:
+ if node.Tok != token.DEFINE {
+ u.exprs(node.Rhs)
+ u.exprs(node.Lhs)
+ break
+ }
+ lhs := node.Lhs
+ if len(lhs) == 2 && lhs[1].(*ast.Ident).Name == "_" {
+ lhs = lhs[:1]
+ }
+ if len(lhs) != 1 {
+ panic("no support for := with multiple names")
+ }
+
+ name := lhs[0].(*ast.Ident)
+ obj := &object{
+ name: name.Name,
+ pos: name.NamePos,
+ }
+
+ old := u.defining
+ u.defining = obj
+ u.exprs(node.Rhs)
+ u.defining = old
+
+ u.scope.objects[name.Name] = obj
+ case *ast.ReturnStmt:
+ u.exprs(node.Results)
+ case *ast.IncDecStmt:
+ u.node(node.X)
+
+ // expressions
+
+ case *ast.CallExpr:
+ u.node(node.Fun)
+ u.exprs(node.Args)
+ case *ast.SelectorExpr:
+ u.node(node.X)
+ case *ast.UnaryExpr:
+ u.node(node.X)
+ case *ast.BinaryExpr:
+ u.node(node.X)
+ u.node(node.Y)
+ case *ast.StarExpr:
+ u.node(node.X)
+ case *ast.ParenExpr:
+ u.node(node.X)
+ case *ast.IndexExpr:
+ u.node(node.X)
+ u.node(node.Index)
+ case *ast.TypeAssertExpr:
+ u.node(node.X)
+ u.node(node.Type)
+ case *ast.Ident:
+ if obj := u.scope.Lookup(node.Name); obj != nil {
+ obj.numUses++
+ if u.defining != nil {
+ u.defining.used = append(u.defining.used, obj)
+ }
+ }
+ case *ast.BasicLit:
+ case *ast.ValueSpec:
+ u.exprs(node.Values)
+ default:
+ panic(fmt.Sprintf("unhandled node: %T", node))
+ }
+}
+
+// scope keeps track of a certain scope and its declared names, as well as the
+// outer (parent) scope.
+type scope struct {
+ outer *scope // can be nil, if this is the top-level scope
+ objects map[string]*object // indexed by each declared name
+}
+
+func (s *scope) Lookup(name string) *object {
+ if obj := s.objects[name]; obj != nil {
+ return obj
+ }
+ if s.outer == nil {
+ return nil
+ }
+ return s.outer.Lookup(name)
+}
+
+// object keeps track of a declared name, such as a variable or import.
+type object struct {
+ name string
+ pos token.Pos // start position of the node declaring the object
+
+ numUses int // number of times this object is used
+ used []*object // objects that its declaration makes use of
+}
+
+func fprint(w io.Writer, n Node) {
+ switch n := n.(type) {
+ case *File:
+ file := n
+ seenRewrite := make(map[[3]string]string)
+ fmt.Fprintf(w, "// Code generated from _gen/%s%s.rules; DO NOT EDIT.\n", n.Arch.name, n.Suffix)
+ fmt.Fprintf(w, "// generated with: cd _gen; go run .\n")
+ fmt.Fprintf(w, "\npackage ssa\n")
+ for _, path := range append([]string{
+ "fmt",
+ "internal/buildcfg",
+ "math",
+ "cmd/internal/obj",
+ "cmd/compile/internal/base",
+ "cmd/compile/internal/types",
+ }, n.Arch.imports...) {
+ fmt.Fprintf(w, "import %q\n", path)
+ }
+ for _, f := range n.List {
+ f := f.(*Func)
+ fmt.Fprintf(w, "func rewrite%s%s%s%s(", f.Kind, n.Arch.name, n.Suffix, f.Suffix)
+ fmt.Fprintf(w, "%c *%s) bool {\n", strings.ToLower(f.Kind)[0], f.Kind)
+ if f.Kind == "Value" && f.ArgLen > 0 {
+ for i := f.ArgLen - 1; i >= 0; i-- {
+ fmt.Fprintf(w, "v_%d := v.Args[%d]\n", i, i)
+ }
+ }
+ for _, n := range f.List {
+ fprint(w, n)
+
+ if rr, ok := n.(*RuleRewrite); ok {
+ k := [3]string{
+ normalizeMatch(rr.Match, file.Arch),
+ normalizeWhitespace(rr.Cond),
+ normalizeWhitespace(rr.Result),
+ }
+ if prev, ok := seenRewrite[k]; ok {
+ log.Fatalf("duplicate rule %s, previously seen at %s\n", rr.Loc, prev)
+ }
+ seenRewrite[k] = rr.Loc
+ }
+ }
+ fmt.Fprintf(w, "}\n")
+ }
+ case *Switch:
+ fmt.Fprintf(w, "switch ")
+ fprint(w, n.Expr)
+ fmt.Fprintf(w, " {\n")
+ for _, n := range n.List {
+ fprint(w, n)
+ }
+ fmt.Fprintf(w, "}\n")
+ case *Case:
+ fmt.Fprintf(w, "case ")
+ fprint(w, n.Expr)
+ fmt.Fprintf(w, ":\n")
+ for _, n := range n.List {
+ fprint(w, n)
+ }
+ case *RuleRewrite:
+ if *addLine {
+ fmt.Fprintf(w, "// %s\n", n.Loc)
+ }
+ fmt.Fprintf(w, "// match: %s\n", n.Match)
+ if n.Cond != "" {
+ fmt.Fprintf(w, "// cond: %s\n", n.Cond)
+ }
+ fmt.Fprintf(w, "// result: %s\n", n.Result)
+ fmt.Fprintf(w, "for %s {\n", n.Check)
+ nCommutative := 0
+ for _, n := range n.List {
+ if b, ok := n.(*CondBreak); ok {
+ b.InsideCommuteLoop = nCommutative > 0
+ }
+ fprint(w, n)
+ if loop, ok := n.(StartCommuteLoop); ok {
+ if nCommutative != loop.Depth {
+ panic("mismatch commute loop depth")
+ }
+ nCommutative++
+ }
+ }
+ fmt.Fprintf(w, "return true\n")
+ for i := 0; i < nCommutative; i++ {
+ fmt.Fprintln(w, "}")
+ }
+ if n.CommuteDepth > 0 && n.CanFail {
+ fmt.Fprint(w, "break\n")
+ }
+ fmt.Fprintf(w, "}\n")
+ case *Declare:
+ fmt.Fprintf(w, "%s := ", n.Name)
+ fprint(w, n.Value)
+ fmt.Fprintln(w)
+ case *CondBreak:
+ fmt.Fprintf(w, "if ")
+ fprint(w, n.Cond)
+ fmt.Fprintf(w, " {\n")
+ if n.InsideCommuteLoop {
+ fmt.Fprintf(w, "continue")
+ } else {
+ fmt.Fprintf(w, "break")
+ }
+ fmt.Fprintf(w, "\n}\n")
+ case ast.Node:
+ printConfig.Fprint(w, emptyFset, n)
+ if _, ok := n.(ast.Stmt); ok {
+ fmt.Fprintln(w)
+ }
+ case StartCommuteLoop:
+ fmt.Fprintf(w, "for _i%[1]d := 0; _i%[1]d <= 1; _i%[1]d, %[2]s_0, %[2]s_1 = _i%[1]d + 1, %[2]s_1, %[2]s_0 {\n", n.Depth, n.V)
+ default:
+ log.Fatalf("cannot print %T", n)
+ }
+}
+
+var printConfig = printer.Config{
+ Mode: printer.RawFormat, // we use go/format later, so skip work here
+}
+
+var emptyFset = token.NewFileSet()
+
+// Node can be a Statement or an ast.Expr.
+type Node interface{}
+
+// Statement can be one of our high-level statement struct types, or an
+// ast.Stmt under some limited circumstances.
+type Statement interface{}
+
+// BodyBase is shared by all of our statement pseudo-node types which can
+// contain other statements.
+type BodyBase struct {
+ List []Statement
+ CanFail bool
+}
+
+func (w *BodyBase) add(node Statement) {
+ var last Statement
+ if len(w.List) > 0 {
+ last = w.List[len(w.List)-1]
+ }
+ if node, ok := node.(*CondBreak); ok {
+ w.CanFail = true
+ if last, ok := last.(*CondBreak); ok {
+ // Add to the previous "if <cond> { break }" via a
+ // logical OR, which will save verbosity.
+ last.Cond = &ast.BinaryExpr{
+ Op: token.LOR,
+ X: last.Cond,
+ Y: node.Cond,
+ }
+ return
+ }
+ }
+
+ w.List = append(w.List, node)
+}
+
+// predeclared contains globally known tokens that should not be redefined.
+var predeclared = map[string]bool{
+ "nil": true,
+ "false": true,
+ "true": true,
+}
+
+// declared reports if the body contains a Declare with the given name.
+func (w *BodyBase) declared(name string) bool {
+ if predeclared[name] {
+ // Treat predeclared names as having already been declared.
+ // This lets us use nil to match an aux field or
+ // true and false to match an auxint field.
+ return true
+ }
+ for _, s := range w.List {
+ if decl, ok := s.(*Declare); ok && decl.Name == name {
+ return true
+ }
+ }
+ return false
+}
+
+// These types define some high-level statement struct types, which can be used
+// as a Statement. This allows us to keep some node structs simpler, and have
+// higher-level nodes such as an entire rule rewrite.
+//
+// Note that ast.Expr is always used as-is; we don't declare our own expression
+// nodes.
+type (
+ File struct {
+ BodyBase // []*Func
+ Arch arch
+ Suffix string
+ }
+ Func struct {
+ BodyBase
+ Kind string // "Value" or "Block"
+ Suffix string
+ ArgLen int32 // if kind == "Value", number of args for this op
+ }
+ Switch struct {
+ BodyBase // []*Case
+ Expr ast.Expr
+ }
+ Case struct {
+ BodyBase
+ Expr ast.Expr
+ }
+ RuleRewrite struct {
+ BodyBase
+ Match, Cond, Result string // top comments
+ Check string // top-level boolean expression
+
+ Alloc int // for unique var names
+ Loc string // file name & line number of the original rule
+ CommuteDepth int // used to track depth of commute loops
+ }
+ Declare struct {
+ Name string
+ Value ast.Expr
+ }
+ CondBreak struct {
+ Cond ast.Expr
+ InsideCommuteLoop bool
+ }
+ StartCommuteLoop struct {
+ Depth int
+ V string
+ }
+)
+
+// exprf parses a Go expression generated from fmt.Sprintf, panicking if an
+// error occurs.
+func exprf(format string, a ...interface{}) ast.Expr {
+ src := fmt.Sprintf(format, a...)
+ expr, err := parser.ParseExpr(src)
+ if err != nil {
+ log.Fatalf("expr parse error on %q: %v", src, err)
+ }
+ return expr
+}
+
+// stmtf parses a Go statement generated from fmt.Sprintf. This function is only
+// meant for simple statements that don't have a custom Statement node declared
+// in this package, such as ast.ReturnStmt or ast.ExprStmt.
+func stmtf(format string, a ...interface{}) Statement {
+ src := fmt.Sprintf(format, a...)
+ fsrc := "package p\nfunc _() {\n" + src + "\n}\n"
+ file, err := parser.ParseFile(token.NewFileSet(), "", fsrc, 0)
+ if err != nil {
+ log.Fatalf("stmt parse error on %q: %v", src, err)
+ }
+ return file.Decls[0].(*ast.FuncDecl).Body.List[0]
+}
+
+var reservedNames = map[string]bool{
+ "v": true, // Values[i], etc
+ "b": true, // v.Block
+ "config": true, // b.Func.Config
+ "fe": true, // b.Func.fe
+ "typ": true, // &b.Func.Config.Types
+}
+
+// declf constructs a simple "name := value" declaration,
+// using exprf for its value.
+//
+// name must not be one of reservedNames.
+// This helps prevent unintended shadowing and name clashes.
+// To declare a reserved name, use declReserved.
+func declf(loc, name, format string, a ...interface{}) *Declare {
+ if reservedNames[name] {
+ log.Fatalf("rule %s uses the reserved name %s", loc, name)
+ }
+ return &Declare{name, exprf(format, a...)}
+}
+
+// declReserved is like declf, but the name must be one of reservedNames.
+// Calls to declReserved should generally be static and top-level.
+func declReserved(name, value string) *Declare {
+ if !reservedNames[name] {
+ panic(fmt.Sprintf("declReserved call does not use a reserved name: %q", name))
+ }
+ return &Declare{name, exprf(value)}
+}
+
+// breakf constructs a simple "if cond { break }" statement, using exprf for its
+// condition.
+func breakf(format string, a ...interface{}) *CondBreak {
+ return &CondBreak{Cond: exprf(format, a...)}
+}
+
+func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite {
+ rr := &RuleRewrite{Loc: rule.Loc}
+ rr.Match, rr.Cond, rr.Result = rule.parse()
+ _, _, auxint, aux, s := extract(rr.Match) // remove parens, then split
+
+ // check match of control values
+ if len(s) < data.controls {
+ log.Fatalf("incorrect number of arguments in %s, got %v wanted at least %v", rule, len(s), data.controls)
+ }
+ controls := s[:data.controls]
+ pos := make([]string, data.controls)
+ for i, arg := range controls {
+ cname := fmt.Sprintf("b.Controls[%v]", i)
+ if strings.Contains(arg, "(") {
+ vname, expr := splitNameExpr(arg)
+ if vname == "" {
+ vname = fmt.Sprintf("v_%v", i)
+ }
+ rr.add(declf(rr.Loc, vname, cname))
+ p, op := genMatch0(rr, arch, expr, vname, nil, false) // TODO: pass non-nil cnt?
+ if op != "" {
+ check := fmt.Sprintf("%s.Op == %s", cname, op)
+ if rr.Check == "" {
+ rr.Check = check
+ } else {
+ rr.Check += " && " + check
+ }
+ }
+ if p == "" {
+ p = vname + ".Pos"
+ }
+ pos[i] = p
+ } else {
+ rr.add(declf(rr.Loc, arg, cname))
+ pos[i] = arg + ".Pos"
+ }
+ }
+ for _, e := range []struct {
+ name, field, dclType string
+ }{
+ {auxint, "AuxInt", data.auxIntType()},
+ {aux, "Aux", data.auxType()},
+ } {
+ if e.name == "" {
+ continue
+ }
+
+ if e.dclType == "" {
+ log.Fatalf("op %s has no declared type for %s", data.name, e.field)
+ }
+ if !token.IsIdentifier(e.name) || rr.declared(e.name) {
+ rr.add(breakf("%sTo%s(b.%s) != %s", unTitle(e.field), title(e.dclType), e.field, e.name))
+ } else {
+ rr.add(declf(rr.Loc, e.name, "%sTo%s(b.%s)", unTitle(e.field), title(e.dclType), e.field))
+ }
+ }
+ if rr.Cond != "" {
+ rr.add(breakf("!(%s)", rr.Cond))
+ }
+
+ // Rule matches. Generate result.
+ outop, _, auxint, aux, t := extract(rr.Result) // remove parens, then split
+ blockName, outdata := getBlockInfo(outop, arch)
+ if len(t) < outdata.controls {
+ log.Fatalf("incorrect number of output arguments in %s, got %v wanted at least %v", rule, len(s), outdata.controls)
+ }
+
+ // Check if newsuccs is the same set as succs.
+ succs := s[data.controls:]
+ newsuccs := t[outdata.controls:]
+ m := map[string]bool{}
+ for _, succ := range succs {
+ if m[succ] {
+ log.Fatalf("can't have a repeat successor name %s in %s", succ, rule)
+ }
+ m[succ] = true
+ }
+ for _, succ := range newsuccs {
+ if !m[succ] {
+ log.Fatalf("unknown successor %s in %s", succ, rule)
+ }
+ delete(m, succ)
+ }
+ if len(m) != 0 {
+ log.Fatalf("unmatched successors %v in %s", m, rule)
+ }
+
+ var genControls [2]string
+ for i, control := range t[:outdata.controls] {
+ // Select a source position for any new control values.
+ // TODO: does it always make sense to use the source position
+ // of the original control values or should we be using the
+ // block's source position in some cases?
+ newpos := "b.Pos" // default to block's source position
+ if i < len(pos) && pos[i] != "" {
+ // Use the previous control value's source position.
+ newpos = pos[i]
+ }
+
+ // Generate a new control value (or copy an existing value).
+ genControls[i] = genResult0(rr, arch, control, false, false, newpos, nil)
+ }
+ switch outdata.controls {
+ case 0:
+ rr.add(stmtf("b.Reset(%s)", blockName))
+ case 1:
+ rr.add(stmtf("b.resetWithControl(%s, %s)", blockName, genControls[0]))
+ case 2:
+ rr.add(stmtf("b.resetWithControl2(%s, %s, %s)", blockName, genControls[0], genControls[1]))
+ default:
+ log.Fatalf("too many controls: %d", outdata.controls)
+ }
+
+ if auxint != "" {
+ // Make sure auxint value has the right type.
+ rr.add(stmtf("b.AuxInt = %sToAuxInt(%s)", unTitle(outdata.auxIntType()), auxint))
+ }
+ if aux != "" {
+ // Make sure aux value has the right type.
+ rr.add(stmtf("b.Aux = %sToAux(%s)", unTitle(outdata.auxType()), aux))
+ }
+
+ succChanged := false
+ for i := 0; i < len(succs); i++ {
+ if succs[i] != newsuccs[i] {
+ succChanged = true
+ }
+ }
+ if succChanged {
+ if len(succs) != 2 {
+ log.Fatalf("changed successors, len!=2 in %s", rule)
+ }
+ if succs[0] != newsuccs[1] || succs[1] != newsuccs[0] {
+ log.Fatalf("can only handle swapped successors in %s", rule)
+ }
+ rr.add(stmtf("b.swapSuccessors()"))
+ }
+
+ if *genLog {
+ rr.add(stmtf("logRule(%q)", rule.Loc))
+ }
+ return rr
+}
+
+// genMatch returns the variable whose source position should be used for the
+// result (or "" if no opinion), and a boolean that reports whether the match can fail.
+func genMatch(rr *RuleRewrite, arch arch, match string, pregenTop bool) (pos, checkOp string) {
+ cnt := varCount(rr)
+ return genMatch0(rr, arch, match, "v", cnt, pregenTop)
+}
+
+func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int, pregenTop bool) (pos, checkOp string) {
+ if match[0] != '(' || match[len(match)-1] != ')' {
+ log.Fatalf("%s: non-compound expr in genMatch0: %q", rr.Loc, match)
+ }
+ op, oparch, typ, auxint, aux, args := parseValue(match, arch, rr.Loc)
+
+ checkOp = fmt.Sprintf("Op%s%s", oparch, op.name)
+
+ if op.faultOnNilArg0 || op.faultOnNilArg1 {
+ // Prefer the position of an instruction which could fault.
+ pos = v + ".Pos"
+ }
+
+ // If the last argument is ___, it means "don't care about trailing arguments, really"
+ // The likely/intended use is for rewrites that are too tricky to express in the existing pattern language
+ // Do a length check early because long patterns fed short (ultimately not-matching) inputs will
+ // do an indexing error in pattern-matching.
+ if op.argLength == -1 {
+ l := len(args)
+ if l == 0 || args[l-1] != "___" {
+ rr.add(breakf("len(%s.Args) != %d", v, l))
+ } else if l > 1 && args[l-1] == "___" {
+ rr.add(breakf("len(%s.Args) < %d", v, l-1))
+ }
+ }
+
+ for _, e := range []struct {
+ name, field, dclType string
+ }{
+ {typ, "Type", "*types.Type"},
+ {auxint, "AuxInt", op.auxIntType()},
+ {aux, "Aux", op.auxType()},
+ } {
+ if e.name == "" {
+ continue
+ }
+
+ if e.dclType == "" {
+ log.Fatalf("op %s has no declared type for %s", op.name, e.field)
+ }
+ if !token.IsIdentifier(e.name) || rr.declared(e.name) {
+ switch e.field {
+ case "Aux":
+ rr.add(breakf("auxTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name))
+ case "AuxInt":
+ rr.add(breakf("auxIntTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name))
+ case "Type":
+ rr.add(breakf("%s.%s != %s", v, e.field, e.name))
+ }
+ } else {
+ switch e.field {
+ case "Aux":
+ rr.add(declf(rr.Loc, e.name, "auxTo%s(%s.%s)", title(e.dclType), v, e.field))
+ case "AuxInt":
+ rr.add(declf(rr.Loc, e.name, "auxIntTo%s(%s.%s)", title(e.dclType), v, e.field))
+ case "Type":
+ rr.add(declf(rr.Loc, e.name, "%s.%s", v, e.field))
+ }
+ }
+ }
+
+ commutative := op.commutative
+ if commutative {
+ if args[0] == args[1] {
+ // When we have (Add x x), for any x,
+ // even if there are other uses of x besides these two,
+ // and even if x is not a variable,
+ // we can skip the commutative match.
+ commutative = false
+ }
+ if cnt[args[0]] == 1 && cnt[args[1]] == 1 {
+ // When we have (Add x y) with no other uses
+ // of x and y in the matching rule and condition,
+ // then we can skip the commutative match (Add y x).
+ commutative = false
+ }
+ }
+
+ if !pregenTop {
+ // Access last argument first to minimize bounds checks.
+ for n := len(args) - 1; n > 0; n-- {
+ a := args[n]
+ if a == "_" {
+ continue
+ }
+ if !rr.declared(a) && token.IsIdentifier(a) && !(commutative && len(args) == 2) {
+ rr.add(declf(rr.Loc, a, "%s.Args[%d]", v, n))
+ // delete the last argument so it is not reprocessed
+ args = args[:n]
+ } else {
+ rr.add(stmtf("_ = %s.Args[%d]", v, n))
+ }
+ break
+ }
+ }
+ if commutative && !pregenTop {
+ for i := 0; i <= 1; i++ {
+ vname := fmt.Sprintf("%s_%d", v, i)
+ rr.add(declf(rr.Loc, vname, "%s.Args[%d]", v, i))
+ }
+ }
+ if commutative {
+ rr.add(StartCommuteLoop{rr.CommuteDepth, v})
+ rr.CommuteDepth++
+ }
+ for i, arg := range args {
+ if arg == "_" {
+ continue
+ }
+ var rhs string
+ if (commutative && i < 2) || pregenTop {
+ rhs = fmt.Sprintf("%s_%d", v, i)
+ } else {
+ rhs = fmt.Sprintf("%s.Args[%d]", v, i)
+ }
+ if !strings.Contains(arg, "(") {
+ // leaf variable
+ if rr.declared(arg) {
+ // variable already has a definition. Check whether
+ // the old definition and the new definition match.
+ // For example, (add x x). Equality is just pointer equality
+ // on Values (so cse is important to do before lowering).
+ rr.add(breakf("%s != %s", arg, rhs))
+ } else {
+ if arg != rhs {
+ rr.add(declf(rr.Loc, arg, "%s", rhs))
+ }
+ }
+ continue
+ }
+ // compound sexpr
+ argname, expr := splitNameExpr(arg)
+ if argname == "" {
+ argname = fmt.Sprintf("%s_%d", v, i)
+ }
+ if argname == "b" {
+ log.Fatalf("don't name args 'b', it is ambiguous with blocks")
+ }
+
+ if argname != rhs {
+ rr.add(declf(rr.Loc, argname, "%s", rhs))
+ }
+ bexpr := exprf("%s.Op != addLater", argname)
+ rr.add(&CondBreak{Cond: bexpr})
+ argPos, argCheckOp := genMatch0(rr, arch, expr, argname, cnt, false)
+ bexpr.(*ast.BinaryExpr).Y.(*ast.Ident).Name = argCheckOp
+
+ if argPos != "" {
+ // Keep the argument in preference to the parent, as the
+ // argument is normally earlier in program flow.
+ // Keep the argument in preference to an earlier argument,
+ // as that prefers the memory argument which is also earlier
+ // in the program flow.
+ pos = argPos
+ }
+ }
+
+ return pos, checkOp
+}
+
+func genResult(rr *RuleRewrite, arch arch, result, pos string) {
+ move := result[0] == '@'
+ if move {
+ // parse @block directive
+ s := strings.SplitN(result[1:], " ", 2)
+ rr.add(stmtf("b = %s", s[0]))
+ result = s[1]
+ }
+ cse := make(map[string]string)
+ genResult0(rr, arch, result, true, move, pos, cse)
+}
+
+func genResult0(rr *RuleRewrite, arch arch, result string, top, move bool, pos string, cse map[string]string) string {
+ resname, expr := splitNameExpr(result)
+ result = expr
+ // TODO: when generating a constant result, use f.constVal to avoid
+ // introducing copies just to clean them up again.
+ if result[0] != '(' {
+ // variable
+ if top {
+ // It in not safe in general to move a variable between blocks
+ // (and particularly not a phi node).
+ // Introduce a copy.
+ rr.add(stmtf("v.copyOf(%s)", result))
+ }
+ return result
+ }
+
+ w := normalizeWhitespace(result)
+ if prev := cse[w]; prev != "" {
+ return prev
+ }
+
+ op, oparch, typ, auxint, aux, args := parseValue(result, arch, rr.Loc)
+
+ // Find the type of the variable.
+ typeOverride := typ != ""
+ if typ == "" && op.typ != "" {
+ typ = typeName(op.typ)
+ }
+
+ v := "v"
+ if top && !move {
+ rr.add(stmtf("v.reset(Op%s%s)", oparch, op.name))
+ if typeOverride {
+ rr.add(stmtf("v.Type = %s", typ))
+ }
+ } else {
+ if typ == "" {
+ log.Fatalf("sub-expression %s (op=Op%s%s) at %s must have a type", result, oparch, op.name, rr.Loc)
+ }
+ if resname == "" {
+ v = fmt.Sprintf("v%d", rr.Alloc)
+ } else {
+ v = resname
+ }
+ rr.Alloc++
+ rr.add(declf(rr.Loc, v, "b.NewValue0(%s, Op%s%s, %s)", pos, oparch, op.name, typ))
+ if move && top {
+ // Rewrite original into a copy
+ rr.add(stmtf("v.copyOf(%s)", v))
+ }
+ }
+
+ if auxint != "" {
+ // Make sure auxint value has the right type.
+ rr.add(stmtf("%s.AuxInt = %sToAuxInt(%s)", v, unTitle(op.auxIntType()), auxint))
+ }
+ if aux != "" {
+ // Make sure aux value has the right type.
+ rr.add(stmtf("%s.Aux = %sToAux(%s)", v, unTitle(op.auxType()), aux))
+ }
+ all := new(strings.Builder)
+ for i, arg := range args {
+ x := genResult0(rr, arch, arg, false, move, pos, cse)
+ if i > 0 {
+ all.WriteString(", ")
+ }
+ all.WriteString(x)
+ }
+ switch len(args) {
+ case 0:
+ case 1:
+ rr.add(stmtf("%s.AddArg(%s)", v, all.String()))
+ default:
+ rr.add(stmtf("%s.AddArg%d(%s)", v, len(args), all.String()))
+ }
+
+ if cse != nil {
+ cse[w] = v
+ }
+ return v
+}
+
+func split(s string) []string {
+ var r []string
+
+outer:
+ for s != "" {
+ d := 0 // depth of ({[<
+ var open, close byte // opening and closing markers ({[< or )}]>
+ nonsp := false // found a non-space char so far
+ for i := 0; i < len(s); i++ {
+ switch {
+ case d == 0 && s[i] == '(':
+ open, close = '(', ')'
+ d++
+ case d == 0 && s[i] == '<':
+ open, close = '<', '>'
+ d++
+ case d == 0 && s[i] == '[':
+ open, close = '[', ']'
+ d++
+ case d == 0 && s[i] == '{':
+ open, close = '{', '}'
+ d++
+ case d == 0 && (s[i] == ' ' || s[i] == '\t'):
+ if nonsp {
+ r = append(r, strings.TrimSpace(s[:i]))
+ s = s[i:]
+ continue outer
+ }
+ case d > 0 && s[i] == open:
+ d++
+ case d > 0 && s[i] == close:
+ d--
+ default:
+ nonsp = true
+ }
+ }
+ if d != 0 {
+ log.Fatalf("imbalanced expression: %q", s)
+ }
+ if nonsp {
+ r = append(r, strings.TrimSpace(s))
+ }
+ break
+ }
+ return r
+}
+
+// isBlock reports whether this op is a block opcode.
+func isBlock(name string, arch arch) bool {
+ for _, b := range genericBlocks {
+ if b.name == name {
+ return true
+ }
+ }
+ for _, b := range arch.blocks {
+ if b.name == name {
+ return true
+ }
+ }
+ return false
+}
+
+func extract(val string) (op, typ, auxint, aux string, args []string) {
+ val = val[1 : len(val)-1] // remove ()
+
+ // Split val up into regions.
+ // Split by spaces/tabs, except those contained in (), {}, [], or <>.
+ s := split(val)
+
+ // Extract restrictions and args.
+ op = s[0]
+ for _, a := range s[1:] {
+ switch a[0] {
+ case '<':
+ typ = a[1 : len(a)-1] // remove <>
+ case '[':
+ auxint = a[1 : len(a)-1] // remove []
+ case '{':
+ aux = a[1 : len(a)-1] // remove {}
+ default:
+ args = append(args, a)
+ }
+ }
+ return
+}
+
+// parseValue parses a parenthesized value from a rule.
+// The value can be from the match or the result side.
+// It returns the op and unparsed strings for typ, auxint, and aux restrictions and for all args.
+// oparch is the architecture that op is located in, or "" for generic.
+func parseValue(val string, arch arch, loc string) (op opData, oparch, typ, auxint, aux string, args []string) {
+ // Resolve the op.
+ var s string
+ s, typ, auxint, aux, args = extract(val)
+
+ // match reports whether x is a good op to select.
+ // If strict is true, rule generation might succeed.
+ // If strict is false, rule generation has failed,
+ // but we're trying to generate a useful error.
+ // Doing strict=true then strict=false allows
+ // precise op matching while retaining good error messages.
+ match := func(x opData, strict bool, archname string) bool {
+ if x.name != s {
+ return false
+ }
+ if x.argLength != -1 && int(x.argLength) != len(args) && (len(args) != 1 || args[0] != "...") {
+ if strict {
+ return false
+ }
+ log.Printf("%s: op %s (%s) should have %d args, has %d", loc, s, archname, x.argLength, len(args))
+ }
+ return true
+ }
+
+ for _, x := range genericOps {
+ if match(x, true, "generic") {
+ op = x
+ break
+ }
+ }
+ for _, x := range arch.ops {
+ if arch.name != "generic" && match(x, true, arch.name) {
+ if op.name != "" {
+ log.Fatalf("%s: matches for op %s found in both generic and %s", loc, op.name, arch.name)
+ }
+ op = x
+ oparch = arch.name
+ break
+ }
+ }
+
+ if op.name == "" {
+ // Failed to find the op.
+ // Run through everything again with strict=false
+ // to generate useful diagnosic messages before failing.
+ for _, x := range genericOps {
+ match(x, false, "generic")
+ }
+ for _, x := range arch.ops {
+ match(x, false, arch.name)
+ }
+ log.Fatalf("%s: unknown op %s", loc, s)
+ }
+
+ // Sanity check aux, auxint.
+ if auxint != "" && !opHasAuxInt(op) {
+ log.Fatalf("%s: op %s %s can't have auxint", loc, op.name, op.aux)
+ }
+ if aux != "" && !opHasAux(op) {
+ log.Fatalf("%s: op %s %s can't have aux", loc, op.name, op.aux)
+ }
+ return
+}
+
+func opHasAuxInt(op opData) bool {
+ switch op.aux {
+ case "Bool", "Int8", "Int16", "Int32", "Int64", "Int128", "UInt8", "Float32", "Float64",
+ "SymOff", "CallOff", "SymValAndOff", "TypSize", "ARM64BitField", "FlagConstant", "CCop":
+ return true
+ }
+ return false
+}
+
+func opHasAux(op opData) bool {
+ switch op.aux {
+ case "String", "Sym", "SymOff", "Call", "CallOff", "SymValAndOff", "Typ", "TypSize",
+ "S390XCCMask", "S390XRotateParams":
+ return true
+ }
+ return false
+}
+
+// splitNameExpr splits s-expr arg, possibly prefixed by "name:",
+// into name and the unprefixed expression.
+// For example, "x:(Foo)" yields "x", "(Foo)",
+// and "(Foo)" yields "", "(Foo)".
+func splitNameExpr(arg string) (name, expr string) {
+ colon := strings.Index(arg, ":")
+ if colon < 0 {
+ return "", arg
+ }
+ openparen := strings.Index(arg, "(")
+ if openparen < 0 {
+ log.Fatalf("splitNameExpr(%q): colon but no open parens", arg)
+ }
+ if colon > openparen {
+ // colon is inside the parens, such as in "(Foo x:(Bar))".
+ return "", arg
+ }
+ return arg[:colon], arg[colon+1:]
+}
+
+func getBlockInfo(op string, arch arch) (name string, data blockData) {
+ for _, b := range genericBlocks {
+ if b.name == op {
+ return "Block" + op, b
+ }
+ }
+ for _, b := range arch.blocks {
+ if b.name == op {
+ return "Block" + arch.name + op, b
+ }
+ }
+ log.Fatalf("could not find block data for %s", op)
+ panic("unreachable")
+}
+
+// typeName returns the string to use to generate a type.
+func typeName(typ string) string {
+ if typ[0] == '(' {
+ ts := strings.Split(typ[1:len(typ)-1], ",")
+ if len(ts) != 2 {
+ log.Fatalf("Tuple expect 2 arguments")
+ }
+ return "types.NewTuple(" + typeName(ts[0]) + ", " + typeName(ts[1]) + ")"
+ }
+ switch typ {
+ case "Flags", "Mem", "Void", "Int128":
+ return "types.Type" + typ
+ default:
+ return "typ." + typ
+ }
+}
+
+// balance returns the number of unclosed '(' characters in s.
+// If a ')' appears without a corresponding '(', balance returns -1.
+func balance(s string) int {
+ balance := 0
+ for _, c := range s {
+ switch c {
+ case '(':
+ balance++
+ case ')':
+ balance--
+ if balance < 0 {
+ // don't allow ")(" to return 0
+ return -1
+ }
+ }
+ }
+ return balance
+}
+
+// findAllOpcode is a function to find the opcode portion of s-expressions.
+var findAllOpcode = regexp.MustCompile(`[(](\w+[|])+\w+[)]`).FindAllStringIndex
+
+// excludeFromExpansion reports whether the substring s[idx[0]:idx[1]] in a rule
+// should be disregarded as a candidate for | expansion.
+// It uses simple syntactic checks to see whether the substring
+// is inside an AuxInt expression or inside the && conditions.
+func excludeFromExpansion(s string, idx []int) bool {
+ left := s[:idx[0]]
+ if strings.LastIndexByte(left, '[') > strings.LastIndexByte(left, ']') {
+ // Inside an AuxInt expression.
+ return true
+ }
+ right := s[idx[1]:]
+ if strings.Contains(left, "&&") && strings.Contains(right, "=>") {
+ // Inside && conditions.
+ return true
+ }
+ return false
+}
+
+// expandOr converts a rule into multiple rules by expanding | ops.
+func expandOr(r string) []string {
+ // Find every occurrence of |-separated things.
+ // They look like MOV(B|W|L|Q|SS|SD)load or MOV(Q|L)loadidx(1|8).
+ // Generate rules selecting one case from each |-form.
+
+ // Count width of |-forms. They must match.
+ n := 1
+ for _, idx := range findAllOpcode(r, -1) {
+ if excludeFromExpansion(r, idx) {
+ continue
+ }
+ s := r[idx[0]:idx[1]]
+ c := strings.Count(s, "|") + 1
+ if c == 1 {
+ continue
+ }
+ if n > 1 && n != c {
+ log.Fatalf("'|' count doesn't match in %s: both %d and %d\n", r, n, c)
+ }
+ n = c
+ }
+ if n == 1 {
+ // No |-form in this rule.
+ return []string{r}
+ }
+ // Build each new rule.
+ res := make([]string, n)
+ for i := 0; i < n; i++ {
+ buf := new(strings.Builder)
+ x := 0
+ for _, idx := range findAllOpcode(r, -1) {
+ if excludeFromExpansion(r, idx) {
+ continue
+ }
+ buf.WriteString(r[x:idx[0]]) // write bytes we've skipped over so far
+ s := r[idx[0]+1 : idx[1]-1] // remove leading "(" and trailing ")"
+ buf.WriteString(strings.Split(s, "|")[i]) // write the op component for this rule
+ x = idx[1] // note that we've written more bytes
+ }
+ buf.WriteString(r[x:])
+ res[i] = buf.String()
+ }
+ return res
+}
+
+// varCount returns a map which counts the number of occurrences of
+// Value variables in the s-expression rr.Match and the Go expression rr.Cond.
+func varCount(rr *RuleRewrite) map[string]int {
+ cnt := map[string]int{}
+ varCount1(rr.Loc, rr.Match, cnt)
+ if rr.Cond != "" {
+ expr, err := parser.ParseExpr(rr.Cond)
+ if err != nil {
+ log.Fatalf("%s: failed to parse cond %q: %v", rr.Loc, rr.Cond, err)
+ }
+ ast.Inspect(expr, func(n ast.Node) bool {
+ if id, ok := n.(*ast.Ident); ok {
+ cnt[id.Name]++
+ }
+ return true
+ })
+ }
+ return cnt
+}
+
+func varCount1(loc, m string, cnt map[string]int) {
+ if m[0] == '<' || m[0] == '[' || m[0] == '{' {
+ return
+ }
+ if token.IsIdentifier(m) {
+ cnt[m]++
+ return
+ }
+ // Split up input.
+ name, expr := splitNameExpr(m)
+ if name != "" {
+ cnt[name]++
+ }
+ if expr[0] != '(' || expr[len(expr)-1] != ')' {
+ log.Fatalf("%s: non-compound expr in varCount1: %q", loc, expr)
+ }
+ s := split(expr[1 : len(expr)-1])
+ for _, arg := range s[1:] {
+ varCount1(loc, arg, cnt)
+ }
+}
+
+// normalizeWhitespace replaces 2+ whitespace sequences with a single space.
+func normalizeWhitespace(x string) string {
+ x = strings.Join(strings.Fields(x), " ")
+ x = strings.Replace(x, "( ", "(", -1)
+ x = strings.Replace(x, " )", ")", -1)
+ x = strings.Replace(x, "[ ", "[", -1)
+ x = strings.Replace(x, " ]", "]", -1)
+ x = strings.Replace(x, ")=>", ") =>", -1)
+ return x
+}
+
+// opIsCommutative reports whether op s is commutative.
+func opIsCommutative(op string, arch arch) bool {
+ for _, x := range genericOps {
+ if op == x.name {
+ if x.commutative {
+ return true
+ }
+ break
+ }
+ }
+ if arch.name != "generic" {
+ for _, x := range arch.ops {
+ if op == x.name {
+ if x.commutative {
+ return true
+ }
+ break
+ }
+ }
+ }
+ return false
+}
+
+func normalizeMatch(m string, arch arch) string {
+ if token.IsIdentifier(m) {
+ return m
+ }
+ op, typ, auxint, aux, args := extract(m)
+ if opIsCommutative(op, arch) {
+ if args[1] < args[0] {
+ args[0], args[1] = args[1], args[0]
+ }
+ }
+ s := new(strings.Builder)
+ fmt.Fprintf(s, "%s <%s> [%s] {%s}", op, typ, auxint, aux)
+ for _, arg := range args {
+ prefix, expr := splitNameExpr(arg)
+ fmt.Fprint(s, " ", prefix, normalizeMatch(expr, arch))
+ }
+ return s.String()
+}
+
+func parseEllipsisRules(rules []Rule, arch arch) (newop string, ok bool) {
+ if len(rules) != 1 {
+ for _, r := range rules {
+ if strings.Contains(r.Rule, "...") {
+ log.Fatalf("%s: found ellipsis in rule, but there are other rules with the same op", r.Loc)
+ }
+ }
+ return "", false
+ }
+ rule := rules[0]
+ match, cond, result := rule.parse()
+ if cond != "" || !isEllipsisValue(match) || !isEllipsisValue(result) {
+ if strings.Contains(rule.Rule, "...") {
+ log.Fatalf("%s: found ellipsis in non-ellipsis rule", rule.Loc)
+ }
+ checkEllipsisRuleCandidate(rule, arch)
+ return "", false
+ }
+ op, oparch, _, _, _, _ := parseValue(result, arch, rule.Loc)
+ return fmt.Sprintf("Op%s%s", oparch, op.name), true
+}
+
+// isEllipsisValue reports whether s is of the form (OpX ...).
+func isEllipsisValue(s string) bool {
+ if len(s) < 2 || s[0] != '(' || s[len(s)-1] != ')' {
+ return false
+ }
+ c := split(s[1 : len(s)-1])
+ if len(c) != 2 || c[1] != "..." {
+ return false
+ }
+ return true
+}
+
+func checkEllipsisRuleCandidate(rule Rule, arch arch) {
+ match, cond, result := rule.parse()
+ if cond != "" {
+ return
+ }
+ op, _, _, auxint, aux, args := parseValue(match, arch, rule.Loc)
+ var auxint2, aux2 string
+ var args2 []string
+ var usingCopy string
+ var eop opData
+ if result[0] != '(' {
+ // Check for (Foo x) => x, which can be converted to (Foo ...) => (Copy ...).
+ args2 = []string{result}
+ usingCopy = " using Copy"
+ } else {
+ eop, _, _, auxint2, aux2, args2 = parseValue(result, arch, rule.Loc)
+ }
+ // Check that all restrictions in match are reproduced exactly in result.
+ if aux != aux2 || auxint != auxint2 || len(args) != len(args2) {
+ return
+ }
+ if strings.Contains(rule.Rule, "=>") && op.aux != eop.aux {
+ return
+ }
+ for i := range args {
+ if args[i] != args2[i] {
+ return
+ }
+ }
+ switch {
+ case opHasAux(op) && aux == "" && aux2 == "":
+ fmt.Printf("%s: rule silently zeros aux, either copy aux or explicitly zero\n", rule.Loc)
+ case opHasAuxInt(op) && auxint == "" && auxint2 == "":
+ fmt.Printf("%s: rule silently zeros auxint, either copy auxint or explicitly zero\n", rule.Loc)
+ default:
+ fmt.Printf("%s: possible ellipsis rule candidate%s: %q\n", rule.Loc, usingCopy, rule.Rule)
+ }
+}
+
+func opByName(arch arch, name string) opData {
+ name = name[2:]
+ for _, x := range genericOps {
+ if name == x.name {
+ return x
+ }
+ }
+ if arch.name != "generic" {
+ name = name[len(arch.name):]
+ for _, x := range arch.ops {
+ if name == x.name {
+ return x
+ }
+ }
+ }
+ log.Fatalf("failed to find op named %s in arch %s", name, arch.name)
+ panic("unreachable")
+}
+
+// auxType returns the Go type that this operation should store in its aux field.
+func (op opData) auxType() string {
+ switch op.aux {
+ case "String":
+ return "string"
+ case "Sym":
+ // Note: a Sym can be an *obj.LSym, a *gc.Node, or nil.
+ return "Sym"
+ case "SymOff":
+ return "Sym"
+ case "Call":
+ return "Call"
+ case "CallOff":
+ return "Call"
+ case "SymValAndOff":
+ return "Sym"
+ case "Typ":
+ return "*types.Type"
+ case "TypSize":
+ return "*types.Type"
+ case "S390XCCMask":
+ return "s390x.CCMask"
+ case "S390XRotateParams":
+ return "s390x.RotateParams"
+ default:
+ return "invalid"
+ }
+}
+
+// auxIntType returns the Go type that this operation should store in its auxInt field.
+func (op opData) auxIntType() string {
+ switch op.aux {
+ case "Bool":
+ return "bool"
+ case "Int8":
+ return "int8"
+ case "Int16":
+ return "int16"
+ case "Int32":
+ return "int32"
+ case "Int64":
+ return "int64"
+ case "Int128":
+ return "int128"
+ case "UInt8":
+ return "uint8"
+ case "Float32":
+ return "float32"
+ case "Float64":
+ return "float64"
+ case "CallOff":
+ return "int32"
+ case "SymOff":
+ return "int32"
+ case "SymValAndOff":
+ return "ValAndOff"
+ case "TypSize":
+ return "int64"
+ case "CCop":
+ return "Op"
+ case "FlagConstant":
+ return "flagConstant"
+ case "ARM64BitField":
+ return "arm64BitField"
+ default:
+ return "invalid"
+ }
+}
+
+// auxType returns the Go type that this block should store in its aux field.
+func (b blockData) auxType() string {
+ switch b.aux {
+ case "Sym":
+ return "Sym"
+ case "S390XCCMask", "S390XCCMaskInt8", "S390XCCMaskUint8":
+ return "s390x.CCMask"
+ case "S390XRotateParams":
+ return "s390x.RotateParams"
+ default:
+ return "invalid"
+ }
+}
+
+// auxIntType returns the Go type that this block should store in its auxInt field.
+func (b blockData) auxIntType() string {
+ switch b.aux {
+ case "S390XCCMaskInt8":
+ return "int8"
+ case "S390XCCMaskUint8":
+ return "uint8"
+ case "Int64":
+ return "int64"
+ default:
+ return "invalid"
+ }
+}
+
+func title(s string) string {
+ if i := strings.Index(s, "."); i >= 0 {
+ switch strings.ToLower(s[:i]) {
+ case "s390x": // keep arch prefix for clarity
+ s = s[:i] + s[i+1:]
+ default:
+ s = s[i+1:]
+ }
+ }
+ return strings.Title(s)
+}
+
+func unTitle(s string) string {
+ if i := strings.Index(s, "."); i >= 0 {
+ switch strings.ToLower(s[:i]) {
+ case "s390x": // keep arch prefix for clarity
+ s = s[:i] + s[i+1:]
+ default:
+ s = s[i+1:]
+ }
+ }
+ return strings.ToLower(s[:1]) + s[1:]
+}