Adding upstream version 1.16.10.upstream/1.16.10 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 13:14:23 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 13:14:23 +0000
commit: 73df946d56c74384511a194dd01dbe099584fd1a (patch)
tree: fd0bcea490dd81327ddfbb31e215439672c9a068 /src/cmd/compile/internal/ssa/gen
parent: Initial commit. (diff)
download: golang-1.16-upstream.tar.xz
golang-1.16-upstream.zip
34 files changed, 25603 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules
new file mode 100644
index 0000000..fbc12fd
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -0,0 +1,1111 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(Ptr|32|16|8) ...) => (ADDL ...)
+(Add(32|64)F ...) => (ADDS(S|D) ...)
+(Add32carry ...) => (ADDLcarry ...)
+(Add32withcarry ...) => (ADCL ...)
+
+(Sub(Ptr|32|16|8) ...) => (SUBL ...)
+(Sub(32|64)F ...) => (SUBS(S|D) ...)
+(Sub32carry ...) => (SUBLcarry ...)
+(Sub32withcarry ...) => (SBBL ...)
+
+(Mul(32|16|8) ...) => (MULL ...)
+(Mul(32|64)F ...) => (MULS(S|D) ...)
+(Mul32uhilo ...) => (MULLQU ...)
+
+(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y))
+(Select1 (Mul32uover x y)) => (SETO (Select1 <types.TypeFlags> (MULLU x y)))
+
+(Avg32u ...) => (AVGLU ...)
+
+(Div(32|64)F ...) => (DIVS(S|D) ...)
+(Div(32|32u|16|16u) ...) => (DIV(L|LU|W|WU) ...)
+(Div8   x y) => (DIVW  (SignExt8to16 x) (SignExt8to16 y))
+(Div8u  x y) => (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+
+(Hmul(32|32u) ...) => (HMUL(L|LU) ...)
+
+(Mod(32|32u|16|16u) ...) => (MOD(L|LU|W|WU) ...)
+(Mod8   x y) => (MODW  (SignExt8to16 x) (SignExt8to16 y))
+(Mod8u  x y) => (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y))
+
+(And(32|16|8) ...) => (ANDL ...)
+(Or(32|16|8) ...) => (ORL ...)
+(Xor(32|16|8) ...) => (XORL ...)
+
+(Neg(32|16|8) ...) => (NEGL ...)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
+
+(Com(32|16|8) ...) => (NOTL ...)
+
+// Lowering boolean ops
+(AndB ...) => (ANDL ...)
+(OrB ...) => (ORL ...)
+(Not x) => (XORLconst [1] x)
+
+// Lowering pointer arithmetic
+(OffPtr [off] ptr) => (ADDLconst [int32(off)] ptr)
+
+(Bswap32 ...) => (BSWAPL ...)
+
+(Sqrt ...) => (SQRTSD ...)
+
+(Ctz16 x) => (BSFL (ORLconst <typ.UInt32> [0x10000] x))
+(Ctz16NonZero ...) => (BSFL ...)
+
+// Lowering extension
+(SignExt8to16  ...) => (MOVBLSX ...)
+(SignExt8to32  ...) => (MOVBLSX ...)
+(SignExt16to32 ...) => (MOVWLSX ...)
+
+(ZeroExt8to16  ...) => (MOVBLZX ...)
+(ZeroExt8to32  ...) => (MOVBLZX ...)
+(ZeroExt16to32 ...) => (MOVWLZX ...)
+
+(Signmask x) => (SARLconst x [31])
+(Zeromask <t> x) => (XORLconst [-1] (SBBLcarrymask <t> (CMPLconst x [1])))
+(Slicemask <t> x) => (SARLconst (NEGL <t> x) [31])
+
+// Lowering truncation
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8  ...) => (Copy ...)
+(Trunc32to8  ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+
+// Lowering float-int conversions
+(Cvt32to32F ...) => (CVTSL2SS ...)
+(Cvt32to64F ...) => (CVTSL2SD ...)
+
+(Cvt32Fto32 ...) => (CVTTSS2SL ...)
+(Cvt64Fto32 ...) => (CVTTSD2SL ...)
+
+(Cvt32Fto64F ...) => (CVTSS2SD ...)
+(Cvt64Fto32F ...) => (CVTSD2SS ...)
+
+(Round32F ...) => (Copy ...)
+(Round64F ...) => (Copy ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+// Lowering shifts
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+//   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
+(Lsh32x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+(Lsh16x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+(Lsh8x(32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+
+(Lsh32x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y)
+(Lsh16x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y)
+(Lsh8x(32|16|8)  <t> x y) && shiftIsBounded(v) => (SHLL <t> x y)
+
+(Rsh32Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32])))
+(Rsh16Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [16])))
+(Rsh8Ux(32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [8])))
+
+(Rsh32Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRL <t> x y)
+(Rsh16Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRW <t> x y)
+(Rsh8Ux(32|16|8)  <t> x y) && shiftIsBounded(v) => (SHRB <t> x y)
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
+
+(Rsh32x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [32])))))
+(Rsh16x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [16])))))
+(Rsh8x(32|16|8) <t> x y)  && !shiftIsBounded(v) => (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [8])))))
+
+(Rsh32x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARL x y)
+(Rsh16x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARW x y)
+(Rsh8x(32|16|8) <t> x y)  && shiftIsBounded(v) => (SARB x y)
+
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SHLLconst x [int32(c)])
+(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SARLconst x [int32(c)])
+(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 => (SHRLconst x [int32(c)])
+(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SHLLconst x [int32(c)])
+(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SARWconst x [int16(c)])
+(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 => (SHRWconst x [int16(c)])
+(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SHLLconst x [int32(c)])
+(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SARBconst x [int8(c)])
+(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 => (SHRBconst x [int8(c)])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh32x64 x (Const64 [c])) && uint64(c) >= 32 => (SARLconst x [31])
+(Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 => (SARWconst x [15])
+(Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SARBconst x [7])
+
+// constant rotates
+(RotateLeft32 x (MOVLconst [c])) => (ROLLconst [c&31] x)
+(RotateLeft16 x (MOVLconst [c])) => (ROLWconst [int16(c&15)] x)
+(RotateLeft8 x (MOVLconst [c]))  => (ROLBconst [int8(c&7)] x)
+
+// Lowering comparisons
+(Less32  x y) => (SETL (CMPL x y))
+(Less16  x y) => (SETL (CMPW x y))
+(Less8   x y) => (SETL (CMPB x y))
+(Less32U x y) => (SETB (CMPL x y))
+(Less16U x y) => (SETB (CMPW x y))
+(Less8U  x y) => (SETB (CMPB x y))
+// Use SETGF with reversed operands to dodge NaN case
+(Less64F x y) => (SETGF (UCOMISD y x))
+(Less32F x y) => (SETGF (UCOMISS y x))
+
+(Leq32  x y) => (SETLE (CMPL x y))
+(Leq16  x y) => (SETLE (CMPW x y))
+(Leq8   x y) => (SETLE (CMPB x y))
+(Leq32U x y) => (SETBE (CMPL x y))
+(Leq16U x y) => (SETBE (CMPW x y))
+(Leq8U  x y) => (SETBE (CMPB x y))
+// Use SETGEF with reversed operands to dodge NaN case
+(Leq64F x y) => (SETGEF (UCOMISD y x))
+(Leq32F x y) => (SETGEF (UCOMISS y x))
+
+(Eq32  x y) => (SETEQ (CMPL x y))
+(Eq16  x y) => (SETEQ (CMPW x y))
+(Eq8   x y) => (SETEQ (CMPB x y))
+(EqB   x y) => (SETEQ (CMPB x y))
+(EqPtr x y) => (SETEQ (CMPL x y))
+(Eq64F x y) => (SETEQF (UCOMISD x y))
+(Eq32F x y) => (SETEQF (UCOMISS x y))
+
+(Neq32  x y) => (SETNE (CMPL x y))
+(Neq16  x y) => (SETNE (CMPW x y))
+(Neq8   x y) => (SETNE (CMPB x y))
+(NeqB   x y) => (SETNE (CMPB x y))
+(NeqPtr x y) => (SETNE (CMPL x y))
+(Neq64F x y) => (SETNEF (UCOMISD x y))
+(Neq32F x y) => (SETNEF (UCOMISS x y))
+
+// Lowering loads
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVLload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem)
+
+// Lowering stores
+// These more-specific FP versions of Store pattern should come first.
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem)
+
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Lowering moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem)
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBload [2] src mem)
+		(MOVWstore dst (MOVWload src mem) mem))
+(Move [5] dst src mem) =>
+	(MOVBstore [4] dst (MOVBload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+(Move [6] dst src mem) =>
+	(MOVWstore [4] dst (MOVWload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+(Move [7] dst src mem) =>
+	(MOVLstore [3] dst (MOVLload [3] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+(Move [8] dst src mem) =>
+	(MOVLstore [4] dst (MOVLload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+
+// Adjust moves to be a multiple of 4 bytes.
+(Move [s] dst src mem)
+	&& s > 8 && s%4 != 0 =>
+	(Move [s-s%4]
+		(ADDLconst <dst.Type> dst [int32(s%4)])
+		(ADDLconst <src.Type> src [int32(s%4)])
+		(MOVLstore dst (MOVLload src mem) mem))
+
+// Medium copying uses a duff device.
+(Move [s] dst src mem)
+	&& s > 8 && s <= 4*128 && s%4 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [10*(128-s/4)] dst src mem)
+// 10 and 128 are magic constants.  10 is the number of bytes to encode:
+//	MOVL	(SI), CX
+//	ADDL	$4, SI
+//	MOVL	CX, (DI)
+//	ADDL	$4, DI
+// and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy.
+
+// Large copying uses REP MOVSL.
+(Move [s] dst src mem) && (s > 4*128 || config.noDuffDevice) && s%4 == 0 && logLargeCopy(v, s) =>
+	(REPMOVSL dst src (MOVLconst [int32(s/4)]) mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [0] destptr mem)
+(Zero [2] destptr mem) => (MOVWstoreconst [0] destptr mem)
+(Zero [4] destptr mem) => (MOVLstoreconst [0] destptr mem)
+
+(Zero [3] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,2)] destptr
+		(MOVWstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [5] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [6] destptr mem) =>
+	(MOVWstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [7] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,3)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+
+// Strip off any fractional word zeroing.
+(Zero [s] destptr mem) && s%4 != 0 && s > 4 =>
+	(Zero [s-s%4] (ADDLconst destptr [int32(s%4)])
+		(MOVLstoreconst [0] destptr mem))
+
+// Zero small numbers of words directly.
+(Zero [8] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [12] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,8)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,4)] destptr
+			(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem)))
+(Zero [16] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,12)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,8)] destptr
+			(MOVLstoreconst [makeValAndOff32(0,4)] destptr
+				(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))))
+
+// Medium zeroing uses a duff device.
+(Zero [s] destptr mem)
+  && s > 16 && s <= 4*128 && s%4 == 0
+  && !config.noDuffDevice =>
+	(DUFFZERO [1*(128-s/4)] destptr (MOVLconst [0]) mem)
+// 1 and 128 are magic constants.  1 is the number of bytes to encode STOSL.
+// 128 is the number of STOSL instructions in duffzero.
+// See src/runtime/duff_386.s:duffzero.
+
+// Large zeroing uses REP STOSQ.
+(Zero [s] destptr mem)
+  && (s > 4*128 || (config.noDuffDevice && s > 16))
+  && s%4 == 0 =>
+	(REPSTOSL destptr (MOVLconst [int32(s/4)]) (MOVLconst [0]) mem)
+
+
+// Lowering constants
+(Const8   [c]) => (MOVLconst [int32(c)])
+(Const16  [c]) => (MOVLconst [int32(c)])
+(Const32  ...) => (MOVLconst ...)
+(Const(32|64)F ...) => (MOVS(S|D)const ...)
+(ConstNil) => (MOVLconst [0])
+(ConstBool [c]) => (MOVLconst [b2i32(c)])
+
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// Miscellaneous
+(IsNonNil p) => (SETNE (TESTL p p))
+(IsInBounds idx len) => (SETB (CMPL idx len))
+(IsSliceInBounds idx len) => (SETBE (CMPL idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetG ...) => (LoweredGetG ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(Addr {sym} base) => (LEAL {sym} base)
+(LocalAddr {sym} base _) => (LEAL {sym} base)
+
+// block rewrites
+(If (SETL  cmp) yes no) => (LT  cmp yes no)
+(If (SETLE cmp) yes no) => (LE  cmp yes no)
+(If (SETG  cmp) yes no) => (GT  cmp yes no)
+(If (SETGE cmp) yes no) => (GE  cmp yes no)
+(If (SETEQ cmp) yes no) => (EQ  cmp yes no)
+(If (SETNE cmp) yes no) => (NE  cmp yes no)
+(If (SETB  cmp) yes no) => (ULT cmp yes no)
+(If (SETBE cmp) yes no) => (ULE cmp yes no)
+(If (SETA  cmp) yes no) => (UGT cmp yes no)
+(If (SETAE cmp) yes no) => (UGE cmp yes no)
+(If (SETO  cmp) yes no) => (OS cmp yes no)
+
+// Special case for floating point - LF/LEF not generated
+(If (SETGF  cmp) yes no) => (UGT  cmp yes no)
+(If (SETGEF cmp) yes no) => (UGE  cmp yes no)
+(If (SETEQF cmp) yes no) => (EQF  cmp yes no)
+(If (SETNEF cmp) yes no) => (NEF  cmp yes no)
+
+(If cond yes no) => (NE (TESTB cond cond) yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem)
+
+// ***************************
+// Above: lowering rules
+// Below: optimizations
+// ***************************
+// TODO: Should the optimizations be a separate pass?
+
+// Fold boolean tests into blocks
+(NE (TESTB (SETL  cmp) (SETL  cmp)) yes no) => (LT  cmp yes no)
+(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE  cmp yes no)
+(NE (TESTB (SETG  cmp) (SETG  cmp)) yes no) => (GT  cmp yes no)
+(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE  cmp yes no)
+(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ  cmp yes no)
+(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE  cmp yes no)
+(NE (TESTB (SETB  cmp) (SETB  cmp)) yes no) => (ULT cmp yes no)
+(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no)
+(NE (TESTB (SETA  cmp) (SETA  cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no)
+
+// Special case for floating point - LF/LEF not generated
+(NE (TESTB (SETGF  cmp) (SETGF  cmp)) yes no) => (UGT  cmp yes no)
+(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE  cmp yes no)
+(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF  cmp yes no)
+(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF  cmp yes no)
+
+// fold constants into instructions
+(ADDL x (MOVLconst [c])) => (ADDLconst [c] x)
+(ADDLcarry x (MOVLconst [c])) => (ADDLconstcarry [c] x)
+(ADCL x (MOVLconst [c]) f) => (ADCLconst [c] x f)
+
+(SUBL x (MOVLconst [c])) => (SUBLconst x [c])
+(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c]))
+(SUBLcarry x (MOVLconst [c])) => (SUBLconstcarry [c] x)
+(SBBL x (MOVLconst [c]) f) => (SBBLconst [c] x f)
+
+(MULL x (MOVLconst [c])) => (MULLconst [c] x)
+(ANDL x (MOVLconst [c])) => (ANDLconst [c] x)
+
+(ANDLconst [c] (ANDLconst [d] x)) => (ANDLconst [c & d] x)
+(XORLconst [c] (XORLconst [d] x)) => (XORLconst [c ^ d] x)
+(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x)
+
+(ORL x (MOVLconst [c])) => (ORLconst [c] x)
+(XORL x (MOVLconst [c])) => (XORLconst [c] x)
+
+(SHLL x (MOVLconst [c])) => (SHLLconst [c&31] x)
+(SHRL x (MOVLconst [c])) => (SHRLconst [c&31] x)
+(SHRW x (MOVLconst [c])) && c&31 < 16 => (SHRWconst [int16(c&31)] x)
+(SHRW _ (MOVLconst [c])) && c&31 >= 16 => (MOVLconst [0])
+(SHRB x (MOVLconst [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x)
+(SHRB _ (MOVLconst [c])) && c&31 >= 8 => (MOVLconst [0])
+
+(SARL x (MOVLconst [c])) => (SARLconst [c&31] x)
+(SARW x (MOVLconst [c])) => (SARWconst [int16(min(int64(c&31),15))] x)
+(SARB x (MOVLconst [c])) => (SARBconst [int8(min(int64(c&31),7))] x)
+
+(SARL x (ANDLconst [31] y)) => (SARL x y)
+(SHLL x (ANDLconst [31] y)) => (SHLL x y)
+(SHRL x (ANDLconst [31] y)) => (SHRL x y)
+
+// Rotate instructions
+
+(ADDL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c => (ROLLconst [c] x)
+( ORL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c => (ROLLconst [c] x)
+(XORL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c => (ROLLconst [c] x)
+
+(ADDL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == int16(16-c) && t.Size() == 2
+  => (ROLWconst x [int16(c)])
+( ORL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == int16(16-c) && t.Size() == 2
+  => (ROLWconst x [int16(c)])
+(XORL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == int16(16-c) && t.Size() == 2
+  => (ROLWconst x [int16(c)])
+
+(ADDL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == int8(8-c) && t.Size() == 1
+  => (ROLBconst x [int8(c)])
+( ORL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == int8(8-c) && t.Size() == 1
+  => (ROLBconst x [int8(c)])
+(XORL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == int8(8-c) && t.Size() == 1
+  => (ROLBconst x [int8(c)])
+
+(ROLLconst [c] (ROLLconst [d] x)) => (ROLLconst [(c+d)&31] x)
+(ROLWconst [c] (ROLWconst [d] x)) => (ROLWconst [(c+d)&15] x)
+(ROLBconst [c] (ROLBconst [d] x)) => (ROLBconst [(c+d)& 7] x)
+
+
+// Constant shift simplifications
+
+(SHLLconst x [0]) => x
+(SHRLconst x [0]) => x
+(SARLconst x [0]) => x
+
+(SHRWconst x [0]) => x
+(SARWconst x [0]) => x
+
+(SHRBconst x [0]) => x
+(SARBconst x [0]) => x
+
+(ROLLconst [0] x) => x
+(ROLWconst [0] x) => x
+(ROLBconst [0] x) => x
+
+// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
+// because the x86 instructions are defined to use all 5 bits of the shift even
+// for the small shifts. I don't think we'll ever generate a weird shift (e.g.
+// (SHRW x (MOVLconst [24])), but just in case.
+
+(CMPL x (MOVLconst [c])) => (CMPLconst x [c])
+(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c]))
+(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)])
+(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)]))
+(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)])
+(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)]))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+(CMP(L|W|B) x y) && x.ID > y.ID => (InvertFlags (CMP(L|W|B) y x))
+
+// strength reduction
+// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
+//    1 - addl, shll, leal, negl, subl
+//    3 - imull
+// This limits the rewrites to two instructions.
+// Note that negl always operates in-place,
+// which can require a register-register move
+// to preserve the original value,
+// so it must be used with care.
+(MULLconst [-9] x) => (NEGL (LEAL8 <v.Type> x x))
+(MULLconst [-5] x) => (NEGL (LEAL4 <v.Type> x x))
+(MULLconst [-3] x) => (NEGL (LEAL2 <v.Type> x x))
+(MULLconst [-1] x) => (NEGL x)
+(MULLconst [0] _) => (MOVLconst [0])
+(MULLconst [1] x) => x
+(MULLconst [3] x) => (LEAL2 x x)
+(MULLconst [5] x) => (LEAL4 x x)
+(MULLconst [7] x) => (LEAL2 x (LEAL2 <v.Type> x x))
+(MULLconst [9] x) => (LEAL8 x x)
+(MULLconst [11] x) => (LEAL2 x (LEAL4 <v.Type> x x))
+(MULLconst [13] x) => (LEAL4 x (LEAL2 <v.Type> x x))
+(MULLconst [19] x) => (LEAL2 x (LEAL8 <v.Type> x x))
+(MULLconst [21] x) => (LEAL4 x (LEAL4 <v.Type> x x))
+(MULLconst [25] x) => (LEAL8 x (LEAL2 <v.Type> x x))
+(MULLconst [27] x) => (LEAL8 (LEAL2 <v.Type> x x) (LEAL2 <v.Type> x x))
+(MULLconst [37] x) => (LEAL4 x (LEAL8 <v.Type> x x))
+(MULLconst [41] x) => (LEAL8 x (LEAL4 <v.Type> x x))
+(MULLconst [45] x) => (LEAL8 (LEAL4 <v.Type> x x) (LEAL4 <v.Type> x x))
+(MULLconst [73] x) => (LEAL8 x (LEAL8 <v.Type> x x))
+(MULLconst [81] x) => (LEAL8 (LEAL8 <v.Type> x x) (LEAL8 <v.Type> x x))
+
+(MULLconst [c] x) && isPowerOfTwo32(c+1) && c >= 15 => (SUBL (SHLLconst <v.Type> [int32(log32(c+1))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-1) && c >= 17 => (LEAL1 (SHLLconst <v.Type> [int32(log32(c-1))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-2) && c >= 34 => (LEAL2 (SHLLconst <v.Type> [int32(log32(c-2))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-4) && c >= 68 => (LEAL4 (SHLLconst <v.Type> [int32(log32(c-4))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEAL8 (SHLLconst <v.Type> [int32(log32(c-8))] x) x)
+(MULLconst [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHLLconst [int32(log32(c/3))] (LEAL2 <v.Type> x x))
+(MULLconst [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHLLconst [int32(log32(c/5))] (LEAL4 <v.Type> x x))
+(MULLconst [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHLLconst [int32(log32(c/9))] (LEAL8 <v.Type> x x))
+
+// combine add/shift into LEAL
+(ADDL x (SHLLconst [3] y)) => (LEAL8 x y)
+(ADDL x (SHLLconst [2] y)) => (LEAL4 x y)
+(ADDL x (SHLLconst [1] y)) => (LEAL2 x y)
+(ADDL x (ADDL y y)) => (LEAL2 x y)
+(ADDL x (ADDL x y)) => (LEAL2 y x)
+
+// combine ADDL/ADDLconst into LEAL1
+(ADDLconst [c] (ADDL x y)) => (LEAL1 [c] x y)
+(ADDL (ADDLconst [c] x) y) => (LEAL1 [c] x y)
+
+// fold ADDL into LEAL
+(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x)
+(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x)
+(ADDLconst [c] x:(SP)) => (LEAL [c] x) // so it is rematerializeable
+(LEAL [c] {s} (ADDL x y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y)
+(ADDL x (LEAL [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y)
+
+// fold ADDLconst into LEALx
+(ADDLconst [c] (LEAL1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL1 [c+d] {s} x y)
+(ADDLconst [c] (LEAL2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL2 [c+d] {s} x y)
+(ADDLconst [c] (LEAL4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL4 [c+d] {s} x y)
+(ADDLconst [c] (LEAL8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL8 [c+d] {s} x y)
+(LEAL1 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL1 [c+d] {s} x y)
+(LEAL2 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL2 [c+d] {s} x y)
+(LEAL2 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEAL2 [c+2*d] {s} x y)
+(LEAL4 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL4 [c+d] {s} x y)
+(LEAL4 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEAL4 [c+4*d] {s} x y)
+(LEAL8 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL8 [c+d] {s} x y)
+(LEAL8 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEAL8 [c+8*d] {s} x y)
+
+// fold shifts into LEALx
+(LEAL1 [c] {s} x (SHLLconst [1] y)) => (LEAL2 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [2] y)) => (LEAL4 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [3] y)) => (LEAL8 [c] {s} x y)
+(LEAL2 [c] {s} x (SHLLconst [1] y)) => (LEAL4 [c] {s} x y)
+(LEAL2 [c] {s} x (SHLLconst [2] y)) => (LEAL8 [c] {s} x y)
+(LEAL4 [c] {s} x (SHLLconst [1] y)) => (LEAL8 [c] {s} x y)
+
+// reverse ordering of compare instruction
+(SETL (InvertFlags x)) => (SETG x)
+(SETG (InvertFlags x)) => (SETL x)
+(SETB (InvertFlags x)) => (SETA x)
+(SETA (InvertFlags x)) => (SETB x)
+(SETLE (InvertFlags x)) => (SETGE x)
+(SETGE (InvertFlags x)) => (SETLE x)
+(SETBE (InvertFlags x)) => (SETAE x)
+(SETAE (InvertFlags x)) => (SETBE x)
+(SETEQ (InvertFlags x)) => (SETEQ x)
+(SETNE (InvertFlags x)) => (SETNE x)
+
+// sign extended loads
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values.  See test/atomicload.go.
+(MOVBLSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBLSXload <v.Type> [off] {sym} ptr mem)
+(MOVBLZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVWLSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWLSXload <v.Type> [off] {sym} ptr mem)
+(MOVWLZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLZX x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLZX x)
+(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVBLSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLSX x)
+(MOVWLSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLSX x)
+
+// Fold extensions and ANDs together.
+(MOVBLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x)
+(MOVWLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x)
+(MOVBLSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x)
+(MOVWLSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x)
+
+// Don't extend before storing
+(MOVWstore [off] {sym} ptr (MOVWL(S|Z)X x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBL(S|Z)X x) mem) => (MOVBstore [off] {sym} ptr x mem)
+
+// fold constants into memory operations
+// Note that this is not always a good idea because if not all the uses of
+// the ADDLconst get eliminated, we still have to compute the ADDLconst and we now
+// have potentially two live values (ptr and (ADDLconst [off] ptr)) instead of one.
+// Nevertheless, let's do it!
+(MOV(L|W|B|SS|SD)load  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+    (MOV(L|W|B|SS|SD)load  [off1+off2] {sym} ptr mem)
+(MOV(L|W|B|SS|SD)store  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+    (MOV(L|W|B|SS|SD)store  [off1+off2] {sym} ptr val mem)
+
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDLconst [off2] base) mem) && valoff1.canAdd32(off2) =>
+	((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {sym} base mem)
+
+// Fold constants into stores.
+(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(int64(off)) =>
+	(MOVLstoreconst [makeValAndOff32(c,off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(int64(off)) =>
+	(MOVWstoreconst [makeValAndOff32(c,off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(int64(off)) =>
+	(MOVBstoreconst [makeValAndOff32(c,off)] {sym} ptr mem)
+
+// Fold address offsets into constant stores.
+(MOV(L|W|B)storeconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+	(MOV(L|W|B)storeconst [sc.addOffset32(off)] {s} ptr mem)
+
+// We need to fold LEAL into the MOVx ops so that the live variable analysis knows
+// what variables are being read/written by the ops.
+// Note: we turn off this merging for operations on globals when building
+// position-independent code (when Flag_shared is set).
+// PIC needs a spare register to load the PC into.  Having the LEAL be
+// a separate instruction gives us that register.  Having the LEAL be
+// a separate instruction also allows it to be CSEd (which is good because
+// it compiles to a thunk call).
+(MOV(L|W|B|SS|SD|BLSX|WLSX)load  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
+  && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+        (MOV(L|W|B|SS|SD|BLSX|WLSX)load  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOV(L|W|B|SS|SD)store  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
+  && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOV(L|W|B|SS|SD)store  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(MOV(L|W|B)storeconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off)
+  && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOV(L|W|B)storeconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAL [off2] {sym2} base) mem)
+	&& valoff1.canAdd32(off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+
+// Merge load/store to op
+((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr mem)) mem)
+	&& y.Uses==1 && l.Uses==1 && clobber(y, l) && validValAndOff(int64(c),int64(off)) =>
+	((ADD|AND|OR|XOR)Lconstmodify [makeValAndOff32(c,off)] {sym} ptr mem)
+
+// fold LEALs together
+(LEAL [off1] {sym1} (LEAL [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL [off1+off2] {mergeSym(sym1,sym2)} x)
+
+// LEAL into LEAL1
+(LEAL1 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL1 into LEAL
+(LEAL [off1] {sym1} (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+       (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL into LEAL[248]
+(LEAL2 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL4 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL8 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL[248] into LEAL
+(LEAL [off1] {sym1} (LEAL2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAL[1248] into LEAL[1248]. Only some such merges are possible.
+(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} x y)
+(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} y x)
+(LEAL2 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+2*int64(off2)) =>
+      (LEAL4 [off1+2*off2] {sym} x y)
+(LEAL4 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+4*int64(off2)) =>
+      (LEAL8 [off1+4*off2] {sym} x y)
+
+// Absorb InvertFlags into branches.
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+
+// Constant comparisons.
+(CMPLconst (MOVLconst [x]) [y]) && x==y                       => (FlagEQ)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT)
+
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y                       => (FlagEQ)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT)
+
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y                      => (FlagEQ)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT)
+
+// Other known comparisons.
+(CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) => (FlagLT_ULT)
+(CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT)
+(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m) && int8(m) < n => (FlagLT_ULT)
+// TODO: DIVxU also.
+
+// Absorb flag constants into SBB ops.
+(SBBLcarrymask (FlagEQ)) => (MOVLconst [0])
+(SBBLcarrymask (FlagLT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagLT_UGT)) => (MOVLconst [0])
+(SBBLcarrymask (FlagGT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagGT_UGT)) => (MOVLconst [0])
+
+// Absorb flag constants into branches.
+(EQ (FlagEQ) yes no) => (First yes no)
+(EQ (FlagLT_ULT) yes no) => (First no yes)
+(EQ (FlagLT_UGT) yes no) => (First no yes)
+(EQ (FlagGT_ULT) yes no) => (First no yes)
+(EQ (FlagGT_UGT) yes no) => (First no yes)
+
+(NE (FlagEQ) yes no) => (First no yes)
+(NE (FlagLT_ULT) yes no) => (First yes no)
+(NE (FlagLT_UGT) yes no) => (First yes no)
+(NE (FlagGT_ULT) yes no) => (First yes no)
+(NE (FlagGT_UGT) yes no) => (First yes no)
+
+(LT (FlagEQ) yes no) => (First no yes)
+(LT (FlagLT_ULT) yes no) => (First yes no)
+(LT (FlagLT_UGT) yes no) => (First yes no)
+(LT (FlagGT_ULT) yes no) => (First no yes)
+(LT (FlagGT_UGT) yes no) => (First no yes)
+
+(LE (FlagEQ) yes no) => (First yes no)
+(LE (FlagLT_ULT) yes no) => (First yes no)
+(LE (FlagLT_UGT) yes no) => (First yes no)
+(LE (FlagGT_ULT) yes no) => (First no yes)
+(LE (FlagGT_UGT) yes no) => (First no yes)
+
+(GT (FlagEQ) yes no) => (First no yes)
+(GT (FlagLT_ULT) yes no) => (First no yes)
+(GT (FlagLT_UGT) yes no) => (First no yes)
+(GT (FlagGT_ULT) yes no) => (First yes no)
+(GT (FlagGT_UGT) yes no) => (First yes no)
+
+(GE (FlagEQ) yes no) => (First yes no)
+(GE (FlagLT_ULT) yes no) => (First no yes)
+(GE (FlagLT_UGT) yes no) => (First no yes)
+(GE (FlagGT_ULT) yes no) => (First yes no)
+(GE (FlagGT_UGT) yes no) => (First yes no)
+
+(ULT (FlagEQ) yes no) => (First no yes)
+(ULT (FlagLT_ULT) yes no) => (First yes no)
+(ULT (FlagLT_UGT) yes no) => (First no yes)
+(ULT (FlagGT_ULT) yes no) => (First yes no)
+(ULT (FlagGT_UGT) yes no) => (First no yes)
+
+(ULE (FlagEQ) yes no) => (First yes no)
+(ULE (FlagLT_ULT) yes no) => (First yes no)
+(ULE (FlagLT_UGT) yes no) => (First no yes)
+(ULE (FlagGT_ULT) yes no) => (First yes no)
+(ULE (FlagGT_UGT) yes no) => (First no yes)
+
+(UGT (FlagEQ) yes no) => (First no yes)
+(UGT (FlagLT_ULT) yes no) => (First no yes)
+(UGT (FlagLT_UGT) yes no) => (First yes no)
+(UGT (FlagGT_ULT) yes no) => (First no yes)
+(UGT (FlagGT_UGT) yes no) => (First yes no)
+
+(UGE (FlagEQ) yes no) => (First yes no)
+(UGE (FlagLT_ULT) yes no) => (First no yes)
+(UGE (FlagLT_UGT) yes no) => (First yes no)
+(UGE (FlagGT_ULT) yes no) => (First no yes)
+(UGE (FlagGT_UGT) yes no) => (First yes no)
+
+// Absorb flag constants into SETxx ops.
+(SETEQ (FlagEQ)) => (MOVLconst [1])
+(SETEQ (FlagLT_ULT)) => (MOVLconst [0])
+(SETEQ (FlagLT_UGT)) => (MOVLconst [0])
+(SETEQ (FlagGT_ULT)) => (MOVLconst [0])
+(SETEQ (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETNE (FlagEQ)) => (MOVLconst [0])
+(SETNE (FlagLT_ULT)) => (MOVLconst [1])
+(SETNE (FlagLT_UGT)) => (MOVLconst [1])
+(SETNE (FlagGT_ULT)) => (MOVLconst [1])
+(SETNE (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETL (FlagEQ)) => (MOVLconst [0])
+(SETL (FlagLT_ULT)) => (MOVLconst [1])
+(SETL (FlagLT_UGT)) => (MOVLconst [1])
+(SETL (FlagGT_ULT)) => (MOVLconst [0])
+(SETL (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETLE (FlagEQ)) => (MOVLconst [1])
+(SETLE (FlagLT_ULT)) => (MOVLconst [1])
+(SETLE (FlagLT_UGT)) => (MOVLconst [1])
+(SETLE (FlagGT_ULT)) => (MOVLconst [0])
+(SETLE (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETG (FlagEQ)) => (MOVLconst [0])
+(SETG (FlagLT_ULT)) => (MOVLconst [0])
+(SETG (FlagLT_UGT)) => (MOVLconst [0])
+(SETG (FlagGT_ULT)) => (MOVLconst [1])
+(SETG (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETGE (FlagEQ)) => (MOVLconst [1])
+(SETGE (FlagLT_ULT)) => (MOVLconst [0])
+(SETGE (FlagLT_UGT)) => (MOVLconst [0])
+(SETGE (FlagGT_ULT)) => (MOVLconst [1])
+(SETGE (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETB (FlagEQ)) => (MOVLconst [0])
+(SETB (FlagLT_ULT)) => (MOVLconst [1])
+(SETB (FlagLT_UGT)) => (MOVLconst [0])
+(SETB (FlagGT_ULT)) => (MOVLconst [1])
+(SETB (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETBE (FlagEQ)) => (MOVLconst [1])
+(SETBE (FlagLT_ULT)) => (MOVLconst [1])
+(SETBE (FlagLT_UGT)) => (MOVLconst [0])
+(SETBE (FlagGT_ULT)) => (MOVLconst [1])
+(SETBE (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETA (FlagEQ)) => (MOVLconst [0])
+(SETA (FlagLT_ULT)) => (MOVLconst [0])
+(SETA (FlagLT_UGT)) => (MOVLconst [1])
+(SETA (FlagGT_ULT)) => (MOVLconst [0])
+(SETA (FlagGT_UGT)) => (MOVLconst [1])
+
+(SETAE (FlagEQ)) => (MOVLconst [1])
+(SETAE (FlagLT_ULT)) => (MOVLconst [0])
+(SETAE (FlagLT_UGT)) => (MOVLconst [1])
+(SETAE (FlagGT_ULT)) => (MOVLconst [0])
+(SETAE (FlagGT_UGT)) => (MOVLconst [1])
+
+// Remove redundant *const ops
+(ADDLconst [c] x) && c==0  => x
+(SUBLconst [c] x) && c==0  => x
+(ANDLconst [c] _) && c==0  => (MOVLconst [0])
+(ANDLconst [c] x) && c==-1 => x
+(ORLconst [c] x)  && c==0  => x
+(ORLconst [c] _)  && c==-1 => (MOVLconst [-1])
+(XORLconst [c] x) && c==0  => x
+// TODO: since we got rid of the W/B versions, we might miss
+// things like (ANDLconst [0x100] x) which were formerly
+// (ANDBconst [0] x).  Probably doesn't happen very often.
+// If we cared, we might do:
+//  (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 => (MOVLconst [0])
+
+// Convert constant subtracts to constant adds
+(SUBLconst [c] x) => (ADDLconst [-c] x)
+
+// generic constant folding
+// TODO: more of this
+(ADDLconst [c] (MOVLconst [d])) => (MOVLconst [c+d])
+(ADDLconst [c] (ADDLconst [d] x)) => (ADDLconst [c+d] x)
+(SARLconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)])
+(SARWconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)])
+(SARBconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)])
+(NEGL (MOVLconst [c])) => (MOVLconst [-c])
+(MULLconst [c] (MOVLconst [d])) => (MOVLconst [c*d])
+(ANDLconst [c] (MOVLconst [d])) => (MOVLconst [c&d])
+(ORLconst [c] (MOVLconst [d])) => (MOVLconst [c|d])
+(XORLconst [c] (MOVLconst [d])) => (MOVLconst [c^d])
+(NOTL (MOVLconst [c])) => (MOVLconst [^c])
+
+// generic simplifications
+// TODO: more of this
+(ADDL x (NEGL y)) => (SUBL x y)
+(SUBL x x) => (MOVLconst [0])
+(ANDL x x) => x
+(ORL x x) => x
+(XORL x x) => (MOVLconst [0])
+
+// checking AND against 0.
+(CMP(L|W|B)const l:(ANDL x y) [0]) && l.Uses==1 => (TEST(L|W|B) x y)
+(CMPLconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTLconst [c] x)
+(CMPWconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTWconst [int16(c)] x)
+(CMPBconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTBconst [int8(c)] x)
+
+// TEST %reg,%reg is shorter than CMP
+(CMP(L|W|B)const x [0]) => (TEST(L|W|B) x x)
+
+// Convert LEAL1 back to ADDL if we can
+(LEAL1 [0] {nil} x y) => (ADDL x y)
+
+// Combining byte loads into larger (unaligned) loads.
+// There are many ways these combinations could occur.  This is
+// designed to match the way encoding/binary.LittleEndian does it.
+(ORL                  x0:(MOVBload [i0] {s} p mem)
+    s0:(SHLLconst [8] x1:(MOVBload [i1] {s} p mem)))
+  && i1 == i0+1
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, s0)
+  => @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
+
+(ORL                  x0:(MOVBload [i] {s} p0 mem)
+    s0:(SHLLconst [8] x1:(MOVBload [i] {s} p1 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, s0)
+  => @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+
+(ORL o0:(ORL
+                       x0:(MOVWload [i0] {s} p mem)
+    s0:(SHLLconst [16] x1:(MOVBload [i2] {s} p mem)))
+    s1:(SHLLconst [24] x2:(MOVBload [i3] {s} p mem)))
+  && i2 == i0+2
+  && i3 == i0+3
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && o0.Uses == 1
+  && mergePoint(b,x0,x1,x2) != nil
+  && clobber(x0, x1, x2, s0, s1, o0)
+  => @mergePoint(b,x0,x1,x2) (MOVLload [i0] {s} p mem)
+
+(ORL o0:(ORL
+                       x0:(MOVWload [i] {s} p0 mem)
+    s0:(SHLLconst [16] x1:(MOVBload [i] {s} p1 mem)))
+    s1:(SHLLconst [24] x2:(MOVBload [i] {s} p2 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && o0.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && sequentialAddresses(p1, p2, 1)
+  && mergePoint(b,x0,x1,x2) != nil
+  && clobber(x0, x1, x2, s0, s1, o0)
+  => @mergePoint(b,x0,x1,x2) (MOVLload [i] {s} p0 mem)
+
+// Combine constant stores into larger (unaligned) stores.
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 1 == c.Off()
+  && clobber(x)
+  => (MOVWstoreconst [makeValAndOff32(int32(a.Val()&0xff | c.Val()<<8), a.Off32())] {s} p mem)
+(MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 1 == c.Off()
+  && clobber(x)
+  => (MOVWstoreconst [makeValAndOff32(int32(a.Val()&0xff | c.Val()<<8), a.Off32())] {s} p mem)
+
+(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
+  && x.Uses == 1
+  && a.Off() == c.Off()
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstoreconst [makeValAndOff32(int32(a.Val()&0xff | c.Val()<<8), a.Off32())] {s} p0 mem)
+(MOVBstoreconst [a] {s} p0 x:(MOVBstoreconst [c] {s} p1 mem))
+  && x.Uses == 1
+  && a.Off() == c.Off()
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstoreconst [makeValAndOff32(int32(a.Val()&0xff | c.Val()<<8), a.Off32())] {s} p0 mem)
+
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 2 == c.Off()
+  && clobber(x)
+  => (MOVLstoreconst [makeValAndOff32(int32(a.Val()&0xffff | c.Val()<<16), a.Off32())] {s} p mem)
+(MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
+  && x.Uses == 1
+  && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
+  && clobber(x)
+  => (MOVLstoreconst [makeValAndOff32(int32(a.Val()&0xffff | c.Val()<<16), a.Off32())] {s} p mem)
+
+(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
+  && x.Uses == 1
+  && a.Off() == c.Off()
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  => (MOVLstoreconst [makeValAndOff32(int32(a.Val()&0xffff | c.Val()<<16), a.Off32())] {s} p0 mem)
+(MOVWstoreconst [a] {s} p0 x:(MOVWstoreconst [c] {s} p1 mem))
+  && x.Uses == 1
+  && a.Off() == c.Off()
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  => (MOVLstoreconst [makeValAndOff32(int32(a.Val()&0xffff | c.Val()<<16), a.Off32())] {s} p0 mem)
+
+// Combine stores into larger (unaligned) stores.
+(MOVBstore [i] {s} p (SHR(W|L)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w x:(MOVBstore {s} [i+1] p (SHR(W|L)const [8] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i] {s} p w mem)
+(MOVBstore [i] {s} p (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRLconst [j-8] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-1] {s} p w0 mem)
+
+(MOVBstore [i] {s} p1 (SHR(W|L)const [8] w) x:(MOVBstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p0 w x:(MOVBstore {s} [i] p1 (SHR(W|L)const [8] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRLconst [j-8] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstore [i] {s} p0 w0 mem)
+
+(MOVWstore [i] {s} p (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVLstore [i-2] {s} p w mem)
+(MOVWstore [i] {s} p (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRLconst [j-16] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVLstore [i-2] {s} p w0 mem)
+
+(MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  => (MOVLstore [i] {s} p0 w mem)
+(MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRLconst [j-16] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  => (MOVLstore [i] {s} p0 w0 mem)
+
+// For PIC, break floating-point constant loading into two instructions so we have
+// a register to use for holding the address of the constant pool entry.
+(MOVSSconst [c]) && config.ctxt.Flag_shared => (MOVSSconst2 (MOVSSconst1 [c]))
+(MOVSDconst [c]) && config.ctxt.Flag_shared => (MOVSDconst2 (MOVSDconst1 [c]))
+
+(CMP(L|W|B) l:(MOV(L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) => (CMP(L|W|B)load {sym} [off] ptr x mem)
+(CMP(L|W|B) x l:(MOV(L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (InvertFlags (CMP(L|W|B)load {sym} [off] ptr x mem))
+
+(CMP(L|W|B)const l:(MOV(L|W|B)load {sym} [off] ptr mem) [c])
+	&& l.Uses == 1
+	&& validValAndOff(int64(c), int64(off))
+	&& clobber(l) =>
+  @l.Block (CMP(L|W|B)constload {sym} [makeValAndOff32(int32(c),int32(off))] ptr mem)
+
+(CMPLload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(int64(c),int64(off)) => (CMPLconstload {sym} [makeValAndOff32(c,off)] ptr mem)
+(CMPWload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(int64(int16(c)),int64(off)) => (CMPWconstload {sym} [makeValAndOff32(int32(int16(c)),off)] ptr mem)
+(CMPBload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(int64(int8(c)),int64(off)) => (CMPBconstload {sym} [makeValAndOff32(int32(int8(c)),off)] ptr mem)
+
+(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read8(sym, int64(off)))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVLload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go
new file mode 100644
index 0000000..737b99c
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -0,0 +1,585 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - Floating-point types live in the low natural slot of an sse2 register.
+//    Unused portions are junk.
+//  - We do not use AH,BH,CH,DH registers.
+//  - When doing sub-register operations, we try to write the whole
+//    destination register to avoid a partial-register write.
+//  - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+//    filled by sign-extending the used portion.  Users of AuxInt which interpret
+//    AuxInt as unsigned (e.g. shifts) must be careful.
+
+// Suffixes encode the bit width of various instructions.
+// L (long word) = 32 bit
+// W (word)      = 16 bit
+// B (byte)      = 8 bit
+
+// copied from ../../x86/reg.go
+var regNames386 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+
+	// If you add registers, update asyncPreempt in runtime
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNames386) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNames386 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		ax         = buildReg("AX")
+		cx         = buildReg("CX")
+		dx         = buildReg("DX")
+		bx         = buildReg("BX")
+		si         = buildReg("SI")
+		gp         = buildReg("AX CX DX BX BP SI DI")
+		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7")
+		gpsp       = gp | buildReg("SP")
+		gpspsb     = gpsp | buildReg("SB")
+		callerSave = gp | fp
+	)
+	// Common slices of register masks
+	var (
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
+	)
+
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: gponly}
+		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
+		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp11carry = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
+		gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
+		gp1carry1 = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp2carry1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
+		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax}, clobbers: dx}
+		gp21hmul  = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
+		gp11mod   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx}, clobbers: ax}
+		gp21mul   = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}
+
+		gp2flags     = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags     = regInfo{inputs: []regMask{gpsp}}
+		gp0flagsLoad = regInfo{inputs: []regMask{gpspsb, 0}}
+		gp1flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		flagsgp      = regInfo{inputs: nil, outputs: gponly}
+
+		readflags = regInfo{inputs: nil, outputs: gponly}
+		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
+
+		gpload      = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+		gp21load    = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
+		gploadidx   = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+		gp21loadidx = regInfo{inputs: []regMask{gp, gpspsb, gpsp, 0}, outputs: gponly}
+
+		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
+		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+
+		fp01     = regInfo{inputs: nil, outputs: fponly}
+		fp21     = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
+		fpgp     = regInfo{inputs: fponly, outputs: gponly}
+		gpfp     = regInfo{inputs: gponly, outputs: fponly}
+		fp11     = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags = regInfo{inputs: []regMask{fp, fp}}
+
+		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+		fpstore    = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+		fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+	)
+
+	var _386ops = []opData{
+		// fp ops
+		{name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true, usesScratch: true}, // fp32 add
+		{name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true},                    // fp64 add
+		{name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true, usesScratch: true},                    // fp32 sub
+		{name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true},                                       // fp64 sub
+		{name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true, usesScratch: true}, // fp32 mul
+		{name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true},                    // fp64 mul
+		{name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true, usesScratch: true},                    // fp32 div
+		{name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true},                                       // fp64 div
+
+		{name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
+		{name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
+		{name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true},                               // fp32 constant
+		{name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true},                               // fp64 constant
+		{name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"},                // fp32 load indexed by i
+		{name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"},                // fp32 load indexed by 4*i
+		{name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"},                // fp64 load indexed by i
+		{name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"},                // fp64 load indexed by 8*i
+
+		{name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store
+		{name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store
+		{name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"},                // fp32 indexed by i store
+		{name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"},                // fp32 indexed by 4i store
+		{name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"},                // fp64 indexed by i store
+		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"},                // fp64 indexed by 8i store
+
+		{name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+
+		// binary ops
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                // arg0 + arg1
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32", clobberFlags: true}, // arg0 + auxint
+
+		{name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true},                // arg0 + arg1, generates <carry,result> pair
+		{name: "ADDLconstcarry", argLength: 1, reg: gp11carry, asm: "ADDL", aux: "Int32", resultInArg0: true},                // arg0 + auxint, generates <carry,result> pair
+		{name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0+arg1+carry(arg2), where arg2 is flags
+		{name: "ADCLconst", argLength: 2, reg: gp1carry1, asm: "ADCL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0+auxint+carry(arg1), where arg1 is flags
+
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+
+		{name: "SUBLcarry", argLength: 2, reg: gp21carry, asm: "SUBL", resultInArg0: true},                                   // arg0-arg1, generates <borrow,result> pair
+		{name: "SUBLconstcarry", argLength: 1, reg: gp11carry, asm: "SUBL", aux: "Int32", resultInArg0: true},                // arg0-auxint, generates <borrow,result> pair
+		{name: "SBBL", argLength: 3, reg: gp2carry1, asm: "SBBL", resultInArg0: true, clobberFlags: true},                    // arg0-arg1-borrow(arg2), where arg2 is flags
+		{name: "SBBLconst", argLength: 2, reg: gp1carry1, asm: "SBBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0-auxint-borrow(arg1), where arg1 is flags
+
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true},                    // arg0 * auxint
+
+		{name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true}, // Let x = arg0*arg1 (full 32x32->64  unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x.
+
+		{name: "HMULL", argLength: 2, reg: gp21hmul, commutative: true, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp21hmul, commutative: true, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+
+		{name: "MULLQU", argLength: 2, reg: gp21mul, commutative: true, asm: "MULL", clobberFlags: true}, // arg0 * arg1, high 32 in result[0], low 32 in result[1]
+
+		{name: "AVGLU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 32 result bits
+
+		// For DIVL, DIVW, MODL and MODW, AuxInt non-zero means that the divisor has been proved to be not -1.
+		{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 / arg1
+		{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL", clobberFlags: true},              // arg0 / arg1
+		{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW", clobberFlags: true},              // arg0 / arg1
+
+		{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 % arg1
+		{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 % arg1
+		{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL", clobberFlags: true},              // arg0 % arg1
+		{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW", clobberFlags: true},              // arg0 % arg1
+
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
+
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
+
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
+
+		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
+		{name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"},  // arg0 compare to auxint
+
+		// compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem.
+		{name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+		// compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem.
+		{name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+		{name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags", usesScratch: true}, // arg0 compare to arg1, f32
+		{name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags", usesScratch: true}, // arg0 compare to arg1, f64
+
+		{name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
+		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0
+
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},               // arg0 << arg1, shift amount is mod 32
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
+		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-15
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // unsigned arg0 >> auxint, shift amount 0-7
+
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-15
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // signed arg0 >> auxint, shift amount 0-7
+
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // arg0 rotate left auxint, rotate amount 0-7
+
+		// binary-op with a memory source operand
+		{name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 + tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 - tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "MULLload", argLength: 3, reg: gp21load, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 & tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},    // arg0 | tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+
+		// binary-op with an indexed memory source operand
+		{name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},  // arg0 + tmp, tmp loaded from  arg1+arg2*4+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},  // arg0 - tmp, tmp loaded from  arg1+arg2*4+auxint+aux, arg3 = mem
+		{name: "MULLloadidx4", argLength: 4, reg: gp21loadidx, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from  arg1+arg2*4+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},  // arg0 & tmp, tmp loaded from  arg1+arg2*4+auxint+aux, arg3 = mem
+		{name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},    // arg0 | tmp, tmp loaded from  arg1+arg2*4+auxint+aux, arg3 = mem
+		{name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},  // arg0 ^ tmp, tmp loaded from  arg1+arg2*4+auxint+aux, arg3 = mem
+
+		// unary ops
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
+
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true, clobberFlags: true}, // ^arg0
+
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
+
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
+
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+
+		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+
+		{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+		// Note: SBBW and SBBB are subsumed by SBBL
+
+		{name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+		{name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+		{name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"},  // extract signed < condition from arg0
+		{name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+		{name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"},  // extract signed > condition from arg0
+		{name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+		{name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"},  // extract unsigned < condition from arg0
+		{name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+		{name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"},  // extract unsigned > condition from arg0
+		{name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+		{name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"},  // extract if overflow flag is set from arg0
+		// Need different opcodes for floating point conditions because
+		// any comparison involving a NaN is always FALSE and thus
+		// the patterns for inverting conditions cannot be used.
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},                       // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},                       // extract "unordered" (Nan present) condition from arg0
+
+		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
+		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+		{name: "MOVBLSX", argLength: 1, reg: gp11, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32
+		{name: "MOVBLZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32
+		{name: "MOVWLSX", argLength: 1, reg: gp11, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32
+		{name: "MOVWLZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32
+
+		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+
+		{name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL", usesScratch: true}, // convert float64 to int32
+		{name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL", usesScratch: true}, // convert float32 to int32
+		{name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS", usesScratch: true},   // convert int32 to float32
+		{name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD", usesScratch: true},   // convert int32 to float64
+		{name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS", usesScratch: true},   // convert float64 to float32
+		{name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"},                      // convert float32 to float64
+
+		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+		{name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+		{name: "LEAL1", argLength: 2, reg: gp21sb, commutative: true, aux: "SymOff", symEffect: "Addr"},      // arg0 + arg1 + auxint + aux
+		{name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"},                         // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"},                         // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"},                         // arg0 + 8*arg1 + auxint + aux
+		// Note: LEAL{1,2,4,8} must not have OpSB as either argument.
+
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},             // ditto, sign extend to int32
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},             // ditto, sign extend to int32
+		{name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"},    // load 4 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store byte in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+		// direct binary-op on memory (read-modify-write)
+		{name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
+		{name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
+		{name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
+		{name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+auxint+aux) |= arg1, arg2=mem
+		{name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
+
+		// direct binary-op on indexed memory (read-modify-write)
+		{name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) += arg2, arg3=mem
+		{name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) -= arg2, arg3=mem
+		{name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) &= arg2, arg3=mem
+		{name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+arg1*4+auxint+aux) |= arg2, arg3=mem
+		{name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) ^= arg2, arg3=mem
+
+		// direct binary-op on memory with a constant (read-modify-write)
+		{name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},   // or  ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+		// direct binary-op on indexed memory with a constant (read-modify-write)
+		{name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+		{name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+		{name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // or  ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+		{name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
+
+		// indexed loads/stores
+		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", aux: "SymOff", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"},                    // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Read"},    // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff", symEffect: "Read"},                       // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
+		// TODO: sign-extending indexed loads
+		{name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff", symEffect: "Write"},                    // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff", symEffect: "Write"},                    // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+		// TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
+
+		// For storeconst ops, the AuxInt field encodes both
+		// the value to store and an address offset of the store.
+		// Cast AuxInt to a ValAndOff to extract Val and Off fields.
+		{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux.  arg1=mem
+		{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ...
+		{name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ...
+
+		{name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux.  arg2=mem
+		{name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... arg1 ...
+		{name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... 2*arg1 ...
+		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... arg1 ...
+		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... 4*arg1 ...
+
+		// arg0 = pointer to start of memory to zero
+		// arg1 = value to store (will always be zero)
+		// arg2 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+				// Note: CX is only clobbered when dynamic linking.
+			},
+			faultOnNilArg0: true,
+		},
+
+		// arg0 = address of memory to zero
+		// arg1 = # of 4-byte words to zero
+		// arg2 = value to store (will always be zero)
+		// arg3 = mem
+		// returns mem
+		{
+			name:      "REPSTOSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+			},
+			faultOnNilArg0: true,
+		},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                              // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = mem
+		// auxint = offset from duffcopy symbol to call
+		// returns memory
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
+				clobbers: buildReg("DI SI CX"), // uses CX as a temporary
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = # of 8-byte words to copy
+		// arg3 = mem
+		// returns memory
+		{
+			name:      "REPMOVSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+				clobbers: buildReg("DI SI CX"),
+			},
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// (InvertFlags (CMPL a b)) == (CMPL b a)
+		// So if we want (SETL (CMPL a b)) but we can't do that because a is a constant,
+		// then we do (SETL (InvertFlags (CMPL b a))) instead.
+		// Rewrites will convert this to (SETG (CMPL b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Pseudo-ops
+		{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of DX (the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true},
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary, but may clobber others.
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		// Extend ops are the same as Bounds ops except the indexes are 64-bit.
+		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, dx, bx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, cx, dx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, ax, cx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned <
+		{name: "FlagGT_ULT"}, // signed > and unsigned >
+
+		// Special ops for PIC floating-point constants.
+		// MOVSXconst1 loads the address of the constant-pool entry into a register.
+		// MOVSXconst2 loads the constant from that address.
+		// MOVSXconst1 returns a pointer, but we type it as uint32 because it can never point to the Go heap.
+		{name: "MOVSSconst1", reg: gp01, typ: "UInt32", aux: "Float32"},
+		{name: "MOVSDconst1", reg: gp01, typ: "UInt32", aux: "Float64"},
+		{name: "MOVSSconst2", argLength: 1, reg: gpfp, asm: "MOVSS"},
+		{name: "MOVSDconst2", argLength: 1, reg: gpfp, asm: "MOVSD"},
+	}
+
+	var _386blocks = []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LT", controls: 1},
+		{name: "LE", controls: 1},
+		{name: "GT", controls: 1},
+		{name: "GE", controls: 1},
+		{name: "OS", controls: 1},
+		{name: "OC", controls: 1},
+		{name: "ULT", controls: 1},
+		{name: "ULE", controls: 1},
+		{name: "UGT", controls: 1},
+		{name: "UGE", controls: 1},
+		{name: "EQF", controls: 1},
+		{name: "NEF", controls: 1},
+		{name: "ORD", controls: 1}, // FP, ordered comparison (parity zero)
+		{name: "NAN", controls: 1}, // FP, unordered comparison (parity one)
+	}
+
+	archs = append(archs, arch{
+		name:            "386",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../x86/ssa.go",
+		ops:             _386ops,
+		blocks:          _386blocks,
+		regnames:        regNames386,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["BP"]),
+		linkreg:         -1, // not used
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/386splitload.rules b/src/cmd/compile/internal/ssa/gen/386splitload.rules
new file mode 100644
index 0000000..ed93b90
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/386splitload.rules
@@ -0,0 +1,11 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// See the top of AMD64splitload.rules for discussion of these rules.
+
+(CMP(L|W|B)load {sym} [off] ptr x mem) => (CMP(L|W|B) (MOV(L|W|B)load {sym} [off] ptr mem) x)
+
+(CMPLconstload {sym} [vo] ptr mem) => (CMPLconst (MOVLload {sym} [vo.Off32()] ptr mem) [vo.Val32()])
+(CMPWconstload {sym} [vo] ptr mem) => (CMPWconst (MOVWload {sym} [vo.Off32()] ptr mem) [vo.Val16()])
+(CMPBconstload {sym} [vo] ptr mem) => (CMPBconst (MOVBload {sym} [vo.Off32()] ptr mem) [vo.Val8()])
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
new file mode 100644
index 0000000..5de1e1e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -0,0 +1,2216 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(64|32|16|8) ...) => (ADD(Q|L|L|L) ...)
+(AddPtr ...) => (ADDQ ...)
+(Add(32|64)F ...) => (ADDS(S|D) ...)
+
+(Sub(64|32|16|8) ...) => (SUB(Q|L|L|L) ...)
+(SubPtr ...) => (SUBQ ...)
+(Sub(32|64)F ...) => (SUBS(S|D) ...)
+
+(Mul(64|32|16|8) ...) => (MUL(Q|L|L|L) ...)
+(Mul(32|64)F ...) => (MULS(S|D) ...)
+
+(Select0 (Mul64uover x y)) => (Select0 <typ.UInt64> (MULQU x y))
+(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y))
+(Select1 (Mul(64|32)uover x y)) => (SETO (Select1 <types.TypeFlags> (MUL(Q|L)U x y)))
+
+(Hmul(64|32) ...) => (HMUL(Q|L) ...)
+(Hmul(64|32)u ...) => (HMUL(Q|L)U ...)
+
+(Div(64|32|16) [a] x y) => (Select0 (DIV(Q|L|W) [a] x y))
+(Div8  x y) => (Select0 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Div(64|32|16)u x y) => (Select0 (DIV(Q|L|W)U x y))
+(Div8u x y) => (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
+(Div(32|64)F ...) => (DIVS(S|D) ...)
+
+(Select0 (Add64carry x y c)) =>
+	(Select0 <typ.UInt64> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
+(Select1 (Add64carry x y c)) =>
+	(NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))
+(Select0 (Sub64borrow x y c)) =>
+	(Select0 <typ.UInt64> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
+(Select1 (Sub64borrow x y c)) =>
+	(NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))
+
+// Optimize ADCQ and friends
+(ADCQ x (MOVQconst [c]) carry) && is32Bit(c) => (ADCQconst x [int32(c)] carry)
+(ADCQ x y (FlagEQ)) => (ADDQcarry x y)
+(ADCQconst x [c] (FlagEQ)) => (ADDQconstcarry x [c])
+(ADDQcarry x (MOVQconst [c])) && is32Bit(c) => (ADDQconstcarry x [int32(c)])
+(SBBQ x (MOVQconst [c]) borrow) && is32Bit(c) => (SBBQconst x [int32(c)] borrow)
+(SBBQ x y (FlagEQ)) => (SUBQborrow x y)
+(SBBQconst x [c] (FlagEQ)) => (SUBQconstborrow x [c])
+(SUBQborrow x (MOVQconst [c])) && is32Bit(c) => (SUBQconstborrow x [int32(c)])
+(Select1 (NEGLflags (MOVQconst [0]))) => (FlagEQ)
+(Select1 (NEGLflags (NEGQ (SBBQcarrymask x)))) => x
+
+
+(Mul64uhilo ...) => (MULQU2 ...)
+(Div128u ...) => (DIVQU2 ...)
+
+(Avg64u ...) => (AVGQU ...)
+
+(Mod(64|32|16) [a] x y) => (Select1 (DIV(Q|L|W) [a] x y))
+(Mod8  x y) => (Select1 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Mod(64|32|16)u x y) => (Select1 (DIV(Q|L|W)U x y))
+(Mod8u x y) => (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
+
+(And(64|32|16|8) ...) => (AND(Q|L|L|L) ...)
+(Or(64|32|16|8) ...) => (OR(Q|L|L|L) ...)
+(Xor(64|32|16|8) ...) => (XOR(Q|L|L|L) ...)
+(Com(64|32|16|8) ...) => (NOT(Q|L|L|L) ...)
+
+(Neg(64|32|16|8) ...) => (NEG(Q|L|L|L) ...)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
+
+// Lowering boolean ops
+(AndB ...) => (ANDL ...)
+(OrB ...) => (ORL ...)
+(Not x) => (XORLconst [1] x)
+
+// Lowering pointer arithmetic
+(OffPtr [off] ptr) && is32Bit(off) => (ADDQconst [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDQ (MOVQconst [off]) ptr)
+
+// Lowering other arithmetic
+(Ctz64 <t> x) => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
+(Ctz32 x) => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
+(Ctz16 x) => (BSFL (BTSLconst <typ.UInt32> [16] x))
+(Ctz8  x) => (BSFL (BTSLconst <typ.UInt32> [ 8] x))
+
+(Ctz64NonZero x) => (Select0 (BSFQ x))
+(Ctz32NonZero ...) => (BSFL ...)
+(Ctz16NonZero ...) => (BSFL ...)
+(Ctz8NonZero  ...) => (BSFL ...)
+
+// BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
+// However, for zero-extended values, we can cheat a bit, and calculate
+// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
+// places the index of the highest set bit where we want it.
+(BitLen64 <t> x) => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
+(BitLen32 x) => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
+(BitLen16 x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
+(BitLen8  x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
+
+(Bswap(64|32) ...) => (BSWAP(Q|L) ...)
+
+(PopCount(64|32) ...) => (POPCNT(Q|L) ...)
+(PopCount16 x) => (POPCNTL (MOVWQZX <typ.UInt32> x))
+(PopCount8 x) => (POPCNTL (MOVBQZX <typ.UInt32> x))
+
+(Sqrt ...) => (SQRTSD ...)
+
+(RoundToEven x) => (ROUNDSD [0] x)
+(Floor x)       => (ROUNDSD [1] x)
+(Ceil x)        => (ROUNDSD [2] x)
+(Trunc x)       => (ROUNDSD [3] x)
+
+(FMA x y z) => (VFMADD231SD z x y)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to16  ...) => (MOVBQSX ...)
+(SignExt8to32  ...) => (MOVBQSX ...)
+(SignExt8to64  ...) => (MOVBQSX ...)
+(SignExt16to32 ...) => (MOVWQSX ...)
+(SignExt16to64 ...) => (MOVWQSX ...)
+(SignExt32to64 ...) => (MOVLQSX ...)
+
+(ZeroExt8to16  ...) => (MOVBQZX ...)
+(ZeroExt8to32  ...) => (MOVBQZX ...)
+(ZeroExt8to64  ...) => (MOVBQZX ...)
+(ZeroExt16to32 ...) => (MOVWQZX ...)
+(ZeroExt16to64 ...) => (MOVWQZX ...)
+(ZeroExt32to64 ...) => (MOVLQZX ...)
+
+(Slicemask <t> x) => (SARQconst (NEGQ <t> x) [63])
+
+(SpectreIndex <t> x y) => (CMOVQCC x (MOVQconst [0]) (CMPQ x y))
+(SpectreSliceIndex <t> x y) => (CMOVQHI x (MOVQconst [0]) (CMPQ x y))
+
+// Lowering truncation
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8  ...) => (Copy ...)
+(Trunc32to8  ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8  ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Lowering float <-> int
+(Cvt32to32F ...) => (CVTSL2SS ...)
+(Cvt32to64F ...) => (CVTSL2SD ...)
+(Cvt64to32F ...) => (CVTSQ2SS ...)
+(Cvt64to64F ...) => (CVTSQ2SD ...)
+
+(Cvt32Fto32 ...) => (CVTTSS2SL ...)
+(Cvt32Fto64 ...) => (CVTTSS2SQ ...)
+(Cvt64Fto32 ...) => (CVTTSD2SL ...)
+(Cvt64Fto64 ...) => (CVTTSD2SQ ...)
+
+(Cvt32Fto64F ...) => (CVTSS2SD ...)
+(Cvt64Fto32F ...) => (CVTSD2SS ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+// Lowering shifts
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+//   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
+(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLQ x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SHLL x y)
+
+(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16])))
+(Rsh8Ux(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8])))
+
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRQ x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRL x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRW x y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SHRB x y)
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
+(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64])))))
+(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32])))))
+(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16])))))
+(Rsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8])))))
+
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SARQ x y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SARL x y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y)
+(Rsh8x(64|32|16|8) x y)  && shiftIsBounded(v) => (SARB x y)
+
+// Lowering integer comparisons
+(Less(64|32|16|8)      x y) => (SETL  (CMP(Q|L|W|B)     x y))
+(Less(64|32|16|8)U     x y) => (SETB  (CMP(Q|L|W|B)     x y))
+(Leq(64|32|16|8)       x y) => (SETLE (CMP(Q|L|W|B)     x y))
+(Leq(64|32|16|8)U      x y) => (SETBE (CMP(Q|L|W|B)     x y))
+(Eq(Ptr|64|32|16|8|B)  x y) => (SETEQ (CMP(Q|Q|L|W|B|B) x y))
+(Neq(Ptr|64|32|16|8|B) x y) => (SETNE (CMP(Q|Q|L|W|B|B) x y))
+
+// Lowering floating point comparisons
+// Note Go assembler gets UCOMISx operand order wrong, but it is right here
+// and the operands are reversed when generating assembly language.
+(Eq(32|64)F   x y) => (SETEQF (UCOMIS(S|D) x y))
+(Neq(32|64)F  x y) => (SETNEF (UCOMIS(S|D) x y))
+// Use SETGF/SETGEF with reversed operands to dodge NaN case.
+(Less(32|64)F x y) => (SETGF  (UCOMIS(S|D) y x))
+(Leq(32|64)F  x y) => (SETGEF (UCOMIS(S|D) y x))
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVQload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) => (MOVLload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem)
+
+// Lowering stores
+// These more-specific FP versions of Store pattern should come first.
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem)
+
+(Store {t} ptr val mem) && t.Size() == 8 => (MOVQstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Lowering moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem)
+(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
+(Move [16] dst src mem) && config.useSSE => (MOVOstore dst (MOVOload src mem) mem)
+(Move [16] dst src mem) && !config.useSSE =>
+	(MOVQstore [8] dst (MOVQload [8] src mem)
+		(MOVQstore dst (MOVQload src mem) mem))
+
+(Move [32] dst src mem) =>
+	(Move [16]
+		(OffPtr <dst.Type> dst [16])
+		(OffPtr <src.Type> src [16])
+		(Move [16] dst src mem))
+
+(Move [48] dst src mem) && config.useSSE =>
+	(Move [32]
+		(OffPtr <dst.Type> dst [16])
+		(OffPtr <src.Type> src [16])
+		(Move [16] dst src mem))
+
+(Move [64] dst src mem) && config.useSSE =>
+	(Move [32]
+		(OffPtr <dst.Type> dst [32])
+		(OffPtr <src.Type> src [32])
+		(Move [32] dst src mem))
+
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBload [2] src mem)
+		(MOVWstore dst (MOVWload src mem) mem))
+(Move [5] dst src mem) =>
+	(MOVBstore [4] dst (MOVBload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+(Move [6] dst src mem) =>
+	(MOVWstore [4] dst (MOVWload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+(Move [7] dst src mem) =>
+	(MOVLstore [3] dst (MOVLload [3] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+(Move [9] dst src mem) =>
+	(MOVBstore [8] dst (MOVBload [8] src mem)
+		(MOVQstore dst (MOVQload src mem) mem))
+(Move [10] dst src mem) =>
+	(MOVWstore [8] dst (MOVWload [8] src mem)
+		(MOVQstore dst (MOVQload src mem) mem))
+(Move [12] dst src mem) =>
+	(MOVLstore [8] dst (MOVLload [8] src mem)
+		(MOVQstore dst (MOVQload src mem) mem))
+(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 =>
+	(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
+		(MOVQstore dst (MOVQload src mem) mem))
+
+// Adjust moves to be a multiple of 16 bytes.
+(Move [s] dst src mem)
+	&& s > 16 && s%16 != 0 && s%16 <= 8 =>
+	(Move [s-s%16]
+		(OffPtr <dst.Type> dst [s%16])
+		(OffPtr <src.Type> src [s%16])
+		(MOVQstore dst (MOVQload src mem) mem))
+(Move [s] dst src mem)
+	&& s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE =>
+	(Move [s-s%16]
+		(OffPtr <dst.Type> dst [s%16])
+		(OffPtr <src.Type> src [s%16])
+		(MOVOstore dst (MOVOload src mem) mem))
+(Move [s] dst src mem)
+	&& s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE =>
+	(Move [s-s%16]
+		(OffPtr <dst.Type> dst [s%16])
+		(OffPtr <src.Type> src [s%16])
+		(MOVQstore [8] dst (MOVQload [8] src mem)
+			(MOVQstore dst (MOVQload src mem) mem)))
+
+// Medium copying uses a duff device.
+(Move [s] dst src mem)
+	&& s > 64 && s <= 16*64 && s%16 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [s] dst src mem)
+
+// Large copying uses REP MOVSQ.
+(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) =>
+	(REPMOVSQ dst src (MOVQconst [s/8]) mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [2] destptr mem) => (MOVWstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [4] destptr mem) => (MOVLstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [8] destptr mem) => (MOVQstoreconst [makeValAndOff32(0,0)] destptr mem)
+
+(Zero [3] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,2)] destptr
+		(MOVWstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [5] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [6] destptr mem) =>
+	(MOVWstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [7] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,3)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+
+// Strip off any fractional word zeroing.
+(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE =>
+	(Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))
+
+// Zero small numbers of words directly.
+(Zero [16] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [24] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,16)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+			(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem)))
+(Zero [32] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,24)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,16)] destptr
+			(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+				(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))))
+
+(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,int32(s-8))] destptr
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))
+
+// Adjust zeros to be a multiple of 16 bytes.
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE =>
+	(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
+		(MOVOstore destptr (MOVOconst [0]) mem))
+
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE =>
+	(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))
+
+(Zero [16] destptr mem) && config.useSSE =>
+	(MOVOstore destptr (MOVOconst [0]) mem)
+(Zero [32] destptr mem) && config.useSSE =>
+	(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
+		(MOVOstore destptr (MOVOconst [0]) mem))
+(Zero [48] destptr mem) && config.useSSE =>
+	(MOVOstore (OffPtr <destptr.Type> destptr [32]) (MOVOconst [0])
+		(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
+			(MOVOstore destptr (MOVOconst [0]) mem)))
+(Zero [64] destptr mem) && config.useSSE =>
+	(MOVOstore (OffPtr <destptr.Type> destptr [48]) (MOVOconst [0])
+		(MOVOstore (OffPtr <destptr.Type> destptr [32]) (MOVOconst [0])
+			(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
+				(MOVOstore destptr (MOVOconst [0]) mem))))
+
+// Medium zeroing uses a duff device.
+(Zero [s] destptr mem)
+	&& s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice =>
+	(DUFFZERO [s] destptr (MOVOconst [0]) mem)
+
+// Large zeroing uses REP STOSQ.
+(Zero [s] destptr mem)
+	&& (s > 1024 || (config.noDuffDevice && s > 64 || !config.useSSE && s > 32))
+	&& s%8 == 0 =>
+	(REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
+
+// Lowering constants
+(Const8   [c]) => (MOVLconst [int32(c)])
+(Const16  [c]) => (MOVLconst [int32(c)])
+(Const32  ...) => (MOVLconst ...)
+(Const64  ...) => (MOVQconst ...)
+(Const32F ...) => (MOVSSconst ...)
+(Const64F ...) => (MOVSDconst ...)
+(ConstNil    ) => (MOVQconst [0])
+(ConstBool [c]) => (MOVLconst [b2i32(c)])
+
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// Lowering conditional moves
+// If the condition is a SETxx, we can just run a CMOV from the comparison that was
+// setting the flags.
+// Legend: HI=unsigned ABOVE, CS=unsigned BELOW, CC=unsigned ABOVE EQUAL, LS=unsigned BELOW EQUAL
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && (is64BitInt(t) || isPtr(t))
+    => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is32BitInt(t)
+    => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is16BitInt(t)
+    => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+
+// If the condition does not set the flags, we need to generate a comparison.
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 1
+    => (CondSelect <t> x y (MOVBQZX <typ.UInt64> check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 2
+    => (CondSelect <t> x y (MOVWQZX <typ.UInt64> check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 4
+    => (CondSelect <t> x y (MOVLQZX <typ.UInt64> check))
+
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && (is64BitInt(t) || isPtr(t))
+    => (CMOVQNE y x (CMPQconst [0] check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is32BitInt(t)
+    => (CMOVLNE y x (CMPQconst [0] check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is16BitInt(t)
+    => (CMOVWNE y x (CMPQconst [0] check))
+
+// Absorb InvertFlags
+(CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+    => (CMOVQ(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+(CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+    => (CMOVL(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+(CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+    => (CMOVW(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+
+// Absorb constants generated during lower
+(CMOV(QEQ|QLE|QGE|QCC|QLS|LEQ|LLE|LGE|LCC|LLS|WEQ|WLE|WGE|WCC|WLS) _ x (FlagEQ)) => x
+(CMOV(QNE|QLT|QGT|QCS|QHI|LNE|LLT|LGT|LCS|LHI|WNE|WLT|WGT|WCS|WHI) y _ (FlagEQ)) => y
+(CMOV(QNE|QGT|QGE|QHI|QCC|LNE|LGT|LGE|LHI|LCC|WNE|WGT|WGE|WHI|WCC) _ x (FlagGT_UGT)) => x
+(CMOV(QEQ|QLE|QLT|QLS|QCS|LEQ|LLE|LLT|LLS|LCS|WEQ|WLE|WLT|WLS|WCS) y _ (FlagGT_UGT)) => y
+(CMOV(QNE|QGT|QGE|QLS|QCS|LNE|LGT|LGE|LLS|LCS|WNE|WGT|WGE|WLS|WCS) _ x (FlagGT_ULT)) => x
+(CMOV(QEQ|QLE|QLT|QHI|QCC|LEQ|LLE|LLT|LHI|LCC|WEQ|WLE|WLT|WHI|WCC) y _ (FlagGT_ULT)) => y
+(CMOV(QNE|QLT|QLE|QCS|QLS|LNE|LLT|LLE|LCS|LLS|WNE|WLT|WLE|WCS|WLS) _ x (FlagLT_ULT)) => x
+(CMOV(QEQ|QGT|QGE|QHI|QCC|LEQ|LGT|LGE|LHI|LCC|WEQ|WGT|WGE|WHI|WCC) y _ (FlagLT_ULT)) => y
+(CMOV(QNE|QLT|QLE|QHI|QCC|LNE|LLT|LLE|LHI|LCC|WNE|WLT|WLE|WHI|WCC) _ x (FlagLT_UGT)) => x
+(CMOV(QEQ|QGT|QGE|QCS|QLS|LEQ|LGT|LGE|LCS|LLS|WEQ|WGT|WGE|WCS|WLS) y _ (FlagLT_UGT)) => y
+
+// Miscellaneous
+(IsNonNil p) => (SETNE (TESTQ p p))
+(IsInBounds idx len) => (SETB (CMPQ idx len))
+(IsSliceInBounds idx len) => (SETBE (CMPQ idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetG ...) => (LoweredGetG ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+
+(HasCPUFeature {s}) => (SETNE (CMPQconst [0] (LoweredHasCPUFeature {s})))
+(Addr {sym} base) => (LEAQ {sym} base)
+(LocalAddr {sym} base _) => (LEAQ {sym} base)
+
+(MOVBstore [off] {sym} ptr y:(SETL x) mem) && y.Uses == 1 => (SETLstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETLE x) mem) && y.Uses == 1 => (SETLEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETG x) mem) && y.Uses == 1 => (SETGstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETGE x) mem) && y.Uses == 1 => (SETGEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETEQ x) mem) && y.Uses == 1 => (SETEQstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETNE x) mem) && y.Uses == 1 => (SETNEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETB x) mem) && y.Uses == 1 => (SETBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETBE x) mem) && y.Uses == 1 => (SETBEstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETA x) mem) && y.Uses == 1 => (SETAstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr y:(SETAE x) mem) && y.Uses == 1 => (SETAEstore [off] {sym} ptr x mem)
+
+// block rewrites
+(If (SETL  cmp) yes no) => (LT  cmp yes no)
+(If (SETLE cmp) yes no) => (LE  cmp yes no)
+(If (SETG  cmp) yes no) => (GT  cmp yes no)
+(If (SETGE cmp) yes no) => (GE  cmp yes no)
+(If (SETEQ cmp) yes no) => (EQ  cmp yes no)
+(If (SETNE cmp) yes no) => (NE  cmp yes no)
+(If (SETB  cmp) yes no) => (ULT cmp yes no)
+(If (SETBE cmp) yes no) => (ULE cmp yes no)
+(If (SETA  cmp) yes no) => (UGT cmp yes no)
+(If (SETAE cmp) yes no) => (UGE cmp yes no)
+(If (SETO cmp) yes no) => (OS cmp yes no)
+
+// Special case for floating point - LF/LEF not generated
+(If (SETGF  cmp) yes no) => (UGT  cmp yes no)
+(If (SETGEF cmp) yes no) => (UGE  cmp yes no)
+(If (SETEQF cmp) yes no) => (EQF  cmp yes no)
+(If (SETNEF cmp) yes no) => (NEF  cmp yes no)
+
+(If cond yes no) => (NE (TESTB cond cond) yes no)
+
+// Atomic loads.  Other than preserving their ordering with respect to other loads, nothing special here.
+(AtomicLoad8 ptr mem) => (MOVBatomicload ptr mem)
+(AtomicLoad32 ptr mem) => (MOVLatomicload ptr mem)
+(AtomicLoad64 ptr mem) => (MOVQatomicload ptr mem)
+(AtomicLoadPtr ptr mem) => (MOVQatomicload ptr mem)
+
+// Atomic stores.  We use XCHG to prevent the hardware reordering a subsequent load.
+// TODO: most runtime uses of atomic stores don't need that property.  Use normal stores for those?
+(AtomicStore8 ptr val mem) => (Select1 (XCHGB <types.NewTuple(typ.UInt8,types.TypeMem)> val ptr mem))
+(AtomicStore32 ptr val mem) => (Select1 (XCHGL <types.NewTuple(typ.UInt32,types.TypeMem)> val ptr mem))
+(AtomicStore64 ptr val mem) => (Select1 (XCHGQ <types.NewTuple(typ.UInt64,types.TypeMem)> val ptr mem))
+(AtomicStorePtrNoWB ptr val mem) => (Select1 (XCHGQ <types.NewTuple(typ.BytePtr,types.TypeMem)> val ptr mem))
+
+// Atomic exchanges.
+(AtomicExchange32 ptr val mem) => (XCHGL val ptr mem)
+(AtomicExchange64 ptr val mem) => (XCHGQ val ptr mem)
+
+// Atomic adds.
+(AtomicAdd32 ptr val mem) => (AddTupleFirst32 val (XADDLlock val ptr mem))
+(AtomicAdd64 ptr val mem) => (AddTupleFirst64 val (XADDQlock val ptr mem))
+(Select0 <t> (AddTupleFirst32 val tuple)) => (ADDL val (Select0 <t> tuple))
+(Select1     (AddTupleFirst32   _ tuple)) => (Select1 tuple)
+(Select0 <t> (AddTupleFirst64 val tuple)) => (ADDQ val (Select0 <t> tuple))
+(Select1     (AddTupleFirst64   _ tuple)) => (Select1 tuple)
+
+// Atomic compare and swap.
+(AtomicCompareAndSwap32 ptr old new_ mem) => (CMPXCHGLlock ptr old new_ mem)
+(AtomicCompareAndSwap64 ptr old new_ mem) => (CMPXCHGQlock ptr old new_ mem)
+
+// Atomic memory updates.
+(AtomicAnd8  ptr val mem) => (ANDBlock ptr val mem)
+(AtomicAnd32 ptr val mem) => (ANDLlock ptr val mem)
+(AtomicOr8   ptr val mem) => (ORBlock  ptr val mem)
+(AtomicOr32  ptr val mem) => (ORLlock  ptr val mem)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// ***************************
+// Above: lowering rules
+// Below: optimizations
+// ***************************
+// TODO: Should the optimizations be a separate pass?
+
+// Fold boolean tests into blocks
+(NE (TESTB (SETL  cmp) (SETL  cmp)) yes no) => (LT  cmp yes no)
+(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE  cmp yes no)
+(NE (TESTB (SETG  cmp) (SETG  cmp)) yes no) => (GT  cmp yes no)
+(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE  cmp yes no)
+(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ  cmp yes no)
+(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE  cmp yes no)
+(NE (TESTB (SETB  cmp) (SETB  cmp)) yes no) => (ULT cmp yes no)
+(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no)
+(NE (TESTB (SETA  cmp) (SETA  cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no)
+
+// Unsigned comparisons to 0/1
+(ULT (TEST(Q|L|W|B) x x) yes no) => (First no yes)
+(UGE (TEST(Q|L|W|B) x x) yes no) => (First yes no)
+(SETB (TEST(Q|L|W|B) x x)) => (ConstBool [false])
+(SETAE (TEST(Q|L|W|B) x x)) => (ConstBool [true])
+
+// x & 1 != 0 -> x & 1
+(SETNE (TEST(B|W)const [1] x)) => (AND(L|L)const [1] x)
+(SETB (BT(L|Q)const [0] x)) => (AND(L|Q)const [1] x)
+
+// Recognize bit tests: a&(1<<b) != 0 for b suitably bounded
+// Note that BTx instructions use the carry bit, so we need to convert tests for zero flag
+// into tests for carry flags.
+// ULT and SETB check the carry flag; they are identical to CS and SETCS. Same, mutatis
+// mutandis, for UGE and SETAE, and CC and SETCC.
+((NE|EQ) (TESTL (SHLL (MOVLconst [1]) x) y)) => ((ULT|UGE) (BTL x y))
+((NE|EQ) (TESTQ (SHLQ (MOVQconst [1]) x) y)) => ((ULT|UGE) (BTQ x y))
+((NE|EQ) (TESTLconst [c] x)) && isUint32PowerOfTwo(int64(c))
+    => ((ULT|UGE) (BTLconst [int8(log32(c))] x))
+((NE|EQ) (TESTQconst [c] x)) && isUint64PowerOfTwo(int64(c))
+    => ((ULT|UGE) (BTQconst [int8(log32(c))] x))
+((NE|EQ) (TESTQ (MOVQconst [c]) x)) && isUint64PowerOfTwo(c)
+    => ((ULT|UGE) (BTQconst [int8(log64(c))] x))
+(SET(NE|EQ) (TESTL (SHLL (MOVLconst [1]) x) y)) => (SET(B|AE)  (BTL x y))
+(SET(NE|EQ) (TESTQ (SHLQ (MOVQconst [1]) x) y)) => (SET(B|AE)  (BTQ x y))
+(SET(NE|EQ) (TESTLconst [c] x)) && isUint32PowerOfTwo(int64(c))
+    => (SET(B|AE)  (BTLconst [int8(log32(c))] x))
+(SET(NE|EQ) (TESTQconst [c] x)) && isUint64PowerOfTwo(int64(c))
+    => (SET(B|AE)  (BTQconst [int8(log32(c))] x))
+(SET(NE|EQ) (TESTQ (MOVQconst [c]) x)) && isUint64PowerOfTwo(c)
+    => (SET(B|AE)  (BTQconst [int8(log64(c))] x))
+// SET..store variant
+(SET(NE|EQ)store [off] {sym} ptr (TESTL (SHLL (MOVLconst [1]) x) y) mem)
+    => (SET(B|AE)store  [off] {sym} ptr (BTL x y) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ (SHLQ (MOVQconst [1]) x) y) mem)
+    => (SET(B|AE)store  [off] {sym} ptr (BTQ x y) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTLconst [c] x) mem) && isUint32PowerOfTwo(int64(c))
+    => (SET(B|AE)store  [off] {sym} ptr (BTLconst [int8(log32(c))] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTQconst [c] x) mem) && isUint64PowerOfTwo(int64(c))
+    => (SET(B|AE)store  [off] {sym} ptr (BTQconst [int8(log32(c))] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ (MOVQconst [c]) x) mem) && isUint64PowerOfTwo(c)
+    => (SET(B|AE)store  [off] {sym} ptr (BTQconst [int8(log64(c))] x) mem)
+
+// Handle bit-testing in the form (a>>b)&1 != 0 by building the above rules
+// and further combining shifts.
+(BT(Q|L)const [c] (SHRQconst [d] x)) && (c+d)<64 => (BTQconst [c+d] x)
+(BT(Q|L)const [c] (SHLQconst [d] x)) && c>d      => (BT(Q|L)const [c-d] x)
+(BT(Q|L)const [0] s:(SHRQ x y)) => (BTQ y x)
+(BTLconst [c] (SHRLconst [d] x)) && (c+d)<32 => (BTLconst [c+d] x)
+(BTLconst [c] (SHLLconst [d] x)) && c>d      => (BTLconst [c-d] x)
+(BTLconst [0] s:(SHRL x y)) => (BTL y x)
+
+// Rewrite a & 1 != 1 into a & 1 == 0.
+// Among other things, this lets us turn (a>>b)&1 != 1 into a bit test.
+(SET(NE|EQ) (CMPLconst [1] s:(ANDLconst [1] _))) => (SET(EQ|NE) (CMPLconst [0] s))
+(SET(NE|EQ)store [off] {sym} ptr (CMPLconst [1] s:(ANDLconst [1] _)) mem) => (SET(EQ|NE)store [off] {sym} ptr (CMPLconst [0] s) mem)
+(SET(NE|EQ) (CMPQconst [1] s:(ANDQconst [1] _))) => (SET(EQ|NE) (CMPQconst [0] s))
+(SET(NE|EQ)store [off] {sym} ptr (CMPQconst [1] s:(ANDQconst [1] _)) mem) => (SET(EQ|NE)store [off] {sym} ptr (CMPQconst [0] s) mem)
+
+// Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b)
+(OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y)
+(XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y)
+(ORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) =>
+	(BTSLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(ORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) =>
+	(BTSQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+(XORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) =>
+	(BTCLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(XORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) =>
+	(BTCQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+
+// Convert ORconst into BTS, if the code gets smaller, with boundary being
+// (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes).
+((ORQ|XORQ)const [c] x) && isUint64PowerOfTwo(int64(c)) && uint64(c) >= 128
+    => (BT(S|C)Qconst [int8(log32(c))] x)
+((ORL|XORL)const [c] x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128
+    => (BT(S|C)Lconst [int8(log32(c))] x)
+((ORQ|XORQ) (MOVQconst [c]) x) && isUint64PowerOfTwo(c) && uint64(c) >= 128
+    => (BT(S|C)Qconst [int8(log64(c))] x)
+((ORL|XORL) (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128
+    => (BT(S|C)Lconst [int8(log32(c))] x)
+
+// Recognize bit clearing: a &^= 1<<b
+(AND(Q|L) (NOT(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y)) x) => (BTR(Q|L) x y)
+(ANDQconst [c] x) && isUint64PowerOfTwo(int64(^c)) && uint64(^c) >= 128
+    => (BTRQconst [int8(log32(^c))] x)
+(ANDLconst [c] x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
+    => (BTRLconst [int8(log32(^c))] x)
+(ANDQ (MOVQconst [c]) x) && isUint64PowerOfTwo(^c) && uint64(^c) >= 128
+    => (BTRQconst [int8(log64(^c))] x)
+(ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
+    => (BTRLconst [int8(log32(^c))] x)
+(ANDLmodify [off] {sym} ptr (NOTL s:(SHLL (MOVLconst [1]) <t> x)) mem) =>
+	(BTRLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(ANDQmodify [off] {sym} ptr (NOTQ s:(SHLQ (MOVQconst [1]) <t> x)) mem) =>
+	(BTRQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+
+// Special-case bit patterns on first/last bit.
+// generic.rules changes ANDs of high-part/low-part masks into a couple of shifts,
+// for instance:
+//    x & 0xFFFF0000 -> (x >> 16) << 16
+//    x & 0x80000000 -> (x >> 31) << 31
+//
+// In case the mask is just one bit (like second example above), it conflicts
+// with the above rules to detect bit-testing / bit-clearing of first/last bit.
+// We thus special-case them, by detecting the shift patterns.
+
+// Special case resetting first/last bit
+(SHL(L|Q)const [1] (SHR(L|Q)const [1] x))
+	=> (BTR(L|Q)const [0] x)
+(SHRLconst [1] (SHLLconst [1] x))
+	=> (BTRLconst [31] x)
+(SHRQconst [1] (SHLQconst [1] x))
+	=> (BTRQconst [63] x)
+
+// Special case testing first/last bit (with double-shift generated by generic.rules)
+((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHLQconst [63] (SHRQconst [63] x)) z2)) && z1==z2
+    => ((SETB|SETAE|ULT|UGE) (BTQconst [63] x))
+((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHLLconst [31] (SHRQconst [31] x)) z2)) && z1==z2
+    => ((SETB|SETAE|ULT|UGE) (BTQconst [31] x))
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHLQconst [63] (SHRQconst [63] x)) z2) mem) && z1==z2
+    => (SET(B|AE)store [off] {sym} ptr (BTQconst [63] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHLLconst [31] (SHRLconst [31] x)) z2) mem) && z1==z2
+    => (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem)
+
+((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHRQconst [63] (SHLQconst [63] x)) z2)) && z1==z2
+    => ((SETB|SETAE|ULT|UGE)  (BTQconst [0] x))
+((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHRLconst [31] (SHLLconst [31] x)) z2)) && z1==z2
+    => ((SETB|SETAE|ULT|UGE)  (BTLconst [0] x))
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHRQconst [63] (SHLQconst [63] x)) z2) mem) && z1==z2
+    => (SET(B|AE)store [off] {sym} ptr (BTQconst [0] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHRLconst [31] (SHLLconst [31] x)) z2) mem) && z1==z2
+    => (SET(B|AE)store [off] {sym} ptr (BTLconst [0] x) mem)
+
+// Special-case manually testing last bit with "a>>63 != 0" (without "&1")
+((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHRQconst [63] x) z2)) && z1==z2
+    => ((SETB|SETAE|ULT|UGE) (BTQconst [63] x))
+((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHRLconst [31] x) z2)) && z1==z2
+    => ((SETB|SETAE|ULT|UGE) (BTLconst [31] x))
+(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHRQconst [63] x) z2) mem) && z1==z2
+    => (SET(B|AE)store [off] {sym} ptr (BTQconst [63] x) mem)
+(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHRLconst [31] x) z2) mem) && z1==z2
+    => (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem)
+
+// Fold combinations of bit ops on same bit. An example is math.Copysign(c,-1)
+(BTS(Q|L)const [c] (BTR(Q|L)const [c] x)) => (BTS(Q|L)const [c] x)
+(BTS(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTS(Q|L)const [c] x)
+(BTR(Q|L)const [c] (BTS(Q|L)const [c] x)) => (BTR(Q|L)const [c] x)
+(BTR(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTR(Q|L)const [c] x)
+
+// Fold boolean negation into SETcc.
+(XORLconst [1] (SETNE x)) => (SETEQ x)
+(XORLconst [1] (SETEQ x)) => (SETNE x)
+(XORLconst [1] (SETL  x)) => (SETGE x)
+(XORLconst [1] (SETGE x)) => (SETL  x)
+(XORLconst [1] (SETLE x)) => (SETG  x)
+(XORLconst [1] (SETG  x)) => (SETLE x)
+(XORLconst [1] (SETB  x)) => (SETAE x)
+(XORLconst [1] (SETAE x)) => (SETB  x)
+(XORLconst [1] (SETBE x)) => (SETA  x)
+(XORLconst [1] (SETA  x)) => (SETBE x)
+
+// Special case for floating point - LF/LEF not generated
+(NE (TESTB (SETGF  cmp) (SETGF  cmp)) yes no) => (UGT  cmp yes no)
+(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE  cmp yes no)
+(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF  cmp yes no)
+(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF  cmp yes no)
+
+// Disabled because it interferes with the pattern match above and makes worse code.
+// (SETNEF x) => (ORQ (SETNE <typ.Int8> x) (SETNAN <typ.Int8> x))
+// (SETEQF x) => (ANDQ (SETEQ <typ.Int8> x) (SETORD <typ.Int8> x))
+
+// fold constants into instructions
+(ADDQ x (MOVQconst [c])) && is32Bit(c) => (ADDQconst [int32(c)] x)
+(ADDQ x (MOVLconst [c])) => (ADDQconst [c] x)
+(ADDL x (MOVLconst [c])) => (ADDLconst [c] x)
+
+(SUBQ x (MOVQconst [c])) && is32Bit(c) => (SUBQconst x [int32(c)])
+(SUBQ (MOVQconst [c]) x) && is32Bit(c) => (NEGQ (SUBQconst <v.Type> x [int32(c)]))
+(SUBL x (MOVLconst [c])) => (SUBLconst x [c])
+(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c]))
+
+(MULQ x (MOVQconst [c])) && is32Bit(c) => (MULQconst [int32(c)] x)
+(MULL x (MOVLconst [c])) => (MULLconst [c] x)
+
+(ANDQ x (MOVQconst [c])) && is32Bit(c) => (ANDQconst [int32(c)] x)
+(ANDL x (MOVLconst [c])) => (ANDLconst [c] x)
+
+(AND(L|Q)const [c] (AND(L|Q)const [d] x)) => (AND(L|Q)const [c & d] x)
+(XOR(L|Q)const [c] (XOR(L|Q)const [d] x)) => (XOR(L|Q)const [c ^ d] x)
+(OR(L|Q)const  [c] (OR(L|Q)const  [d] x)) => (OR(L|Q)const  [c | d] x)
+
+(BTRLconst [c] (ANDLconst [d] x)) => (ANDLconst [d &^ (1<<uint32(c))] x)
+(ANDLconst [c] (BTRLconst [d] x)) => (ANDLconst [c &^ (1<<uint32(d))] x)
+(BTRLconst [c] (BTRLconst [d] x)) => (ANDLconst [^(1<<uint32(c) | 1<<uint32(d))] x)
+
+(BTCLconst [c] (XORLconst [d] x)) => (XORLconst [d ^ 1<<uint32(c)] x)
+(XORLconst [c] (BTCLconst [d] x)) => (XORLconst [c ^ 1<<uint32(d)] x)
+(BTCLconst [c] (BTCLconst [d] x)) => (XORLconst [1<<uint32(c) | 1<<uint32(d)] x)
+
+(BTSLconst [c] (ORLconst  [d] x)) => (ORLconst [d | 1<<uint32(c)] x)
+(ORLconst  [c] (BTSLconst [d] x)) => (ORLconst [c | 1<<uint32(d)] x)
+(BTSLconst [c] (BTSLconst [d] x)) => (ORLconst [1<<uint32(c) | 1<<uint32(d)] x)
+
+(BTRQconst [c] (ANDQconst [d] x)) && is32Bit(int64(d) &^ (1<<uint32(c)))     => (ANDQconst [d &^ (1<<uint32(c))] x)
+(ANDQconst [c] (BTRQconst [d] x)) && is32Bit(int64(c) &^ (1<<uint32(d)))     => (ANDQconst [c &^ (1<<uint32(d))] x)
+(BTRQconst [c] (BTRQconst [d] x)) && is32Bit(^(1<<uint32(c) | 1<<uint32(d))) => (ANDQconst [^(1<<uint32(c) | 1<<uint32(d))] x)
+
+(BTCQconst [c] (XORQconst [d] x)) && is32Bit(int64(d) ^ 1<<uint32(c))     => (XORQconst [d ^ 1<<uint32(c)] x)
+(XORQconst [c] (BTCQconst [d] x)) && is32Bit(int64(c) ^ 1<<uint32(d))     => (XORQconst [c ^ 1<<uint32(d)] x)
+(BTCQconst [c] (BTCQconst [d] x)) && is32Bit(1<<uint32(c) ^ 1<<uint32(d)) => (XORQconst [1<<uint32(c) ^ 1<<uint32(d)] x)
+
+(BTSQconst [c] (ORQconst  [d] x)) && is32Bit(int64(d) | 1<<uint32(c))     => (ORQconst [d | 1<<uint32(c)] x)
+(ORQconst  [c] (BTSQconst [d] x)) && is32Bit(int64(c) | 1<<uint32(d))     => (ORQconst [c | 1<<uint32(d)] x)
+(BTSQconst [c] (BTSQconst [d] x)) && is32Bit(1<<uint32(c) | 1<<uint32(d)) => (ORQconst [1<<uint32(c) | 1<<uint32(d)] x)
+
+
+(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x)
+(MULQconst [c] (MULQconst [d] x)) && is32Bit(int64(c)*int64(d)) => (MULQconst [c * d] x)
+
+(ORQ x (MOVQconst [c])) && is32Bit(c) => (ORQconst [int32(c)] x)
+(ORQ x (MOVLconst [c])) => (ORQconst [c] x)
+(ORL x (MOVLconst [c])) => (ORLconst [c] x)
+
+(XORQ x (MOVQconst [c])) && is32Bit(c) => (XORQconst [int32(c)] x)
+(XORL x (MOVLconst [c])) => (XORLconst [c] x)
+
+(SHLQ x (MOV(Q|L)const [c])) => (SHLQconst [int8(c&63)] x)
+(SHLL x (MOV(Q|L)const [c])) => (SHLLconst [int8(c&31)] x)
+
+(SHRQ x (MOV(Q|L)const [c])) => (SHRQconst [int8(c&63)] x)
+(SHRL x (MOV(Q|L)const [c])) => (SHRLconst [int8(c&31)] x)
+(SHRW x (MOV(Q|L)const [c])) && c&31 < 16 => (SHRWconst [int8(c&31)] x)
+(SHRW _ (MOV(Q|L)const [c])) && c&31 >= 16 => (MOVLconst [0])
+(SHRB x (MOV(Q|L)const [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x)
+(SHRB _ (MOV(Q|L)const [c])) && c&31 >= 8 => (MOVLconst [0])
+
+(SARQ x (MOV(Q|L)const [c])) => (SARQconst [int8(c&63)] x)
+(SARL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x)
+(SARW x (MOV(Q|L)const [c])) => (SARWconst [int8(min(int64(c)&31,15))] x)
+(SARB x (MOV(Q|L)const [c])) => (SARBconst [int8(min(int64(c)&31,7))] x)
+
+
+// Operations which don't affect the low 6/5 bits of the shift amount are NOPs.
+((SHLQ|SHRQ|SARQ) x (ADDQconst [c] y)) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
+((SHLQ|SHRQ|SARQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y))
+
+((SHLL|SHRL|SARL) x (ADDQconst [c] y)) && c & 31 == 0  => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0  => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
+((SHLL|SHRL|SARL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGQ <t> y))
+
+((SHLQ|SHRQ|SARQ) x (ADDLconst [c] y)) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0  => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
+((SHLQ|SHRQ|SARQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y)
+((SHLQ|SHRQ|SARQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y))
+
+((SHLL|SHRL|SARL) x (ADDLconst [c] y)) && c & 31 == 0  => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0  => ((SHLL|SHRL|SARL) x (NEGL <t> y))
+((SHLL|SHRL|SARL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y)
+((SHLL|SHRL|SARL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGL <t> y))
+
+// Constant rotate instructions
+((ADDQ|ORQ|XORQ) (SHLQconst x [c]) (SHRQconst x [d])) && d==64-c => (ROLQconst x [c])
+((ADDL|ORL|XORL) (SHLLconst x [c]) (SHRLconst x [d])) && d==32-c => (ROLLconst x [c])
+
+((ADDL|ORL|XORL) <t> (SHLLconst x [c]) (SHRWconst x [d])) && d==16-c && c < 16 && t.Size() == 2 => (ROLWconst x [c])
+((ADDL|ORL|XORL) <t> (SHLLconst x [c]) (SHRBconst x [d])) && d==8-c  && c < 8  && t.Size() == 1 => (ROLBconst x [c])
+
+(ROLQconst [c] (ROLQconst [d] x)) => (ROLQconst [(c+d)&63] x)
+(ROLLconst [c] (ROLLconst [d] x)) => (ROLLconst [(c+d)&31] x)
+(ROLWconst [c] (ROLWconst [d] x)) => (ROLWconst [(c+d)&15] x)
+(ROLBconst [c] (ROLBconst [d] x)) => (ROLBconst [(c+d)& 7] x)
+
+(RotateLeft8  ...) => (ROLB ...)
+(RotateLeft16 ...) => (ROLW ...)
+(RotateLeft32 ...) => (ROLL ...)
+(RotateLeft64 ...) => (ROLQ ...)
+
+// Non-constant rotates.
+// We want to issue a rotate when the Go source contains code like
+//     y &= 63
+//     x << y | x >> (64-y)
+// The shift rules above convert << to SHLx and >> to SHRx.
+// SHRx converts its shift argument from 64-y to -y.
+// A tricky situation occurs when y==0. Then the original code would be:
+//     x << 0 | x >> 64
+// But x >> 64 is 0, not x. So there's an additional mask that is ANDed in
+// to force the second term to 0. We don't need that mask, but we must match
+// it in order to strip it out.
+(ORQ (SHLQ x y) (ANDQ (SHRQ x (NEG(Q|L) y)) (SBBQcarrymask (CMP(Q|L)const (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [63]) [-64])) [64])))) => (ROLQ x y)
+(ORQ (SHRQ x y) (ANDQ (SHLQ x (NEG(Q|L) y)) (SBBQcarrymask (CMP(Q|L)const (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [63]) [-64])) [64])))) => (RORQ x y)
+
+(ORL (SHLL x y) (ANDL (SHRL x (NEG(Q|L) y)) (SBBLcarrymask (CMP(Q|L)const (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [31]) [-32])) [32])))) => (ROLL x y)
+(ORL (SHRL x y) (ANDL (SHLL x (NEG(Q|L) y)) (SBBLcarrymask (CMP(Q|L)const (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [31]) [-32])) [32])))) => (RORL x y)
+
+// Help with rotate detection
+(CMPQconst (NEGQ (ADDQconst [-16] (ANDQconst [15] _))) [32]) => (FlagLT_ULT)
+(CMPQconst (NEGQ (ADDQconst [ -8] (ANDQconst  [7] _))) [32]) => (FlagLT_ULT)
+
+(ORL (SHLL x (AND(Q|L)const y [15]))
+     (ANDL (SHRW x (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [15]) [-16])))
+           (SBBLcarrymask (CMP(Q|L)const (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [15]) [-16])) [16]))))
+  && v.Type.Size() == 2
+  => (ROLW x y)
+(ORL (SHRW x (AND(Q|L)const y [15]))
+     (SHLL x (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [15]) [-16]))))
+  && v.Type.Size() == 2
+  => (RORW x y)
+
+(ORL (SHLL x (AND(Q|L)const y [ 7]))
+     (ANDL (SHRB x (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [ 7]) [ -8])))
+           (SBBLcarrymask (CMP(Q|L)const (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [ 7]) [ -8])) [ 8]))))
+  && v.Type.Size() == 1
+  => (ROLB x y)
+(ORL (SHRB x (AND(Q|L)const y [ 7]))
+     (SHLL x (NEG(Q|L) (ADD(Q|L)const (AND(Q|L)const y [ 7]) [ -8]))))
+  && v.Type.Size() == 1
+  => (RORB x y)
+
+// rotate left negative = rotate right
+(ROLQ x (NEG(Q|L) y)) => (RORQ x y)
+(ROLL x (NEG(Q|L) y)) => (RORL x y)
+(ROLW x (NEG(Q|L) y)) => (RORW x y)
+(ROLB x (NEG(Q|L) y)) => (RORB x y)
+
+// rotate right negative = rotate left
+(RORQ x (NEG(Q|L) y)) => (ROLQ x y)
+(RORL x (NEG(Q|L) y)) => (ROLL x y)
+(RORW x (NEG(Q|L) y)) => (ROLW x y)
+(RORB x (NEG(Q|L) y)) => (ROLB x y)
+
+// rotate by constants
+(ROLQ x (MOV(Q|L)const [c])) => (ROLQconst [int8(c&63)] x)
+(ROLL x (MOV(Q|L)const [c])) => (ROLLconst [int8(c&31)] x)
+(ROLW x (MOV(Q|L)const [c])) => (ROLWconst [int8(c&15)] x)
+(ROLB x (MOV(Q|L)const [c])) => (ROLBconst [int8(c&7) ] x)
+
+(RORQ x (MOV(Q|L)const [c])) => (ROLQconst [int8((-c)&63)] x)
+(RORL x (MOV(Q|L)const [c])) => (ROLLconst [int8((-c)&31)] x)
+(RORW x (MOV(Q|L)const [c])) => (ROLWconst [int8((-c)&15)] x)
+(RORB x (MOV(Q|L)const [c])) => (ROLBconst [int8((-c)&7) ] x)
+
+// Constant shift simplifications
+((SHLQ|SHRQ|SARQ)const      x [0]) => x
+((SHLL|SHRL|SARL)const      x [0]) => x
+((SHRW|SARW)const           x [0]) => x
+((SHRB|SARB)const           x [0]) => x
+((ROLQ|ROLL|ROLW|ROLB)const x [0]) => x
+
+// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
+// because the x86 instructions are defined to use all 5 bits of the shift even
+// for the small shifts. I don't think we'll ever generate a weird shift (e.g.
+// (SHRW x (MOVLconst [24])), but just in case.
+
+(CMPQ x (MOVQconst [c])) && is32Bit(c) => (CMPQconst x [int32(c)])
+(CMPQ (MOVQconst [c]) x) && is32Bit(c) => (InvertFlags (CMPQconst x [int32(c)]))
+(CMPL x (MOVLconst [c])) => (CMPLconst x [c])
+(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c]))
+(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)])
+(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)]))
+(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)])
+(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)]))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+(CMP(Q|L|W|B) x y) && x.ID > y.ID => (InvertFlags (CMP(Q|L|W|B) y x))
+
+// Using MOVZX instead of AND is cheaper.
+(AND(Q|L)const [  0xFF] x) => (MOVBQZX x)
+(AND(Q|L)const [0xFFFF] x) => (MOVWQZX x)
+// This rule is currently invalid because 0xFFFFFFFF is not representable by a signed int32.
+// Commenting out for now, because it also can't trigger because of the is32bit guard on the
+// ANDQconst lowering-rule, above, prevents 0xFFFFFFFF from matching (for the same reason)
+// Using an alternate form of this rule segfaults some binaries because of
+// adverse interactions with other passes.
+// (ANDQconst [0xFFFFFFFF] x) => (MOVLQZX x)
+
+// strength reduction
+// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
+//    1 - addq, shlq, leaq, negq, subq
+//    3 - imulq
+// This limits the rewrites to two instructions.
+// Note that negq always operates in-place,
+// which can require a register-register move
+// to preserve the original value,
+// so it must be used with care.
+(MUL(Q|L)const [-9] x) => (NEG(Q|L) (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [-5] x) => (NEG(Q|L) (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [-3] x) => (NEG(Q|L) (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [-1] x) => (NEG(Q|L) x)
+(MUL(Q|L)const [ 0] _) => (MOV(Q|L)const [0])
+(MUL(Q|L)const [ 1] x) => x
+(MUL(Q|L)const [ 3] x) => (LEA(Q|L)2 x x)
+(MUL(Q|L)const [ 5] x) => (LEA(Q|L)4 x x)
+(MUL(Q|L)const [ 7] x) => (LEA(Q|L)2 x (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [ 9] x) => (LEA(Q|L)8 x x)
+(MUL(Q|L)const [11] x) => (LEA(Q|L)2 x (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [13] x) => (LEA(Q|L)4 x (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [19] x) => (LEA(Q|L)2 x (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [21] x) => (LEA(Q|L)4 x (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [25] x) => (LEA(Q|L)8 x (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [27] x) => (LEA(Q|L)8 (LEA(Q|L)2 <v.Type> x x) (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [37] x) => (LEA(Q|L)4 x (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [41] x) => (LEA(Q|L)8 x (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [45] x) => (LEA(Q|L)8 (LEA(Q|L)4 <v.Type> x x) (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [73] x) => (LEA(Q|L)8 x (LEA(Q|L)8 <v.Type> x x))
+(MUL(Q|L)const [81] x) => (LEA(Q|L)8 (LEA(Q|L)8 <v.Type> x x) (LEA(Q|L)8 <v.Type> x x))
+
+(MUL(Q|L)const [c] x) && isPowerOfTwo64(int64(c)+1) && c >=  15 => (SUB(Q|L)  (SHL(Q|L)const <v.Type> [int8(log64(int64(c)+1))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-1) && c >=  17 => (LEA(Q|L)1 (SHL(Q|L)const <v.Type> [int8(log32(c-1))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-2) && c >=  34 => (LEA(Q|L)2 (SHL(Q|L)const <v.Type> [int8(log32(c-2))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-4) && c >=  68 => (LEA(Q|L)4 (SHL(Q|L)const <v.Type> [int8(log32(c-4))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEA(Q|L)8 (SHL(Q|L)const <v.Type> [int8(log32(c-8))] x) x)
+(MUL(Q|L)const [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHL(Q|L)const [int8(log32(c/3))] (LEA(Q|L)2 <v.Type> x x))
+(MUL(Q|L)const [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHL(Q|L)const [int8(log32(c/5))] (LEA(Q|L)4 <v.Type> x x))
+(MUL(Q|L)const [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHL(Q|L)const [int8(log32(c/9))] (LEA(Q|L)8 <v.Type> x x))
+
+// combine add/shift into LEAQ/LEAL
+(ADD(L|Q) x (SHL(L|Q)const [3] y)) => (LEA(L|Q)8 x y)
+(ADD(L|Q) x (SHL(L|Q)const [2] y)) => (LEA(L|Q)4 x y)
+(ADD(L|Q) x (SHL(L|Q)const [1] y)) => (LEA(L|Q)2 x y)
+(ADD(L|Q) x (ADD(L|Q) y y))        => (LEA(L|Q)2 x y)
+(ADD(L|Q) x (ADD(L|Q) x y))        => (LEA(L|Q)2 y x)
+
+// combine ADDQ/ADDQconst into LEAQ1/LEAL1
+(ADD(Q|L)const [c] (ADD(Q|L) x y)) => (LEA(Q|L)1 [c] x y)
+(ADD(Q|L) (ADD(Q|L)const [c] x) y) => (LEA(Q|L)1 [c] x y)
+(ADD(Q|L)const [c] (SHL(Q|L)const [1] x)) => (LEA(Q|L)1 [c] x x)
+
+// fold ADDQ/ADDL into LEAQ/LEAL
+(ADD(Q|L)const [c] (LEA(Q|L) [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L) [c+d] {s} x)
+(LEA(Q|L) [c] {s} (ADD(Q|L)const [d] x)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L) [c+d] {s} x)
+(LEA(Q|L) [c] {s} (ADD(Q|L) x y)) && x.Op != OpSB && y.Op != OpSB => (LEA(Q|L)1 [c] {s} x y)
+(ADD(Q|L) x (LEA(Q|L) [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEA(Q|L)1 [c] {s} x y)
+
+// fold ADDQconst/ADDLconst into LEAQx/LEALx
+(ADD(Q|L)const [c] (LEA(Q|L)1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)1 [c+d] {s} x y)
+(ADD(Q|L)const [c] (LEA(Q|L)2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)2 [c+d] {s} x y)
+(ADD(Q|L)const [c] (LEA(Q|L)4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)4 [c+d] {s} x y)
+(ADD(Q|L)const [c] (LEA(Q|L)8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)8 [c+d] {s} x y)
+(LEA(Q|L)1 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEA(Q|L)1 [c+d] {s} x y)
+(LEA(Q|L)2 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEA(Q|L)2 [c+d] {s} x y)
+(LEA(Q|L)2 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEA(Q|L)2 [c+2*d] {s} x y)
+(LEA(Q|L)4 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEA(Q|L)4 [c+d] {s} x y)
+(LEA(Q|L)4 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEA(Q|L)4 [c+4*d] {s} x y)
+(LEA(Q|L)8 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEA(Q|L)8 [c+d] {s} x y)
+(LEA(Q|L)8 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEA(Q|L)8 [c+8*d] {s} x y)
+
+// fold shifts into LEAQx/LEALx
+(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)2 [c] {s} x y)
+(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [2] y)) => (LEA(Q|L)4 [c] {s} x y)
+(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [3] y)) => (LEA(Q|L)8 [c] {s} x y)
+(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)4 [c] {s} x y)
+(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [2] y)) => (LEA(Q|L)8 [c] {s} x y)
+(LEA(Q|L)4 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)8 [c] {s} x y)
+
+// reverse ordering of compare instruction
+(SETL (InvertFlags x)) => (SETG x)
+(SETG (InvertFlags x)) => (SETL x)
+(SETB (InvertFlags x)) => (SETA x)
+(SETA (InvertFlags x)) => (SETB x)
+(SETLE (InvertFlags x)) => (SETGE x)
+(SETGE (InvertFlags x)) => (SETLE x)
+(SETBE (InvertFlags x)) => (SETAE x)
+(SETAE (InvertFlags x)) => (SETBE x)
+(SETEQ (InvertFlags x)) => (SETEQ x)
+(SETNE (InvertFlags x)) => (SETNE x)
+
+(SETLstore [off] {sym} ptr (InvertFlags x) mem) => (SETGstore [off] {sym} ptr x mem)
+(SETGstore [off] {sym} ptr (InvertFlags x) mem) => (SETLstore [off] {sym} ptr x mem)
+(SETBstore [off] {sym} ptr (InvertFlags x) mem) => (SETAstore [off] {sym} ptr x mem)
+(SETAstore [off] {sym} ptr (InvertFlags x) mem) => (SETBstore [off] {sym} ptr x mem)
+(SETLEstore [off] {sym} ptr (InvertFlags x) mem) => (SETGEstore [off] {sym} ptr x mem)
+(SETGEstore [off] {sym} ptr (InvertFlags x) mem) => (SETLEstore [off] {sym} ptr x mem)
+(SETBEstore [off] {sym} ptr (InvertFlags x) mem) => (SETAEstore [off] {sym} ptr x mem)
+(SETAEstore [off] {sym} ptr (InvertFlags x) mem) => (SETBEstore [off] {sym} ptr x mem)
+(SETEQstore [off] {sym} ptr (InvertFlags x) mem) => (SETEQstore [off] {sym} ptr x mem)
+(SETNEstore [off] {sym} ptr (InvertFlags x) mem) => (SETNEstore [off] {sym} ptr x mem)
+
+// sign extended loads
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values.  See test/atomicload.go.
+(MOVBQSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVBQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVWQSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVLQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLQSXload <v.Type> [off] {sym} ptr mem)
+(MOVLQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLQSXload <v.Type> [off] {sym} ptr mem)
+(MOVLQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLload <v.Type> [off] {sym} ptr mem)
+(MOVLQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLload <v.Type> [off] {sym} ptr mem)
+
+(MOVLQZX x) && zeroUpper32Bits(x,3) => x
+(MOVWQZX x) && zeroUpper48Bits(x,3) => x
+(MOVBQZX x) && zeroUpper56Bits(x,3) => x
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBQZX x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWQZX x)
+(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVLQZX x)
+(MOVQload [off] {sym} ptr (MOVQstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVBQSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBQSX x)
+(MOVWQSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWQSX x)
+(MOVLQSXload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVLQSX x)
+
+// Fold extensions and ANDs together.
+(MOVBQZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x)
+(MOVWQZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x)
+(MOVLQZX (ANDLconst [c] x)) => (ANDLconst [c] x)
+(MOVBQSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x)
+(MOVWQSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x)
+(MOVLQSX (ANDLconst [c] x)) && uint32(c) & 0x80000000 == 0 => (ANDLconst [c & 0x7fffffff] x)
+
+// Don't extend before storing
+(MOVLstore [off] {sym} ptr (MOVLQSX x) mem) => (MOVLstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWQSX x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBQSX x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVLstore [off] {sym} ptr (MOVLQZX x) mem) => (MOVLstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWQZX x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBQZX x) mem) => (MOVBstore [off] {sym} ptr x mem)
+
+// fold constants into memory operations
+// Note that this is not always a good idea because if not all the uses of
+// the ADDQconst get eliminated, we still have to compute the ADDQconst and we now
+// have potentially two live values (ptr and (ADDQconst [off] ptr)) instead of one.
+// Nevertheless, let's do it!
+(MOV(Q|L|W|B|SS|SD|O)load  [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+    (MOV(Q|L|W|B|SS|SD|O)load  [off1+off2] {sym} ptr mem)
+(MOV(Q|L|W|B|SS|SD|O)store  [off1] {sym} (ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOV(Q|L|W|B|SS|SD|O)store  [off1+off2] {sym} ptr val mem)
+(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1+off2] {sym} base val mem)
+((ADD|SUB|AND|OR|XOR)Qload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {sym} val base mem)
+((ADD|SUB|AND|OR|XOR)Lload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
+(CMP(Q|L|W|B)load [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(CMP(Q|L|W|B)load [off1+off2] {sym} base val mem)
+(CMP(Q|L|W|B)constload [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
+	(CMP(Q|L|W|B)constload [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
+
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
+((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
+	((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
+((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
+	((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
+((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1+off2] {sym} base val mem)
+((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+	((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1+off2] {sym} base val mem)
+
+// Fold constants into stores.
+(MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validVal(c) =>
+	(MOVQstoreconst [makeValAndOff32(int32(c),off)] {sym} ptr mem)
+(MOVLstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) =>
+	(MOVLstoreconst [makeValAndOff32(int32(c),off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) =>
+	(MOVWstoreconst [makeValAndOff32(int32(int16(c)),off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) =>
+	(MOVBstoreconst [makeValAndOff32(int32(int8(c)),off)] {sym} ptr mem)
+
+// Fold address offsets into constant stores.
+(MOV(Q|L|W|B)storeconst [sc] {s} (ADDQconst [off] ptr) mem) && ValAndOff(sc).canAdd32(off) =>
+	(MOV(Q|L|W|B)storeconst [ValAndOff(sc).addOffset32(off)] {s} ptr mem)
+
+// We need to fold LEAQ into the MOVx ops so that the live variable analysis knows
+// what variables are being read/written by the ops.
+(MOV(Q|L|W|B|SS|SD|O|BQSX|WQSX|LQSX)load [off1] {sym1} (LEAQ [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOV(Q|L|W|B|SS|SD|O|BQSX|WQSX|LQSX)load [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOV(Q|L|W|B|SS|SD|O)store [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOV(Q|L|W|B|SS|SD|O)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOV(Q|L|W|B)storeconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd32(off) =>
+	(MOV(Q|L|W|B)storeconst [ValAndOff(sc).addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+((ADD|SUB|AND|OR|XOR)Qload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|AND|OR|XOR)Lload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+(CMP(Q|L|W|B)load [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(CMP(Q|L|W|B)load [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(CMP(Q|L|W|B)constload [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
+	&& ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
+	(CMP(Q|L|W|B)constload [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+
+((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
+	&& ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
+	((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
+	&& ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
+	((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
+((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+// fold LEAQs together
+(LEAQ [off1] {sym1} (LEAQ [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAQ [off1+off2] {mergeSym(sym1,sym2)} x)
+
+// LEAQ into LEAQ1
+(LEAQ1 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAQ1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ1 into LEAQ
+(LEAQ [off1] {sym1} (LEAQ1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+       (LEAQ1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ into LEAQ[248]
+(LEAQ2 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAQ2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ4 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAQ4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ8 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAQ8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ[248] into LEAQ
+(LEAQ [off1] {sym1} (LEAQ2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAQ2 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ [off1] {sym1} (LEAQ4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAQ4 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAQ [off1] {sym1} (LEAQ8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAQ8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ[1248] into LEAQ[1248]. Only some such merges are possible.
+(LEAQ1 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAQ2 [off1+off2] {mergeSym(sym1, sym2)} x y)
+(LEAQ1 [off1] {sym1} x (LEAQ1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAQ2 [off1+off2] {mergeSym(sym1, sym2)} y x)
+(LEAQ2 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+2*int64(off2)) && sym2 == nil =>
+      (LEAQ4 [off1+2*off2] {sym1} x y)
+(LEAQ4 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+4*int64(off2)) && sym2 == nil =>
+      (LEAQ8 [off1+4*off2] {sym1} x y)
+// TODO: more?
+
+// Lower LEAQ2/4/8 when the offset is a constant
+(LEAQ2 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*2) =>
+	(LEAQ [off+int32(scale)*2] {sym} x)
+(LEAQ4 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*4) =>
+	(LEAQ [off+int32(scale)*4] {sym} x)
+(LEAQ8 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*8) =>
+	(LEAQ [off+int32(scale)*8] {sym} x)
+
+// Absorb InvertFlags into branches.
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+
+// Constant comparisons.
+(CMPQconst (MOVQconst [x]) [y]) && x==int64(y) => (FlagEQ)
+(CMPQconst (MOVQconst [x]) [y]) && x<int64(y) && uint64(x)<uint64(int64(y)) => (FlagLT_ULT)
+(CMPQconst (MOVQconst [x]) [y]) && x<int64(y) && uint64(x)>uint64(int64(y)) => (FlagLT_UGT)
+(CMPQconst (MOVQconst [x]) [y]) && x>int64(y) && uint64(x)<uint64(int64(y)) => (FlagGT_ULT)
+(CMPQconst (MOVQconst [x]) [y]) && x>int64(y) && uint64(x)>uint64(int64(y)) => (FlagGT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x==y => (FlagEQ)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y => (FlagEQ)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y => (FlagEQ)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT)
+
+// CMPQconst requires a 32 bit const, but we can still constant-fold 64 bit consts.
+// In theory this applies to any of the simplifications above,
+// but CMPQ is the only one I've actually seen occur.
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x==y => (FlagEQ)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x<y && uint64(x)<uint64(y) => (FlagLT_ULT)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x<y && uint64(x)>uint64(y) => (FlagLT_UGT)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x>y && uint64(x)<uint64(y) => (FlagGT_ULT)
+(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x>y && uint64(x)>uint64(y) => (FlagGT_UGT)
+
+// Other known comparisons.
+(CMPQconst (MOVBQZX _) [c]) && 0xFF < c => (FlagLT_ULT)
+(CMPQconst (MOVWQZX _) [c]) && 0xFFFF < c => (FlagLT_ULT)
+(CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) => (FlagLT_ULT)
+(CMPQconst (SHRQconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 64 && (1<<uint64(64-c)) <= uint64(n) => (FlagLT_ULT)
+(CMPQconst (ANDQconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPQconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
+(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT)
+(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m)  && int8(m)  < n => (FlagLT_ULT)
+
+// TESTQ c c sets flags like CMPQ c 0.
+(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c == 0 => (FlagEQ)
+(TESTLconst [c] (MOVLconst [c])) && c == 0 => (FlagEQ)
+(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c < 0  => (FlagLT_UGT)
+(TESTLconst [c] (MOVLconst [c])) && c < 0  => (FlagLT_UGT)
+(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c > 0  => (FlagGT_UGT)
+(TESTLconst [c] (MOVLconst [c])) && c > 0  => (FlagGT_UGT)
+
+// TODO: DIVxU also.
+
+// Absorb flag constants into SBB ops.
+(SBBQcarrymask (FlagEQ))     => (MOVQconst [0])
+(SBBQcarrymask (FlagLT_ULT)) => (MOVQconst [-1])
+(SBBQcarrymask (FlagLT_UGT)) => (MOVQconst [0])
+(SBBQcarrymask (FlagGT_ULT)) => (MOVQconst [-1])
+(SBBQcarrymask (FlagGT_UGT)) => (MOVQconst [0])
+(SBBLcarrymask (FlagEQ))     => (MOVLconst [0])
+(SBBLcarrymask (FlagLT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagLT_UGT)) => (MOVLconst [0])
+(SBBLcarrymask (FlagGT_ULT)) => (MOVLconst [-1])
+(SBBLcarrymask (FlagGT_UGT)) => (MOVLconst [0])
+
+// Absorb flag constants into branches.
+((EQ|LE|GE|ULE|UGE) (FlagEQ) yes no)     => (First yes no)
+((NE|LT|GT|ULT|UGT) (FlagEQ) yes no)     => (First no yes)
+((NE|LT|LE|ULT|ULE) (FlagLT_ULT) yes no) => (First yes no)
+((EQ|GT|GE|UGT|UGE) (FlagLT_ULT) yes no) => (First no yes)
+((NE|LT|LE|UGT|UGE) (FlagLT_UGT) yes no) => (First yes no)
+((EQ|GT|GE|ULT|ULE) (FlagLT_UGT) yes no) => (First no yes)
+((NE|GT|GE|ULT|ULE) (FlagGT_ULT) yes no) => (First yes no)
+((EQ|LT|LE|UGT|UGE) (FlagGT_ULT) yes no) => (First no yes)
+((NE|GT|GE|UGT|UGE) (FlagGT_UGT) yes no) => (First yes no)
+((EQ|LT|LE|ULT|ULE) (FlagGT_UGT) yes no) => (First no yes)
+
+// Absorb flag constants into SETxx ops.
+((SETEQ|SETLE|SETGE|SETBE|SETAE) (FlagEQ))     => (MOVLconst [1])
+((SETNE|SETL|SETG|SETB|SETA)     (FlagEQ))     => (MOVLconst [0])
+((SETNE|SETL|SETLE|SETB|SETBE)   (FlagLT_ULT)) => (MOVLconst [1])
+((SETEQ|SETG|SETGE|SETA|SETAE)   (FlagLT_ULT)) => (MOVLconst [0])
+((SETNE|SETL|SETLE|SETA|SETAE)   (FlagLT_UGT)) => (MOVLconst [1])
+((SETEQ|SETG|SETGE|SETB|SETBE)   (FlagLT_UGT)) => (MOVLconst [0])
+((SETNE|SETG|SETGE|SETB|SETBE)   (FlagGT_ULT)) => (MOVLconst [1])
+((SETEQ|SETL|SETLE|SETA|SETAE)   (FlagGT_ULT)) => (MOVLconst [0])
+((SETNE|SETG|SETGE|SETA|SETAE)   (FlagGT_UGT)) => (MOVLconst [1])
+((SETEQ|SETL|SETLE|SETB|SETBE)   (FlagGT_UGT)) => (MOVLconst [0])
+
+(SETEQstore [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETEQstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETEQstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETEQstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETEQstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETNEstore [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETNEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETNEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETNEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETNEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETLstore  [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETLstore  [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLstore  [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLstore  [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETLstore  [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETLEstore [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETLEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETLEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETGstore  [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGstore  [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGstore  [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGstore  [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETGstore  [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETGEstore [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETGEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETGEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETGEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETBstore  [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETBstore  [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBstore  [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETBstore  [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBstore  [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETBEstore [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETBEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETBEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+
+(SETAstore  [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAstore  [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAstore  [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETAstore  [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAstore  [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+(SETAEstore [off] {sym} ptr (FlagEQ)     mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETAEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+(SETAEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem)
+(SETAEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem)
+
+// Remove redundant *const ops
+(ADDQconst [0] x)          => x
+(ADDLconst [c] x) && c==0  => x
+(SUBQconst [0] x)          => x
+(SUBLconst [c] x) && c==0  => x
+(ANDQconst [0] _)          => (MOVQconst [0])
+(ANDLconst [c] _) && c==0  => (MOVLconst [0])
+(ANDQconst [-1] x)         => x
+(ANDLconst [c] x) && c==-1 => x
+(ORQconst [0] x)           => x
+(ORLconst [c] x)  && c==0  => x
+(ORQconst [-1] _)          => (MOVQconst [-1])
+(ORLconst [c] _)  && c==-1 => (MOVLconst [-1])
+(XORQconst [0] x)          => x
+(XORLconst [c] x) && c==0  => x
+// TODO: since we got rid of the W/B versions, we might miss
+// things like (ANDLconst [0x100] x) which were formerly
+// (ANDBconst [0] x).  Probably doesn't happen very often.
+// If we cared, we might do:
+//  (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 -> (MOVLconst [0])
+
+// Remove redundant ops
+// Not in generic rules, because they may appear after lowering e. g. Slicemask
+(NEG(Q|L) (NEG(Q|L) x)) => x
+(NEG(Q|L) s:(SUB(Q|L) x y)) && s.Uses == 1 => (SUB(Q|L) y x)
+
+// Convert constant subtracts to constant adds
+(SUBQconst [c] x) && c != -(1<<31) => (ADDQconst [-c] x)
+(SUBLconst [c] x) => (ADDLconst [-c] x)
+
+// generic constant folding
+// TODO: more of this
+(ADDQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)+d])
+(ADDLconst [c] (MOVLconst [d])) => (MOVLconst [c+d])
+(ADDQconst [c] (ADDQconst [d] x)) && is32Bit(int64(c)+int64(d)) => (ADDQconst [c+d] x)
+(ADDLconst [c] (ADDLconst [d] x)) => (ADDLconst [c+d] x)
+(SUBQconst (MOVQconst [d]) [c]) => (MOVQconst [d-int64(c)])
+(SUBQconst (SUBQconst x [d]) [c]) && is32Bit(int64(-c)-int64(d)) => (ADDQconst [-c-d] x)
+(SARQconst [c] (MOVQconst [d])) => (MOVQconst [d>>uint64(c)])
+(SARLconst [c] (MOVQconst [d])) => (MOVQconst [int64(int32(d))>>uint64(c)])
+(SARWconst [c] (MOVQconst [d])) => (MOVQconst [int64(int16(d))>>uint64(c)])
+(SARBconst [c] (MOVQconst [d])) => (MOVQconst [int64(int8(d))>>uint64(c)])
+(NEGQ (MOVQconst [c])) => (MOVQconst [-c])
+(NEGL (MOVLconst [c])) => (MOVLconst [-c])
+(MULQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)*d])
+(MULLconst [c] (MOVLconst [d])) => (MOVLconst [c*d])
+(ANDQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)&d])
+(ANDLconst [c] (MOVLconst [d])) => (MOVLconst [c&d])
+(ORQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)|d])
+(ORLconst [c] (MOVLconst [d])) => (MOVLconst [c|d])
+(XORQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)^d])
+(XORLconst [c] (MOVLconst [d])) => (MOVLconst [c^d])
+(NOTQ (MOVQconst [c])) => (MOVQconst [^c])
+(NOTL (MOVLconst [c])) => (MOVLconst [^c])
+(BTSQconst [c] (MOVQconst [d])) => (MOVQconst [d|(1<<uint32(c))])
+(BTSLconst [c] (MOVLconst [d])) => (MOVLconst [d|(1<<uint32(c))])
+(BTRQconst [c] (MOVQconst [d])) => (MOVQconst [d&^(1<<uint32(c))])
+(BTRLconst [c] (MOVLconst [d])) => (MOVLconst [d&^(1<<uint32(c))])
+(BTCQconst [c] (MOVQconst [d])) => (MOVQconst [d^(1<<uint32(c))])
+(BTCLconst [c] (MOVLconst [d])) => (MOVLconst [d^(1<<uint32(c))])
+
+// If c or d doesn't fit into 32 bits, then we can't construct ORQconst,
+// but we can still constant-fold.
+// In theory this applies to any of the simplifications above,
+// but ORQ is the only one I've actually seen occur.
+(ORQ (MOVQconst [c]) (MOVQconst [d])) => (MOVQconst [c|d])
+
+// generic simplifications
+// TODO: more of this
+(ADDQ x (NEGQ y)) => (SUBQ x y)
+(ADDL x (NEGL y)) => (SUBL x y)
+(SUBQ x x) => (MOVQconst [0])
+(SUBL x x) => (MOVLconst [0])
+(ANDQ x x) => x
+(ANDL x x) => x
+(ORQ x x)  => x
+(ORL x x)  => x
+(XORQ x x) => (MOVQconst [0])
+(XORL x x) => (MOVLconst [0])
+
+(SHLLconst [d] (MOVLconst [c])) => (MOVLconst [c << uint64(d)])
+(SHLQconst [d] (MOVQconst [c])) => (MOVQconst [c << uint64(d)])
+(SHLQconst [d] (MOVLconst [c])) => (MOVQconst [int64(c) << uint64(d)])
+
+// Fold NEG into ADDconst/MULconst. Take care to keep c in 32 bit range.
+(NEGQ (ADDQconst [c] (NEGQ x))) && c != -(1<<31) => (ADDQconst [-c] x)
+(MULQconst [c] (NEGQ x)) && c != -(1<<31) => (MULQconst [-c] x)
+
+// checking AND against 0.
+(CMPQconst a:(ANDQ x y) [0]) && a.Uses == 1 => (TESTQ x y)
+(CMPLconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTL x y)
+(CMPWconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTW x y)
+(CMPBconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTB x y)
+(CMPQconst a:(ANDQconst [c] x) [0]) && a.Uses == 1 => (TESTQconst [c] x)
+(CMPLconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTLconst [c] x)
+(CMPWconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTWconst [int16(c)] x)
+(CMPBconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTBconst [int8(c)] x)
+
+// Convert TESTx to TESTxconst if possible.
+(TESTQ (MOVQconst [c]) x) && is32Bit(c) => (TESTQconst [int32(c)] x)
+(TESTL (MOVLconst [c]) x) => (TESTLconst [c] x)
+(TESTW (MOVLconst [c]) x) => (TESTWconst [int16(c)] x)
+(TESTB (MOVLconst [c]) x) => (TESTBconst [int8(c)] x)
+
+// TEST %reg,%reg is shorter than CMP
+(CMPQconst x [0]) => (TESTQ x x)
+(CMPLconst x [0]) => (TESTL x x)
+(CMPWconst x [0]) => (TESTW x x)
+(CMPBconst x [0]) => (TESTB x x)
+(TESTQconst [-1] x) && x.Op != OpAMD64MOVQconst => (TESTQ x x)
+(TESTLconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTL x x)
+(TESTWconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTW x x)
+(TESTBconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTB x x)
+
+// Convert LEAQ1 back to ADDQ if we can
+(LEAQ1 [0] x y) && v.Aux == nil => (ADDQ x y)
+
+// Combining byte loads into larger (unaligned) loads.
+// There are many ways these combinations could occur.  This is
+// designed to match the way encoding/binary.LittleEndian does it.
+
+// Little-endian loads
+
+(OR(L|Q)                  x0:(MOVBload [i0] {s} p mem)
+    sh:(SHL(L|Q)const [8] x1:(MOVBload [i1] {s} p mem)))
+  && i1 == i0+1
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
+
+(OR(L|Q)                  x0:(MOVBload [i] {s} p0 mem)
+    sh:(SHL(L|Q)const [8] x1:(MOVBload [i] {s} p1 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+
+(OR(L|Q)                   x0:(MOVWload [i0] {s} p mem)
+    sh:(SHL(L|Q)const [16] x1:(MOVWload [i1] {s} p mem)))
+  && i1 == i0+2
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
+
+(OR(L|Q)                   x0:(MOVWload [i] {s} p0 mem)
+    sh:(SHL(L|Q)const [16] x1:(MOVWload [i] {s} p1 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
+
+(ORQ                   x0:(MOVLload [i0] {s} p mem)
+    sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem)))
+  && i1 == i0+4
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem)
+
+(ORQ                   x0:(MOVLload [i] {s} p0 mem)
+    sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem)
+
+(OR(L|Q)
+    s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
+    or:(OR(L|Q)
+        s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
+	y))
+  && i1 == i0+1
+  && j1 == j0+8
+  && j0 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
+
+(OR(L|Q)
+    s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
+    or:(OR(L|Q)
+        s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
+	y))
+  && j1 == j0+8
+  && j0 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
+
+(ORQ
+    s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem))
+    or:(ORQ
+        s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem))
+	y))
+  && i1 == i0+2
+  && j1 == j0+16
+  && j0 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y)
+
+(ORQ
+    s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem))
+    or:(ORQ
+        s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem))
+	y))
+  && j1 == j0+16
+  && j0 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y)
+
+// Big-endian loads
+
+(OR(L|Q)
+                           x1:(MOVBload [i1] {s} p mem)
+    sh:(SHL(L|Q)const [8]  x0:(MOVBload [i0] {s} p mem)))
+  && i1 == i0+1
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
+
+(OR(L|Q)
+                           x1:(MOVBload [i] {s} p1 mem)
+    sh:(SHL(L|Q)const [8]  x0:(MOVBload [i] {s} p0 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
+
+(OR(L|Q)
+                            r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))
+    sh:(SHL(L|Q)const [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
+  && i1 == i0+2
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
+
+(OR(L|Q)
+                            r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))
+    sh:(SHL(L|Q)const [16]  r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
+
+(ORQ
+                        r1:(BSWAPL x1:(MOVLload [i1] {s} p mem))
+    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i0] {s} p mem))))
+  && i1 == i0+4
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem))
+
+(ORQ
+                        r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem))
+    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem))
+
+(OR(L|Q)
+    s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
+    or:(OR(L|Q)
+        s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
+	y))
+  && i1 == i0+1
+  && j1 == j0-8
+  && j1 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
+
+(OR(L|Q)
+    s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
+    or:(OR(L|Q)
+        s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
+	y))
+  && j1 == j0-8
+  && j1 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
+
+(ORQ
+    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem)))
+    or:(ORQ
+        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)))
+	y))
+  && i1 == i0+2
+  && j1 == j0-16
+  && j1 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, r0, r1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y)
+
+(ORQ
+    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem)))
+    or:(ORQ
+        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)))
+	y))
+  && j1 == j0-16
+  && j1 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, r0, r1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y)
+
+// Combine 2 byte stores + shift into rolw 8 + word store
+(MOVBstore [i] {s} p w
+  x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem))
+  && x0.Uses == 1
+  && clobber(x0)
+  => (MOVWstore [i-1] {s} p (ROLWconst <w.Type> [8] w) mem)
+(MOVBstore [i] {s} p1 w
+  x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem))
+  && x0.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x0)
+  => (MOVWstore [i] {s} p0 (ROLWconst <w.Type> [8] w) mem)
+
+// Combine stores + shifts into bswap and larger (unaligned) stores
+(MOVBstore [i] {s} p w
+  x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)
+  x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)
+  x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && clobber(x0, x1, x2)
+  => (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
+(MOVBstore [i] {s} p3 w
+  x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w)
+  x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w)
+  x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && sequentialAddresses(p1, p2, 1)
+  && sequentialAddresses(p2, p3, 1)
+  && clobber(x0, x1, x2)
+  => (MOVLstore [i] {s} p0 (BSWAPL <w.Type> w) mem)
+
+(MOVBstore [i] {s} p w
+  x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)
+  x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)
+  x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)
+  x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)
+  x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)
+  x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)
+  x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && clobber(x0, x1, x2, x3, x4, x5, x6)
+  => (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
+(MOVBstore [i] {s} p7 w
+  x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w)
+  x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w)
+  x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w)
+  x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w)
+  x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w)
+  x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w)
+  x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem))))))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && sequentialAddresses(p1, p2, 1)
+  && sequentialAddresses(p2, p3, 1)
+  && sequentialAddresses(p3, p4, 1)
+  && sequentialAddresses(p4, p5, 1)
+  && sequentialAddresses(p5, p6, 1)
+  && sequentialAddresses(p6, p7, 1)
+  && clobber(x0, x1, x2, x3, x4, x5, x6)
+  => (MOVQstore [i] {s} p0 (BSWAPQ <w.Type> w) mem)
+
+// Combine constant stores into larger (unaligned) stores.
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 1 == c.Off()
+  && clobber(x)
+  => (MOVWstoreconst [makeValAndOff64(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p mem)
+(MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 1 == c.Off()
+  && clobber(x)
+  => (MOVWstoreconst [makeValAndOff64(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p mem)
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 2 == c.Off()
+  && clobber(x)
+  => (MOVLstoreconst [makeValAndOff64(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p mem)
+(MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 2 == c.Off()
+  && clobber(x)
+  => (MOVLstoreconst [makeValAndOff64(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p mem)
+(MOVLstoreconst [c] {s} p x:(MOVLstoreconst [a] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 4 == c.Off()
+  && clobber(x)
+  => (MOVQstore [a.Off32()] {s} p (MOVQconst [a.Val()&0xffffffff | c.Val()<<32]) mem)
+(MOVLstoreconst [a] {s} p x:(MOVLstoreconst [c] {s} p mem))
+  && x.Uses == 1
+  && a.Off() + 4 == c.Off()
+  && clobber(x)
+  => (MOVQstore [a.Off32()] {s} p (MOVQconst [a.Val()&0xffffffff | c.Val()<<32]) mem)
+(MOVQstoreconst [c] {s} p x:(MOVQstoreconst [c2] {s} p mem))
+  && config.useSSE
+  && x.Uses == 1
+  && c2.Off() + 8 == c.Off()
+  && c.Val() == 0
+  && c2.Val() == 0
+  && clobber(x)
+  => (MOVOstore [c2.Off32()] {s} p (MOVOconst [0]) mem)
+
+// Combine stores into larger (unaligned) stores. Little endian.
+(MOVBstore [i] {s} p (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHR(W|L|Q)const [8] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i] {s} p w mem)
+(MOVBstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p w0:(SHR(L|Q)const [j-8] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  => (MOVWstore [i] {s} p0 w0 mem)
+
+(MOVWstore [i] {s} p (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVLstore [i-2] {s} p w mem)
+(MOVWstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p w0:(SHR(L|Q)const [j-16] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVLstore [i-2] {s} p w0 mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  => (MOVLstore [i] {s} p0 w mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  => (MOVLstore [i] {s} p0 w0 mem)
+
+(MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVQstore [i-4] {s} p w mem)
+(MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVQstore [i-4] {s} p w0 mem)
+(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && clobber(x)
+  => (MOVQstore [i] {s} p0 w mem)
+(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && clobber(x)
+  => (MOVQstore [i] {s} p0 w0 mem)
+
+(MOVBstore [i] {s} p
+  x1:(MOVBload [j] {s2} p2 mem)
+    mem2:(MOVBstore [i-1] {s} p
+      x2:(MOVBload [j-1] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && clobber(x1, x2, mem2)
+  => (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem)
+
+(MOVWstore [i] {s} p
+  x1:(MOVWload [j] {s2} p2 mem)
+    mem2:(MOVWstore [i-2] {s} p
+      x2:(MOVWload [j-2] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && clobber(x1, x2, mem2)
+  => (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem)
+
+(MOVLstore [i] {s} p
+  x1:(MOVLload [j] {s2} p2 mem)
+    mem2:(MOVLstore [i-4] {s} p
+      x2:(MOVLload [j-4] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && clobber(x1, x2, mem2)
+  => (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
+
+(MOVQload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVQload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVLload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVLload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVQstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVQstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVLstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVLstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstore  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(MOVQstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVQstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVLstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVWstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVWstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVBstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVBstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+(MOVQload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => (MOVQload  [off1+off2] {sym} ptr mem)
+(MOVLload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => (MOVLload  [off1+off2] {sym} ptr mem)
+(MOVWload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => (MOVWload  [off1+off2] {sym} ptr mem)
+(MOVBload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => (MOVBload  [off1+off2] {sym} ptr mem)
+(MOVQstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => (MOVQstore  [off1+off2] {sym} ptr val mem)
+(MOVLstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => (MOVLstore  [off1+off2] {sym} ptr val mem)
+(MOVWstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => (MOVWstore  [off1+off2] {sym} ptr val mem)
+(MOVBstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => (MOVBstore  [off1+off2] {sym} ptr val mem)
+(MOVQstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+	(MOVQstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVLstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+	(MOVLstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVWstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+	(MOVWstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVBstoreconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+	(MOVBstoreconst [sc.addOffset32(off)] {s} ptr mem)
+
+// Merge load and op
+// TODO: add indexed variants?
+((ADD|SUB|AND|OR|XOR)Q x l:(MOVQload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|AND|OR|XOR)Qload x [off] {sym} ptr mem)
+((ADD|SUB|AND|OR|XOR)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|AND|OR|XOR)Lload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((BTC|BTR|BTS)L l:(MOVLload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((BTC|BTR|BTS)Lmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
+(MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
+(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
+(MOVQstore {sym} [off] ptr y:((BTC|BTR|BTS)Q l:(MOVQload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
+	((BTC|BTR|BTS)Qmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
+
+// Merge ADDQconst and LEAQ into atomic loads.
+(MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOV(Q|L|B)atomicload [off1+off2] {sym} ptr mem)
+(MOV(Q|L|B)atomicload [off1] {sym1} (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOV(Q|L|B)atomicload [off1+off2] {mergeSym(sym1, sym2)} ptr mem)
+
+// Merge ADDQconst and LEAQ into atomic stores.
+(XCHGQ [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(XCHGQ [off1+off2] {sym} val ptr mem)
+(XCHGQ [off1] {sym1} val (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ptr.Op != OpSB =>
+	(XCHGQ [off1+off2] {mergeSym(sym1,sym2)} val ptr mem)
+(XCHGL [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(XCHGL [off1+off2] {sym} val ptr mem)
+(XCHGL [off1] {sym1} val (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ptr.Op != OpSB =>
+	(XCHGL [off1+off2] {mergeSym(sym1,sym2)} val ptr mem)
+
+// Merge ADDQconst into atomic adds.
+// TODO: merging LEAQ doesn't work, assembler doesn't like the resulting instructions.
+(XADDQlock [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(XADDQlock [off1+off2] {sym} val ptr mem)
+(XADDLlock [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(XADDLlock [off1+off2] {sym} val ptr mem)
+
+// Merge ADDQconst into atomic compare and swaps.
+// TODO: merging LEAQ doesn't work, assembler doesn't like the resulting instructions.
+(CMPXCHGQlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(CMPXCHGQlock [off1+off2] {sym} ptr old new_ mem)
+(CMPXCHGLlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(int64(off1)+int64(off2)) =>
+	(CMPXCHGLlock [off1+off2] {sym} ptr old new_ mem)
+
+// We don't need the conditional move if we know the arg of BSF is not zero.
+(CMOVQEQ x _ (Select1 (BSFQ (ORQconst [c] _)))) && c != 0 => x
+// Extension is unnecessary for trailing zeros.
+(BSFQ (ORQconst <t> [1<<8] (MOVBQZX x))) => (BSFQ (ORQconst <t> [1<<8] x))
+(BSFQ (ORQconst <t> [1<<16] (MOVWQZX x))) => (BSFQ (ORQconst <t> [1<<16] x))
+
+// Redundant sign/zero extensions
+// Note: see issue 21963. We have to make sure we use the right type on
+// the resulting extension (the outer type, not the inner type).
+(MOVLQSX (MOVLQSX x)) => (MOVLQSX x)
+(MOVLQSX (MOVWQSX x)) => (MOVWQSX x)
+(MOVLQSX (MOVBQSX x)) => (MOVBQSX x)
+(MOVWQSX (MOVWQSX x)) => (MOVWQSX x)
+(MOVWQSX (MOVBQSX x)) => (MOVBQSX x)
+(MOVBQSX (MOVBQSX x)) => (MOVBQSX x)
+(MOVLQZX (MOVLQZX x)) => (MOVLQZX x)
+(MOVLQZX (MOVWQZX x)) => (MOVWQZX x)
+(MOVLQZX (MOVBQZX x)) => (MOVBQZX x)
+(MOVWQZX (MOVWQZX x)) => (MOVWQZX x)
+(MOVWQZX (MOVBQZX x)) => (MOVBQZX x)
+(MOVBQZX (MOVBQZX x)) => (MOVBQZX x)
+
+(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem)
+	&& isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && validValAndOff(int64(c),int64(off)) && clobber(l, a) =>
+	((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify {sym} [makeValAndOff32(int32(c),off)] ptr mem)
+(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem)
+	&& isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && validValAndOff(int64(c),int64(off)) && clobber(l, a) =>
+	((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify {sym} [makeValAndOff32(int32(c),off)] ptr mem)
+
+// float <-> int register moves, with no conversion.
+// These come up when compiling math.{Float{32,64}bits,Float{32,64}frombits}.
+(MOVQload  [off] {sym} ptr (MOVSDstore [off] {sym} ptr val _)) => (MOVQf2i val)
+(MOVLload  [off] {sym} ptr (MOVSSstore [off] {sym} ptr val _)) => (MOVLf2i val)
+(MOVSDload [off] {sym} ptr (MOVQstore  [off] {sym} ptr val _)) => (MOVQi2f val)
+(MOVSSload [off] {sym} ptr (MOVLstore  [off] {sym} ptr val _)) => (MOVLi2f val)
+
+// Other load-like ops.
+(ADDQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (ADDQ x (MOVQf2i y))
+(ADDLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (ADDL x (MOVLf2i y))
+(SUBQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (SUBQ x (MOVQf2i y))
+(SUBLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (SUBL x (MOVLf2i y))
+(ANDQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (ANDQ x (MOVQf2i y))
+(ANDLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (ANDL x (MOVLf2i y))
+( ORQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => ( ORQ x (MOVQf2i y))
+( ORLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => ( ORL x (MOVLf2i y))
+(XORQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (XORQ x (MOVQf2i y))
+(XORLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (XORL x (MOVLf2i y))
+
+(ADDSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (ADDSD x (MOVQi2f y))
+(ADDSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (ADDSS x (MOVLi2f y))
+(SUBSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (SUBSD x (MOVQi2f y))
+(SUBSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (SUBSS x (MOVLi2f y))
+(MULSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (MULSD x (MOVQi2f y))
+(MULSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (MULSS x (MOVLi2f y))
+
+// Redirect stores to use the other register set.
+(MOVQstore  [off] {sym} ptr (MOVQf2i val) mem) => (MOVSDstore [off] {sym} ptr val mem)
+(MOVLstore  [off] {sym} ptr (MOVLf2i val) mem) => (MOVSSstore [off] {sym} ptr val mem)
+(MOVSDstore [off] {sym} ptr (MOVQi2f val) mem) => (MOVQstore  [off] {sym} ptr val mem)
+(MOVSSstore [off] {sym} ptr (MOVLi2f val) mem) => (MOVLstore  [off] {sym} ptr val mem)
+
+// Load args directly into the register class where it will be used.
+// We do this by just modifying the type of the Arg.
+(MOVQf2i <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+(MOVLf2i <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+(MOVQi2f <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+(MOVLi2f <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym})
+
+// LEAQ is rematerializeable, so this helps to avoid register spill.
+// See issue 22947 for details
+(ADD(Q|L)const [off] x:(SP)) => (LEA(Q|L) [off] x)
+
+// HMULx is commutative, but its first argument must go in AX.
+// If possible, put a rematerializeable value in the first argument slot,
+// to reduce the odds that another value will be have to spilled
+// specifically to free up AX.
+(HMUL(Q|L)  x y) && !x.rematerializeable() && y.rematerializeable() => (HMUL(Q|L)  y x)
+(HMUL(Q|L)U x y) && !x.rematerializeable() && y.rematerializeable() => (HMUL(Q|L)U y x)
+
+// Fold loads into compares
+// Note: these may be undone by the flagalloc pass.
+(CMP(Q|L|W|B) l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) => (CMP(Q|L|W|B)load {sym} [off] ptr x mem)
+(CMP(Q|L|W|B) x l:(MOV(Q|L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (InvertFlags (CMP(Q|L|W|B)load {sym} [off] ptr x mem))
+
+(CMP(Q|L)const l:(MOV(Q|L)load {sym} [off] ptr mem) [c])
+	&& l.Uses == 1
+	&& clobber(l) =>
+@l.Block (CMP(Q|L)constload {sym} [makeValAndOff32(c,off)] ptr mem)
+(CMP(W|B)const l:(MOV(W|B)load {sym} [off] ptr mem) [c])
+	&& l.Uses == 1
+	&& clobber(l) =>
+@l.Block (CMP(W|B)constload {sym} [makeValAndOff32(int32(c),off)] ptr mem)
+
+(CMPQload {sym} [off] ptr (MOVQconst [c]) mem) && validValAndOff(c,int64(off)) => (CMPQconstload {sym} [makeValAndOff64(c,int64(off))] ptr mem)
+(CMPLload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(int64(c),int64(off)) => (CMPLconstload {sym} [makeValAndOff32(c,off)] ptr mem)
+(CMPWload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(int64(int16(c)),int64(off)) => (CMPWconstload {sym} [makeValAndOff32(int32(int16(c)),off)] ptr mem)
+(CMPBload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(int64(int8(c)),int64(off)) => (CMPBconstload {sym} [makeValAndOff32(int32(int8(c)),off)] ptr mem)
+
+(TEST(Q|L|W|B)  l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) l2)
+        && l == l2
+	&& l.Uses == 2
+	&& validValAndOff(0, int64(off))
+	&& clobber(l) =>
+  @l.Block (CMP(Q|L|W|B)constload {sym} [makeValAndOff64(0, int64(off))] ptr mem)
+
+(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read8(sym, int64(off)))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVLload [off] {sym} (SB) _) && symIsRO(sym) => (MOVQconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVQload [off] {sym} (SB) _) && symIsRO(sym) => (MOVQconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVOstore [dstOff] {dstSym} ptr (MOVOload [srcOff] {srcSym} (SB) _) mem) && symIsRO(srcSym) =>
+  (MOVQstore [dstOff+8] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff)+8, config.ctxt.Arch.ByteOrder))])
+    (MOVQstore [dstOff] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff), config.ctxt.Arch.ByteOrder))]) mem))
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
new file mode 100644
index 0000000..a87581b
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -0,0 +1,946 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - Floating-point types live in the low natural slot of an sse2 register.
+//    Unused portions are junk.
+//  - We do not use AH,BH,CH,DH registers.
+//  - When doing sub-register operations, we try to write the whole
+//    destination register to avoid a partial-register write.
+//  - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+//    filled by sign-extending the used portion.  Users of AuxInt which interpret
+//    AuxInt as unsigned (e.g. shifts) must be careful.
+//  - All SymOff opcodes require their offset to fit in an int32.
+
+// Suffixes encode the bit width of various instructions.
+// Q (quad word) = 64 bit
+// L (long word) = 32 bit
+// W (word)      = 16 bit
+// B (byte)      = 8 bit
+
+// copied from ../../amd64/reg.go
+var regNamesAMD64 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+	"X8",
+	"X9",
+	"X10",
+	"X11",
+	"X12",
+	"X13",
+	"X14",
+	"X15",
+
+	// If you add registers, update asyncPreempt in runtime
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesAMD64) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesAMD64 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		ax         = buildReg("AX")
+		cx         = buildReg("CX")
+		dx         = buildReg("DX")
+		bx         = buildReg("BX")
+		gp         = buildReg("AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15")
+		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15")
+		gpsp       = gp | buildReg("SP")
+		gpspsb     = gpsp | buildReg("SB")
+		callerSave = gp | fp
+	)
+	// Common slices of register masks
+	var (
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
+	)
+
+	// Common regInfo
+	var (
+		gp01           = regInfo{inputs: nil, outputs: gponly}
+		gp11           = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp         = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+		gp11sb         = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
+		gp21           = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp         = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+		gp21sb         = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
+		gp21shift      = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+		gp11div        = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
+		gp21hmul       = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
+		gp21flags      = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
+		gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}}
+
+		gp2flags     = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags     = regInfo{inputs: []regMask{gpsp}}
+		gp0flagsLoad = regInfo{inputs: []regMask{gpspsb, 0}}
+		gp1flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gp2flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+		flagsgp      = regInfo{inputs: nil, outputs: gponly}
+
+		gp11flags      = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
+		gp1flags1flags = regInfo{inputs: []regMask{gp, 0}, outputs: []regMask{gp, 0}}
+
+		readflags = regInfo{inputs: nil, outputs: gponly}
+		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
+
+		gpload      = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+		gp21load    = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
+		gploadidx   = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+		gp21loadidx = regInfo{inputs: []regMask{gp, gpspsb, gpsp, 0}, outputs: gponly}
+		gp21pax     = regInfo{inputs: []regMask{gp &^ ax, gp}, outputs: []regMask{gp &^ ax}, clobbers: ax}
+
+		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
+		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gpstorexchg     = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: []regMask{gp}}
+		cmpxchg         = regInfo{inputs: []regMask{gp, ax, gp, 0}, outputs: []regMask{gp, 0}, clobbers: ax}
+
+		fp01        = regInfo{inputs: nil, outputs: fponly}
+		fp21        = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp31        = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
+		fp21load    = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
+		fp21loadidx = regInfo{inputs: []regMask{fp, gpspsb, gpspsb, 0}, outputs: fponly}
+		fpgp        = regInfo{inputs: fponly, outputs: gponly}
+		gpfp        = regInfo{inputs: gponly, outputs: fponly}
+		fp11        = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags    = regInfo{inputs: []regMask{fp, fp}}
+
+		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+		fpstore    = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+		fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+	)
+
+	var AMD64ops = []opData{
+		// fp ops
+		{name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add
+		{name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add
+		{name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true},                    // fp32 sub
+		{name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true},                    // fp64 sub
+		{name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul
+		{name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul
+		{name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true},                    // fp32 div
+		{name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true},                    // fp64 div
+
+		{name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
+		{name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
+		{name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true},                               // fp32 constant
+		{name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true},                               // fp64 constant
+		{name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", scale: 1, aux: "SymOff", symEffect: "Read"},      // fp32 load indexed by i
+		{name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", scale: 4, aux: "SymOff", symEffect: "Read"},      // fp32 load indexed by 4*i
+		{name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", scale: 1, aux: "SymOff", symEffect: "Read"},      // fp64 load indexed by i
+		{name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", scale: 8, aux: "SymOff", symEffect: "Read"},      // fp64 load indexed by 8*i
+
+		{name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store
+		{name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store
+		{name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", scale: 1, aux: "SymOff", symEffect: "Write"},      // fp32 indexed by i store
+		{name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", scale: 4, aux: "SymOff", symEffect: "Write"},      // fp32 indexed by 4i store
+		{name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", scale: 1, aux: "SymOff", symEffect: "Write"},      // fp64 indexed by i store
+		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", scale: 8, aux: "SymOff", symEffect: "Write"},      // fp64 indexed by 8i store
+
+		{name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+
+		{name: "ADDSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "ADDSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ADDSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "ADDSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "SUBSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "SUBSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "SUBSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "SUBSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "MULSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "MULSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "MULSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "MULSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "DIVSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "DIVSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "DIVSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "DIVSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+
+		// binary ops
+		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true},                                                                   // arg0 + arg1
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                                                                   // arg0 + arg1
+		{name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int32", typ: "UInt64", clobberFlags: true},                                                    // arg0 + auxint
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", clobberFlags: true},                                                                   // arg0 + auxint
+		{name: "ADDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+		{name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true},                    // arg0 - arg1
+		{name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
+
+		{name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
+		{name: "MULQconst", argLength: 1, reg: gp11, asm: "IMUL3Q", aux: "Int32", clobberFlags: true},                    // arg0 * auxint
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true},                    // arg0 * auxint
+
+		{name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true}, // Let x = arg0*arg1 (full 32x32->64  unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x.
+		{name: "MULQU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt64,Flags)", asm: "MULQ", commutative: true, clobberFlags: true}, // Let x = arg0*arg1 (full 64x64->128 unsigned multiply). Returns uint64(x), and flags set to overflow if uint64(x) != x.
+
+		// HMULx[U] are intentionally not marked as commutative, even though they are.
+		// This is because they have asymmetric register requirements.
+		// There are rewrite rules to try to place arguments in preferable slots.
+		{name: "HMULQ", argLength: 2, reg: gp21hmul, asm: "IMULQ", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULQU", argLength: 2, reg: gp21hmul, asm: "MULQ", clobberFlags: true}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
+
+		{name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 64 result bits
+
+		// For DIVQ, DIVL and DIVW, AuxInt non-zero means that the divisor has been proved to be not -1.
+		{name: "DIVQ", argLength: 2, reg: gp11div, typ: "(Int64,Int64)", asm: "IDIVQ", aux: "Bool", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVL", argLength: 2, reg: gp11div, typ: "(Int32,Int32)", asm: "IDIVL", aux: "Bool", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVW", argLength: 2, reg: gp11div, typ: "(Int16,Int16)", asm: "IDIVW", aux: "Bool", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+
+		{name: "DIVQU", argLength: 2, reg: gp11div, typ: "(UInt64,UInt64)", asm: "DIVQ", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVLU", argLength: 2, reg: gp11div, typ: "(UInt32,UInt32)", asm: "DIVL", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+		{name: "DIVWU", argLength: 2, reg: gp11div, typ: "(UInt16,UInt16)", asm: "DIVW", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
+
+		{name: "NEGLflags", argLength: 1, reg: gp11flags, typ: "(UInt32,Flags)", asm: "NEGL", resultInArg0: true}, // -arg0, flags set for 0-arg0.
+		// The following 4 add opcodes return the low 64 bits of the sum in the first result and
+		// the carry (the 65th bit) in the carry flag.
+		{name: "ADDQcarry", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDQ", commutative: true, resultInArg0: true}, // r = arg0+arg1
+		{name: "ADCQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", commutative: true, resultInArg0: true}, // r = arg0+arg1+carry(arg2)
+		{name: "ADDQconstcarry", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint
+		{name: "ADCQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint+carry(arg1)
+
+		// The following 4 add opcodes return the low 64 bits of the difference in the first result and
+		// the borrow (if the result is negative) in the carry flag.
+		{name: "SUBQborrow", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "SUBQ", resultInArg0: true},                    // r = arg0-arg1
+		{name: "SBBQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "SBBQ", resultInArg0: true},                     // r = arg0-(arg1+carry(arg2))
+		{name: "SUBQconstborrow", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "SUBQ", aux: "Int32", resultInArg0: true}, // r = arg0-auxint
+		{name: "SBBQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "SBBQ", aux: "Int32", resultInArg0: true},  // r = arg0-(auxint+carry(arg1))
+
+		{name: "MULQU2", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}, commutative: true, asm: "MULQ", clobberFlags: true}, // arg0 * arg1, returns (hi, lo)
+		{name: "DIVQU2", argLength: 3, reg: regInfo{inputs: []regMask{dx, ax, gpsp}, outputs: []regMask{ax, dx}}, asm: "DIVQ", clobberFlags: true},                // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r)
+
+		{name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true, clobberFlags: true},                                                 // arg0 & arg1
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true},                                                 // arg0 & arg1
+		{name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                 // arg0 & auxint
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                 // arg0 & auxint
+		{name: "ANDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+		{name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true, clobberFlags: true},                                                 // arg0 | arg1
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true},                                                 // arg0 | arg1
+		{name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                 // arg0 | auxint
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                 // arg0 | auxint
+		{name: "ORQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+		{name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true, clobberFlags: true},                                                 // arg0 ^ arg1
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true},                                                 // arg0 ^ arg1
+		{name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                 // arg0 ^ auxint
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                 // arg0 ^ auxint
+		{name: "XORQconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+		{name: "CMPQ", argLength: 2, reg: gp2flags, asm: "CMPQ", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPQconst", argLength: 1, reg: gp1flags, asm: "CMPQ", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+		{name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
+		{name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"},  // arg0 compare to auxint
+
+		// compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem.
+		{name: "CMPQload", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+		// compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem.
+		{name: "CMPQconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPQ", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+		{name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
+
+		// compare *(arg0+N*arg1+auxint+aux) to arg2 (in that order). arg3=mem.
+		{name: "CMPQloadidx8", argLength: 4, reg: gp2flagsLoad, asm: "CMPQ", scale: 8, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPQloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPQ", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPLloadidx4", argLength: 4, reg: gp2flagsLoad, asm: "CMPL", scale: 4, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPLloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPL", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPWloadidx2", argLength: 4, reg: gp2flagsLoad, asm: "CMPW", scale: 2, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPWloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPW", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPBloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPB", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"},
+
+		// compare *(arg0+N*arg1+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg2=mem.
+		{name: "CMPQconstloadidx8", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", scale: 8, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPQconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPLconstloadidx4", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", scale: 4, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPLconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPWconstloadidx2", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", scale: 2, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPWconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+		{name: "CMPBconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"},
+
+		{name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32
+		{name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64
+
+		{name: "BTL", argLength: 2, reg: gp2flags, asm: "BTL", typ: "Flags"},                                           // test whether bit arg0%32 in arg1 is set
+		{name: "BTQ", argLength: 2, reg: gp2flags, asm: "BTQ", typ: "Flags"},                                           // test whether bit arg0%64 in arg1 is set
+		{name: "BTCL", argLength: 2, reg: gp21, asm: "BTCL", resultInArg0: true, clobberFlags: true},                   // complement bit arg1%32 in arg0
+		{name: "BTCQ", argLength: 2, reg: gp21, asm: "BTCQ", resultInArg0: true, clobberFlags: true},                   // complement bit arg1%64 in arg0
+		{name: "BTRL", argLength: 2, reg: gp21, asm: "BTRL", resultInArg0: true, clobberFlags: true},                   // reset bit arg1%32 in arg0
+		{name: "BTRQ", argLength: 2, reg: gp21, asm: "BTRQ", resultInArg0: true, clobberFlags: true},                   // reset bit arg1%64 in arg0
+		{name: "BTSL", argLength: 2, reg: gp21, asm: "BTSL", resultInArg0: true, clobberFlags: true},                   // set bit arg1%32 in arg0
+		{name: "BTSQ", argLength: 2, reg: gp21, asm: "BTSQ", resultInArg0: true, clobberFlags: true},                   // set bit arg1%64 in arg0
+		{name: "BTLconst", argLength: 1, reg: gp1flags, asm: "BTL", typ: "Flags", aux: "Int8"},                         // test whether bit auxint in arg0 is set, 0 <= auxint < 32
+		{name: "BTQconst", argLength: 1, reg: gp1flags, asm: "BTQ", typ: "Flags", aux: "Int8"},                         // test whether bit auxint in arg0 is set, 0 <= auxint < 64
+		{name: "BTCLconst", argLength: 1, reg: gp11, asm: "BTCL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 32
+		{name: "BTCQconst", argLength: 1, reg: gp11, asm: "BTCQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 64
+		{name: "BTRLconst", argLength: 1, reg: gp11, asm: "BTRL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 32
+		{name: "BTRQconst", argLength: 1, reg: gp11, asm: "BTRQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 64
+		{name: "BTSLconst", argLength: 1, reg: gp11, asm: "BTSL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 32
+		{name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64
+
+		// direct bit operation on memory operand
+		//
+		// Note that these operations do not mask the bit offset (arg1), and will write beyond their expected
+		// bounds if that argument is larger than 64/32 (for BT*Q and BT*L, respectively). If the compiler
+		// cannot prove that arg1 is in range, it must be explicitly masked (see e.g. the patterns that produce
+		// BT*modify from (MOVstore (BT* (MOVLload ptr mem) x) mem)).
+		{name: "BTCQmodify", argLength: 3, reg: gpstore, asm: "BTCQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // complement bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
+		{name: "BTCLmodify", argLength: 3, reg: gpstore, asm: "BTCL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // complement bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
+		{name: "BTSQmodify", argLength: 3, reg: gpstore, asm: "BTSQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // set bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
+		{name: "BTSLmodify", argLength: 3, reg: gpstore, asm: "BTSL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // set bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
+		{name: "BTRQmodify", argLength: 3, reg: gpstore, asm: "BTRQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // reset bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
+		{name: "BTRLmodify", argLength: 3, reg: gpstore, asm: "BTRL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},     // reset bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
+		{name: "BTCQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "BTCLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "BTSQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "BTSLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "BTRQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+		{name: "BTRLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
+
+		{name: "TESTQ", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTQ", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"}, // (arg0 & arg1) compare to 0
+		{name: "TESTQconst", argLength: 1, reg: gp1flags, asm: "TESTQ", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+		{name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
+		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0
+
+		{name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true},              // arg0 << arg1, shift amount is mod 64
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true},              // arg0 << arg1, shift amount is mod 32
+		{name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-63
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
+		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+		{name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true},              // unsigned arg0 >> arg1, shift amount is mod 64
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true},              // unsigned uint32(arg0) >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true},              // unsigned uint16(arg0) >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true},              // unsigned uint8(arg0) >> arg1, shift amount is mod 32
+		{name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-63
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned uint32(arg0) >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned uint16(arg0) >> auxint, shift amount 0-15
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned uint8(arg0) >> auxint, shift amount 0-7
+
+		{name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true, clobberFlags: true},              // signed arg0 >> arg1, shift amount is mod 64
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true},              // signed int32(arg0) >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true},              // signed int16(arg0) >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true},              // signed int8(arg0) >> arg1, shift amount is mod 32
+		{name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int32(arg0) >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int16(arg0) >> auxint, shift amount 0-15
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int8(arg0) >> auxint, shift amount 0-7
+
+		{name: "ROLQ", argLength: 2, reg: gp21shift, asm: "ROLQ", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
+		{name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
+		{name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
+		{name: "ROLB", argLength: 2, reg: gp21shift, asm: "ROLB", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
+		{name: "RORQ", argLength: 2, reg: gp21shift, asm: "RORQ", resultInArg0: true, clobberFlags: true},              // arg0 rotate right arg1 bits.
+		{name: "RORL", argLength: 2, reg: gp21shift, asm: "RORL", resultInArg0: true, clobberFlags: true},              // arg0 rotate right arg1 bits.
+		{name: "RORW", argLength: 2, reg: gp21shift, asm: "RORW", resultInArg0: true, clobberFlags: true},              // arg0 rotate right arg1 bits.
+		{name: "RORB", argLength: 2, reg: gp21shift, asm: "RORB", resultInArg0: true, clobberFlags: true},              // arg0 rotate right arg1 bits.
+		{name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-63
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-7
+
+		{name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ADDQload", argLength: 3, reg: gp21load, asm: "ADDQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "SUBQload", argLength: 3, reg: gp21load, asm: "SUBQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ANDQload", argLength: 3, reg: gp21load, asm: "ANDQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ORQload", argLength: 3, reg: gp21load, asm: "ORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "XORQload", argLength: 3, reg: gp21load, asm: "XORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+		{name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+
+		{name: "ADDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ADDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ADDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ADDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "SUBQloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "SUBQloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ANDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ANDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "XORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "XORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "XORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "XORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+
+		// direct binary-op on memory (read-modify-write)
+		{name: "ADDQmodify", argLength: 3, reg: gpstore, asm: "ADDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
+		{name: "SUBQmodify", argLength: 3, reg: gpstore, asm: "SUBQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
+		{name: "ANDQmodify", argLength: 3, reg: gpstore, asm: "ANDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
+		{name: "ORQmodify", argLength: 3, reg: gpstore, asm: "ORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},   // *(arg0+auxint+aux) |= arg1, arg2=mem
+		{name: "XORQmodify", argLength: 3, reg: gpstore, asm: "XORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
+		{name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
+		{name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
+		{name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
+		{name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},   // *(arg0+auxint+aux) |= arg1, arg2=mem
+		{name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
+
+		{name: "ADDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "ADDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "SUBQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "SUBQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "ANDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ANDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "ORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "XORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "XORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "ADDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "ADDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "SUBLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "SUBLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "ANDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ANDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+4*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "ORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "XORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "XORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) ^= arg2, arg3=mem
+
+		{name: "ADDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+
+		// unary ops
+		{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true}, // -arg0
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
+
+		{name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true, clobberFlags: true}, // ^arg0
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true, clobberFlags: true}, // ^arg0
+
+		// BS{F,R}Q returns a tuple [result, flags]
+		// result is undefined if the input is zero.
+		// flags are set to "equal" if the input is zero, "not equal" otherwise.
+		// BS{F,R}L returns only the result.
+		{name: "BSFQ", argLength: 1, reg: gp11flags, asm: "BSFQ", typ: "(UInt64,Flags)"},        // # of low-order zeroes in 64-bit arg
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", typ: "UInt32", clobberFlags: true}, // # of low-order zeroes in 32-bit arg
+		{name: "BSRQ", argLength: 1, reg: gp11flags, asm: "BSRQ", typ: "(UInt64,Flags)"},        // # of high-order zeroes in 64-bit arg
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", typ: "UInt32", clobberFlags: true}, // # of high-order zeroes in 32-bit arg
+
+		// CMOV instructions: 64, 32 and 16-bit sizes.
+		// if arg2 encodes a true result, return arg1, else arg0
+		{name: "CMOVQEQ", argLength: 3, reg: gp21, asm: "CMOVQEQ", resultInArg0: true},
+		{name: "CMOVQNE", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true},
+		{name: "CMOVQLT", argLength: 3, reg: gp21, asm: "CMOVQLT", resultInArg0: true},
+		{name: "CMOVQGT", argLength: 3, reg: gp21, asm: "CMOVQGT", resultInArg0: true},
+		{name: "CMOVQLE", argLength: 3, reg: gp21, asm: "CMOVQLE", resultInArg0: true},
+		{name: "CMOVQGE", argLength: 3, reg: gp21, asm: "CMOVQGE", resultInArg0: true},
+		{name: "CMOVQLS", argLength: 3, reg: gp21, asm: "CMOVQLS", resultInArg0: true},
+		{name: "CMOVQHI", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true},
+		{name: "CMOVQCC", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true},
+		{name: "CMOVQCS", argLength: 3, reg: gp21, asm: "CMOVQCS", resultInArg0: true},
+
+		{name: "CMOVLEQ", argLength: 3, reg: gp21, asm: "CMOVLEQ", resultInArg0: true},
+		{name: "CMOVLNE", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+		{name: "CMOVLLT", argLength: 3, reg: gp21, asm: "CMOVLLT", resultInArg0: true},
+		{name: "CMOVLGT", argLength: 3, reg: gp21, asm: "CMOVLGT", resultInArg0: true},
+		{name: "CMOVLLE", argLength: 3, reg: gp21, asm: "CMOVLLE", resultInArg0: true},
+		{name: "CMOVLGE", argLength: 3, reg: gp21, asm: "CMOVLGE", resultInArg0: true},
+		{name: "CMOVLLS", argLength: 3, reg: gp21, asm: "CMOVLLS", resultInArg0: true},
+		{name: "CMOVLHI", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true},
+		{name: "CMOVLCC", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true},
+		{name: "CMOVLCS", argLength: 3, reg: gp21, asm: "CMOVLCS", resultInArg0: true},
+
+		{name: "CMOVWEQ", argLength: 3, reg: gp21, asm: "CMOVWEQ", resultInArg0: true},
+		{name: "CMOVWNE", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+		{name: "CMOVWLT", argLength: 3, reg: gp21, asm: "CMOVWLT", resultInArg0: true},
+		{name: "CMOVWGT", argLength: 3, reg: gp21, asm: "CMOVWGT", resultInArg0: true},
+		{name: "CMOVWLE", argLength: 3, reg: gp21, asm: "CMOVWLE", resultInArg0: true},
+		{name: "CMOVWGE", argLength: 3, reg: gp21, asm: "CMOVWGE", resultInArg0: true},
+		{name: "CMOVWLS", argLength: 3, reg: gp21, asm: "CMOVWLS", resultInArg0: true},
+		{name: "CMOVWHI", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true},
+		{name: "CMOVWCC", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true},
+		{name: "CMOVWCS", argLength: 3, reg: gp21, asm: "CMOVWCS", resultInArg0: true},
+
+		// CMOV with floating point instructions. We need separate pseudo-op to handle
+		// InvertFlags correctly, and to generate special code that handles NaN (unordered flag).
+		// NOTE: the fact that CMOV*EQF here is marked to generate CMOV*NE is not a bug. See
+		// code generation in amd64/ssa.go.
+		{name: "CMOVQEQF", argLength: 3, reg: gp21pax, asm: "CMOVQNE", resultInArg0: true},
+		{name: "CMOVQNEF", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true},
+		{name: "CMOVQGTF", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true},
+		{name: "CMOVQGEF", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true},
+		{name: "CMOVLEQF", argLength: 3, reg: gp21pax, asm: "CMOVLNE", resultInArg0: true},
+		{name: "CMOVLNEF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+		{name: "CMOVLGTF", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true},
+		{name: "CMOVLGEF", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true},
+		{name: "CMOVWEQF", argLength: 3, reg: gp21pax, asm: "CMOVWNE", resultInArg0: true},
+		{name: "CMOVWNEF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+		{name: "CMOVWGTF", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true},
+		{name: "CMOVWGEF", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true},
+
+		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+
+		// POPCNT instructions aren't guaranteed to be on the target platform (they are SSE4).
+		// Any use must be preceded by a successful check of runtime.x86HasPOPCNT.
+		{name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true}, // count number of set bits in arg0
+		{name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true}, // count number of set bits in arg0
+
+		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+
+		// ROUNDSD instruction isn't guaranteed to be on the target platform (it is SSE4.1)
+		// Any use must be preceded by a successful check of runtime.x86HasSSE41.
+		{name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
+
+		// VFMADD231SD only exists on platforms with the FMA3 instruction set.
+		// Any use must be preceded by a successful check of runtime.support_fma.
+		{name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"},
+
+		{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
+		{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+		// Note: SBBW and SBBB are subsumed by SBBL
+
+		{name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+		{name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+		{name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"},  // extract signed < condition from arg0
+		{name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+		{name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"},  // extract signed > condition from arg0
+		{name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+		{name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"},  // extract unsigned < condition from arg0
+		{name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+		{name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"},  // extract unsigned > condition from arg0
+		{name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+		{name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"},  // extract if overflow flag is set from arg0
+		// Variants that store result to memory
+		{name: "SETEQstore", argLength: 3, reg: gpstoreconst, asm: "SETEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract == condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETNEstore", argLength: 3, reg: gpstoreconst, asm: "SETNE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract != condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETLstore", argLength: 3, reg: gpstoreconst, asm: "SETLT", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // extract signed < condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETLEstore", argLength: 3, reg: gpstoreconst, asm: "SETLE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed <= condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETGstore", argLength: 3, reg: gpstoreconst, asm: "SETGT", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // extract signed > condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETGEstore", argLength: 3, reg: gpstoreconst, asm: "SETGE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed >= condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETBstore", argLength: 3, reg: gpstoreconst, asm: "SETCS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // extract unsigned < condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETBEstore", argLength: 3, reg: gpstoreconst, asm: "SETLS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned <= condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETAstore", argLength: 3, reg: gpstoreconst, asm: "SETHI", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // extract unsigned > condition from arg1 to arg0+auxint+aux, arg2=mem
+		{name: "SETAEstore", argLength: 3, reg: gpstoreconst, asm: "SETCC", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned >= condition from arg1 to arg0+auxint+aux, arg2=mem
+		// Need different opcodes for floating point conditions because
+		// any comparison involving a NaN is always FALSE and thus
+		// the patterns for inverting conditions cannot be used.
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},                       // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},                       // extract "unordered" (Nan present) condition from arg0
+
+		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
+		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+		{name: "MOVBQSX", argLength: 1, reg: gp11, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64
+		{name: "MOVBQZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int64
+		{name: "MOVWQSX", argLength: 1, reg: gp11, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64
+		{name: "MOVWQZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int64
+		{name: "MOVLQSX", argLength: 1, reg: gp11, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64
+		{name: "MOVLQZX", argLength: 1, reg: gp11, asm: "MOVL"},    // zero extend arg0 from int32 to int64
+
+		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+		{name: "MOVQconst", reg: gp01, asm: "MOVQ", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
+
+		{name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
+		{name: "CVTTSD2SQ", argLength: 1, reg: fpgp, asm: "CVTTSD2SQ"}, // convert float64 to int64
+		{name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
+		{name: "CVTTSS2SQ", argLength: 1, reg: fpgp, asm: "CVTTSS2SQ"}, // convert float32 to int64
+		{name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"},   // convert int32 to float32
+		{name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"},   // convert int32 to float64
+		{name: "CVTSQ2SS", argLength: 1, reg: gpfp, asm: "CVTSQ2SS"},   // convert int64 to float32
+		{name: "CVTSQ2SD", argLength: 1, reg: gpfp, asm: "CVTSQ2SD"},   // convert int64 to float64
+		{name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"},   // convert float64 to float32
+		{name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"},   // convert float32 to float64
+
+		// Move values between int and float registers, with no conversion.
+		// TODO: should we have generic versions of these?
+		{name: "MOVQi2f", argLength: 1, reg: gpfp, typ: "Float64"}, // move 64 bits from int to float reg
+		{name: "MOVQf2i", argLength: 1, reg: fpgp, typ: "UInt64"},  // move 64 bits from float to int reg
+		{name: "MOVLi2f", argLength: 1, reg: gpfp, typ: "Float32"}, // move 32 bits from int to float reg
+		{name: "MOVLf2i", argLength: 1, reg: fpgp, typ: "UInt32"},  // move 32 bits from float to int reg, zero extend
+
+		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+		{name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true, symEffect: "Addr"},      // arg0 + auxint + offset encoded in aux
+		{name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true, symEffect: "Addr"},      // arg0 + auxint + offset encoded in aux
+		{name: "LEAW", argLength: 1, reg: gp11sb, asm: "LEAW", aux: "SymOff", rematerializeable: true, symEffect: "Addr"},      // arg0 + auxint + offset encoded in aux
+		{name: "LEAQ1", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+		{name: "LEAL1", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+		{name: "LEAW1", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
+		{name: "LEAQ2", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 2, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAL2", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 2, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAW2", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 2, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAQ4", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 4, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAL4", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 4, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAW4", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 4, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAQ8", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 8, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 8*arg1 + auxint + aux
+		{name: "LEAL8", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 8, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 8*arg1 + auxint + aux
+		{name: "LEAW8", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 8, aux: "SymOff", symEffect: "Addr"},                    // arg0 + 8*arg1 + auxint + aux
+		// Note: LEAx{1,2,4,8} must not have OpSB as either argument.
+
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBQSXload", argLength: 2, reg: gpload, asm: "MOVBQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},             // ditto, sign extend to int64
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVWQSXload", argLength: 2, reg: gpload, asm: "MOVWQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},             // ditto, sign extend to int64
+		{name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"},    // load 4 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVLQSXload", argLength: 2, reg: gpload, asm: "MOVLQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},             // ditto, sign extend to int64
+		{name: "MOVQload", argLength: 2, reg: gpload, asm: "MOVQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"},    // load 8 bytes from arg0+auxint+aux. arg1=mem
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store byte in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVQstore", argLength: 3, reg: gpstore, asm: "MOVQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},    // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128", faultOnNilArg0: true, symEffect: "Read"},  // load 16 bytes from arg0+auxint+aux. arg1=mem
+		{name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+		// indexed loads/stores
+		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", scale: 1, aux: "SymOff", typ: "UInt8", symEffect: "Read"},  // load a byte from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", scale: 1, aux: "SymOff", typ: "UInt16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", scale: 2, aux: "SymOff", typ: "UInt16", symEffect: "Read"},                    // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymOff", typ: "UInt32", symEffect: "Read"},    // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", scale: 4, aux: "SymOff", typ: "UInt32", symEffect: "Read"},                       // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx8", argLength: 3, reg: gploadidx, asm: "MOVL", scale: 8, aux: "SymOff", typ: "UInt32", symEffect: "Read"},                       // load 4 bytes from arg0+8*arg1+auxint+aux. arg2=mem
+		{name: "MOVQloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymOff", typ: "UInt64", symEffect: "Read"},    // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVQloadidx8", argLength: 3, reg: gploadidx, asm: "MOVQ", scale: 8, aux: "SymOff", typ: "UInt64", symEffect: "Read"},                       // load 8 bytes from arg0+8*arg1+auxint+aux. arg2=mem
+		// TODO: sign-extending indexed loads
+		{name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", scale: 1, aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", scale: 1, aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", scale: 2, aux: "SymOff", symEffect: "Write"},                    // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", scale: 4, aux: "SymOff", symEffect: "Write"},                    // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVL", scale: 8, aux: "SymOff", symEffect: "Write"},                    // store 4 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
+		{name: "MOVQstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymOff", symEffect: "Write"}, // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVQ", scale: 8, aux: "SymOff", symEffect: "Write"},                    // store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
+		// TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
+
+		// For storeconst ops, the AuxInt field encodes both
+		// the value to store and an address offset of the store.
+		// Cast AuxInt to a ValAndOff to extract Val and Off fields.
+		{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux.  arg1=mem
+		{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ...
+		{name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ...
+		{name: "MOVQstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVQ", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of ...
+
+		{name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVB", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux.  arg2=mem
+		{name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVW", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... arg1 ...
+		{name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", scale: 2, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},                    // store low 2 bytes of ... 2*arg1 ...
+		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... arg1 ...
+		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", scale: 4, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},                    // store low 4 bytes of ... 4*arg1 ...
+		{name: "MOVQstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store 8 bytes of ... arg1 ...
+		{name: "MOVQstoreconstidx8", argLength: 3, reg: gpstoreconstidx, asm: "MOVQ", scale: 8, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"},                    // store 8 bytes of ... 8*arg1 ...
+
+		// arg0 = pointer to start of memory to zero
+		// arg1 = value to store (will always be zero)
+		// arg2 = mem
+		// auxint = # of bytes to zero
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("X0")},
+				clobbers: buildReg("DI"),
+			},
+			faultOnNilArg0: true,
+			unsafePoint:    true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+		},
+		{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},
+
+		// arg0 = address of memory to zero
+		// arg1 = # of 8-byte words to zero
+		// arg2 = value to store (will always be zero)
+		// arg3 = mem
+		// returns mem
+		{
+			name:      "REPSTOSQ",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+			},
+			faultOnNilArg0: true,
+		},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                              // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = mem
+		// auxint = # of bytes to copy, must be multiple of 16
+		// returns memory
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
+				clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+		},
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = # of 8-byte words to copy
+		// arg3 = mem
+		// returns memory
+		{
+			name:      "REPMOVSQ",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+				clobbers: buildReg("DI SI CX"),
+			},
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// (InvertFlags (CMPQ a b)) == (CMPQ b a)
+		// So if we want (SETL (CMPQ a b)) but we can't do that because a is a constant,
+		// then we do (SETL (InvertFlags (CMPQ b a))) instead.
+		// Rewrites will convert this to (SETG (CMPQ b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Pseudo-ops
+		{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of DX (the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true},
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary, but may clobber others.
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		{name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "UInt64", aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned >
+		{name: "FlagGT_ULT"}, // signed > and unsigned <
+
+		// Atomic loads.  These are just normal loads but return <value,memory> tuples
+		// so they can be properly ordered with other loads.
+		// load from arg0+auxint+aux.  arg1=mem.
+		{name: "MOVBatomicload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+		{name: "MOVLatomicload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+		{name: "MOVQatomicload", argLength: 2, reg: gpload, asm: "MOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+		// Atomic stores and exchanges.  Stores use XCHG to get the right memory ordering semantics.
+		// store arg0 to arg1+auxint+aux, arg2=mem.
+		// These ops return a tuple of <old contents of *(arg1+auxint+aux), memory>.
+		// Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)!
+		{name: "XCHGB", argLength: 3, reg: gpstorexchg, asm: "XCHGB", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "XCHGL", argLength: 3, reg: gpstorexchg, asm: "XCHGL", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "XCHGQ", argLength: 3, reg: gpstorexchg, asm: "XCHGQ", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+
+		// Atomic adds.
+		// *(arg1+auxint+aux) += arg0.  arg2=mem.
+		// Returns a tuple of <old contents of *(arg1+auxint+aux), memory>.
+		// Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)!
+		{name: "XADDLlock", argLength: 3, reg: gpstorexchg, asm: "XADDL", typ: "(UInt32,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "XADDQlock", argLength: 3, reg: gpstorexchg, asm: "XADDQ", typ: "(UInt64,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
+		{name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
+
+		// Compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+		// if *(arg0+auxint+aux) == arg1 {
+		//   *(arg0+auxint+aux) = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// Note that these instructions also return the old value in AX, but we ignore it.
+		// TODO: have these return flags instead of bool.  The current system generates:
+		//    CMPXCHGQ ...
+		//    SETEQ AX
+		//    CMPB  AX, $0
+		//    JNE ...
+		// instead of just
+		//    CMPXCHGQ ...
+		//    JEQ ...
+		// but we can't do that because memory-using ops can't generate flags yet
+		// (flagalloc wants to move flag-generating instructions around).
+		{name: "CMPXCHGLlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "CMPXCHGQlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+
+		// Atomic memory updates.
+		{name: "ANDBlock", argLength: 3, reg: gpstore, asm: "ANDB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
+		{name: "ANDLlock", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
+		{name: "ORBlock", argLength: 3, reg: gpstore, asm: "ORB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},   // *(arg0+auxint+aux) |= arg1
+		{name: "ORLlock", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},   // *(arg0+auxint+aux) |= arg1
+	}
+
+	var AMD64blocks = []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LT", controls: 1},
+		{name: "LE", controls: 1},
+		{name: "GT", controls: 1},
+		{name: "GE", controls: 1},
+		{name: "OS", controls: 1},
+		{name: "OC", controls: 1},
+		{name: "ULT", controls: 1},
+		{name: "ULE", controls: 1},
+		{name: "UGT", controls: 1},
+		{name: "UGE", controls: 1},
+		{name: "EQF", controls: 1},
+		{name: "NEF", controls: 1},
+		{name: "ORD", controls: 1}, // FP, ordered comparison (parity zero)
+		{name: "NAN", controls: 1}, // FP, unordered comparison (parity one)
+	}
+
+	archs = append(archs, arch{
+		name:            "AMD64",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../amd64/ssa.go",
+		ops:             AMD64ops,
+		blocks:          AMD64blocks,
+		regnames:        regNamesAMD64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["BP"]),
+		linkreg:         -1, // not used
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64splitload.rules b/src/cmd/compile/internal/ssa/gen/AMD64splitload.rules
new file mode 100644
index 0000000..a50d509
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/AMD64splitload.rules
@@ -0,0 +1,45 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules used by flagalloc and addressingmodes to
+// split a flag-generating merged load op into separate load and op.
+// Unlike with the other rules files, not all of these
+// rules will be applied to all values.
+// Rather, flagalloc will request for rules to be applied
+// to a particular problematic value.
+// These are often the exact inverse of rules in AMD64.rules,
+// only with the conditions removed.
+//
+// For addressingmodes, certain single instructions are slower than the two instruction
+// split generated here (which is different from the inputs to addressingmodes).
+// For example:
+// (CMPBconstload c (ADDQ x y)) -> (CMPBconstloadidx1 c x y) -> (CMPB c (MOVBloadidx1 x y))
+
+(CMP(Q|L|W|B)load {sym} [off] ptr x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)load {sym} [off] ptr mem) x)
+
+(CMP(Q|L|W|B)constload {sym} [vo] ptr mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)load {sym} [vo.Off32()] ptr mem) x)
+
+(CMPQconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPQconst (MOVQload {sym} [vo.Off32()] ptr mem) [vo.Val32()])
+(CMPLconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPLconst (MOVLload {sym} [vo.Off32()] ptr mem) [vo.Val32()])
+(CMPWconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPWconst (MOVWload {sym} [vo.Off32()] ptr mem) [vo.Val16()])
+(CMPBconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPBconst (MOVBload {sym} [vo.Off32()] ptr mem) [vo.Val8()])
+
+(CMP(Q|L|W|B)loadidx1 {sym} [off] ptr idx x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)loadidx1 {sym} [off] ptr idx mem) x)
+(CMPQloadidx8 {sym} [off] ptr idx x mem) => (CMPQ (MOVQloadidx8 {sym} [off] ptr idx mem) x)
+(CMPLloadidx4 {sym} [off] ptr idx x mem) => (CMPL (MOVLloadidx4 {sym} [off] ptr idx mem) x)
+(CMPWloadidx2 {sym} [off] ptr idx x mem) => (CMPW (MOVWloadidx2 {sym} [off] ptr idx mem) x)
+
+(CMP(Q|L|W|B)constloadidx1 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)loadidx1 {sym} [vo.Off32()] ptr idx mem) x)
+(CMPQconstloadidx8         {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTQ         x:(MOVQloadidx8         {sym} [vo.Off32()] ptr idx mem) x)
+(CMPLconstloadidx4         {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTL         x:(MOVLloadidx4         {sym} [vo.Off32()] ptr idx mem) x)
+(CMPWconstloadidx2         {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTW         x:(MOVWloadidx2         {sym} [vo.Off32()] ptr idx mem) x)
+
+(CMPQconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPLconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPWconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val16()])
+(CMPBconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPBconst (MOVBloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val8()])
+
+(CMPQconstloadidx8 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx8 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPLconstloadidx4 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx4 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPWconstloadidx2 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx2 {sym} [vo.Off32()] ptr idx mem) [vo.Val16()])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules
new file mode 100644
index 0000000..69989b0
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@@ -0,0 +1,1475 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|32|16|8) ...) => (ADD ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+(Add32carry ...) => (ADDS ...)
+(Add32withcarry ...) => (ADC ...)
+
+(Sub(Ptr|32|16|8) ...) => (SUB ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+(Sub32carry ...) => (SUBS ...)
+(Sub32withcarry ...) => (SBC ...)
+
+(Mul(32|16|8) ...) => (MUL ...)
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+(Hmul(32|32u) ...) => (HMU(L|LU) ...)
+(Mul32uhilo ...) => (MULLU ...)
+
+(Div32 x y) =>
+	(SUB (XOR <typ.UInt32>                                                        // negate the result if one operand is negative
+		(Select0 <typ.UInt32> (CALLudiv
+			(SUB <typ.UInt32> (XOR x <typ.UInt32> (Signmask x)) (Signmask x))   // negate x if negative
+			(SUB <typ.UInt32> (XOR y <typ.UInt32> (Signmask y)) (Signmask y)))) // negate y if negative
+		(Signmask (XOR <typ.UInt32> x y))) (Signmask (XOR <typ.UInt32> x y)))
+(Div32u x y) => (Select0 <typ.UInt32> (CALLudiv x y))
+(Div16 x y) => (Div32 (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (Div32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (Div32 (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (Div32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod32 x y) =>
+	(SUB (XOR <typ.UInt32>                                                        // negate the result if x is negative
+		(Select1 <typ.UInt32> (CALLudiv
+			(SUB <typ.UInt32> (XOR <typ.UInt32> x (Signmask x)) (Signmask x))   // negate x if negative
+			(SUB <typ.UInt32> (XOR <typ.UInt32> y (Signmask y)) (Signmask y)))) // negate y if negative
+		(Signmask x)) (Signmask x))
+(Mod32u x y) => (Select1 <typ.UInt32> (CALLudiv x y))
+(Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+// (x + y) / 2 with x>=y -> (x - y) / 2 + y
+(Avg32u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(32|16|8) ...) => (AND ...)
+(Or(32|16|8) ...) => (OR ...)
+(Xor(32|16|8) ...) => (XOR ...)
+
+// unary ops
+(Neg(32|16|8) x) => (RSBconst [0] x)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(32|16|8) ...) => (MVN ...)
+
+(Sqrt ...) => (SQRTD ...)
+(Abs ...) => (ABSD ...)
+
+// TODO: optimize this for ARMv5 and ARMv6
+(Ctz32NonZero ...) => (Ctz32 ...)
+(Ctz16NonZero ...) => (Ctz32 ...)
+(Ctz8NonZero ...) => (Ctz32 ...)
+
+// count trailing zero for ARMv5 and ARMv6
+// 32 - CLZ(x&-x - 1)
+(Ctz32 <t> x) && objabi.GOARM<=6 =>
+	(RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1])))
+(Ctz16 <t> x) && objabi.GOARM<=6 =>
+	(RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x10000] x))) [1])))
+(Ctz8 <t> x) && objabi.GOARM<=6 =>
+	(RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x100] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x100] x))) [1])))
+
+// count trailing zero for ARMv7
+(Ctz32 <t> x) && objabi.GOARM==7 => (CLZ <t> (RBIT <t> x))
+(Ctz16 <t> x) && objabi.GOARM==7 => (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) && objabi.GOARM==7 => (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+
+// bit length
+(BitLen32 <t> x) => (RSBconst [32] (CLZ <t> x))
+
+// byte swap for ARMv5
+// let (a, b, c, d) be the bytes of x from high to low
+// t1 = x right rotate 16 bits -- (c,   d,   a,   b  )
+// t2 = x ^ t1                 -- (a^c, b^d, a^c, b^d)
+// t3 = t2 &^ 0xff0000         -- (a^c, 0,   a^c, b^d)
+// t4 = t3 >> 8                -- (0,   a^c, 0,   a^c)
+// t5 = x right rotate 8 bits  -- (d,   a,   b,   c  )
+// result = t4 ^ t5            -- (d,   c,   b,   a  )
+// using shifted ops this can be done in 4 instructions.
+(Bswap32 <t> x) && objabi.GOARM==5 =>
+	(XOR <t>
+		(SRLconst <t> (BICconst <t> (XOR <t> x (SRRconst <t> [16] x)) [0xff0000]) [8])
+		(SRRconst <t> x [8]))
+
+// byte swap for ARMv6 and above
+(Bswap32 x) && objabi.GOARM>=6 => (REV x)
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XORconst [1] (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// shifts
+// hardware instruction uses only the low byte of the shift
+// we compare to 256 to ensure Go semantics for large shifts
+(Lsh32x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0])
+(Lsh32x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Lsh32x8  x y) => (SLL x (ZeroExt8to32 y))
+
+(Lsh16x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0])
+(Lsh16x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Lsh16x8  x y) => (SLL x (ZeroExt8to32 y))
+
+(Lsh8x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0])
+(Lsh8x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Lsh8x8  x y) => (SLL x (ZeroExt8to32 y))
+
+(Rsh32Ux32 x y) => (CMOVWHSconst (SRL <x.Type> x y) (CMPconst [256] y) [0])
+(Rsh32Ux16 x y) => (CMOVWHSconst (SRL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Rsh32Ux8  x y) => (SRL x (ZeroExt8to32 y))
+
+(Rsh16Ux32 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt16to32 x) y) (CMPconst [256] y) [0])
+(Rsh16Ux16 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt16to32 x) (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Rsh16Ux8  x y) => (SRL (ZeroExt16to32 x) (ZeroExt8to32 y))
+
+(Rsh8Ux32 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt8to32 x) y) (CMPconst [256] y) [0])
+(Rsh8Ux16 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt8to32 x) (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0])
+(Rsh8Ux8  x y) => (SRL (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(Rsh32x32 x y) => (SRAcond x y (CMPconst [256] y))
+(Rsh32x16 x y) => (SRAcond x (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y)))
+(Rsh32x8  x y) => (SRA x (ZeroExt8to32 y))
+
+(Rsh16x32 x y) => (SRAcond (SignExt16to32 x) y (CMPconst [256] y))
+(Rsh16x16 x y) => (SRAcond (SignExt16to32 x) (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y)))
+(Rsh16x8  x y) => (SRA (SignExt16to32 x) (ZeroExt8to32 y))
+
+(Rsh8x32 x y) => (SRAcond (SignExt8to32 x) y (CMPconst [256] y))
+(Rsh8x16 x y) => (SRAcond (SignExt8to32 x) (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y)))
+(Rsh8x8  x y) => (SRA (SignExt8to32 x) (ZeroExt8to32 y))
+
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SLLconst x [int32(c)])
+(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SRAconst x [int32(c)])
+(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 => (SRLconst x [int32(c)])
+(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SLLconst x [int32(c)])
+(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 => (SRLconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SLLconst x [int32(c)])
+(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 => (SRLconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh32x64 x (Const64 [c])) && uint64(c) >= 32 => (SRAconst x [31])
+(Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [31])
+(Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [31])
+
+// constants
+(Const(8|16|32) [val]) => (MOVWconst [int32(val)])
+(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)])
+(ConstNil) => (MOVWconst [0])
+(ConstBool [b]) => (MOVWconst [b2i32(b)])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+
+(Signmask x) => (SRAconst x [31])
+(Zeromask x) => (SRAconst (RSBshiftRL <typ.Int32> x x [1]) [31]) // sign bit of uint32(x)>>1 - x
+(Slicemask <t> x) => (SRAconst (RSBconst <t> [0] x) [31])
+
+// float <-> int conversion
+(Cvt32to32F ...) => (MOVWF ...)
+(Cvt32to64F ...) => (MOVWD ...)
+(Cvt32Uto32F ...) => (MOVWUF ...)
+(Cvt32Uto64F ...) => (MOVWUD ...)
+(Cvt32Fto32 ...) => (MOVFW ...)
+(Cvt64Fto32 ...) => (MOVDW ...)
+(Cvt32Fto32U ...) => (MOVFWU ...)
+(Cvt64Fto32U ...) => (MOVDWU ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+// fused-multiply-add
+(FMA x y z) => (FMULAD z x y)
+
+// comparisons
+(Eq8 x y)  => (Equal (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (Equal (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (Equal (CMP x y))
+(EqPtr x y) => (Equal (CMP x y))
+(Eq(32|64)F x y) => (Equal (CMP(F|D) x y))
+
+(Neq8 x y)  => (NotEqual (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) => (NotEqual (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) => (NotEqual (CMP x y))
+(NeqPtr x y) => (NotEqual (CMP x y))
+(Neq(32|64)F x y) => (NotEqual (CMP(F|D) x y))
+
+(Less8 x y)  => (LessThan (CMP (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) => (LessThan (CMP (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) => (LessThan (CMP x y))
+(Less(32|64)F x y) => (GreaterThan (CMP(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y)  => (LessThanU (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) => (LessThanU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) => (LessThanU (CMP x y))
+
+(Leq8 x y)  => (LessEqual (CMP (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (LessEqual (CMP (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (LessEqual (CMP x y))
+(Leq(32|64)F x y) => (GreaterEqual (CMP(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y)  => (LessEqualU (CMP (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (LessEqualU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (LessEqualU (CMP x y))
+
+(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr)
+
+(Addr {sym} base) => (MOVWaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVWaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore ptr (MOVWconst [0]) mem)
+(Zero [2] ptr mem) =>
+	(MOVBstore [1] ptr (MOVWconst [0])
+		(MOVBstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore ptr (MOVWconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] ptr (MOVWconst [0])
+		(MOVHstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] ptr mem) =>
+	(MOVBstore [3] ptr (MOVWconst [0])
+		(MOVBstore [2] ptr (MOVWconst [0])
+			(MOVBstore [1] ptr (MOVWconst [0])
+				(MOVBstore [0] ptr (MOVWconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+	(MOVBstore [2] ptr (MOVWconst [0])
+		(MOVBstore [1] ptr (MOVWconst [0])
+			(MOVBstore [0] ptr (MOVWconst [0]) mem)))
+
+// Medium zeroing uses a duff device
+// 4 and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+	&& s%4 == 0 && s > 4 && s <= 512
+	&& t.Alignment()%4 == 0 && !config.noDuffDevice =>
+	(DUFFZERO [4 * (128 - s/4)] ptr (MOVWconst [0]) mem)
+
+// Large zeroing uses a loop
+(Zero [s] {t} ptr mem)
+	&& (s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0 =>
+	(LoweredZero [t.Alignment()]
+		ptr
+		(ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))])
+		(MOVWconst [0])
+		mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore dst (MOVHUload src mem) mem)
+(Move [2] dst src mem) =>
+	(MOVBstore [1] dst (MOVBUload [1] src mem)
+		(MOVBstore dst (MOVBUload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] dst (MOVHUload [2] src mem)
+		(MOVHstore dst (MOVHUload src mem) mem))
+(Move [4] dst src mem) =>
+	(MOVBstore [3] dst (MOVBUload [3] src mem)
+		(MOVBstore [2] dst (MOVBUload [2] src mem)
+			(MOVBstore [1] dst (MOVBUload [1] src mem)
+				(MOVBstore dst (MOVBUload src mem) mem))))
+
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBUload [2] src mem)
+		(MOVBstore [1] dst (MOVBUload [1] src mem)
+			(MOVBstore dst (MOVBUload src mem) mem)))
+
+// Medium move uses a duff device
+// 8 and 128 are magic constants, see runtime/mkduff.go
+(Move [s] {t} dst src mem)
+	&& s%4 == 0 && s > 4 && s <= 512
+	&& t.Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [8 * (128 - s/4)] dst src mem)
+
+// Large move uses a loop
+(Move [s] {t} dst src mem)
+	&& ((s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0) && logLargeCopy(v, s) =>
+	(LoweredMove [t.Alignment()]
+		dst
+		src
+		(ADDconst <src.Type> src [int32(s-moveSize(t.Alignment(), config))])
+		mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) => (LessThanU (CMP idx len))
+(IsSliceInBounds idx len) => (LessEqualU (CMP idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) => (EQ cc yes no)
+(If (NotEqual cc) yes no) => (NE cc yes no)
+(If (LessThan cc) yes no) => (LT cc yes no)
+(If (LessThanU cc) yes no) => (ULT cc yes no)
+(If (LessEqual cc) yes no) => (LE cc yes no)
+(If (LessEqualU cc) yes no) => (ULE cc yes no)
+(If (GreaterThan cc) yes no) => (GT cc yes no)
+(If (GreaterThanU cc) yes no) => (UGT cc yes no)
+(If (GreaterEqual cc) yes no) => (GE cc yes no)
+(If (GreaterEqualU cc) yes no) => (UGE cc yes no)
+
+(If cond yes no) => (NE (CMPconst [0] cond) yes no)
+
+// Absorb boolean tests into block
+(NE (CMPconst [0] (Equal cc)) yes no) => (EQ cc yes no)
+(NE (CMPconst [0] (NotEqual cc)) yes no) => (NE cc yes no)
+(NE (CMPconst [0] (LessThan cc)) yes no) => (LT cc yes no)
+(NE (CMPconst [0] (LessThanU cc)) yes no) => (ULT cc yes no)
+(NE (CMPconst [0] (LessEqual cc)) yes no) => (LE cc yes no)
+(NE (CMPconst [0] (LessEqualU cc)) yes no) => (ULE cc yes no)
+(NE (CMPconst [0] (GreaterThan cc)) yes no) => (GT cc yes no)
+(NE (CMPconst [0] (GreaterThanU cc)) yes no) => (UGT cc yes no)
+(NE (CMPconst [0] (GreaterEqual cc)) yes no) => (GE cc yes no)
+(NE (CMPconst [0] (GreaterEqualU cc)) yes no) => (UGE cc yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem)
+
+// Optimizations
+
+// fold offset into address
+(ADDconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off1+off2] {sym} ptr)
+(SUBconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off2-off1] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVBload [off1+off2] {sym} ptr mem)
+(MOVBload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVBload [off1-off2] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVBUload [off1+off2] {sym} ptr mem)
+(MOVBUload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVBUload [off1-off2] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVHload [off1+off2] {sym} ptr mem)
+(MOVHload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVHload [off1-off2] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVHUload [off1+off2] {sym} ptr mem)
+(MOVHUload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVHUload [off1-off2] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVWload [off1+off2] {sym} ptr mem)
+(MOVWload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVWload [off1-off2] {sym} ptr mem)
+(MOVFload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVFload [off1+off2] {sym} ptr mem)
+(MOVFload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVFload [off1-off2] {sym} ptr mem)
+(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVDload [off1+off2] {sym} ptr mem)
+(MOVDload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVDload [off1-off2] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVBstore [off1+off2] {sym} ptr val mem)
+(MOVBstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVBstore [off1-off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVHstore [off1+off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVHstore [off1-off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVWstore [off1+off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVWstore [off1-off2] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVFstore [off1+off2] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVFstore [off1-off2] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVDstore [off1+off2] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVDstore [off1-off2] {sym} ptr val mem)
+
+(MOVBload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVFload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVFstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBreg x)
+(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBUreg x)
+(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHreg x)
+(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHUreg x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+
+(MOVFload [off] {sym} ptr (MOVFstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+
+(MOVWloadidx ptr idx (MOVWstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => x
+(MOVWloadshiftLL ptr idx [c] (MOVWstoreshiftLL ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x
+(MOVWloadshiftRL ptr idx [c] (MOVWstoreshiftRL ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x
+(MOVWloadshiftRA ptr idx [c] (MOVWstoreshiftRA ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x
+(MOVBUloadidx ptr idx (MOVBstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVBUreg x)
+(MOVBloadidx ptr idx (MOVBstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVBreg x)
+(MOVHUloadidx ptr idx (MOVHstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVHUreg x)
+(MOVHloadidx ptr idx (MOVHstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVHreg x)
+
+// fold constant into arithmatic ops
+(ADD x (MOVWconst [c])) => (ADDconst [c] x)
+(SUB (MOVWconst [c]) x) => (RSBconst [c] x)
+(SUB x (MOVWconst [c])) => (SUBconst [c] x)
+(RSB (MOVWconst [c]) x) => (SUBconst [c] x)
+(RSB x (MOVWconst [c])) => (RSBconst [c] x)
+
+(ADDS x (MOVWconst [c])) => (ADDSconst [c] x)
+(SUBS x (MOVWconst [c])) => (SUBSconst [c] x)
+
+(ADC (MOVWconst [c]) x flags) => (ADCconst [c] x flags)
+(SBC (MOVWconst [c]) x flags) => (RSCconst [c] x flags)
+(SBC x (MOVWconst [c]) flags) => (SBCconst [c] x flags)
+
+(AND x (MOVWconst [c])) => (ANDconst [c] x)
+(OR  x (MOVWconst [c])) => (ORconst [c] x)
+(XOR x (MOVWconst [c])) => (XORconst [c] x)
+(BIC x (MOVWconst [c])) => (BICconst [c] x)
+
+(SLL x (MOVWconst [c])) && 0 <= c && c < 32 => (SLLconst x [c])
+(SRL x (MOVWconst [c])) && 0 <= c && c < 32 => (SRLconst x [c])
+(SRA x (MOVWconst [c])) && 0 <= c && c < 32 => (SRAconst x [c])
+
+(CMP x (MOVWconst [c])) => (CMPconst [c] x)
+(CMP (MOVWconst [c]) x) => (InvertFlags (CMPconst [c] x))
+(CMN x (MOVWconst [c])) => (CMNconst [c] x)
+(TST x (MOVWconst [c])) => (TSTconst [c] x)
+(TEQ x (MOVWconst [c])) => (TEQconst [c] x)
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+(CMP x y) && x.ID > y.ID => (InvertFlags (CMP y x))
+
+// don't extend after proper load
+// MOVWreg instruction is not emitted if src and dst registers are same, but it ensures the type.
+(MOVBreg x:(MOVBload _ _)) => (MOVWreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVWreg x)
+
+// fold extensions and ANDs together
+(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x)
+(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&0xffff] x)
+(MOVBreg (ANDconst [c] x)) && c & 0x80 == 0 => (ANDconst [c&0x7f] x)
+(MOVHreg (ANDconst [c] x)) && c & 0x8000 == 0 => (ANDconst [c&0x7fff] x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVWreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVWreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVWnop doesn't emit instruction, only for ensuring the type.
+(MOVWreg x) && x.Uses == 1 => (MOVWnop x)
+
+// mul by constant
+(MUL x (MOVWconst [c])) && int32(c) == -1 => (RSBconst [0] x)
+(MUL _ (MOVWconst [0])) => (MOVWconst [0])
+(MUL x (MOVWconst [1])) => x
+(MUL x (MOVWconst [c])) && isPowerOfTwo32(c) => (SLLconst [int32(log32(c))] x)
+(MUL x (MOVWconst [c])) && isPowerOfTwo32(c-1) && c >= 3 => (ADDshiftLL x x [int32(log32(c-1))])
+(MUL x (MOVWconst [c])) && isPowerOfTwo32(c+1) && c >= 7 => (RSBshiftLL x x [int32(log32(c+1))])
+(MUL x (MOVWconst [c])) && c%3 == 0 && isPowerOfTwo32(c/3) => (SLLconst [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1]))
+(MUL x (MOVWconst [c])) && c%5 == 0 && isPowerOfTwo32(c/5) => (SLLconst [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2]))
+(MUL x (MOVWconst [c])) && c%7 == 0 && isPowerOfTwo32(c/7) => (SLLconst [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3]))
+(MUL x (MOVWconst [c])) && c%9 == 0 && isPowerOfTwo32(c/9) => (SLLconst [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3]))
+
+(MULA x (MOVWconst [c]) a) && c == -1 => (SUB a x)
+(MULA _ (MOVWconst [0]) a) => a
+(MULA x (MOVWconst [1]) a) => (ADD x a)
+(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c) => (ADD (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c-1) && c >= 3 => (ADD (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c+1) && c >= 7 => (ADD (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULA x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo32(c/3) => (ADD (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULA x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo32(c/5) => (ADD (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULA x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo32(c/7) => (ADD (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULA x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo32(c/9) => (ADD (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULA (MOVWconst [c]) x a) && c == -1 => (SUB a x)
+(MULA (MOVWconst [0]) _ a) => a
+(MULA (MOVWconst [1]) x a) => (ADD x a)
+(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c) => (ADD (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c-1) && c >= 3 => (ADD (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c+1) && c >= 7 => (ADD (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULA (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo32(c/3) => (ADD (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULA (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo32(c/5) => (ADD (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULA (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo32(c/7) => (ADD (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULA (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo32(c/9) => (ADD (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULS x (MOVWconst [c]) a) && c == -1 => (ADD a x)
+(MULS _ (MOVWconst [0]) a) => a
+(MULS x (MOVWconst [1]) a) => (RSB x a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c) => (RSB (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c-1) && c >= 3 => (RSB (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c+1) && c >= 7 => (RSB (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULS x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo32(c/3) => (RSB (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULS x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo32(c/5) => (RSB (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULS x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo32(c/7) => (RSB (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULS x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo32(c/9) => (RSB (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULS (MOVWconst [c]) x a) && c == -1 => (ADD a x)
+(MULS (MOVWconst [0]) _ a) => a
+(MULS (MOVWconst [1]) x a) => (RSB x a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c) => (RSB (SLLconst <x.Type> [int32(log32(c))] x) a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c-1) && c >= 3 => (RSB (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c+1) && c >= 7 => (RSB (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a)
+(MULS (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo32(c/3) => (RSB (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a)
+(MULS (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo32(c/5) => (RSB (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a)
+(MULS (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo32(c/7) => (RSB (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a)
+(MULS (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo32(c/9) => (RSB (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a)
+
+// div by constant
+(Select0 (CALLudiv x (MOVWconst [1]))) => x
+(Select1 (CALLudiv _ (MOVWconst [1]))) => (MOVWconst [0])
+(Select0 (CALLudiv x (MOVWconst [c]))) && isPowerOfTwo32(c) => (SRLconst [int32(log32(c))] x)
+(Select1 (CALLudiv x (MOVWconst [c]))) && isPowerOfTwo32(c) => (ANDconst [c-1] x)
+
+// constant comparisons
+(CMPconst (MOVWconst [x]) [y]) => (FlagConstant [subFlags32(x,y)])
+(CMNconst (MOVWconst [x]) [y]) => (FlagConstant [addFlags32(x,y)])
+(TSTconst (MOVWconst [x]) [y]) => (FlagConstant [logicFlags32(x&y)])
+(TEQconst (MOVWconst [x]) [y]) => (FlagConstant [logicFlags32(x^y)])
+
+// other known comparisons
+(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags32(0, 1)])
+(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags32(0, 1)])
+(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n => (FlagConstant [subFlags32(0, 1)])
+(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) => (FlagConstant [subFlags32(0, 1)])
+
+// absorb flag constants into branches
+(EQ (FlagConstant [fc]) yes no) &&  fc.eq() => (First yes no)
+(EQ (FlagConstant [fc]) yes no) && !fc.eq() => (First no yes)
+
+(NE (FlagConstant [fc]) yes no) &&  fc.ne() => (First yes no)
+(NE (FlagConstant [fc]) yes no) && !fc.ne() => (First no yes)
+
+(LT (FlagConstant [fc]) yes no) &&  fc.lt() => (First yes no)
+(LT (FlagConstant [fc]) yes no) && !fc.lt() => (First no yes)
+
+(LE (FlagConstant [fc]) yes no) &&  fc.le() => (First yes no)
+(LE (FlagConstant [fc]) yes no) && !fc.le() => (First no yes)
+
+(GT (FlagConstant [fc]) yes no) &&  fc.gt() => (First yes no)
+(GT (FlagConstant [fc]) yes no) && !fc.gt() => (First no yes)
+
+(GE (FlagConstant [fc]) yes no) &&  fc.ge() => (First yes no)
+(GE (FlagConstant [fc]) yes no) && !fc.ge() => (First no yes)
+
+(ULT (FlagConstant [fc]) yes no) &&  fc.ult() => (First yes no)
+(ULT (FlagConstant [fc]) yes no) && !fc.ult() => (First no yes)
+
+(ULE (FlagConstant [fc]) yes no) &&  fc.ule() => (First yes no)
+(ULE (FlagConstant [fc]) yes no) && !fc.ule() => (First no yes)
+
+(UGT (FlagConstant [fc]) yes no) &&  fc.ugt() => (First yes no)
+(UGT (FlagConstant [fc]) yes no) && !fc.ugt() => (First no yes)
+
+(UGE (FlagConstant [fc]) yes no) &&  fc.uge() => (First yes no)
+(UGE (FlagConstant [fc]) yes no) && !fc.uge() => (First no yes)
+
+(LTnoov (FlagConstant [fc]) yes no) &&  fc.ltNoov() => (First yes no)
+(LTnoov (FlagConstant [fc]) yes no) && !fc.ltNoov() => (First no yes)
+
+(LEnoov (FlagConstant [fc]) yes no) &&  fc.leNoov() => (First yes no)
+(LEnoov (FlagConstant [fc]) yes no) && !fc.leNoov() => (First no yes)
+
+(GTnoov (FlagConstant [fc]) yes no) &&  fc.gtNoov() => (First yes no)
+(GTnoov (FlagConstant [fc]) yes no) && !fc.gtNoov() => (First no yes)
+
+(GEnoov (FlagConstant [fc]) yes no) &&  fc.geNoov() => (First yes no)
+(GEnoov (FlagConstant [fc]) yes no) && !fc.geNoov() => (First no yes)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+(LTnoov (InvertFlags cmp) yes no) => (GTnoov cmp yes no)
+(GEnoov (InvertFlags cmp) yes no) => (LEnoov cmp yes no)
+(LEnoov (InvertFlags cmp) yes no) => (GEnoov cmp yes no)
+(GTnoov (InvertFlags cmp) yes no) => (LTnoov cmp yes no)
+
+// absorb flag constants into boolean values
+(Equal (FlagConstant [fc])) => (MOVWconst [b2i32(fc.eq())])
+(NotEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ne())])
+(LessThan (FlagConstant [fc])) => (MOVWconst [b2i32(fc.lt())])
+(LessThanU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ult())])
+(LessEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.le())])
+(LessEqualU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ule())])
+(GreaterThan (FlagConstant [fc])) => (MOVWconst [b2i32(fc.gt())])
+(GreaterThanU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ugt())])
+(GreaterEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ge())])
+(GreaterEqualU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.uge())])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) => (Equal x)
+(NotEqual (InvertFlags x)) => (NotEqual x)
+(LessThan (InvertFlags x)) => (GreaterThan x)
+(LessThanU (InvertFlags x)) => (GreaterThanU x)
+(GreaterThan (InvertFlags x)) => (LessThan x)
+(GreaterThanU (InvertFlags x)) => (LessThanU x)
+(LessEqual (InvertFlags x)) => (GreaterEqual x)
+(LessEqualU (InvertFlags x)) => (GreaterEqualU x)
+(GreaterEqual (InvertFlags x)) => (LessEqual x)
+(GreaterEqualU (InvertFlags x)) => (LessEqualU x)
+
+// absorb flag constants into conditional instructions
+(CMOVWLSconst _ (FlagConstant [fc]) [c]) && fc.ule() => (MOVWconst [c])
+(CMOVWLSconst x (FlagConstant [fc]) [c]) && fc.ugt() => x
+
+(CMOVWHSconst _ (FlagConstant [fc]) [c]) && fc.uge() => (MOVWconst [c])
+(CMOVWHSconst x (FlagConstant [fc]) [c]) && fc.ult() => x
+
+(CMOVWLSconst x (InvertFlags flags) [c]) => (CMOVWHSconst x flags [c])
+(CMOVWHSconst x (InvertFlags flags) [c]) => (CMOVWLSconst x flags [c])
+
+(SRAcond x _ (FlagConstant [fc])) && fc.uge() => (SRAconst x [31])
+(SRAcond x y (FlagConstant [fc])) && fc.ult() => (SRA x y)
+
+// remove redundant *const ops
+(ADDconst [0] x) => x
+(SUBconst [0] x) => x
+(ANDconst [0] _) => (MOVWconst [0])
+(ANDconst [c] x) && int32(c)==-1 => x
+(ORconst [0] x) => x
+(ORconst [c] _) && int32(c)==-1 => (MOVWconst [-1])
+(XORconst [0] x) => x
+(BICconst [0] x) => x
+(BICconst [c] _) && int32(c)==-1 => (MOVWconst [0])
+
+// generic constant folding
+(ADDconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(uint32(-c)) => (SUBconst [-c] x)
+(SUBconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(uint32(-c)) => (ADDconst [-c] x)
+(ANDconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(^uint32(c)) => (BICconst [int32(^uint32(c))] x)
+(BICconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(^uint32(c)) => (ANDconst [int32(^uint32(c))] x)
+(ADDconst [c] x) && objabi.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && uint32(-c)<=0xffff => (SUBconst [-c] x)
+(SUBconst [c] x) && objabi.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && uint32(-c)<=0xffff => (ADDconst [-c] x)
+(ANDconst [c] x) && objabi.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && ^uint32(c)<=0xffff => (BICconst [int32(^uint32(c))] x)
+(BICconst [c] x) && objabi.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && ^uint32(c)<=0xffff => (ANDconst [int32(^uint32(c))] x)
+(ADDconst [c] (MOVWconst [d])) => (MOVWconst [c+d])
+(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x)
+(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x)
+(ADDconst [c] (RSBconst [d] x)) => (RSBconst [c+d] x)
+(ADCconst [c] (ADDconst [d] x) flags) => (ADCconst [c+d] x flags)
+(ADCconst [c] (SUBconst [d] x) flags) => (ADCconst [c-d] x flags)
+(SUBconst [c] (MOVWconst [d])) => (MOVWconst [d-c])
+(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x)
+(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x)
+(SUBconst [c] (RSBconst [d] x)) => (RSBconst [-c+d] x)
+(SBCconst [c] (ADDconst [d] x) flags) => (SBCconst [c-d] x flags)
+(SBCconst [c] (SUBconst [d] x) flags) => (SBCconst [c+d] x flags)
+(RSBconst [c] (MOVWconst [d])) => (MOVWconst [c-d])
+(RSBconst [c] (RSBconst [d] x)) => (ADDconst [c-d] x)
+(RSBconst [c] (ADDconst [d] x)) => (RSBconst [c-d] x)
+(RSBconst [c] (SUBconst [d] x)) => (RSBconst [c+d] x)
+(RSCconst [c] (ADDconst [d] x) flags) => (RSCconst [c-d] x flags)
+(RSCconst [c] (SUBconst [d] x) flags) => (RSCconst [c+d] x flags)
+(SLLconst [c] (MOVWconst [d])) => (MOVWconst [d<<uint64(c)])
+(SRLconst [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)>>uint64(c))])
+(SRAconst [c] (MOVWconst [d])) => (MOVWconst [d>>uint64(c)])
+(MUL (MOVWconst [c]) (MOVWconst [d])) => (MOVWconst [c*d])
+(MULA (MOVWconst [c]) (MOVWconst [d]) a) => (ADDconst [c*d] a)
+(MULS (MOVWconst [c]) (MOVWconst [d]) a) => (SUBconst [c*d] a)
+(Select0 (CALLudiv (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)/uint32(d))])
+(Select1 (CALLudiv (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)%uint32(d))])
+(ANDconst [c] (MOVWconst [d])) => (MOVWconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVWconst [d])) => (MOVWconst [c|d])
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (MOVWconst [d])) => (MOVWconst [c^d])
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(BICconst [c] (MOVWconst [d])) => (MOVWconst [d&^c])
+(BICconst [c] (BICconst [d] x)) => (BICconst [c|d] x)
+(MVN (MOVWconst [c])) => (MOVWconst [^c])
+(MOVBreg (MOVWconst [c])) => (MOVWconst [int32(int8(c))])
+(MOVBUreg (MOVWconst [c])) => (MOVWconst [int32(uint8(c))])
+(MOVHreg (MOVWconst [c])) => (MOVWconst [int32(int16(c))])
+(MOVHUreg (MOVWconst [c])) => (MOVWconst [int32(uint16(c))])
+(MOVWreg (MOVWconst [c])) => (MOVWconst [c])
+// BFX: Width = c >> 8, LSB = c & 0xff, result = d << (32 - Width - LSB) >> (32 - Width)
+(BFX [c] (MOVWconst [d])) => (MOVWconst [d<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8))])
+(BFXU [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
+
+// absorb shifts into ops
+(ADD x (SLLconst [c] y)) => (ADDshiftLL x y [c])
+(ADD x (SRLconst [c] y)) => (ADDshiftRL x y [c])
+(ADD x (SRAconst [c] y)) => (ADDshiftRA x y [c])
+(ADD x (SLL y z)) => (ADDshiftLLreg x y z)
+(ADD x (SRL y z)) => (ADDshiftRLreg x y z)
+(ADD x (SRA y z)) => (ADDshiftRAreg x y z)
+(ADC x (SLLconst [c] y) flags) => (ADCshiftLL x y [c] flags)
+(ADC x (SRLconst [c] y) flags) => (ADCshiftRL x y [c] flags)
+(ADC x (SRAconst [c] y) flags) => (ADCshiftRA x y [c] flags)
+(ADC x (SLL y z) flags) => (ADCshiftLLreg x y z flags)
+(ADC x (SRL y z) flags) => (ADCshiftRLreg x y z flags)
+(ADC x (SRA y z) flags) => (ADCshiftRAreg x y z flags)
+(ADDS x (SLLconst [c] y)) => (ADDSshiftLL x y [c])
+(ADDS x (SRLconst [c] y)) => (ADDSshiftRL x y [c])
+(ADDS x (SRAconst [c] y)) => (ADDSshiftRA x y [c])
+(ADDS x (SLL y z)) => (ADDSshiftLLreg x y z)
+(ADDS x (SRL y z)) => (ADDSshiftRLreg x y z)
+(ADDS x (SRA y z)) => (ADDSshiftRAreg x y z)
+(SUB x (SLLconst [c] y)) => (SUBshiftLL x y [c])
+(SUB (SLLconst [c] y) x) => (RSBshiftLL x y [c])
+(SUB x (SRLconst [c] y)) => (SUBshiftRL x y [c])
+(SUB (SRLconst [c] y) x) => (RSBshiftRL x y [c])
+(SUB x (SRAconst [c] y)) => (SUBshiftRA x y [c])
+(SUB (SRAconst [c] y) x) => (RSBshiftRA x y [c])
+(SUB x (SLL y z)) => (SUBshiftLLreg x y z)
+(SUB (SLL y z) x) => (RSBshiftLLreg x y z)
+(SUB x (SRL y z)) => (SUBshiftRLreg x y z)
+(SUB (SRL y z) x) => (RSBshiftRLreg x y z)
+(SUB x (SRA y z)) => (SUBshiftRAreg x y z)
+(SUB (SRA y z) x) => (RSBshiftRAreg x y z)
+(SBC x (SLLconst [c] y) flags) => (SBCshiftLL x y [c] flags)
+(SBC (SLLconst [c] y) x flags) => (RSCshiftLL x y [c] flags)
+(SBC x (SRLconst [c] y) flags) => (SBCshiftRL x y [c] flags)
+(SBC (SRLconst [c] y) x flags) => (RSCshiftRL x y [c] flags)
+(SBC x (SRAconst [c] y) flags) => (SBCshiftRA x y [c] flags)
+(SBC (SRAconst [c] y) x flags) => (RSCshiftRA x y [c] flags)
+(SBC x (SLL y z) flags) => (SBCshiftLLreg x y z flags)
+(SBC (SLL y z) x flags) => (RSCshiftLLreg x y z flags)
+(SBC x (SRL y z) flags) => (SBCshiftRLreg x y z flags)
+(SBC (SRL y z) x flags) => (RSCshiftRLreg x y z flags)
+(SBC x (SRA y z) flags) => (SBCshiftRAreg x y z flags)
+(SBC (SRA y z) x flags) => (RSCshiftRAreg x y z flags)
+(SUBS x (SLLconst [c] y)) => (SUBSshiftLL x y [c])
+(SUBS (SLLconst [c] y) x) => (RSBSshiftLL x y [c])
+(SUBS x (SRLconst [c] y)) => (SUBSshiftRL x y [c])
+(SUBS (SRLconst [c] y) x) => (RSBSshiftRL x y [c])
+(SUBS x (SRAconst [c] y)) => (SUBSshiftRA x y [c])
+(SUBS (SRAconst [c] y) x) => (RSBSshiftRA x y [c])
+(SUBS x (SLL y z)) => (SUBSshiftLLreg x y z)
+(SUBS (SLL y z) x) => (RSBSshiftLLreg x y z)
+(SUBS x (SRL y z)) => (SUBSshiftRLreg x y z)
+(SUBS (SRL y z) x) => (RSBSshiftRLreg x y z)
+(SUBS x (SRA y z)) => (SUBSshiftRAreg x y z)
+(SUBS (SRA y z) x) => (RSBSshiftRAreg x y z)
+(RSB x (SLLconst [c] y)) => (RSBshiftLL x y [c])
+(RSB (SLLconst [c] y) x) => (SUBshiftLL x y [c])
+(RSB x (SRLconst [c] y)) => (RSBshiftRL x y [c])
+(RSB (SRLconst [c] y) x) => (SUBshiftRL x y [c])
+(RSB x (SRAconst [c] y)) => (RSBshiftRA x y [c])
+(RSB (SRAconst [c] y) x) => (SUBshiftRA x y [c])
+(RSB x (SLL y z)) => (RSBshiftLLreg x y z)
+(RSB (SLL y z) x) => (SUBshiftLLreg x y z)
+(RSB x (SRL y z)) => (RSBshiftRLreg x y z)
+(RSB (SRL y z) x) => (SUBshiftRLreg x y z)
+(RSB x (SRA y z)) => (RSBshiftRAreg x y z)
+(RSB (SRA y z) x) => (SUBshiftRAreg x y z)
+(AND x (SLLconst [c] y)) => (ANDshiftLL x y [c])
+(AND x (SRLconst [c] y)) => (ANDshiftRL x y [c])
+(AND x (SRAconst [c] y)) => (ANDshiftRA x y [c])
+(AND x (SLL y z)) => (ANDshiftLLreg x y z)
+(AND x (SRL y z)) => (ANDshiftRLreg x y z)
+(AND x (SRA y z)) => (ANDshiftRAreg x y z)
+(OR x (SLLconst [c] y)) => (ORshiftLL x y [c])
+(OR x (SRLconst [c] y)) => (ORshiftRL x y [c])
+(OR x (SRAconst [c] y)) => (ORshiftRA x y [c])
+(OR x (SLL y z)) => (ORshiftLLreg x y z)
+(OR x (SRL y z)) => (ORshiftRLreg x y z)
+(OR x (SRA y z)) => (ORshiftRAreg x y z)
+(XOR x (SLLconst [c] y)) => (XORshiftLL x y [c])
+(XOR x (SRLconst [c] y)) => (XORshiftRL x y [c])
+(XOR x (SRAconst [c] y)) => (XORshiftRA x y [c])
+(XOR x (SRRconst [c] y)) => (XORshiftRR x y [c])
+(XOR x (SLL y z)) => (XORshiftLLreg x y z)
+(XOR x (SRL y z)) => (XORshiftRLreg x y z)
+(XOR x (SRA y z)) => (XORshiftRAreg x y z)
+(BIC x (SLLconst [c] y)) => (BICshiftLL x y [c])
+(BIC x (SRLconst [c] y)) => (BICshiftRL x y [c])
+(BIC x (SRAconst [c] y)) => (BICshiftRA x y [c])
+(BIC x (SLL y z)) => (BICshiftLLreg x y z)
+(BIC x (SRL y z)) => (BICshiftRLreg x y z)
+(BIC x (SRA y z)) => (BICshiftRAreg x y z)
+(MVN (SLLconst [c] x)) => (MVNshiftLL x [c])
+(MVN (SRLconst [c] x)) => (MVNshiftRL x [c])
+(MVN (SRAconst [c] x)) => (MVNshiftRA x [c])
+(MVN (SLL x y)) => (MVNshiftLLreg x y)
+(MVN (SRL x y)) => (MVNshiftRLreg x y)
+(MVN (SRA x y)) => (MVNshiftRAreg x y)
+
+(CMP x (SLLconst [c] y)) => (CMPshiftLL x y [c])
+(CMP (SLLconst [c] y) x) => (InvertFlags (CMPshiftLL x y [c]))
+(CMP x (SRLconst [c] y)) => (CMPshiftRL x y [c])
+(CMP (SRLconst [c] y) x) => (InvertFlags (CMPshiftRL x y [c]))
+(CMP x (SRAconst [c] y)) => (CMPshiftRA x y [c])
+(CMP (SRAconst [c] y) x) => (InvertFlags (CMPshiftRA x y [c]))
+(CMP x (SLL y z)) => (CMPshiftLLreg x y z)
+(CMP (SLL y z) x) => (InvertFlags (CMPshiftLLreg x y z))
+(CMP x (SRL y z)) => (CMPshiftRLreg x y z)
+(CMP (SRL y z) x) => (InvertFlags (CMPshiftRLreg x y z))
+(CMP x (SRA y z)) => (CMPshiftRAreg x y z)
+(CMP (SRA y z) x) => (InvertFlags (CMPshiftRAreg x y z))
+(TST x (SLLconst [c] y)) => (TSTshiftLL x y [c])
+(TST x (SRLconst [c] y)) => (TSTshiftRL x y [c])
+(TST x (SRAconst [c] y)) => (TSTshiftRA x y [c])
+(TST x (SLL y z)) => (TSTshiftLLreg x y z)
+(TST x (SRL y z)) => (TSTshiftRLreg x y z)
+(TST x (SRA y z)) => (TSTshiftRAreg x y z)
+(TEQ x (SLLconst [c] y)) => (TEQshiftLL x y [c])
+(TEQ x (SRLconst [c] y)) => (TEQshiftRL x y [c])
+(TEQ x (SRAconst [c] y)) => (TEQshiftRA x y [c])
+(TEQ x (SLL y z)) => (TEQshiftLLreg x y z)
+(TEQ x (SRL y z)) => (TEQshiftRLreg x y z)
+(TEQ x (SRA y z)) => (TEQshiftRAreg x y z)
+(CMN x (SLLconst [c] y)) => (CMNshiftLL x y [c])
+(CMN x (SRLconst [c] y)) => (CMNshiftRL x y [c])
+(CMN x (SRAconst [c] y)) => (CMNshiftRA x y [c])
+(CMN x (SLL y z)) => (CMNshiftLLreg x y z)
+(CMN x (SRL y z)) => (CMNshiftRLreg x y z)
+(CMN x (SRA y z)) => (CMNshiftRAreg x y z)
+
+// prefer *const ops to *shift ops
+(ADDshiftLL (MOVWconst [c]) x [d]) => (ADDconst [c] (SLLconst <x.Type> x [d]))
+(ADDshiftRL (MOVWconst [c]) x [d]) => (ADDconst [c] (SRLconst <x.Type> x [d]))
+(ADDshiftRA (MOVWconst [c]) x [d]) => (ADDconst [c] (SRAconst <x.Type> x [d]))
+(ADCshiftLL (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SLLconst <x.Type> x [d]) flags)
+(ADCshiftRL (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SRLconst <x.Type> x [d]) flags)
+(ADCshiftRA (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SRAconst <x.Type> x [d]) flags)
+(ADDSshiftLL (MOVWconst [c]) x [d]) => (ADDSconst [c] (SLLconst <x.Type> x [d]))
+(ADDSshiftRL (MOVWconst [c]) x [d]) => (ADDSconst [c] (SRLconst <x.Type> x [d]))
+(ADDSshiftRA (MOVWconst [c]) x [d]) => (ADDSconst [c] (SRAconst <x.Type> x [d]))
+(SUBshiftLL (MOVWconst [c]) x [d]) => (RSBconst [c] (SLLconst <x.Type> x [d]))
+(SUBshiftRL (MOVWconst [c]) x [d]) => (RSBconst [c] (SRLconst <x.Type> x [d]))
+(SUBshiftRA (MOVWconst [c]) x [d]) => (RSBconst [c] (SRAconst <x.Type> x [d]))
+(SBCshiftLL (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SLLconst <x.Type> x [d]) flags)
+(SBCshiftRL (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SRLconst <x.Type> x [d]) flags)
+(SBCshiftRA (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SRAconst <x.Type> x [d]) flags)
+(SUBSshiftLL (MOVWconst [c]) x [d]) => (RSBSconst [c] (SLLconst <x.Type> x [d]))
+(SUBSshiftRL (MOVWconst [c]) x [d]) => (RSBSconst [c] (SRLconst <x.Type> x [d]))
+(SUBSshiftRA (MOVWconst [c]) x [d]) => (RSBSconst [c] (SRAconst <x.Type> x [d]))
+(RSBshiftLL (MOVWconst [c]) x [d]) => (SUBconst [c] (SLLconst <x.Type> x [d]))
+(RSBshiftRL (MOVWconst [c]) x [d]) => (SUBconst [c] (SRLconst <x.Type> x [d]))
+(RSBshiftRA (MOVWconst [c]) x [d]) => (SUBconst [c] (SRAconst <x.Type> x [d]))
+(RSCshiftLL (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SLLconst <x.Type> x [d]) flags)
+(RSCshiftRL (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SRLconst <x.Type> x [d]) flags)
+(RSCshiftRA (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SRAconst <x.Type> x [d]) flags)
+(RSBSshiftLL (MOVWconst [c]) x [d]) => (SUBSconst [c] (SLLconst <x.Type> x [d]))
+(RSBSshiftRL (MOVWconst [c]) x [d]) => (SUBSconst [c] (SRLconst <x.Type> x [d]))
+(RSBSshiftRA (MOVWconst [c]) x [d]) => (SUBSconst [c] (SRAconst <x.Type> x [d]))
+(ANDshiftLL (MOVWconst [c]) x [d]) => (ANDconst [c] (SLLconst <x.Type> x [d]))
+(ANDshiftRL (MOVWconst [c]) x [d]) => (ANDconst [c] (SRLconst <x.Type> x [d]))
+(ANDshiftRA (MOVWconst [c]) x [d]) => (ANDconst [c] (SRAconst <x.Type> x [d]))
+(ORshiftLL (MOVWconst [c]) x [d]) => (ORconst [c] (SLLconst <x.Type> x [d]))
+(ORshiftRL (MOVWconst [c]) x [d]) => (ORconst [c] (SRLconst <x.Type> x [d]))
+(ORshiftRA (MOVWconst [c]) x [d]) => (ORconst [c] (SRAconst <x.Type> x [d]))
+(XORshiftLL (MOVWconst [c]) x [d]) => (XORconst [c] (SLLconst <x.Type> x [d]))
+(XORshiftRL (MOVWconst [c]) x [d]) => (XORconst [c] (SRLconst <x.Type> x [d]))
+(XORshiftRA (MOVWconst [c]) x [d]) => (XORconst [c] (SRAconst <x.Type> x [d]))
+(XORshiftRR (MOVWconst [c]) x [d]) => (XORconst [c] (SRRconst <x.Type> x [d]))
+(CMPshiftLL (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SLLconst <x.Type> x [d])))
+(CMPshiftRL (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRLconst <x.Type> x [d])))
+(CMPshiftRA (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRAconst <x.Type> x [d])))
+(TSTshiftLL (MOVWconst [c]) x [d]) => (TSTconst [c] (SLLconst <x.Type> x [d]))
+(TSTshiftRL (MOVWconst [c]) x [d]) => (TSTconst [c] (SRLconst <x.Type> x [d]))
+(TSTshiftRA (MOVWconst [c]) x [d]) => (TSTconst [c] (SRAconst <x.Type> x [d]))
+(TEQshiftLL (MOVWconst [c]) x [d]) => (TEQconst [c] (SLLconst <x.Type> x [d]))
+(TEQshiftRL (MOVWconst [c]) x [d]) => (TEQconst [c] (SRLconst <x.Type> x [d]))
+(TEQshiftRA (MOVWconst [c]) x [d]) => (TEQconst [c] (SRAconst <x.Type> x [d]))
+(CMNshiftLL (MOVWconst [c]) x [d]) => (CMNconst [c] (SLLconst <x.Type> x [d]))
+(CMNshiftRL (MOVWconst [c]) x [d]) => (CMNconst [c] (SRLconst <x.Type> x [d]))
+(CMNshiftRA (MOVWconst [c]) x [d]) => (CMNconst [c] (SRAconst <x.Type> x [d]))
+
+(ADDshiftLLreg (MOVWconst [c]) x y) => (ADDconst [c] (SLL <x.Type> x y))
+(ADDshiftRLreg (MOVWconst [c]) x y) => (ADDconst [c] (SRL <x.Type> x y))
+(ADDshiftRAreg (MOVWconst [c]) x y) => (ADDconst [c] (SRA <x.Type> x y))
+(ADCshiftLLreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SLL <x.Type> x y) flags)
+(ADCshiftRLreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SRL <x.Type> x y) flags)
+(ADCshiftRAreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SRA <x.Type> x y) flags)
+(ADDSshiftLLreg (MOVWconst [c]) x y) => (ADDSconst [c] (SLL <x.Type> x y))
+(ADDSshiftRLreg (MOVWconst [c]) x y) => (ADDSconst [c] (SRL <x.Type> x y))
+(ADDSshiftRAreg (MOVWconst [c]) x y) => (ADDSconst [c] (SRA <x.Type> x y))
+(SUBshiftLLreg (MOVWconst [c]) x y) => (RSBconst [c] (SLL <x.Type> x y))
+(SUBshiftRLreg (MOVWconst [c]) x y) => (RSBconst [c] (SRL <x.Type> x y))
+(SUBshiftRAreg (MOVWconst [c]) x y) => (RSBconst [c] (SRA <x.Type> x y))
+(SBCshiftLLreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SLL <x.Type> x y) flags)
+(SBCshiftRLreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SRL <x.Type> x y) flags)
+(SBCshiftRAreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SRA <x.Type> x y) flags)
+(SUBSshiftLLreg (MOVWconst [c]) x y) => (RSBSconst [c] (SLL <x.Type> x y))
+(SUBSshiftRLreg (MOVWconst [c]) x y) => (RSBSconst [c] (SRL <x.Type> x y))
+(SUBSshiftRAreg (MOVWconst [c]) x y) => (RSBSconst [c] (SRA <x.Type> x y))
+(RSBshiftLLreg (MOVWconst [c]) x y) => (SUBconst [c] (SLL <x.Type> x y))
+(RSBshiftRLreg (MOVWconst [c]) x y) => (SUBconst [c] (SRL <x.Type> x y))
+(RSBshiftRAreg (MOVWconst [c]) x y) => (SUBconst [c] (SRA <x.Type> x y))
+(RSCshiftLLreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SLL <x.Type> x y) flags)
+(RSCshiftRLreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SRL <x.Type> x y) flags)
+(RSCshiftRAreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SRA <x.Type> x y) flags)
+(RSBSshiftLLreg (MOVWconst [c]) x y) => (SUBSconst [c] (SLL <x.Type> x y))
+(RSBSshiftRLreg (MOVWconst [c]) x y) => (SUBSconst [c] (SRL <x.Type> x y))
+(RSBSshiftRAreg (MOVWconst [c]) x y) => (SUBSconst [c] (SRA <x.Type> x y))
+(ANDshiftLLreg (MOVWconst [c]) x y) => (ANDconst [c] (SLL <x.Type> x y))
+(ANDshiftRLreg (MOVWconst [c]) x y) => (ANDconst [c] (SRL <x.Type> x y))
+(ANDshiftRAreg (MOVWconst [c]) x y) => (ANDconst [c] (SRA <x.Type> x y))
+(ORshiftLLreg (MOVWconst [c]) x y) => (ORconst [c] (SLL <x.Type> x y))
+(ORshiftRLreg (MOVWconst [c]) x y) => (ORconst [c] (SRL <x.Type> x y))
+(ORshiftRAreg (MOVWconst [c]) x y) => (ORconst [c] (SRA <x.Type> x y))
+(XORshiftLLreg (MOVWconst [c]) x y) => (XORconst [c] (SLL <x.Type> x y))
+(XORshiftRLreg (MOVWconst [c]) x y) => (XORconst [c] (SRL <x.Type> x y))
+(XORshiftRAreg (MOVWconst [c]) x y) => (XORconst [c] (SRA <x.Type> x y))
+(CMPshiftLLreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SLL <x.Type> x y)))
+(CMPshiftRLreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SRL <x.Type> x y)))
+(CMPshiftRAreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SRA <x.Type> x y)))
+(TSTshiftLLreg (MOVWconst [c]) x y) => (TSTconst [c] (SLL <x.Type> x y))
+(TSTshiftRLreg (MOVWconst [c]) x y) => (TSTconst [c] (SRL <x.Type> x y))
+(TSTshiftRAreg (MOVWconst [c]) x y) => (TSTconst [c] (SRA <x.Type> x y))
+(TEQshiftLLreg (MOVWconst [c]) x y) => (TEQconst [c] (SLL <x.Type> x y))
+(TEQshiftRLreg (MOVWconst [c]) x y) => (TEQconst [c] (SRL <x.Type> x y))
+(TEQshiftRAreg (MOVWconst [c]) x y) => (TEQconst [c] (SRA <x.Type> x y))
+(CMNshiftLLreg (MOVWconst [c]) x y) => (CMNconst [c] (SLL <x.Type> x y))
+(CMNshiftRLreg (MOVWconst [c]) x y) => (CMNconst [c] (SRL <x.Type> x y))
+(CMNshiftRAreg (MOVWconst [c]) x y) => (CMNconst [c] (SRA <x.Type> x y))
+
+// constant folding in *shift ops
+(ADDshiftLL x (MOVWconst [c]) [d]) => (ADDconst x [c<<uint64(d)])
+(ADDshiftRL x (MOVWconst [c]) [d]) => (ADDconst x [int32(uint32(c)>>uint64(d))])
+(ADDshiftRA x (MOVWconst [c]) [d]) => (ADDconst x [c>>uint64(d)])
+(ADCshiftLL x (MOVWconst [c]) [d] flags) => (ADCconst x [c<<uint64(d)] flags)
+(ADCshiftRL x (MOVWconst [c]) [d] flags) => (ADCconst x [int32(uint32(c)>>uint64(d))] flags)
+(ADCshiftRA x (MOVWconst [c]) [d] flags) => (ADCconst x [c>>uint64(d)] flags)
+(ADDSshiftLL x (MOVWconst [c]) [d]) => (ADDSconst x [c<<uint64(d)])
+(ADDSshiftRL x (MOVWconst [c]) [d]) => (ADDSconst x [int32(uint32(c)>>uint64(d))])
+(ADDSshiftRA x (MOVWconst [c]) [d]) => (ADDSconst x [c>>uint64(d)])
+(SUBshiftLL x (MOVWconst [c]) [d]) => (SUBconst x [c<<uint64(d)])
+(SUBshiftRL x (MOVWconst [c]) [d]) => (SUBconst x [int32(uint32(c)>>uint64(d))])
+(SUBshiftRA x (MOVWconst [c]) [d]) => (SUBconst x [c>>uint64(d)])
+(SBCshiftLL x (MOVWconst [c]) [d] flags) => (SBCconst x [c<<uint64(d)] flags)
+(SBCshiftRL x (MOVWconst [c]) [d] flags) => (SBCconst x [int32(uint32(c)>>uint64(d))] flags)
+(SBCshiftRA x (MOVWconst [c]) [d] flags) => (SBCconst x [c>>uint64(d)] flags)
+(SUBSshiftLL x (MOVWconst [c]) [d]) => (SUBSconst x [c<<uint64(d)])
+(SUBSshiftRL x (MOVWconst [c]) [d]) => (SUBSconst x [int32(uint32(c)>>uint64(d))])
+(SUBSshiftRA x (MOVWconst [c]) [d]) => (SUBSconst x [c>>uint64(d)])
+(RSBshiftLL x (MOVWconst [c]) [d]) => (RSBconst x [c<<uint64(d)])
+(RSBshiftRL x (MOVWconst [c]) [d]) => (RSBconst x [int32(uint32(c)>>uint64(d))])
+(RSBshiftRA x (MOVWconst [c]) [d]) => (RSBconst x [c>>uint64(d)])
+(RSCshiftLL x (MOVWconst [c]) [d] flags) => (RSCconst x [c<<uint64(d)] flags)
+(RSCshiftRL x (MOVWconst [c]) [d] flags) => (RSCconst x [int32(uint32(c)>>uint64(d))] flags)
+(RSCshiftRA x (MOVWconst [c]) [d] flags) => (RSCconst x [c>>uint64(d)] flags)
+(RSBSshiftLL x (MOVWconst [c]) [d]) => (RSBSconst x [c<<uint64(d)])
+(RSBSshiftRL x (MOVWconst [c]) [d]) => (RSBSconst x [int32(uint32(c)>>uint64(d))])
+(RSBSshiftRA x (MOVWconst [c]) [d]) => (RSBSconst x [c>>uint64(d)])
+(ANDshiftLL x (MOVWconst [c]) [d]) => (ANDconst x [c<<uint64(d)])
+(ANDshiftRL x (MOVWconst [c]) [d]) => (ANDconst x [int32(uint32(c)>>uint64(d))])
+(ANDshiftRA x (MOVWconst [c]) [d]) => (ANDconst x [c>>uint64(d)])
+(ORshiftLL x (MOVWconst [c]) [d]) => (ORconst x [c<<uint64(d)])
+(ORshiftRL x (MOVWconst [c]) [d]) => (ORconst x [int32(uint32(c)>>uint64(d))])
+(ORshiftRA x (MOVWconst [c]) [d]) => (ORconst x [c>>uint64(d)])
+(XORshiftLL x (MOVWconst [c]) [d]) => (XORconst x [c<<uint64(d)])
+(XORshiftRL x (MOVWconst [c]) [d]) => (XORconst x [int32(uint32(c)>>uint64(d))])
+(XORshiftRA x (MOVWconst [c]) [d]) => (XORconst x [c>>uint64(d)])
+(XORshiftRR x (MOVWconst [c]) [d]) => (XORconst x [int32(uint32(c)>>uint64(d)|uint32(c)<<uint64(32-d))])
+(BICshiftLL x (MOVWconst [c]) [d]) => (BICconst x [c<<uint64(d)])
+(BICshiftRL x (MOVWconst [c]) [d]) => (BICconst x [int32(uint32(c)>>uint64(d))])
+(BICshiftRA x (MOVWconst [c]) [d]) => (BICconst x [c>>uint64(d)])
+(MVNshiftLL (MOVWconst [c]) [d]) => (MOVWconst [^(c<<uint64(d))])
+(MVNshiftRL (MOVWconst [c]) [d]) => (MOVWconst [^int32(uint32(c)>>uint64(d))])
+(MVNshiftRA (MOVWconst [c]) [d]) => (MOVWconst [int32(c)>>uint64(d)])
+(CMPshiftLL x (MOVWconst [c]) [d]) => (CMPconst x [c<<uint64(d)])
+(CMPshiftRL x (MOVWconst [c]) [d]) => (CMPconst x [int32(uint32(c)>>uint64(d))])
+(CMPshiftRA x (MOVWconst [c]) [d]) => (CMPconst x [c>>uint64(d)])
+(TSTshiftLL x (MOVWconst [c]) [d]) => (TSTconst x [c<<uint64(d)])
+(TSTshiftRL x (MOVWconst [c]) [d]) => (TSTconst x [int32(uint32(c)>>uint64(d))])
+(TSTshiftRA x (MOVWconst [c]) [d]) => (TSTconst x [c>>uint64(d)])
+(TEQshiftLL x (MOVWconst [c]) [d]) => (TEQconst x [c<<uint64(d)])
+(TEQshiftRL x (MOVWconst [c]) [d]) => (TEQconst x [int32(uint32(c)>>uint64(d))])
+(TEQshiftRA x (MOVWconst [c]) [d]) => (TEQconst x [c>>uint64(d)])
+(CMNshiftLL x (MOVWconst [c]) [d]) => (CMNconst x [c<<uint64(d)])
+(CMNshiftRL x (MOVWconst [c]) [d]) => (CMNconst x [int32(uint32(c)>>uint64(d))])
+(CMNshiftRA x (MOVWconst [c]) [d]) => (CMNconst x [c>>uint64(d)])
+
+(ADDshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftLL x y [c])
+(ADDshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftRL x y [c])
+(ADDshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftRA x y [c])
+(ADCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftLL x y [c] flags)
+(ADCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftRL x y [c] flags)
+(ADCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftRA x y [c] flags)
+(ADDSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftLL x y [c])
+(ADDSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftRL x y [c])
+(ADDSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftRA x y [c])
+(SUBshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftLL x y [c])
+(SUBshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftRL x y [c])
+(SUBshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftRA x y [c])
+(SBCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftLL x y [c] flags)
+(SBCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftRL x y [c] flags)
+(SBCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftRA x y [c] flags)
+(SUBSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftLL x y [c])
+(SUBSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftRL x y [c])
+(SUBSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftRA x y [c])
+(RSBshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftLL x y [c])
+(RSBshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftRL x y [c])
+(RSBshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftRA x y [c])
+(RSCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftLL x y [c] flags)
+(RSCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftRL x y [c] flags)
+(RSCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftRA x y [c] flags)
+(RSBSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftLL x y [c])
+(RSBSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftRL x y [c])
+(RSBSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftRA x y [c])
+(ANDshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftLL x y [c])
+(ANDshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftRL x y [c])
+(ANDshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftRA x y [c])
+(ORshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftLL x y [c])
+(ORshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftRL x y [c])
+(ORshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftRA x y [c])
+(XORshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftLL x y [c])
+(XORshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftRL x y [c])
+(XORshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftRA x y [c])
+(BICshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftLL x y [c])
+(BICshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftRL x y [c])
+(BICshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftRA x y [c])
+(MVNshiftLLreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftLL x [c])
+(MVNshiftRLreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftRL x [c])
+(MVNshiftRAreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftRA x [c])
+(CMPshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftLL x y [c])
+(CMPshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftRL x y [c])
+(CMPshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftRA x y [c])
+(TSTshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftLL x y [c])
+(TSTshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftRL x y [c])
+(TSTshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftRA x y [c])
+(TEQshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftLL x y [c])
+(TEQshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftRL x y [c])
+(TEQshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftRA x y [c])
+(CMNshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftLL x y [c])
+(CMNshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftRL x y [c])
+(CMNshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftRA x y [c])
+
+// Generate rotates
+(ADDshiftLL [c] (SRLconst x [32-c]) x) => (SRRconst [32-c] x)
+( ORshiftLL [c] (SRLconst x [32-c]) x) => (SRRconst [32-c] x)
+(XORshiftLL [c] (SRLconst x [32-c]) x) => (SRRconst [32-c] x)
+(ADDshiftRL [c] (SLLconst x [32-c]) x) => (SRRconst [   c] x)
+( ORshiftRL [c] (SLLconst x [32-c]) x) => (SRRconst [   c] x)
+(XORshiftRL [c] (SLLconst x [32-c]) x) => (SRRconst [   c] x)
+
+(RotateLeft32 x (MOVWconst [c])) => (SRRconst [-c&31] x)
+(RotateLeft16 <t> x (MOVWconst [c])) => (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+(RotateLeft8 <t> x (MOVWconst [c])) => (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+(RotateLeft32 x y) => (SRR x (RSBconst [0] <y.Type> y))
+
+// ((x>>8) | (x<<8)) -> (REV16 x), the type of x is uint16, "|" can also be "^" or "+".
+// UBFX instruction is supported by ARMv6T2, ARMv7 and above versions, REV16 is supported by
+// ARMv6 and above versions. So for ARMv6, we need to match SLLconst, SRLconst and ORshiftLL.
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (BFXU <typ.UInt16> [int32(armBFAuxInt(8, 8))] x) x) => (REV16 x)
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (SRLconst <typ.UInt16> [24] (SLLconst [16] x)) x) && objabi.GOARM>=6 => (REV16 x)
+
+// use indexed loads and stores
+(MOVWload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVWloadidx ptr idx mem)
+(MOVWstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVWstoreidx ptr idx val mem)
+(MOVWload [0] {sym} (ADDshiftLL ptr idx [c]) mem) && sym == nil => (MOVWloadshiftLL ptr idx [c] mem)
+(MOVWload [0] {sym} (ADDshiftRL ptr idx [c]) mem) && sym == nil => (MOVWloadshiftRL ptr idx [c] mem)
+(MOVWload [0] {sym} (ADDshiftRA ptr idx [c]) mem) && sym == nil => (MOVWloadshiftRA ptr idx [c] mem)
+(MOVWstore [0] {sym} (ADDshiftLL ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftLL ptr idx [c] val mem)
+(MOVWstore [0] {sym} (ADDshiftRL ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftRL ptr idx [c] val mem)
+(MOVWstore [0] {sym} (ADDshiftRA ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftRA ptr idx [c] val mem)
+(MOVBUload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVBUloadidx ptr idx mem)
+(MOVBload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVBloadidx ptr idx mem)
+(MOVBstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVBstoreidx ptr idx val mem)
+(MOVHUload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVHUloadidx ptr idx mem)
+(MOVHload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVHloadidx ptr idx mem)
+(MOVHstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVHstoreidx ptr idx val mem)
+
+// constant folding in indexed loads and stores
+(MOVWloadidx ptr (MOVWconst [c]) mem) => (MOVWload [c] ptr mem)
+(MOVWloadidx (MOVWconst [c]) ptr mem) => (MOVWload [c] ptr mem)
+(MOVBloadidx ptr (MOVWconst [c]) mem) => (MOVBload [c] ptr mem)
+(MOVBloadidx (MOVWconst [c]) ptr mem) => (MOVBload [c] ptr mem)
+(MOVBUloadidx ptr (MOVWconst [c]) mem) => (MOVBUload [c] ptr mem)
+(MOVBUloadidx (MOVWconst [c]) ptr mem) => (MOVBUload [c] ptr mem)
+(MOVHUloadidx ptr (MOVWconst [c]) mem) => (MOVHUload [c] ptr mem)
+(MOVHUloadidx (MOVWconst [c]) ptr mem) => (MOVHUload [c] ptr mem)
+(MOVHloadidx ptr (MOVWconst [c]) mem) => (MOVHload [c] ptr mem)
+(MOVHloadidx (MOVWconst [c]) ptr mem) => (MOVHload [c] ptr mem)
+
+(MOVWstoreidx ptr (MOVWconst [c]) val mem) => (MOVWstore [c] ptr val mem)
+(MOVWstoreidx (MOVWconst [c]) ptr val mem) => (MOVWstore [c] ptr val mem)
+(MOVBstoreidx ptr (MOVWconst [c]) val mem) => (MOVBstore [c] ptr val mem)
+(MOVBstoreidx (MOVWconst [c]) ptr val mem) => (MOVBstore [c] ptr val mem)
+(MOVHstoreidx ptr (MOVWconst [c]) val mem) => (MOVHstore [c] ptr val mem)
+(MOVHstoreidx (MOVWconst [c]) ptr val mem) => (MOVHstore [c] ptr val mem)
+
+(MOVWloadidx ptr (SLLconst idx [c]) mem) => (MOVWloadshiftLL ptr idx [c] mem)
+(MOVWloadidx (SLLconst idx [c]) ptr mem) => (MOVWloadshiftLL ptr idx [c] mem)
+(MOVWloadidx ptr (SRLconst idx [c]) mem) => (MOVWloadshiftRL ptr idx [c] mem)
+(MOVWloadidx (SRLconst idx [c]) ptr mem) => (MOVWloadshiftRL ptr idx [c] mem)
+(MOVWloadidx ptr (SRAconst idx [c]) mem) => (MOVWloadshiftRA ptr idx [c] mem)
+(MOVWloadidx (SRAconst idx [c]) ptr mem) => (MOVWloadshiftRA ptr idx [c] mem)
+
+(MOVWstoreidx ptr (SLLconst idx [c]) val mem) => (MOVWstoreshiftLL ptr idx [c] val mem)
+(MOVWstoreidx (SLLconst idx [c]) ptr val mem) => (MOVWstoreshiftLL ptr idx [c] val mem)
+(MOVWstoreidx ptr (SRLconst idx [c]) val mem) => (MOVWstoreshiftRL ptr idx [c] val mem)
+(MOVWstoreidx (SRLconst idx [c]) ptr val mem) => (MOVWstoreshiftRL ptr idx [c] val mem)
+(MOVWstoreidx ptr (SRAconst idx [c]) val mem) => (MOVWstoreshiftRA ptr idx [c] val mem)
+(MOVWstoreidx (SRAconst idx [c]) ptr val mem) => (MOVWstoreshiftRA ptr idx [c] val mem)
+
+(MOVWloadshiftLL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)<<uint64(d))] ptr mem)
+(MOVWloadshiftRL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)>>uint64(d))] ptr mem)
+(MOVWloadshiftRA ptr (MOVWconst [c]) [d] mem) => (MOVWload [c>>uint64(d)] ptr mem)
+
+(MOVWstoreshiftLL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)<<uint64(d))] ptr val mem)
+(MOVWstoreshiftRL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)>>uint64(d))] ptr val mem)
+(MOVWstoreshiftRA ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [c>>uint64(d)] ptr val mem)
+
+// generic simplifications
+(ADD x (RSBconst [0] y)) => (SUB x y)
+(ADD <t> (RSBconst [c] x) (RSBconst [d] y)) => (RSBconst [c+d] (ADD <t> x y))
+(SUB x x) => (MOVWconst [0])
+(RSB x x) => (MOVWconst [0])
+(AND x x) => x
+(OR x x) => x
+(XOR x x) => (MOVWconst [0])
+(BIC x x) => (MOVWconst [0])
+
+(ADD (MUL x y) a) => (MULA x y a)
+(SUB a (MUL x y)) && objabi.GOARM == 7 => (MULS x y a)
+(RSB (MUL x y) a) && objabi.GOARM == 7 => (MULS x y a)
+
+(NEGF (MULF x y)) && objabi.GOARM >= 6 => (NMULF x y)
+(NEGD (MULD x y)) && objabi.GOARM >= 6 => (NMULD x y)
+(MULF (NEGF x) y) && objabi.GOARM >= 6 => (NMULF x y)
+(MULD (NEGD x) y) && objabi.GOARM >= 6 => (NMULD x y)
+(NMULF (NEGF x) y) => (MULF x y)
+(NMULD (NEGD x) y) => (MULD x y)
+
+// the result will overwrite the addend, since they are in the same register
+(ADDF a (MULF x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULAF a x y)
+(ADDF a (NMULF x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULSF a x y)
+(ADDD a (MULD x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULAD a x y)
+(ADDD a (NMULD x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULSD a x y)
+(SUBF a (MULF x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULSF a x y)
+(SUBF a (NMULF x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULAF a x y)
+(SUBD a (MULD x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULSD a x y)
+(SUBD a (NMULD x y)) && a.Uses == 1 && objabi.GOARM >= 6 => (MULAD a x y)
+
+(AND x (MVN y)) => (BIC x y)
+
+// simplification with *shift ops
+(SUBshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(SUBshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(SUBshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(RSBshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(RSBshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(RSBshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(ANDshiftLL y:(SLLconst x [c]) x [c]) => y
+(ANDshiftRL y:(SRLconst x [c]) x [c]) => y
+(ANDshiftRA y:(SRAconst x [c]) x [c]) => y
+(ORshiftLL y:(SLLconst x [c]) x [c]) => y
+(ORshiftRL y:(SRLconst x [c]) x [c]) => y
+(ORshiftRA y:(SRAconst x [c]) x [c]) => y
+(XORshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(XORshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(XORshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(BICshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0])
+(BICshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0])
+(BICshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0])
+(AND x (MVNshiftLL y [c])) => (BICshiftLL x y [c])
+(AND x (MVNshiftRL y [c])) => (BICshiftRL x y [c])
+(AND x (MVNshiftRA y [c])) => (BICshiftRA x y [c])
+
+// floating point optimizations
+(CMPF x (MOVFconst [0])) => (CMPF0 x)
+(CMPD x (MOVDconst [0])) => (CMPD0 x)
+
+// bit extraction
+(SRAconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFX [(d-c)|(32-d)<<8] x)
+(SRLconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFXU [(d-c)|(32-d)<<8] x)
+
+// comparison simplification
+((LT|LE|EQ|NE|GE|GT) (CMP x (RSBconst [0] y))) => ((LT|LE|EQ|NE|GE|GT) (CMN x y)) // sense of carry bit not preserved
+((LT|LE|EQ|NE|GE|GT) (CMN x (RSBconst [0] y))) => ((LT|LE|EQ|NE|GE|GT) (CMP x y)) // sense of carry bit not preserved
+(EQ (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (EQ (CMP x y) yes no)
+(EQ (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no)
+(EQ (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (EQ (CMPconst [c] x) yes no)
+(EQ (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (NE (CMP x y) yes no)
+(NE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (NE (CMP a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (NE (CMPconst [c] x) yes no)
+(NE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftRAreg x y z) yes no)
+(EQ (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (EQ (CMN x y) yes no)
+(EQ (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (EQ (CMN a (MUL <x.Type> x y)) yes no)
+(EQ (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (EQ (CMNconst [c] x) yes no)
+(EQ (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (NE (CMN x y) yes no)
+(NE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (NE (CMN a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (NE (CMNconst [c] x) yes no)
+(NE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftRAreg x y z) yes no)
+(EQ (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (EQ (TST x y) yes no)
+(EQ (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (EQ (TSTconst [c] x) yes no)
+(EQ (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (NE (TST x y) yes no)
+(NE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (NE (TSTconst [c] x) yes no)
+(NE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftRAreg x y z) yes no)
+(EQ (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (EQ (TEQ x y) yes no)
+(EQ (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (EQ (TEQconst [c] x) yes no)
+(EQ (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftLL x y [c]) yes no)
+(EQ (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftRL x y [c]) yes no)
+(EQ (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftRA x y [c]) yes no)
+(EQ (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftLLreg x y z) yes no)
+(EQ (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftRLreg x y z) yes no)
+(EQ (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftRAreg x y z) yes no)
+(NE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (NE (TEQ x y) yes no)
+(NE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (NE (TEQconst [c] x) yes no)
+(NE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftLL x y [c]) yes no)
+(NE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftRL x y [c]) yes no)
+(NE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftRA x y [c]) yes no)
+(NE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftLLreg x y z) yes no)
+(NE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftRLreg x y z) yes no)
+(NE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (LTnoov (CMP x y) yes no)
+(LT (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (LTnoov (CMP a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (LTnoov (CMPconst [c] x) yes no)
+(LT (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (LEnoov (CMP x y) yes no)
+(LE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (LEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (LEnoov (CMPconst [c] x) yes no)
+(LE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (LTnoov (CMN x y) yes no)
+(LT (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (LTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (LTnoov (CMNconst [c] x) yes no)
+(LT (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (LEnoov (CMN x y) yes no)
+(LE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (LEnoov (CMN a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1  => (LEnoov (CMNconst [c] x) yes no)
+(LE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (LTnoov (TST x y) yes no)
+(LT (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (LTnoov (TSTconst [c] x) yes no)
+(LT (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (LEnoov (TST x y) yes no)
+(LE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (LEnoov (TSTconst [c] x) yes no)
+(LE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftRAreg x y z) yes no)
+(LT (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (LTnoov (TEQ x y) yes no)
+(LT (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (LTnoov (TEQconst [c] x) yes no)
+(LT (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftLL x y [c]) yes no)
+(LT (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftRL x y [c]) yes no)
+(LT (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftRA x y [c]) yes no)
+(LT (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftLLreg x y z) yes no)
+(LT (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftRLreg x y z) yes no)
+(LT (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftRAreg x y z) yes no)
+(LE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (LEnoov (TEQ x y) yes no)
+(LE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1  => (LEnoov (TEQconst [c] x) yes no)
+(LE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftLL x y [c]) yes no)
+(LE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftRL x y [c]) yes no)
+(LE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftRA x y [c]) yes no)
+(LE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftLLreg x y z) yes no)
+(LE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftRLreg x y z) yes no)
+(LE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (GTnoov (CMP x y) yes no)
+(GT (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (GTnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (GTnoov (CMPconst [c] x) yes no)
+(GT (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (GEnoov (CMP x y) yes no)
+(GE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (GEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (GEnoov (CMPconst [c] x) yes no)
+(GE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (GTnoov (CMN x y) yes no)
+(GT (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (GTnoov (CMNconst [c] x) yes no)
+(GT (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (GEnoov (CMN x y) yes no)
+(GE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (GEnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (GEnoov (CMNconst [c] x) yes no)
+(GE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (GTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (GTnoov (TST x y) yes no)
+(GT (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (GTnoov (TSTconst [c] x) yes no)
+(GT (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (GEnoov (TST x y) yes no)
+(GE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (GEnoov (TSTconst [c] x) yes no)
+(GE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftRAreg x y z) yes no)
+(GT (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (GTnoov (TEQ x y) yes no)
+(GT (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (GTnoov (TEQconst [c] x) yes no)
+(GT (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftLL x y [c]) yes no)
+(GT (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftRL x y [c]) yes no)
+(GT (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftRA x y [c]) yes no)
+(GT (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftLLreg x y z) yes no)
+(GT (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftRLreg x y z) yes no)
+(GT (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftRAreg x y z) yes no)
+(GE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (GEnoov (TEQ x y) yes no)
+(GE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (GEnoov (TEQconst [c] x) yes no)
+(GE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftLL x y [c]) yes no)
+(GE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftRL x y [c]) yes no)
+(GE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftRA x y [c]) yes no)
+(GE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftLLreg x y z) yes no)
+(GE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftRLreg x y z) yes no)
+(GE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftRAreg x y z) yes no)
+
+(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read8(sym, int64(off)))])
+(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
new file mode 100644
index 0000000..80b4005
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -0,0 +1,2789 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|64|32|16|8) ...) => (ADD ...)
+(Add(32F|64F) ...) => (FADD(S|D) ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUB ...)
+(Sub(32F|64F) ...) => (FSUB(S|D) ...)
+
+(Mul64 ...) => (MUL ...)
+(Mul(32|16|8) ...) => (MULW ...)
+(Mul(32F|64F) ...) => (FMUL(S|D) ...)
+
+(Hmul64 ...) => (MULH ...)
+(Hmul64u ...) => (UMULH ...)
+(Hmul32 x y) => (SRAconst (MULL <typ.Int64> x y) [32])
+(Hmul32u x y) => (SRAconst (UMULL <typ.UInt64> x y) [32])
+(Mul64uhilo ...) => (LoweredMuluhilo ...)
+
+(Div64 [false] x y) => (DIV x y)
+(Div64u ...) => (UDIV ...)
+(Div32 [false] x y) => (DIVW x y)
+(Div32u ...) => (UDIVW ...)
+(Div16 [false] x y) => (DIVW (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (UDIVW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (UDIVW (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIVD ...)
+
+(Mod64 x y) => (MOD x y)
+(Mod64u ...) => (UMOD ...)
+(Mod32 x y) => (MODW x y)
+(Mod32u ...) => (UMODW ...)
+(Mod16 x y) => (MODW (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (UMODW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (MODW (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (UMODW (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+// (x + y) / 2 with x>=y    =>    (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+// unary ops
+(Neg(64|32|16|8) ...) => (NEG ...)
+(Neg(32F|64F) ...) => (FNEG(S|D) ...)
+(Com(64|32|16|8) ...) => (MVN ...)
+
+// math package intrinsics
+(Abs ...) => (FABSD ...)
+(Sqrt ...) => (FSQRTD ...)
+(Ceil ...) => (FRINTPD ...)
+(Floor ...) => (FRINTMD ...)
+(Round ...) => (FRINTAD ...)
+(RoundToEven ...) => (FRINTND ...)
+(Trunc ...) => (FRINTZD ...)
+(FMA x y z) => (FMADDD z x y)
+
+// lowering rotates
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 x y) => (RORW x (NEG <y.Type> y))
+(RotateLeft64 x y) => (ROR x (NEG <y.Type> y))
+
+(Ctz(64|32|16|8)NonZero ...) => (Ctz(64|32|32|32) ...)
+
+(Ctz64 <t> x) => (CLZ (RBIT <t> x))
+(Ctz32 <t> x) => (CLZW (RBITW <t> x))
+(Ctz16 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+
+(PopCount64 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x))))
+(PopCount32 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x)))))
+(PopCount16 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt16to64 x)))))
+
+// Load args directly into the register class where it will be used.
+(FMOVDgpfp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym})
+(FMOVDfpgp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym})
+
+// Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set.
+(MOVDstore [off] {sym} ptr (FMOVDfpgp val) mem) => (FMOVDstore [off] {sym} ptr val mem)
+(FMOVDstore [off] {sym} ptr (FMOVDgpfp val) mem) => (MOVDstore [off] {sym} ptr val mem)
+(MOVWstore [off] {sym} ptr (FMOVSfpgp val) mem) => (FMOVSstore [off] {sym} ptr val mem)
+(FMOVSstore [off] {sym} ptr (FMOVSgpfp val) mem) => (MOVWstore [off] {sym} ptr val mem)
+
+// float <=> int register moves, with no conversion.
+// These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
+(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr val _)) => (FMOVDfpgp val)
+(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr val _)) => (FMOVDgpfp val)
+(MOVWUload [off] {sym} ptr (FMOVSstore [off] {sym} ptr val _)) => (FMOVSfpgp val)
+(FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (FMOVSgpfp val)
+
+(BitLen64 x) => (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
+(BitLen32 x) => (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
+
+(Bswap64 ...) => (REV ...)
+(Bswap32 ...) => (REVW ...)
+
+(BitRev64 ...) => (RBIT ...)
+(BitRev32 ...) => (RBITW ...)
+(BitRev16 x) => (SRLconst [48] (RBIT <typ.UInt64> x))
+(BitRev8 x) => (SRLconst [56] (RBIT <typ.UInt64> x))
+
+// In fact, UMOD will be translated into UREM instruction, and UREM is originally translated into
+// UDIV and MSUB instructions. But if there is already an identical UDIV instruction just before or
+// after UREM (case like quo, rem := z/y, z%y), then the second UDIV instruction becomes redundant.
+// The purpose of this rule is to have this extra UDIV instruction removed in CSE pass.
+(UMOD <typ.UInt64> x y) => (MSUB <typ.UInt64> x y (UDIV <typ.UInt64> x y))
+(UMODW <typ.UInt32> x y) => (MSUBW <typ.UInt32> x y (UDIVW <typ.UInt32> x y))
+
+// 64-bit addition with carry.
+(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c))))
+(Select1 (Add64carry x y c)) => (ADCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c)))))
+
+// 64-bit subtraction with borrowing.
+(Select0 (Sub64borrow x y bo)) => (Select0 <typ.UInt64> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))
+(Select1 (Sub64borrow x y bo)) => (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))))
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVDconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XOR (MOVDconst [1]) x)
+
+// shifts
+// hardware instruction uses only the low 6 bits of the shift
+// we compare to 64 to ensure Go semantics for large shifts
+// Rules about rotates with non-const shift are based on the following rules,
+// if the following rules change, please also modify the rules based on them.
+(Lsh64x64 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh64x32 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh64x8  <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Lsh32x64 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh32x32 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh32x8  <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Lsh16x64 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh16x32 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh16x8  <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Lsh8x64 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Lsh8x32 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Lsh8x8  <t> x y) => (CSEL [OpARM64LessThanU] (SLL <t> x (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Rsh64Ux64 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh64Ux32 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> x (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> x (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh64Ux8  <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> x (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Rsh32Ux64 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh32Ux32 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh32Ux8  <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Rsh16Ux64 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh16Ux32 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh16Ux8  <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Rsh8Ux64 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] y))
+(Rsh8Ux32 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) (ZeroExt32to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) (ZeroExt16to64 y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y)))
+(Rsh8Ux8  <t> x y) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) (ZeroExt8to64  y)) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64  y)))
+
+(Rsh64x64 x y) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh64x32 x y) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh64x16 x y) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh64x8  x y) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt8to64  y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64  y))))
+
+(Rsh32x64 x y) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh32x32 x y) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh32x16 x y) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh32x8  x y) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt8to64  y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64  y))))
+
+(Rsh16x64 x y) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh16x32 x y) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh16x16 x y) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh16x8  x y) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt8to64  y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64  y))))
+
+(Rsh8x64 x y) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y)))
+(Rsh8x32 x y) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt32to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y))))
+(Rsh8x16 x y) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt16to64 y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y))))
+(Rsh8x8  x y) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> (ZeroExt8to64  y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64  y))))
+
+// constants
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32F|64F) [val]) => (FMOV(S|D)const [float64(val)])
+(ConstNil) => (MOVDconst [0])
+(ConstBool [b]) => (MOVDconst [b2i(b)])
+
+(Slicemask <t> x) => (SRAconst (NEG <t> x) [63])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+// float <=> int conversion
+(Cvt32to32F ...) => (SCVTFWS ...)
+(Cvt32to64F ...) => (SCVTFWD ...)
+(Cvt64to32F ...) => (SCVTFS ...)
+(Cvt64to64F ...) => (SCVTFD ...)
+(Cvt32Uto32F ...) => (UCVTFWS ...)
+(Cvt32Uto64F ...) => (UCVTFWD ...)
+(Cvt64Uto32F ...) => (UCVTFS ...)
+(Cvt64Uto64F ...) => (UCVTFD ...)
+(Cvt32Fto32 ...) => (FCVTZSSW ...)
+(Cvt64Fto32 ...) => (FCVTZSDW ...)
+(Cvt32Fto64 ...) => (FCVTZSS ...)
+(Cvt64Fto64 ...) => (FCVTZSD ...)
+(Cvt32Fto32U ...) => (FCVTZUSW ...)
+(Cvt64Fto32U ...) => (FCVTZUDW ...)
+(Cvt32Fto64U ...) => (FCVTZUS ...)
+(Cvt64Fto64U ...) => (FCVTZUD ...)
+(Cvt32Fto64F ...) => (FCVTSD ...)
+(Cvt64Fto32F ...) => (FCVTDS ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (LoweredRound32F ...)
+(Round64F ...) => (LoweredRound64F ...)
+
+// comparisons
+(Eq8 x y)  => (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (Equal (CMPW x y))
+(Eq64 x y) => (Equal (CMP x y))
+(EqPtr x y) => (Equal (CMP x y))
+(Eq32F x y) => (Equal (FCMPS x y))
+(Eq64F x y) => (Equal (FCMPD x y))
+
+(Neq8 x y)  => (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) => (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) => (NotEqual (CMPW x y))
+(Neq64 x y) => (NotEqual (CMP x y))
+(NeqPtr x y) => (NotEqual (CMP x y))
+(Neq32F x y) => (NotEqual (FCMPS x y))
+(Neq64F x y) => (NotEqual (FCMPD x y))
+
+(Less8 x y)  => (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) => (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) => (LessThan (CMPW x y))
+(Less64 x y) => (LessThan (CMP x y))
+
+// Set condition flags for floating-point comparisons "x < y"
+// and "x <= y". Because if either or both of the operands are
+// NaNs, all three of (x < y), (x == y) and (x > y) are false,
+// and ARM Manual says FCMP instruction sets PSTATE.<N,Z,C,V>
+// of this case to (0, 0, 1, 1).
+(Less32F x y) => (LessThanF (FCMPS x y))
+(Less64F x y) => (LessThanF (FCMPD x y))
+
+// For an unsigned integer x, the following rules are useful when combining branch
+// 0 <  x  =>  x != 0
+// x <= 0  =>  x == 0
+// x <  1  =>  x == 0
+// 1 <= x  =>  x != 0
+(Less(8U|16U|32U|64U) zero:(MOVDconst [0]) x) => (Neq(8|16|32|64) zero x)
+(Leq(8U|16U|32U|64U) x zero:(MOVDconst [0]))  => (Eq(8|16|32|64) x zero)
+(Less(8U|16U|32U|64U) x (MOVDconst [1])) => (Eq(8|16|32|64) x (MOVDconst [0]))
+(Leq(8U|16U|32U|64U) (MOVDconst [1]) x)  => (Neq(8|16|32|64) (MOVDconst [0]) x)
+
+(Less8U x y)  => (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) => (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) => (LessThanU (CMPW x y))
+(Less64U x y) => (LessThanU (CMP x y))
+
+(Leq8 x y)  => (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (LessEqual (CMPW x y))
+(Leq64 x y) => (LessEqual (CMP x y))
+
+// Refer to the comments for op Less64F above.
+(Leq32F x y) => (LessEqualF (FCMPS x y))
+(Leq64F x y) => (LessEqualF (FCMPD x y))
+
+(Leq8U x y)  => (LessEqualU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (LessEqualU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (LessEqualU (CMPW x y))
+(Leq64U x y) => (LessEqualU (CMP x y))
+
+// Optimize comparison between a floating-point value and 0.0 with "FCMP $(0.0), Fn"
+(FCMPS x (FMOVSconst [0])) => (FCMPS0 x)
+(FCMPS (FMOVSconst [0]) x) => (InvertFlags (FCMPS0 x))
+(FCMPD x (FMOVDconst [0])) => (FCMPD0 x)
+(FCMPD (FMOVDconst [0]) x) => (InvertFlags (FCMPD0 x))
+
+// CSEL needs a flag-generating argument. Synthesize a CMPW if necessary.
+(CondSelect x y boolval) && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval))
+(CondSelect x y boolval) && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (CMPWconst [0] boolval))
+
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVDaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [off] ptr)
+
+(Addr {sym} base) => (MOVDaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t)  && isSigned(t))  => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t)  && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t))  => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t))  => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+
+// zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVDconst [0]) mem)
+(Zero [2] ptr mem) => (MOVHstore ptr (MOVDconst [0]) mem)
+(Zero [4] ptr mem) => (MOVWstore ptr (MOVDconst [0]) mem)
+(Zero [8] ptr mem) => (MOVDstore ptr (MOVDconst [0]) mem)
+
+(Zero [3] ptr mem) =>
+	(MOVBstore [2] ptr (MOVDconst [0])
+		(MOVHstore ptr (MOVDconst [0]) mem))
+(Zero [5] ptr mem) =>
+	(MOVBstore [4] ptr (MOVDconst [0])
+		(MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [6] ptr mem) =>
+	(MOVHstore [4] ptr (MOVDconst [0])
+		(MOVWstore ptr (MOVDconst [0]) mem))
+(Zero [7] ptr mem) =>
+	(MOVBstore [6] ptr (MOVDconst [0])
+		(MOVHstore [4] ptr (MOVDconst [0])
+			(MOVWstore ptr (MOVDconst [0]) mem)))
+(Zero [9] ptr mem) =>
+	(MOVBstore [8] ptr (MOVDconst [0])
+		(MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [10] ptr mem) =>
+	(MOVHstore [8] ptr (MOVDconst [0])
+		(MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [11] ptr mem) =>
+	(MOVBstore [10] ptr (MOVDconst [0])
+		(MOVHstore [8] ptr (MOVDconst [0])
+			(MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [12] ptr mem) =>
+	(MOVWstore [8] ptr (MOVDconst [0])
+		(MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [13] ptr mem) =>
+	(MOVBstore [12] ptr (MOVDconst [0])
+		(MOVWstore [8] ptr (MOVDconst [0])
+			(MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [14] ptr mem) =>
+	(MOVHstore [12] ptr (MOVDconst [0])
+		(MOVWstore [8] ptr (MOVDconst [0])
+			(MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [15] ptr mem) =>
+	(MOVBstore [14] ptr (MOVDconst [0])
+		(MOVHstore [12] ptr (MOVDconst [0])
+			(MOVWstore [8] ptr (MOVDconst [0])
+				(MOVDstore ptr (MOVDconst [0]) mem))))
+(Zero [16] ptr mem) =>
+	(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
+
+(Zero [32] ptr mem) =>
+	(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+		(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
+
+(Zero [48] ptr mem) =>
+	(STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
+		(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+			(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
+
+(Zero [64] ptr mem) =>
+	(STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
+		(STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
+			(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
+				(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
+
+// strip off fractional word zeroing
+(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 =>
+	(Zero [8]
+		(OffPtr <ptr.Type> ptr [s-8])
+		(Zero [s-s%16] ptr mem))
+(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 =>
+	(Zero [16]
+		(OffPtr <ptr.Type> ptr [s-16])
+		(Zero [s-s%16] ptr mem))
+
+// medium zeroing uses a duff device
+// 4, 16, and 64 are magic constants, see runtime/mkduff.go
+(Zero [s] ptr mem)
+	&& s%16 == 0 && s > 64 && s <= 16*64
+	&& !config.noDuffDevice =>
+	(DUFFZERO [4 * (64 - s/16)] ptr mem)
+
+// large zeroing uses a loop
+(Zero [s] ptr mem)
+	&& s%16 == 0 && (s > 16*64 || config.noDuffDevice) =>
+	(LoweredZero
+		ptr
+		(ADDconst <ptr.Type> [s-16] ptr)
+		mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
+(Move [2] dst src mem) => (MOVHstore dst (MOVHUload src mem) mem)
+(Move [4] dst src mem) => (MOVWstore dst (MOVWUload src mem) mem)
+(Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem)
+
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBUload [2] src mem)
+		(MOVHstore dst (MOVHUload src mem) mem))
+(Move [5] dst src mem) =>
+	(MOVBstore [4] dst (MOVBUload [4] src mem)
+		(MOVWstore dst (MOVWUload src mem) mem))
+(Move [6] dst src mem) =>
+	(MOVHstore [4] dst (MOVHUload [4] src mem)
+		(MOVWstore dst (MOVWUload src mem) mem))
+(Move [7] dst src mem) =>
+	(MOVBstore [6] dst (MOVBUload [6] src mem)
+		(MOVHstore [4] dst (MOVHUload [4] src mem)
+			(MOVWstore dst (MOVWUload src mem) mem)))
+(Move [12] dst src mem) =>
+	(MOVWstore [8] dst (MOVWUload [8] src mem)
+		(MOVDstore dst (MOVDload src mem) mem))
+(Move [16] dst src mem) =>
+	(MOVDstore [8] dst (MOVDload [8] src mem)
+		(MOVDstore dst (MOVDload src mem) mem))
+(Move [24] dst src mem) =>
+	(MOVDstore [16] dst (MOVDload [16] src mem)
+		(MOVDstore [8] dst (MOVDload [8] src mem)
+			(MOVDstore dst (MOVDload src mem) mem)))
+
+// strip off fractional word move
+(Move [s] dst src mem) && s%8 != 0 && s > 8 =>
+	(Move [s%8]
+		(OffPtr <dst.Type> dst [s-s%8])
+		(OffPtr <src.Type> src [s-s%8])
+		(Move [s-s%8] dst src mem))
+
+// medium move uses a duff device
+(Move [s] dst src mem)
+	&& s > 32 && s <= 16*64 && s%16 == 8
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem)
+		(DUFFCOPY <types.TypeMem> [8*(64-(s-8)/16)] dst src mem))
+(Move [s] dst src mem)
+	&& s > 32 && s <= 16*64 && s%16 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [8 * (64 - s/16)] dst src mem)
+// 8 is the number of bytes to encode:
+//
+// LDP.P   16(R16), (R26, R27)
+// STP.P   (R26, R27), 16(R17)
+//
+// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
+
+// large move uses a loop
+(Move [s] dst src mem)
+	&& s > 24 && s%8 == 0 && logLargeCopy(v, s) =>
+	(LoweredMove
+		dst
+		src
+		(ADDconst <src.Type> src [s-8])
+		mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) => (LessThanU (CMP idx len))
+(IsSliceInBounds idx len) => (LessEqualU (CMP idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) => (EQ cc yes no)
+(If (NotEqual cc) yes no) => (NE cc yes no)
+(If (LessThan cc) yes no) => (LT cc yes no)
+(If (LessThanU cc) yes no) => (ULT cc yes no)
+(If (LessEqual cc) yes no) => (LE cc yes no)
+(If (LessEqualU cc) yes no) => (ULE cc yes no)
+(If (GreaterThan cc) yes no) => (GT cc yes no)
+(If (GreaterThanU cc) yes no) => (UGT cc yes no)
+(If (GreaterEqual cc) yes no) => (GE cc yes no)
+(If (GreaterEqualU cc) yes no) => (UGE cc yes no)
+(If (LessThanF cc) yes no) => (FLT cc yes no)
+(If (LessEqualF cc) yes no) => (FLE cc yes no)
+(If (GreaterThanF cc) yes no) => (FGT cc yes no)
+(If (GreaterEqualF cc) yes no) => (FGE cc yes no)
+
+(If cond yes no) => (NZ cond yes no)
+
+// atomic intrinsics
+// Note: these ops do not accept offset.
+(AtomicLoad8   ...) => (LDARB ...)
+(AtomicLoad32  ...) => (LDARW ...)
+(AtomicLoad64  ...) => (LDAR  ...)
+(AtomicLoadPtr ...) => (LDAR  ...)
+
+(AtomicStore8       ...) => (STLRB ...)
+(AtomicStore32      ...) => (STLRW ...)
+(AtomicStore64      ...) => (STLR  ...)
+(AtomicStorePtrNoWB ...) => (STLR  ...)
+
+(AtomicExchange(32|64)       ...) => (LoweredAtomicExchange(32|64) ...)
+(AtomicAdd(32|64)            ...) => (LoweredAtomicAdd(32|64) ...)
+(AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...)
+
+(AtomicAdd(32|64)Variant            ...) => (LoweredAtomicAdd(32|64)Variant      ...)
+(AtomicExchange(32|64)Variant       ...) => (LoweredAtomicExchange(32|64)Variant ...)
+(AtomicCompareAndSwap(32|64)Variant ...) => (LoweredAtomicCas(32|64)Variant      ...)
+
+// Currently the updated value is not used, but we need a register to temporarily hold it.
+(AtomicAnd8  ptr val mem) => (Select1 (LoweredAtomicAnd8  ptr val mem))
+(AtomicAnd32 ptr val mem) => (Select1 (LoweredAtomicAnd32 ptr val mem))
+(AtomicOr8   ptr val mem) => (Select1 (LoweredAtomicOr8   ptr val mem))
+(AtomicOr32  ptr val mem) => (Select1 (LoweredAtomicOr32  ptr val mem))
+
+(AtomicAnd8Variant  ptr val mem) => (Select1 (LoweredAtomicAnd8Variant  ptr val mem))
+(AtomicAnd32Variant ptr val mem) => (Select1 (LoweredAtomicAnd32Variant ptr val mem))
+(AtomicOr8Variant   ptr val mem) => (Select1 (LoweredAtomicOr8Variant   ptr val mem))
+(AtomicOr32Variant  ptr val mem) => (Select1 (LoweredAtomicOr32Variant  ptr val mem))
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NZ (Equal cc) yes no) => (EQ cc yes no)
+(NZ (NotEqual cc) yes no) => (NE cc yes no)
+(NZ (LessThan cc) yes no) => (LT cc yes no)
+(NZ (LessThanU cc) yes no) => (ULT cc yes no)
+(NZ (LessEqual cc) yes no) => (LE cc yes no)
+(NZ (LessEqualU cc) yes no) => (ULE cc yes no)
+(NZ (GreaterThan cc) yes no) => (GT cc yes no)
+(NZ (GreaterThanU cc) yes no) => (UGT cc yes no)
+(NZ (GreaterEqual cc) yes no) => (GE cc yes no)
+(NZ (GreaterEqualU cc) yes no) => (UGE cc yes no)
+(NZ (LessThanF cc) yes no) => (FLT cc yes no)
+(NZ (LessEqualF cc) yes no) => (FLE cc yes no)
+(NZ (GreaterThanF cc) yes no) => (FGT cc yes no)
+(NZ (GreaterEqualF cc) yes no) => (FGE cc yes no)
+
+(EQ (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (EQ (TSTWconst [int32(c)] y) yes no)
+(NE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (NE (TSTWconst [int32(c)] y) yes no)
+(LT (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LT (TSTWconst [int32(c)] y) yes no)
+(LE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LE (TSTWconst [int32(c)] y) yes no)
+(GT (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GT (TSTWconst [int32(c)] y) yes no)
+(GE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GE (TSTWconst [int32(c)] y) yes no)
+
+(EQ (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (EQ (TST x y) yes no)
+(NE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (NE (TST x y) yes no)
+(LT (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LT (TST x y) yes no)
+(LE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LE (TST x y) yes no)
+(GT (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GT (TST x y) yes no)
+(GE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GE (TST x y) yes no)
+
+(EQ (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (EQ (TSTW x y) yes no)
+(NE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (NE (TSTW x y) yes no)
+(LT (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LT (TSTW x y) yes no)
+(LE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LE (TSTW x y) yes no)
+(GT (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GT (TSTW x y) yes no)
+(GE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GE (TSTW x y) yes no)
+
+(EQ (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (EQ (TSTconst [c] y) yes no)
+(NE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (NE (TSTconst [c] y) yes no)
+(LT (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LT (TSTconst [c] y) yes no)
+(LE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LE (TSTconst [c] y) yes no)
+(GT (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GT (TSTconst [c] y) yes no)
+(GE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GE (TSTconst [c] y) yes no)
+
+(EQ (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (EQ (CMNconst [c] y) yes no)
+(NE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (NE (CMNconst [c] y) yes no)
+(LT (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LTnoov (CMNconst [c] y) yes no)
+(LE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LEnoov (CMNconst [c] y) yes no)
+(GT (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GTnoov (CMNconst [c] y) yes no)
+(GE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GEnoov (CMNconst [c] y) yes no)
+
+(EQ (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (EQ (CMNWconst [int32(c)] y) yes no)
+(NE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (NE (CMNWconst [int32(c)] y) yes no)
+(LT (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LTnoov (CMNWconst [int32(c)] y) yes no)
+(LE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LEnoov (CMNWconst [int32(c)] y) yes no)
+(GT (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GTnoov (CMNWconst [int32(c)] y) yes no)
+(GE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GEnoov (CMNWconst [int32(c)] y) yes no)
+
+(EQ (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (EQ (CMN x y) yes no)
+(NE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (NE (CMN x y) yes no)
+(LT (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LTnoov (CMN x y) yes no)
+(LE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LEnoov (CMN x y) yes no)
+(GT (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GTnoov (CMN x y) yes no)
+(GE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GEnoov (CMN x y) yes no)
+
+(EQ (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (EQ (CMNW x y) yes no)
+(NE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (NE (CMNW x y) yes no)
+(LT (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LTnoov (CMNW x y) yes no)
+(LE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LEnoov (CMNW x y) yes no)
+(GT (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GTnoov (CMNW x y) yes no)
+(GE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GEnoov (CMNW x y) yes no)
+
+(EQ (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (EQ (CMN x y) yes no)
+(NE (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (NE (CMN x y) yes no)
+(LT (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (LT (CMN x y) yes no)
+(LE (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (LE (CMN x y) yes no)
+(GT (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (GT (CMN x y) yes no)
+(GE (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (GE (CMN x y) yes no)
+
+(EQ (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (EQ (CMNW x y) yes no)
+(NE (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (NE (CMNW x y) yes no)
+(LT (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (LT (CMNW x y) yes no)
+(LE (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (LE (CMNW x y) yes no)
+(GT (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (GT (CMNW x y) yes no)
+(GE (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (GE (CMNW x y) yes no)
+
+(EQ (CMPconst [0] x) yes no) => (Z x yes no)
+(NE (CMPconst [0] x) yes no) => (NZ x yes no)
+(EQ (CMPWconst [0] x) yes no) => (ZW x yes no)
+(NE (CMPWconst [0] x) yes no) => (NZW x yes no)
+
+(EQ (CMPconst [0]  z:(MADD a x y)) yes no) && z.Uses==1 => (EQ (CMN a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0]  z:(MADD a x y)) yes no) && z.Uses==1 => (NE (CMN a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0]  z:(MADD a x y)) yes no) && z.Uses==1 => (LTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0]  z:(MADD a x y)) yes no) && z.Uses==1 => (LEnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0]  z:(MADD a x y)) yes no) && z.Uses==1 => (GTnoov (CMN a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0]  z:(MADD a x y)) yes no) && z.Uses==1 => (GEnoov (CMN a (MUL <x.Type> x y)) yes no)
+
+(EQ (CMPconst [0]  z:(MSUB a x y)) yes no) && z.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no)
+(NE (CMPconst [0]  z:(MSUB a x y)) yes no) && z.Uses==1 => (NE (CMP a (MUL <x.Type> x y)) yes no)
+(LE (CMPconst [0]  z:(MSUB a x y)) yes no) && z.Uses==1 => (LEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(LT (CMPconst [0]  z:(MSUB a x y)) yes no) && z.Uses==1 => (LTnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GE (CMPconst [0]  z:(MSUB a x y)) yes no) && z.Uses==1 => (GEnoov (CMP a (MUL <x.Type> x y)) yes no)
+(GT (CMPconst [0]  z:(MSUB a x y)) yes no) && z.Uses==1 => (GTnoov (CMP a (MUL <x.Type> x y)) yes no)
+
+(EQ (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (EQ (CMNW a (MULW <x.Type> x y)) yes no)
+(NE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (NE (CMNW a (MULW <x.Type> x y)) yes no)
+(LE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (LEnoov (CMNW a (MULW <x.Type> x y)) yes no)
+(LT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (LTnoov (CMNW a (MULW <x.Type> x y)) yes no)
+(GE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (GEnoov (CMNW a (MULW <x.Type> x y)) yes no)
+(GT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (GTnoov (CMNW a (MULW <x.Type> x y)) yes no)
+
+(EQ (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (EQ (CMPW a (MULW <x.Type> x y)) yes no)
+(NE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (NE (CMPW a (MULW <x.Type> x y)) yes no)
+(LE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (LEnoov (CMPW a (MULW <x.Type> x y)) yes no)
+(LT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (LTnoov (CMPW a (MULW <x.Type> x y)) yes no)
+(GE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (GEnoov (CMPW a (MULW <x.Type> x y)) yes no)
+(GT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (GTnoov (CMPW a (MULW <x.Type> x y)) yes no)
+
+// Absorb bit-tests into block
+(Z  (ANDconst [c] x) yes no) && oneBit(c) => (TBZ  [int64(ntz64(c))] x yes no)
+(NZ (ANDconst [c] x) yes no) && oneBit(c) => (TBNZ [int64(ntz64(c))] x yes no)
+(ZW  (ANDconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBZ  [int64(ntz64(int64(uint32(c))))] x yes no)
+(NZW (ANDconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBNZ [int64(ntz64(int64(uint32(c))))] x yes no)
+(EQ (TSTconst [c] x) yes no) && oneBit(c) => (TBZ  [int64(ntz64(c))] x yes no)
+(NE (TSTconst [c] x) yes no) && oneBit(c) => (TBNZ [int64(ntz64(c))] x yes no)
+(EQ (TSTWconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBZ  [int64(ntz64(int64(uint32(c))))] x yes no)
+(NE (TSTWconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBNZ [int64(ntz64(int64(uint32(c))))] x yes no)
+
+// Test sign-bit for signed comparisons against zero
+(GE (CMPWconst [0] x) yes no) => (TBZ  [31] x yes no)
+(GE (CMPconst [0] x) yes no) => (TBZ  [63] x yes no)
+(LT (CMPWconst [0] x) yes no) => (TBNZ  [31] x yes no)
+(LT (CMPconst [0] x) yes no) => (TBNZ  [63] x yes no)
+
+// fold offset into address
+(ADDconst [off1] (MOVDaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) =>
+	 (MOVDaddr [int32(off1)+off2] {sym} ptr)
+
+// fold address into load/store
+(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBload [off1+int32(off2)] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBUload [off1+int32(off2)] {sym} ptr mem)
+(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHload [off1+int32(off2)] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHUload [off1+int32(off2)] {sym} ptr mem)
+(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWload [off1+int32(off2)] {sym} ptr mem)
+(MOVWUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWUload [off1+int32(off2)] {sym} ptr mem)
+(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVDload [off1+int32(off2)] {sym} ptr mem)
+(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVSload [off1+int32(off2)] {sym} ptr mem)
+(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVDload [off1+int32(off2)] {sym} ptr mem)
+
+// register indexed load
+(MOVDload  [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVDloadidx ptr idx mem)
+(MOVWUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWUloadidx ptr idx mem)
+(MOVWload  [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWloadidx ptr idx mem)
+(MOVHUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHUloadidx ptr idx mem)
+(MOVHload  [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHloadidx ptr idx mem)
+(MOVBUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBUloadidx ptr idx mem)
+(MOVBload  [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBloadidx ptr idx mem)
+(FMOVSload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (FMOVSloadidx ptr idx mem)
+(FMOVDload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (FMOVDloadidx ptr idx mem)
+(MOVDloadidx  ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVDload  [int32(c)] ptr mem)
+(MOVDloadidx  (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVDload  [int32(c)] ptr mem)
+(MOVWUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWUload [int32(c)] ptr mem)
+(MOVWUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVWUload [int32(c)] ptr mem)
+(MOVWloadidx  ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWload  [int32(c)] ptr mem)
+(MOVWloadidx  (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVWload  [int32(c)] ptr mem)
+(MOVHUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHUload [int32(c)] ptr mem)
+(MOVHUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVHUload [int32(c)] ptr mem)
+(MOVHloadidx  ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHload  [int32(c)] ptr mem)
+(MOVHloadidx  (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVHload  [int32(c)] ptr mem)
+(MOVBUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBUload [int32(c)] ptr mem)
+(MOVBUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVBUload [int32(c)] ptr mem)
+(MOVBloadidx  ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBload  [int32(c)] ptr mem)
+(MOVBloadidx  (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVBload  [int32(c)] ptr mem)
+(FMOVSloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (FMOVSload [int32(c)] ptr mem)
+(FMOVSloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (FMOVSload [int32(c)] ptr mem)
+(FMOVDloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (FMOVDload [int32(c)] ptr mem)
+(FMOVDloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (FMOVDload [int32(c)] ptr mem)
+
+// shifted register indexed load
+(MOVDload  [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (MOVDloadidx8 ptr idx mem)
+(MOVWUload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWUloadidx4 ptr idx mem)
+(MOVWload  [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWloadidx4 ptr idx mem)
+(MOVHUload [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHUloadidx2 ptr idx mem)
+(MOVHload  [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHloadidx2 ptr idx mem)
+(MOVDloadidx  ptr (SLLconst [3] idx) mem) => (MOVDloadidx8 ptr idx mem)
+(MOVWloadidx  ptr (SLLconst [2] idx) mem) => (MOVWloadidx4 ptr idx mem)
+(MOVWUloadidx ptr (SLLconst [2] idx) mem) => (MOVWUloadidx4 ptr idx mem)
+(MOVHloadidx  ptr (SLLconst [1] idx) mem) => (MOVHloadidx2 ptr idx mem)
+(MOVHUloadidx ptr (SLLconst [1] idx) mem) => (MOVHUloadidx2 ptr idx mem)
+(MOVHloadidx  ptr (ADD idx idx) mem) => (MOVHloadidx2 ptr idx mem)
+(MOVHUloadidx ptr (ADD idx idx) mem) => (MOVHUloadidx2 ptr idx mem)
+(MOVDloadidx  (SLLconst [3] idx) ptr mem) => (MOVDloadidx8 ptr idx mem)
+(MOVWloadidx  (SLLconst [2] idx) ptr mem) => (MOVWloadidx4 ptr idx mem)
+(MOVWUloadidx (SLLconst [2] idx) ptr mem) => (MOVWUloadidx4 ptr idx mem)
+(MOVHloadidx  (ADD idx idx) ptr mem) => (MOVHloadidx2 ptr idx mem)
+(MOVHUloadidx (ADD idx idx) ptr mem) => (MOVHUloadidx2 ptr idx mem)
+(MOVDloadidx8  ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (MOVDload  [int32(c)<<3] ptr mem)
+(MOVWUloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWUload [int32(c)<<2] ptr mem)
+(MOVWloadidx4  ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWload  [int32(c)<<2] ptr mem)
+(MOVHUloadidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHUload [int32(c)<<1] ptr mem)
+(MOVHloadidx2  ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHload  [int32(c)<<1] ptr mem)
+
+(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(STP [off1] {sym} (ADDconst [off2] ptr) val1 val2 mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(STP [off1+int32(off2)] {sym} ptr val1 val2 mem)
+(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVSstore [off1+int32(off2)] {sym} ptr val mem)
+(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVBstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVDstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVQstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2)
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVQstorezero [off1+int32(off2)] {sym} ptr mem)
+
+// register indexed store
+(MOVDstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVDstoreidx ptr idx val mem)
+(MOVWstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVWstoreidx ptr idx val mem)
+(MOVHstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVHstoreidx ptr idx val mem)
+(MOVBstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVBstoreidx ptr idx val mem)
+(FMOVDstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (FMOVDstoreidx ptr idx val mem)
+(FMOVSstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (FMOVSstoreidx ptr idx val mem)
+(MOVDstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVDstore [int32(c)] ptr val mem)
+(MOVDstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVDstore [int32(c)] idx val mem)
+(MOVWstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVWstore [int32(c)] ptr val mem)
+(MOVWstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVWstore [int32(c)] idx val mem)
+(MOVHstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVHstore [int32(c)] ptr val mem)
+(MOVHstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVHstore [int32(c)] idx val mem)
+(MOVBstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVBstore [int32(c)] ptr val mem)
+(MOVBstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVBstore [int32(c)] idx val mem)
+(FMOVDstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (FMOVDstore [int32(c)] ptr val mem)
+(FMOVDstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (FMOVDstore [int32(c)] idx val mem)
+(FMOVSstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (FMOVSstore [int32(c)] ptr val mem)
+(FMOVSstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (FMOVSstore [int32(c)] idx val mem)
+
+// shifted register indexed store
+(MOVDstore [off] {sym} (ADDshiftLL [3] ptr idx) val mem) && off == 0 && sym == nil => (MOVDstoreidx8 ptr idx val mem)
+(MOVWstore [off] {sym} (ADDshiftLL [2] ptr idx) val mem) && off == 0 && sym == nil => (MOVWstoreidx4 ptr idx val mem)
+(MOVHstore [off] {sym} (ADDshiftLL [1] ptr idx) val mem) && off == 0 && sym == nil => (MOVHstoreidx2 ptr idx val mem)
+(MOVDstoreidx ptr (SLLconst [3] idx) val mem) => (MOVDstoreidx8 ptr idx val mem)
+(MOVWstoreidx ptr (SLLconst [2] idx) val mem) => (MOVWstoreidx4 ptr idx val mem)
+(MOVHstoreidx ptr (SLLconst [1] idx) val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVHstoreidx ptr (ADD idx idx) val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVDstoreidx (SLLconst [3] idx) ptr val mem) => (MOVDstoreidx8 ptr idx val mem)
+(MOVWstoreidx (SLLconst [2] idx) ptr val mem) => (MOVWstoreidx4 ptr idx val mem)
+(MOVHstoreidx (SLLconst [1] idx) ptr val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVHstoreidx (ADD idx idx) ptr val mem) => (MOVHstoreidx2 ptr idx val mem)
+(MOVDstoreidx8 ptr (MOVDconst [c]) val mem) && is32Bit(c<<3) => (MOVDstore [int32(c)<<3] ptr val mem)
+(MOVWstoreidx4 ptr (MOVDconst [c]) val mem) && is32Bit(c<<2) => (MOVWstore [int32(c)<<2] ptr val mem)
+(MOVHstoreidx2 ptr (MOVDconst [c]) val mem) && is32Bit(c<<1) => (MOVHstore [int32(c)<<1] ptr val mem)
+
+(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVSload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(STP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val1 val2 mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(STP [off1+off2] {mergeSym(sym1,sym2)} ptr val1 val2 mem)
+(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVQstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
+	&& canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+	&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOVQstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+// store zero
+(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem)
+(STP [off] {sym} ptr (MOVDconst [0]) (MOVDconst [0]) mem) => (MOVQstorezero [off] {sym} ptr mem)
+
+// register indexed store zero
+(MOVDstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVDstorezeroidx ptr idx mem)
+(MOVWstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWstorezeroidx ptr idx mem)
+(MOVHstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHstorezeroidx ptr idx mem)
+(MOVBstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBstorezeroidx ptr idx mem)
+(MOVDstoreidx ptr idx (MOVDconst [0]) mem) => (MOVDstorezeroidx ptr idx mem)
+(MOVWstoreidx ptr idx (MOVDconst [0]) mem) => (MOVWstorezeroidx ptr idx mem)
+(MOVHstoreidx ptr idx (MOVDconst [0]) mem) => (MOVHstorezeroidx ptr idx mem)
+(MOVBstoreidx ptr idx (MOVDconst [0]) mem) => (MOVBstorezeroidx ptr idx mem)
+(MOVDstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVDstorezero [int32(c)] ptr mem)
+(MOVDstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVDstorezero [int32(c)] idx mem)
+(MOVWstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWstorezero [int32(c)] ptr mem)
+(MOVWstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVWstorezero [int32(c)] idx mem)
+(MOVHstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHstorezero [int32(c)] ptr mem)
+(MOVHstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVHstorezero [int32(c)] idx mem)
+(MOVBstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBstorezero [int32(c)] ptr mem)
+(MOVBstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVBstorezero [int32(c)] idx mem)
+
+// shifted register indexed store zero
+(MOVDstorezero [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstorezero [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstorezero [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstorezeroidx ptr (SLLconst [3] idx) mem) => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstorezeroidx ptr (SLLconst [2] idx) mem) => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstorezeroidx ptr (SLLconst [1] idx) mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVHstorezeroidx ptr (ADD idx idx) mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstorezeroidx (SLLconst [3] idx) ptr mem) => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstorezeroidx (SLLconst [2] idx) ptr mem) => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstorezeroidx (SLLconst [1] idx) ptr mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVHstorezeroidx (ADD idx idx) ptr mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstoreidx8 ptr idx (MOVDconst [0]) mem) => (MOVDstorezeroidx8 ptr idx mem)
+(MOVWstoreidx4 ptr idx (MOVDconst [0]) mem) => (MOVWstorezeroidx4 ptr idx mem)
+(MOVHstoreidx2 ptr idx (MOVDconst [0]) mem) => (MOVHstorezeroidx2 ptr idx mem)
+(MOVDstorezeroidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (MOVDstorezero [int32(c<<3)] ptr mem)
+(MOVWstorezeroidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWstorezero [int32(c<<2)] ptr mem)
+(MOVHstorezeroidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHstorezero [int32(c<<1)] ptr mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+// these seem to have bad interaction with other rules, resulting in slower code
+//(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBreg x)
+//(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBUreg x)
+//(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHreg x)
+//(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHUreg x)
+//(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWreg x)
+//(MOVWUload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWUreg x)
+//(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
+//(FMOVSload [off] {sym} ptr (FMOVSstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
+//(FMOVDload [off] {sym} ptr (FMOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
+
+(MOVBload [off] {sym} ptr (MOVBstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVBUload [off] {sym} ptr (MOVBstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVHload [off] {sym} ptr (MOVHstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVHUload [off] {sym} ptr (MOVHstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVWload [off] {sym} ptr (MOVWstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVWUload [off] {sym} ptr (MOVWstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+(MOVDload [off] {sym} ptr (MOVDstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0])
+
+(MOVBloadidx ptr idx (MOVBstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVBUloadidx ptr idx (MOVBstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVHloadidx ptr idx (MOVHstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVHUloadidx ptr idx (MOVHstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVWloadidx ptr idx (MOVWstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVWUloadidx ptr idx (MOVWstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+(MOVDloadidx ptr idx (MOVDstorezeroidx ptr2 idx2 _))
+	&& (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0])
+
+(MOVHloadidx2 ptr idx (MOVHstorezeroidx2 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVHUloadidx2 ptr idx (MOVHstorezeroidx2 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVWloadidx4 ptr idx (MOVWstorezeroidx4 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVWUloadidx4 ptr idx (MOVWstorezeroidx4 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+(MOVDloadidx8 ptr idx (MOVDstorezeroidx8 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0])
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x)
+(MOVBreg x:(MOVBloadidx _  _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHloadidx _ _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWloadidx _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUloadidx _ _ _)) => (MOVDreg x)
+(MOVHreg x:(MOVHloadidx2 _ _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWreg x:(MOVWloadidx4 _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUloadidx4 _ _ _)) => (MOVDreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVDreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstoreidx ptr idx (MOVBreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVBUreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVHreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVHUreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVWreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (MOVWUreg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVHreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVHUreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVWreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOVWUreg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVWstoreidx ptr idx (MOVWreg x) mem) => (MOVWstoreidx ptr idx x mem)
+(MOVWstoreidx ptr idx (MOVWUreg x) mem) => (MOVWstoreidx ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVHreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVHUreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVWreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVHstoreidx2 ptr idx (MOVWUreg x) mem) => (MOVHstoreidx2 ptr idx x mem)
+(MOVWstoreidx4 ptr idx (MOVWreg x) mem) => (MOVWstoreidx4 ptr idx x mem)
+(MOVWstoreidx4 ptr idx (MOVWUreg x) mem) => (MOVWstoreidx4 ptr idx x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVDnop doesn't emit instruction, only for ensuring the type.
+(MOVDreg x) && x.Uses == 1 => (MOVDnop x)
+
+// fold constant into arithmatic ops
+(ADD x (MOVDconst [c])) => (ADDconst [c] x)
+(SUB x (MOVDconst [c])) => (SUBconst [c] x)
+(AND x (MOVDconst [c])) => (ANDconst [c] x)
+(OR  x (MOVDconst [c])) => (ORconst  [c] x)
+(XOR x (MOVDconst [c])) => (XORconst [c] x)
+(TST x (MOVDconst [c])) => (TSTconst [c] x)
+(TSTW x (MOVDconst [c])) => (TSTWconst [int32(c)] x)
+(CMN x (MOVDconst [c])) => (CMNconst [c] x)
+(CMNW x (MOVDconst [c])) => (CMNWconst [int32(c)] x)
+(BIC x (MOVDconst [c])) => (ANDconst [^c] x)
+(EON x (MOVDconst [c])) => (XORconst [^c] x)
+(ORN x (MOVDconst [c])) => (ORconst  [^c] x)
+
+(SLL x (MOVDconst [c])) => (SLLconst x [c&63]) // Note: I don't think we ever generate bad constant shifts (i.e. c>=64)
+(SRL x (MOVDconst [c])) => (SRLconst x [c&63])
+(SRA x (MOVDconst [c])) => (SRAconst x [c&63])
+
+(CMP x (MOVDconst [c])) => (CMPconst [c] x)
+(CMP (MOVDconst [c]) x) => (InvertFlags (CMPconst [c] x))
+(CMPW x (MOVDconst [c])) => (CMPWconst [int32(c)] x)
+(CMPW (MOVDconst [c]) x) => (InvertFlags (CMPWconst [int32(c)] x))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+((CMP|CMPW) x y) && x.ID > y.ID => (InvertFlags ((CMP|CMPW) y x))
+
+// mul-neg => mneg
+(NEG (MUL x y)) => (MNEG x y)
+(NEG (MULW x y)) => (MNEGW x y)
+(MUL (NEG x) y) => (MNEG x y)
+(MULW (NEG x) y) => (MNEGW x y)
+
+// madd/msub
+(ADD a l:(MUL  x y)) && l.Uses==1 && clobber(l) => (MADD a x y)
+(SUB a l:(MUL  x y)) && l.Uses==1 && clobber(l) => (MSUB a x y)
+(ADD a l:(MNEG x y)) && l.Uses==1 && clobber(l) => (MSUB a x y)
+(SUB a l:(MNEG x y)) && l.Uses==1 && clobber(l) => (MADD a x y)
+
+(ADD a l:(MULW  x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MADDW a x y)
+(SUB a l:(MULW  x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MSUBW a x y)
+(ADD a l:(MNEGW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MSUBW a x y)
+(SUB a l:(MNEGW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MADDW a x y)
+
+// optimize ADCSflags, SBCSflags and friends
+(ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] (ADCzerocarry <typ.UInt64> c)))) => (ADCSflags x y c)
+(ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] (MOVDconst [0])))) => (ADDSflags x y)
+(SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> bo))))) => (SBCSflags x y bo)
+(SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags (MOVDconst [0])))) => (SUBSflags x y)
+
+// mul by constant
+(MUL x (MOVDconst [-1])) => (NEG x)
+(MUL _ (MOVDconst [0])) => (MOVDconst [0])
+(MUL x (MOVDconst [1])) => x
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log64(c)] x)
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (ADDshiftLL x x [log64(c-1)])
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log64(c+1)])
+(MUL x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst [log64(c/3)] (ADDshiftLL <x.Type> x x [1]))
+(MUL x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SLLconst [log64(c/5)] (ADDshiftLL <x.Type> x x [2]))
+(MUL x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst [log64(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
+(MUL x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SLLconst [log64(c/9)] (ADDshiftLL <x.Type> x x [3]))
+
+(MULW x (MOVDconst [c])) && int32(c)==-1 => (NEG x)
+(MULW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0])
+(MULW x (MOVDconst [c])) && int32(c)==1 => x
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log64(c)] x)
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (ADDshiftLL x x [log64(c-1)])
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log64(c+1)])
+(MULW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst [log64(c/3)] (ADDshiftLL <x.Type> x x [1]))
+(MULW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SLLconst [log64(c/5)] (ADDshiftLL <x.Type> x x [2]))
+(MULW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst [log64(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
+(MULW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SLLconst [log64(c/9)] (ADDshiftLL <x.Type> x x [3]))
+
+// mneg by constant
+(MNEG x (MOVDconst [-1])) => x
+(MNEG _ (MOVDconst [0])) => (MOVDconst [0])
+(MNEG x (MOVDconst [1])) => (NEG x)
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log64(c)] x))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (NEG (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log64(c+1)]))
+(MNEG x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst <x.Type> [log64(c/3)] (SUBshiftLL <x.Type> x x [2]))
+(MNEG x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (NEG (SLLconst <x.Type> [log64(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEG x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst <x.Type> [log64(c/7)] (SUBshiftLL <x.Type> x x [3]))
+(MNEG x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (NEG (SLLconst <x.Type> [log64(c/9)] (ADDshiftLL <x.Type> x x [3])))
+
+
+(MNEGW x (MOVDconst [c])) && int32(c)==-1 => x
+(MNEGW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0])
+(MNEGW x (MOVDconst [c])) && int32(c)==1 => (NEG x)
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log64(c)] x))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (NEG (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log64(c+1)]))
+(MNEGW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst <x.Type> [log64(c/3)] (SUBshiftLL <x.Type> x x [2]))
+(MNEGW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (NEG (SLLconst <x.Type> [log64(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEGW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst <x.Type> [log64(c/7)] (SUBshiftLL <x.Type> x x [3]))
+(MNEGW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (NEG (SLLconst <x.Type> [log64(c/9)] (ADDshiftLL <x.Type> x x [3])))
+
+
+(MADD a x (MOVDconst [-1])) => (SUB a x)
+(MADD a _ (MOVDconst [0])) => a
+(MADD a x (MOVDconst [1])) => (ADD a x)
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADD a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADD a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADD a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADD a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MADD a (MOVDconst [-1]) x) => (SUB a x)
+(MADD a (MOVDconst [0]) _) => a
+(MADD a (MOVDconst [1]) x) => (ADD a x)
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADD a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADD a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADD a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADD a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MADDW a x (MOVDconst [c])) && int32(c)==-1 => (SUB a x)
+(MADDW a _ (MOVDconst [c])) && int32(c)==0 => a
+(MADDW a x (MOVDconst [c])) && int32(c)==1 => (ADD a x)
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADDW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADDW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADDW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADDW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MADDW a (MOVDconst [c]) x) && int32(c)==-1 => (SUB a x)
+(MADDW a (MOVDconst [c]) _) && int32(c)==0 => a
+(MADDW a (MOVDconst [c]) x) && int32(c)==1 => (ADD a x)
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)])
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MADDW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MADDW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MADDW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MADDW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUB a x (MOVDconst [-1])) => (ADD a x)
+(MSUB a _ (MOVDconst [0])) => a
+(MSUB a x (MOVDconst [1])) => (SUB a x)
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUB a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUB a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUB a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUB a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUB a (MOVDconst [-1]) x) => (ADD a x)
+(MSUB a (MOVDconst [0]) _) => a
+(MSUB a (MOVDconst [1]) x) => (SUB a x)
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUB a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUB a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUB a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUB a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUBW a x (MOVDconst [c])) && int32(c)==-1 => (ADD a x)
+(MSUBW a _ (MOVDconst [c])) && int32(c)==0 => a
+(MSUBW a x (MOVDconst [c])) && int32(c)==1 => (SUB a x)
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUBW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUBW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUBW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUBW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+(MSUBW a (MOVDconst [c]) x) && int32(c)==-1 => (ADD a x)
+(MSUBW a (MOVDconst [c]) _) && int32(c)==0 => a
+(MSUBW a (MOVDconst [c]) x) && int32(c)==1 => (SUB a x)
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)])
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)]))
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)]))
+(MSUBW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)])
+(MSUBW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)])
+(MSUBW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)])
+(MSUBW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)])
+
+// div by constant
+(UDIV x (MOVDconst [1])) => x
+(UDIV x (MOVDconst [c])) && isPowerOfTwo64(c) => (SRLconst [log64(c)] x)
+(UDIVW x (MOVDconst [c])) && uint32(c)==1 => x
+(UDIVW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (SRLconst [log64(c)] x)
+(UMOD _ (MOVDconst [1])) => (MOVDconst [0])
+(UMOD x (MOVDconst [c])) && isPowerOfTwo64(c) => (ANDconst [c-1] x)
+(UMODW _ (MOVDconst [c])) && uint32(c)==1 => (MOVDconst [0])
+(UMODW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (ANDconst [c-1] x)
+
+// generic simplifications
+(ADD x (NEG y)) => (SUB x y)
+(SUB x x) => (MOVDconst [0])
+(AND x x) => x
+(OR  x x) => x
+(XOR x x) => (MOVDconst [0])
+(BIC x x) => (MOVDconst [0])
+(EON x x) => (MOVDconst [-1])
+(ORN x x) => (MOVDconst [-1])
+(AND x (MVN y)) => (BIC x y)
+(XOR x (MVN y)) => (EON x y)
+(OR  x (MVN y)) => (ORN x y)
+(MVN (XOR x y)) => (EON x y)
+(CSEL [cc] x (MOVDconst [0]) flag) => (CSEL0 [cc] x flag)
+(CSEL [cc] (MOVDconst [0]) y flag) => (CSEL0 [arm64Negate(cc)] y flag)
+(SUB x (SUB y z)) => (SUB (ADD <v.Type> x z) y)
+(SUB (SUB x y) z) => (SUB x (ADD <y.Type> y z))
+
+// remove redundant *const ops
+(ADDconst [0]  x) => x
+(SUBconst [0]  x) => x
+(ANDconst [0]  _) => (MOVDconst [0])
+(ANDconst [-1] x) => x
+(ORconst  [0]  x) => x
+(ORconst  [-1] _) => (MOVDconst [-1])
+(XORconst [0]  x) => x
+(XORconst [-1] x) => (MVN x)
+
+// generic constant folding
+(ADDconst [c] (MOVDconst [d]))  => (MOVDconst [c+d])
+(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x)
+(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x)
+(SUBconst [c] (MOVDconst [d]))  => (MOVDconst [d-c])
+(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x)
+(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x)
+(SLLconst [c] (MOVDconst [d]))  => (MOVDconst [d<<uint64(c)])
+(SRLconst [c] (MOVDconst [d]))  => (MOVDconst [int64(uint64(d)>>uint64(c))])
+(SRAconst [c] (MOVDconst [d]))  => (MOVDconst [d>>uint64(c)])
+(MUL   (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c*d])
+(MULW  (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [int64(int32(c)*int32(d))])
+(MNEG  (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [-c*d])
+(MNEGW (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [-int64(int32(c)*int32(d))])
+(MADD  (MOVDconst [c]) x y) => (ADDconst [c] (MUL   <x.Type> x y))
+(MADDW (MOVDconst [c]) x y) => (ADDconst [c] (MULW  <x.Type> x y))
+(MSUB  (MOVDconst [c]) x y) => (ADDconst [c] (MNEG  <x.Type> x y))
+(MSUBW (MOVDconst [c]) x y) => (ADDconst [c] (MNEGW <x.Type> x y))
+(MADD  a (MOVDconst [c]) (MOVDconst [d])) => (ADDconst [c*d] a)
+(MADDW a (MOVDconst [c]) (MOVDconst [d])) => (ADDconst [int64(int32(c)*int32(d))] a)
+(MSUB  a (MOVDconst [c]) (MOVDconst [d])) => (SUBconst [c*d] a)
+(MSUBW a (MOVDconst [c]) (MOVDconst [d])) => (SUBconst [int64(int32(c)*int32(d))] a)
+(DIV   (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [c/d])
+(UDIV  (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint64(c)/uint64(d))])
+(DIVW  (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(int32(c)/int32(d))])
+(UDIVW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint32(c)/uint32(d))])
+(MOD   (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [c%d])
+(UMOD  (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint64(c)%uint64(d))])
+(MODW  (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(int32(c)%int32(d))])
+(UMODW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint32(c)%uint32(d))])
+(ANDconst [c] (MOVDconst [d]))  => (MOVDconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ANDconst [c] (MOVWUreg x)) => (ANDconst [c&(1<<32-1)] x)
+(ANDconst [c] (MOVHUreg x)) => (ANDconst [c&(1<<16-1)] x)
+(ANDconst [c] (MOVBUreg x)) => (ANDconst [c&(1<<8-1)] x)
+(MOVWUreg (ANDconst [c] x)) => (ANDconst [c&(1<<32-1)] x)
+(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&(1<<16-1)] x)
+(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&(1<<8-1)] x)
+(ORconst  [c] (MOVDconst [d]))  => (MOVDconst [c|d])
+(ORconst  [c] (ORconst [d] x))  => (ORconst [c|d] x)
+(XORconst [c] (MOVDconst [d]))  => (MOVDconst [c^d])
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(MVN (MOVDconst [c])) => (MOVDconst [^c])
+(NEG (MOVDconst [c])) => (MOVDconst [-c])
+(MOVBreg  (MOVDconst [c])) => (MOVDconst [int64(int8(c))])
+(MOVBUreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))])
+(MOVHreg  (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+(MOVHUreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVWreg  (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+(MOVWUreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+(MOVDreg  (MOVDconst [c])) => (MOVDconst [c])
+
+// constant comparisons
+(CMPconst  (MOVDconst [x]) [y]) => (FlagConstant [subFlags64(x,y)])
+(CMPWconst (MOVDconst [x]) [y]) => (FlagConstant [subFlags32(int32(x),y)])
+(TSTconst  (MOVDconst [x]) [y]) => (FlagConstant [logicFlags64(x&y)])
+(TSTWconst (MOVDconst [x]) [y]) => (FlagConstant [logicFlags32(int32(x)&y)])
+(CMNconst  (MOVDconst [x]) [y]) => (FlagConstant [addFlags64(x,y)])
+(CMNWconst (MOVDconst [x]) [y]) => (FlagConstant [addFlags32(int32(x),y)])
+
+// other known comparisons
+(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags64(0,1)])
+(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags64(0,1)])
+(CMPconst (MOVWUreg _) [c]) && 0xffffffff < c => (FlagConstant [subFlags64(0,1)])
+(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n => (FlagConstant [subFlags64(0,1)])
+(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 63 && (1<<uint64(64-c)) <= uint64(n) => (FlagConstant [subFlags64(0,1)])
+(CMPWconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags64(0,1)])
+(CMPWconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags64(0,1)])
+
+// absorb flag constants into branches
+(EQ (FlagConstant [fc]) yes no) &&  fc.eq() => (First yes no)
+(EQ (FlagConstant [fc]) yes no) && !fc.eq() => (First no yes)
+
+(NE (FlagConstant [fc]) yes no) &&  fc.ne() => (First yes no)
+(NE (FlagConstant [fc]) yes no) && !fc.ne() => (First no yes)
+
+(LT (FlagConstant [fc]) yes no) &&  fc.lt() => (First yes no)
+(LT (FlagConstant [fc]) yes no) && !fc.lt() => (First no yes)
+
+(LE (FlagConstant [fc]) yes no) &&  fc.le() => (First yes no)
+(LE (FlagConstant [fc]) yes no) && !fc.le() => (First no yes)
+
+(GT (FlagConstant [fc]) yes no) &&  fc.gt() => (First yes no)
+(GT (FlagConstant [fc]) yes no) && !fc.gt() => (First no yes)
+
+(GE (FlagConstant [fc]) yes no) &&  fc.ge() => (First yes no)
+(GE (FlagConstant [fc]) yes no) && !fc.ge() => (First no yes)
+
+(ULT (FlagConstant [fc]) yes no) &&  fc.ult() => (First yes no)
+(ULT (FlagConstant [fc]) yes no) && !fc.ult() => (First no yes)
+
+(ULE (FlagConstant [fc]) yes no) &&  fc.ule() => (First yes no)
+(ULE (FlagConstant [fc]) yes no) && !fc.ule() => (First no yes)
+
+(UGT (FlagConstant [fc]) yes no) &&  fc.ugt() => (First yes no)
+(UGT (FlagConstant [fc]) yes no) && !fc.ugt() => (First no yes)
+
+(UGE (FlagConstant [fc]) yes no) &&  fc.uge() => (First yes no)
+(UGE (FlagConstant [fc]) yes no) && !fc.uge() => (First no yes)
+
+(LTnoov (FlagConstant [fc]) yes no) &&  fc.ltNoov() => (First yes no)
+(LTnoov (FlagConstant [fc]) yes no) && !fc.ltNoov() => (First no yes)
+
+(LEnoov (FlagConstant [fc]) yes no) &&  fc.leNoov() => (First yes no)
+(LEnoov (FlagConstant [fc]) yes no) && !fc.leNoov() => (First no yes)
+
+(GTnoov (FlagConstant [fc]) yes no) &&  fc.gtNoov() => (First yes no)
+(GTnoov (FlagConstant [fc]) yes no) && !fc.gtNoov() => (First no yes)
+
+(GEnoov (FlagConstant [fc]) yes no) &&  fc.geNoov() => (First yes no)
+(GEnoov (FlagConstant [fc]) yes no) && !fc.geNoov() => (First no yes)
+
+(Z (MOVDconst [0]) yes no) => (First yes no)
+(Z (MOVDconst [c]) yes no) && c != 0 => (First no yes)
+(NZ (MOVDconst [0]) yes no) => (First no yes)
+(NZ (MOVDconst [c]) yes no) && c != 0 => (First yes no)
+(ZW (MOVDconst [c]) yes no) && int32(c) == 0 => (First yes no)
+(ZW (MOVDconst [c]) yes no) && int32(c) != 0 => (First no yes)
+(NZW (MOVDconst [c]) yes no) && int32(c) == 0 => (First no yes)
+(NZW (MOVDconst [c]) yes no) && int32(c) != 0 => (First yes no)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+(FLT (InvertFlags cmp) yes no) => (FGT cmp yes no)
+(FGT (InvertFlags cmp) yes no) => (FLT cmp yes no)
+(FLE (InvertFlags cmp) yes no) => (FGE cmp yes no)
+(FGE (InvertFlags cmp) yes no) => (FLE cmp yes no)
+(LTnoov (InvertFlags cmp) yes no) => (GTnoov cmp yes no)
+(GEnoov (InvertFlags cmp) yes no) => (LEnoov cmp yes no)
+(LEnoov (InvertFlags cmp) yes no) => (GEnoov cmp yes no)
+(GTnoov (InvertFlags cmp) yes no) => (LTnoov cmp yes no)
+
+// absorb InvertFlags into CSEL(0)
+(CSEL [cc] x y (InvertFlags cmp)) => (CSEL [arm64Invert(cc)] x y cmp)
+(CSEL0 [cc] x (InvertFlags cmp)) => (CSEL0 [arm64Invert(cc)] x cmp)
+
+// absorb flag constants into boolean values
+(Equal (FlagConstant [fc])) => (MOVDconst [b2i(fc.eq())])
+(NotEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.ne())])
+(LessThan (FlagConstant [fc])) => (MOVDconst [b2i(fc.lt())])
+(LessThanU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ult())])
+(LessEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.le())])
+(LessEqualU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ule())])
+(GreaterThan (FlagConstant [fc])) => (MOVDconst [b2i(fc.gt())])
+(GreaterThanU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ugt())])
+(GreaterEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.ge())])
+(GreaterEqualU (FlagConstant [fc])) => (MOVDconst [b2i(fc.uge())])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) => (Equal x)
+(NotEqual (InvertFlags x)) => (NotEqual x)
+(LessThan (InvertFlags x)) => (GreaterThan x)
+(LessThanU (InvertFlags x)) => (GreaterThanU x)
+(GreaterThan (InvertFlags x)) => (LessThan x)
+(GreaterThanU (InvertFlags x)) => (LessThanU x)
+(LessEqual (InvertFlags x)) => (GreaterEqual x)
+(LessEqualU (InvertFlags x)) => (GreaterEqualU x)
+(GreaterEqual (InvertFlags x)) => (LessEqual x)
+(GreaterEqualU (InvertFlags x)) => (LessEqualU x)
+(LessThanF (InvertFlags x)) => (GreaterThanF x)
+(LessEqualF (InvertFlags x)) => (GreaterEqualF x)
+(GreaterThanF (InvertFlags x)) => (LessThanF x)
+(GreaterEqualF (InvertFlags x)) => (LessEqualF x)
+
+// Boolean-generating instructions always
+// zero upper bit of the register; no need to zero-extend
+(MOVBUreg x) && x.Type.IsBoolean() => (MOVDreg x)
+
+// absorb flag constants into conditional instructions
+(CSEL [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSEL [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => y
+(CSEL0 [cc] x flag) && ccARM64Eval(cc, flag) > 0 => x
+(CSEL0 [cc] _ flag) && ccARM64Eval(cc, flag) < 0 => (MOVDconst [0])
+
+// absorb flags back into boolean CSEL
+(CSEL [cc] x y (CMPWconst [0] boolval)) && cc == OpARM64NotEqual && flagArg(boolval) != nil =>
+      (CSEL [boolval.Op] x y flagArg(boolval))
+(CSEL [cc] x y (CMPWconst [0] boolval)) && cc == OpARM64Equal && flagArg(boolval) != nil =>
+      (CSEL [arm64Negate(boolval.Op)] x y flagArg(boolval))
+(CSEL0 [cc] x (CMPWconst [0] boolval)) && cc == OpARM64NotEqual && flagArg(boolval) != nil =>
+      (CSEL0 [boolval.Op] x flagArg(boolval))
+(CSEL0 [cc] x (CMPWconst [0] boolval)) && cc == OpARM64Equal && flagArg(boolval) != nil =>
+      (CSEL0 [arm64Negate(boolval.Op)] x flagArg(boolval))
+
+// absorb shifts into ops
+(NEG x:(SLLconst [c] y)) && clobberIfDead(x) => (NEGshiftLL [c] y)
+(NEG x:(SRLconst [c] y)) && clobberIfDead(x) => (NEGshiftRL [c] y)
+(NEG x:(SRAconst [c] y)) && clobberIfDead(x) => (NEGshiftRA [c] y)
+(MVN x:(SLLconst [c] y)) && clobberIfDead(x) => (MVNshiftLL [c] y)
+(MVN x:(SRLconst [c] y)) && clobberIfDead(x) => (MVNshiftRL [c] y)
+(MVN x:(SRAconst [c] y)) && clobberIfDead(x) => (MVNshiftRA [c] y)
+(ADD x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ADDshiftLL x0 y [c])
+(ADD x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ADDshiftRL x0 y [c])
+(ADD x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ADDshiftRA x0 y [c])
+(SUB x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (SUBshiftLL x0 y [c])
+(SUB x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (SUBshiftRL x0 y [c])
+(SUB x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (SUBshiftRA x0 y [c])
+(AND x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ANDshiftLL x0 y [c])
+(AND x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ANDshiftRL x0 y [c])
+(AND x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ANDshiftRA x0 y [c])
+(OR  x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ORshiftLL  x0 y [c]) // useful for combined load
+(OR  x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ORshiftRL  x0 y [c])
+(OR  x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ORshiftRA  x0 y [c])
+(XOR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (XORshiftLL x0 y [c])
+(XOR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (XORshiftRL x0 y [c])
+(XOR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (XORshiftRA x0 y [c])
+(BIC x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (BICshiftLL x0 y [c])
+(BIC x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (BICshiftRL x0 y [c])
+(BIC x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (BICshiftRA x0 y [c])
+(ORN x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ORNshiftLL x0 y [c])
+(ORN x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ORNshiftRL x0 y [c])
+(ORN x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ORNshiftRA x0 y [c])
+(EON x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (EONshiftLL x0 y [c])
+(EON x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (EONshiftRL x0 y [c])
+(EON x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (EONshiftRA x0 y [c])
+(CMP x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (CMPshiftLL x0 y [c])
+(CMP x0:(SLLconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftLL x1 y [c]))
+(CMP x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (CMPshiftRL x0 y [c])
+(CMP x0:(SRLconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftRL x1 y [c]))
+(CMP x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (CMPshiftRA x0 y [c])
+(CMP x0:(SRAconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftRA x1 y [c]))
+(CMN x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (CMNshiftLL x0 y [c])
+(CMN x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (CMNshiftRL x0 y [c])
+(CMN x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (CMNshiftRA x0 y [c])
+(TST x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (TSTshiftLL x0 y [c])
+(TST x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (TSTshiftRL x0 y [c])
+(TST x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (TSTshiftRA x0 y [c])
+
+// prefer *const ops to *shift ops
+(ADDshiftLL (MOVDconst [c]) x [d]) => (ADDconst [c] (SLLconst <x.Type> x [d]))
+(ADDshiftRL (MOVDconst [c]) x [d]) => (ADDconst [c] (SRLconst <x.Type> x [d]))
+(ADDshiftRA (MOVDconst [c]) x [d]) => (ADDconst [c] (SRAconst <x.Type> x [d]))
+(ANDshiftLL (MOVDconst [c]) x [d]) => (ANDconst [c] (SLLconst <x.Type> x [d]))
+(ANDshiftRL (MOVDconst [c]) x [d]) => (ANDconst [c] (SRLconst <x.Type> x [d]))
+(ANDshiftRA (MOVDconst [c]) x [d]) => (ANDconst [c] (SRAconst <x.Type> x [d]))
+(ORshiftLL  (MOVDconst [c]) x [d]) => (ORconst  [c] (SLLconst <x.Type> x [d]))
+(ORshiftRL  (MOVDconst [c]) x [d]) => (ORconst  [c] (SRLconst <x.Type> x [d]))
+(ORshiftRA  (MOVDconst [c]) x [d]) => (ORconst  [c] (SRAconst <x.Type> x [d]))
+(XORshiftLL (MOVDconst [c]) x [d]) => (XORconst [c] (SLLconst <x.Type> x [d]))
+(XORshiftRL (MOVDconst [c]) x [d]) => (XORconst [c] (SRLconst <x.Type> x [d]))
+(XORshiftRA (MOVDconst [c]) x [d]) => (XORconst [c] (SRAconst <x.Type> x [d]))
+(CMPshiftLL (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SLLconst <x.Type> x [d])))
+(CMPshiftRL (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRLconst <x.Type> x [d])))
+(CMPshiftRA (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRAconst <x.Type> x [d])))
+(CMNshiftLL (MOVDconst [c]) x [d]) => (CMNconst [c] (SLLconst <x.Type> x [d]))
+(CMNshiftRL (MOVDconst [c]) x [d]) => (CMNconst [c] (SRLconst <x.Type> x [d]))
+(CMNshiftRA (MOVDconst [c]) x [d]) => (CMNconst [c] (SRAconst <x.Type> x [d]))
+(TSTshiftLL (MOVDconst [c]) x [d]) => (TSTconst [c] (SLLconst <x.Type> x [d]))
+(TSTshiftRL (MOVDconst [c]) x [d]) => (TSTconst [c] (SRLconst <x.Type> x [d]))
+(TSTshiftRA (MOVDconst [c]) x [d]) => (TSTconst [c] (SRAconst <x.Type> x [d]))
+
+// constant folding in *shift ops
+(MVNshiftLL (MOVDconst [c]) [d]) => (MOVDconst [^int64(uint64(c)<<uint64(d))])
+(MVNshiftRL (MOVDconst [c]) [d]) => (MOVDconst [^int64(uint64(c)>>uint64(d))])
+(MVNshiftRA (MOVDconst [c]) [d]) => (MOVDconst [^(c>>uint64(d))])
+(NEGshiftLL (MOVDconst [c]) [d]) => (MOVDconst [-int64(uint64(c)<<uint64(d))])
+(NEGshiftRL (MOVDconst [c]) [d]) => (MOVDconst [-int64(uint64(c)>>uint64(d))])
+(NEGshiftRA (MOVDconst [c]) [d]) => (MOVDconst [-(c>>uint64(d))])
+(ADDshiftLL x (MOVDconst [c]) [d]) => (ADDconst x [int64(uint64(c)<<uint64(d))])
+(ADDshiftRL x (MOVDconst [c]) [d]) => (ADDconst x [int64(uint64(c)>>uint64(d))])
+(ADDshiftRA x (MOVDconst [c]) [d]) => (ADDconst x [c>>uint64(d)])
+(SUBshiftLL x (MOVDconst [c]) [d]) => (SUBconst x [int64(uint64(c)<<uint64(d))])
+(SUBshiftRL x (MOVDconst [c]) [d]) => (SUBconst x [int64(uint64(c)>>uint64(d))])
+(SUBshiftRA x (MOVDconst [c]) [d]) => (SUBconst x [c>>uint64(d)])
+(ANDshiftLL x (MOVDconst [c]) [d]) => (ANDconst x [int64(uint64(c)<<uint64(d))])
+(ANDshiftRL x (MOVDconst [c]) [d]) => (ANDconst x [int64(uint64(c)>>uint64(d))])
+(ANDshiftRA x (MOVDconst [c]) [d]) => (ANDconst x [c>>uint64(d)])
+(ORshiftLL  x (MOVDconst [c]) [d]) => (ORconst  x [int64(uint64(c)<<uint64(d))])
+(ORshiftRL  x (MOVDconst [c]) [d]) => (ORconst  x [int64(uint64(c)>>uint64(d))])
+(ORshiftRA  x (MOVDconst [c]) [d]) => (ORconst  x [c>>uint64(d)])
+(XORshiftLL x (MOVDconst [c]) [d]) => (XORconst x [int64(uint64(c)<<uint64(d))])
+(XORshiftRL x (MOVDconst [c]) [d]) => (XORconst x [int64(uint64(c)>>uint64(d))])
+(XORshiftRA x (MOVDconst [c]) [d]) => (XORconst x [c>>uint64(d)])
+(BICshiftLL x (MOVDconst [c]) [d]) => (ANDconst x [^int64(uint64(c)<<uint64(d))])
+(BICshiftRL x (MOVDconst [c]) [d]) => (ANDconst x [^int64(uint64(c)>>uint64(d))])
+(BICshiftRA x (MOVDconst [c]) [d]) => (ANDconst x [^(c>>uint64(d))])
+(ORNshiftLL x (MOVDconst [c]) [d]) => (ORconst  x [^int64(uint64(c)<<uint64(d))])
+(ORNshiftRL x (MOVDconst [c]) [d]) => (ORconst  x [^int64(uint64(c)>>uint64(d))])
+(ORNshiftRA x (MOVDconst [c]) [d]) => (ORconst  x [^(c>>uint64(d))])
+(EONshiftLL x (MOVDconst [c]) [d]) => (XORconst x [^int64(uint64(c)<<uint64(d))])
+(EONshiftRL x (MOVDconst [c]) [d]) => (XORconst x [^int64(uint64(c)>>uint64(d))])
+(EONshiftRA x (MOVDconst [c]) [d]) => (XORconst x [^(c>>uint64(d))])
+(CMPshiftLL x (MOVDconst [c]) [d]) => (CMPconst x [int64(uint64(c)<<uint64(d))])
+(CMPshiftRL x (MOVDconst [c]) [d]) => (CMPconst x [int64(uint64(c)>>uint64(d))])
+(CMPshiftRA x (MOVDconst [c]) [d]) => (CMPconst x [c>>uint64(d)])
+(CMNshiftLL x (MOVDconst [c]) [d]) => (CMNconst x [int64(uint64(c)<<uint64(d))])
+(CMNshiftRL x (MOVDconst [c]) [d]) => (CMNconst x [int64(uint64(c)>>uint64(d))])
+(CMNshiftRA x (MOVDconst [c]) [d]) => (CMNconst x [c>>uint64(d)])
+(TSTshiftLL x (MOVDconst [c]) [d]) => (TSTconst x [int64(uint64(c)<<uint64(d))])
+(TSTshiftRL x (MOVDconst [c]) [d]) => (TSTconst x [int64(uint64(c)>>uint64(d))])
+(TSTshiftRA x (MOVDconst [c]) [d]) => (TSTconst x [c>>uint64(d)])
+
+// simplification with *shift ops
+(SUBshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0])
+(SUBshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0])
+(SUBshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0])
+(ANDshiftLL y:(SLLconst x [c]) x [c]) => y
+(ANDshiftRL y:(SRLconst x [c]) x [c]) => y
+(ANDshiftRA y:(SRAconst x [c]) x [c]) => y
+(ORshiftLL  y:(SLLconst x [c]) x [c]) => y
+(ORshiftRL  y:(SRLconst x [c]) x [c]) => y
+(ORshiftRA  y:(SRAconst x [c]) x [c]) => y
+(XORshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0])
+(XORshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0])
+(XORshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0])
+(BICshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0])
+(EONshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [-1])
+(EONshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [-1])
+(EONshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [-1])
+(ORNshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [-1])
+
+// Generate rotates with const shift
+(ADDshiftLL [c] (SRLconst x [64-c]) x) => (RORconst [64-c] x)
+( ORshiftLL [c] (SRLconst x [64-c]) x) => (RORconst [64-c] x)
+(XORshiftLL [c] (SRLconst x [64-c]) x) => (RORconst [64-c] x)
+(ADDshiftRL [c] (SLLconst x [64-c]) x) => (RORconst [   c] x)
+( ORshiftRL [c] (SLLconst x [64-c]) x) => (RORconst [   c] x)
+(XORshiftRL [c] (SLLconst x [64-c]) x) => (RORconst [   c] x)
+
+(ADDshiftLL <t> [c] (UBFX [bfc] x) x) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+	=> (RORWconst [32-c] x)
+( ORshiftLL <t> [c] (UBFX [bfc] x) x) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+	=> (RORWconst [32-c] x)
+(XORshiftLL <t> [c] (UBFX [bfc] x) x) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+	=> (RORWconst [32-c] x)
+(ADDshiftRL <t> [c] (SLLconst x [32-c]) (MOVWUreg x)) && c < 32 && t.Size() == 4 => (RORWconst [c] x)
+( ORshiftRL <t> [c] (SLLconst x [32-c]) (MOVWUreg x)) && c < 32 && t.Size() == 4 => (RORWconst [c] x)
+(XORshiftRL <t> [c] (SLLconst x [32-c]) (MOVWUreg x)) && c < 32 && t.Size() == 4 => (RORWconst [c] x)
+
+(RORconst [c] (RORconst [d] x)) => (RORconst [(c+d)&63] x)
+(RORWconst [c] (RORWconst [d] x)) => (RORWconst [(c+d)&31] x)
+
+// Generate rotates with non-const shift.
+// These rules match the Go source code like
+//	y &= 63
+//	x << y | x >> (64-y)
+// "|" can also be "^" or "+".
+// As arm64 does not have a ROL instruction, so ROL(x, y) is replaced by ROR(x, -y).
+((ADD|OR|XOR) (SLL x (ANDconst <t> [63] y))
+	(CSEL0 <typ.UInt64> [cc] (SRL <typ.UInt64> x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))
+		(CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))))) && cc == OpARM64LessThanU
+	=> (ROR x (NEG <t> y))
+((ADD|OR|XOR) (SRL <typ.UInt64> x (ANDconst <t> [63] y))
+	(CSEL0 <typ.UInt64> [cc] (SLL x (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y)))
+		(CMPconst [64] (SUB <t> (MOVDconst [64]) (ANDconst <t> [63] y))))) && cc == OpARM64LessThanU
+	=> (ROR x y)
+
+// These rules match the Go source code like
+//	y &= 31
+//	x << y | x >> (32-y)
+// "|" can also be "^" or "+".
+// As arm64 does not have a ROLW instruction, so ROLW(x, y) is replaced by RORW(x, -y).
+((ADD|OR|XOR) (SLL x (ANDconst <t> [31] y))
+	(CSEL0 <typ.UInt32> [cc] (SRL <typ.UInt32> (MOVWUreg x) (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))
+		(CMPconst [64]  (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))))) && cc == OpARM64LessThanU
+	=> (RORW x (NEG <t> y))
+((ADD|OR|XOR) (SRL <typ.UInt32> (MOVWUreg x) (ANDconst <t> [31] y))
+	(CSEL0 <typ.UInt32> [cc] (SLL x (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y)))
+		(CMPconst [64]  (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))))) && cc == OpARM64LessThanU
+	=> (RORW x y)
+
+// ((x>>8) | (x<<8)) => (REV16W x), the type of x is uint16, "|" can also be "^" or "+".
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (UBFX <typ.UInt16> [armBFAuxInt(8, 8)] x) x) => (REV16W x)
+
+// Extract from reg pair
+(ADDshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x)
+( ORshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x)
+(XORshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x)
+
+(ADDshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+	=> (EXTRWconst [32-c] x2 x)
+( ORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+	=> (EXTRWconst [32-c] x2 x)
+(XORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
+	=> (EXTRWconst [32-c] x2 x)
+
+// Rewrite special pairs of shifts to AND.
+// On ARM64 the bitmask can fit into an instruction.
+(SRLconst [c] (SLLconst [c] x)) && 0 < c && c < 64 => (ANDconst [1<<uint(64-c)-1] x) // mask out high bits
+(SLLconst [c] (SRLconst [c] x)) && 0 < c && c < 64 => (ANDconst [^(1<<uint(c)-1)] x) // mask out low bits
+
+// Special case setting bit as 1. An example is math.Copysign(c,-1)
+(ORconst [c1] (ANDconst [c2] x)) && c2|c1 == ^0  => (ORconst [c1] x)
+
+// bitfield ops
+
+// sbfiz
+// (x << lc) >> rc
+(SRAconst [rc] (SLLconst [lc] x)) && lc > rc => (SBFIZ [armBFAuxInt(lc-rc, 64-lc)] x)
+(MOVWreg (SLLconst [lc] x)) && lc < 32 => (SBFIZ [armBFAuxInt(lc, 32-lc)] x)
+(MOVHreg (SLLconst [lc] x)) && lc < 16 => (SBFIZ [armBFAuxInt(lc, 16-lc)] x)
+(MOVBreg (SLLconst [lc] x)) && lc < 8 => (SBFIZ [armBFAuxInt(lc, 8-lc)] x)
+
+// sbfx
+// (x << lc) >> rc
+(SRAconst [rc] (SLLconst [lc] x)) && lc <= rc => (SBFX [armBFAuxInt(rc-lc, 64-rc)] x)
+(SRAconst [rc] (MOVWreg x)) && rc < 32 => (SBFX [armBFAuxInt(rc, 32-rc)] x)
+(SRAconst [rc] (MOVHreg x)) && rc < 16 => (SBFX [armBFAuxInt(rc, 16-rc)] x)
+(SRAconst [rc] (MOVBreg x)) && rc < 8 => (SBFX [armBFAuxInt(rc, 8-rc)] x)
+
+// sbfiz/sbfx combinations: merge shifts into bitfield ops
+(SRAconst [sc] (SBFIZ [bfc] x)) && sc < bfc.getARM64BFlsb()
+	=> (SBFIZ [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x)
+(SRAconst [sc] (SBFIZ [bfc] x)) && sc >= bfc.getARM64BFlsb()
+	&& sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth()
+	=> (SBFX [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x)
+
+// ubfiz
+// (x & ac) << sc
+(SLLconst [sc] (ANDconst [ac] x)) && isARM64BFMask(sc, ac, 0)
+	=> (UBFIZ [armBFAuxInt(sc, arm64BFWidth(ac, 0))] x)
+(SLLconst [sc] (MOVWUreg x)) && isARM64BFMask(sc, 1<<32-1, 0) => (UBFIZ [armBFAuxInt(sc, 32)] x)
+(SLLconst [sc] (MOVHUreg x)) && isARM64BFMask(sc, 1<<16-1, 0) => (UBFIZ [armBFAuxInt(sc, 16)] x)
+(SLLconst [sc] (MOVBUreg x)) && isARM64BFMask(sc, 1<<8-1, 0) => (UBFIZ [armBFAuxInt(sc, 8)] x)
+// (x << sc) & ac
+(ANDconst [ac] (SLLconst [sc] x)) && isARM64BFMask(sc, ac, sc)
+	=> (UBFIZ [armBFAuxInt(sc, arm64BFWidth(ac, sc))] x)
+(MOVWUreg (SLLconst [sc] x)) && isARM64BFMask(sc, 1<<32-1, sc)
+	=> (UBFIZ [armBFAuxInt(sc, arm64BFWidth(1<<32-1, sc))] x)
+(MOVHUreg (SLLconst [sc] x)) && isARM64BFMask(sc, 1<<16-1, sc)
+	=> (UBFIZ [armBFAuxInt(sc, arm64BFWidth(1<<16-1, sc))] x)
+(MOVBUreg (SLLconst [sc] x)) && isARM64BFMask(sc, 1<<8-1, sc)
+	=> (UBFIZ [armBFAuxInt(sc, arm64BFWidth(1<<8-1, sc))] x)
+// (x << lc) >> rc
+(SRLconst [rc] (SLLconst [lc] x)) && lc > rc => (UBFIZ [armBFAuxInt(lc-rc, 64-lc)] x)
+
+// ubfx
+// (x >> sc) & ac
+(ANDconst [ac] (SRLconst [sc] x)) && isARM64BFMask(sc, ac, 0)
+	=> (UBFX [armBFAuxInt(sc, arm64BFWidth(ac, 0))] x)
+(MOVWUreg (SRLconst [sc] x)) && isARM64BFMask(sc, 1<<32-1, 0) => (UBFX [armBFAuxInt(sc, 32)] x)
+(MOVHUreg (SRLconst [sc] x)) && isARM64BFMask(sc, 1<<16-1, 0) => (UBFX [armBFAuxInt(sc, 16)] x)
+(MOVBUreg (SRLconst [sc] x)) && isARM64BFMask(sc, 1<<8-1, 0) => (UBFX [armBFAuxInt(sc, 8)] x)
+// (x & ac) >> sc
+(SRLconst [sc] (ANDconst [ac] x)) && isARM64BFMask(sc, ac, sc)
+	=> (UBFX [armBFAuxInt(sc, arm64BFWidth(ac, sc))] x)
+(SRLconst [sc] (MOVWUreg x)) && isARM64BFMask(sc, 1<<32-1, sc)
+	=> (UBFX [armBFAuxInt(sc, arm64BFWidth(1<<32-1, sc))] x)
+(SRLconst [sc] (MOVHUreg x)) && isARM64BFMask(sc, 1<<16-1, sc)
+	=> (UBFX [armBFAuxInt(sc, arm64BFWidth(1<<16-1, sc))] x)
+(SRLconst [sc] (MOVBUreg x)) && isARM64BFMask(sc, 1<<8-1, sc)
+	=> (UBFX [armBFAuxInt(sc, arm64BFWidth(1<<8-1, sc))] x)
+// (x << lc) >> rc
+(SRLconst [rc] (SLLconst [lc] x)) && lc < rc => (UBFX [armBFAuxInt(rc-lc, 64-rc)] x)
+
+// ubfiz/ubfx combinations: merge shifts into bitfield ops
+(SRLconst [sc] (UBFX [bfc] x)) && sc < bfc.getARM64BFwidth()
+	=> (UBFX [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth()-sc)] x)
+(UBFX [bfc] (SRLconst [sc] x)) && sc+bfc.getARM64BFwidth()+bfc.getARM64BFlsb() < 64
+	=> (UBFX [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth())] x)
+(SLLconst [sc] (UBFIZ [bfc] x)) && sc+bfc.getARM64BFwidth()+bfc.getARM64BFlsb() < 64
+	=> (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth())] x)
+(UBFIZ [bfc] (SLLconst [sc] x)) && sc < bfc.getARM64BFwidth()
+	=> (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth()-sc)] x)
+// ((x << c1) >> c2) >> c3
+(SRLconst [sc] (UBFIZ [bfc] x)) && sc == bfc.getARM64BFlsb()
+	=> (ANDconst [1<<uint(bfc.getARM64BFwidth())-1] x)
+(SRLconst [sc] (UBFIZ [bfc] x)) && sc < bfc.getARM64BFlsb()
+	=> (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x)
+(SRLconst [sc] (UBFIZ [bfc] x)) && sc > bfc.getARM64BFlsb()
+	&& sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth()
+	=> (UBFX [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x)
+// ((x << c1) << c2) >> c3
+(UBFX [bfc] (SLLconst [sc] x)) && sc == bfc.getARM64BFlsb()
+	=> (ANDconst [1<<uint(bfc.getARM64BFwidth())-1] x)
+(UBFX [bfc] (SLLconst [sc] x)) && sc < bfc.getARM64BFlsb()
+	=> (UBFX [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x)
+(UBFX [bfc] (SLLconst [sc] x)) && sc > bfc.getARM64BFlsb()
+	&& sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth()
+	=> (UBFIZ [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x)
+
+// bfi
+(OR (UBFIZ [bfc] x) (ANDconst [ac] y))
+	&& ac == ^((1<<uint(bfc.getARM64BFwidth())-1) << uint(bfc.getARM64BFlsb()))
+	=> (BFI [bfc] y x)
+(ORshiftRL [rc] (ANDconst [ac] x) (SLLconst [lc] y))
+	&& lc > rc && ac == ^((1<<uint(64-lc)-1) << uint64(lc-rc))
+	=> (BFI [armBFAuxInt(lc-rc, 64-lc)] x y)
+// bfxil
+(OR (UBFX [bfc] x) (ANDconst [ac] y)) && ac == ^(1<<uint(bfc.getARM64BFwidth())-1)
+	=> (BFXIL [bfc] y x)
+(ORshiftLL [sc] (UBFX [bfc] x) (SRLconst [sc] y)) && sc == bfc.getARM64BFwidth()
+	=> (BFXIL [bfc] y x)
+(ORshiftRL [rc] (ANDconst [ac] y) (SLLconst [lc] x)) && lc < rc && ac == ^((1<<uint(64-rc)-1))
+	=> (BFXIL [armBFAuxInt(rc-lc, 64-rc)] y x)
+
+// do combined loads
+// little endian loads
+// b[0] | b[1]<<8 => load 16-bit
+(ORshiftLL <t> [8]
+	y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem))
+	y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+	&& i1 == i0+1
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1
+	&& mergePoint(b,x0,x1) != nil
+	&& clobber(x0, x1, y0, y1)
+	=> @mergePoint(b,x0,x1) (MOVHUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(ORshiftLL <t> [8]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem))
+	y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1
+	&& mergePoint(b,x0,x1) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x0, x1, y0, y1)
+	=> @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr0 idx0 mem)
+(ORshiftLL <t> [8]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1
+	&& mergePoint(b,x0,x1) != nil
+	&& clobber(x0, x1, y0, y1)
+	=> @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr idx mem)
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 => load 32-bit
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	            x0:(MOVHUload [i0] {s} p mem)
+	y1:(MOVDnop x1:(MOVBUload [i2] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i3] {s} p mem)))
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& clobber(x0, x1, x2, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	            x0:(MOVHUloadidx ptr0 idx0 mem)
+	y1:(MOVDnop x1:(MOVBUload [2] {s} p1:(ADD ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUload [3] {s} p mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr0 idx0 mem)
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	            x0:(MOVHUloadidx ptr idx mem)
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& clobber(x0, x1, x2, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr idx mem)
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	            x0:(MOVHUloadidx2 ptr0 idx0 mem)
+	y1:(MOVDnop x1:(MOVBUload [2] {s} p1:(ADDshiftLL [1] ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUload [3] {s} p mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr0 (SLLconst <idx0.Type> [1] idx0) mem)
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4]<<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 => load 64-bit
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	            x0:(MOVWUload [i0] {s} p mem)
+	y1:(MOVDnop x1:(MOVBUload [i4] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [i6] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [i7] {s} p mem)))
+	&& i4 == i0+4
+	&& i5 == i0+5
+	&& i6 == i0+6
+	&& i7 == i0+7
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	            x0:(MOVWUloadidx ptr0 idx0 mem)
+	y1:(MOVDnop x1:(MOVBUload [4] {s} p1:(ADD ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUload [5] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [6] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [7] {s} p mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr0 idx0 mem)
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	            x0:(MOVWUloadidx4 ptr0 idx0 mem)
+	y1:(MOVDnop x1:(MOVBUload [4] {s} p1:(ADDshiftLL [2] ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUload [5] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [6] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [7] {s} p mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr0 (SLLconst <idx0.Type> [2] idx0) mem)
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	            x0:(MOVWUloadidx ptr idx mem)
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [4] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [5] idx) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [6] idx) mem)))
+	y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [7] idx) mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr idx mem)
+
+// b[3]<<24 | b[2]<<16 | b[1]<<8 | b[0] => load 32-bit
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+	y0:(MOVDnop x0:(MOVBUload [i3] {s} p mem)))
+	y1:(MOVDnop x1:(MOVBUload [i2] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i1] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [i0] {s} p mem)))
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3) != nil
+	&& clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+	=> @mergePoint(b,x0,x1,x2,x3) (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+	y0:(MOVDnop x0:(MOVBUload [3] {s} p mem)))
+	y1:(MOVDnop x1:(MOVBUload [2] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr0 idx0 mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+	=> @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr0 idx0 mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr idx mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3) != nil
+	&& clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+	=> @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr idx mem)
+
+// b[7]<<56 | b[6]<<48 | b[5]<<40 | b[4]<<32 | b[3]<<24 | b[2]<<16 | b[1]<<8 | b[0] => load 64-bit
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+	y0:(MOVDnop x0:(MOVBUload [i7] {s} p mem)))
+	y1:(MOVDnop x1:(MOVBUload [i6] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [i4] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [i3] {s} p mem)))
+	y5:(MOVDnop x5:(MOVBUload [i2] {s} p mem)))
+	y6:(MOVDnop x6:(MOVBUload [i1] {s} p mem)))
+	y7:(MOVDnop x7:(MOVBUload [i0] {s} p mem)))
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& i4 == i0+4
+	&& i5 == i0+5
+	&& i6 == i0+6
+	&& i7 == i0+7
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+	&& o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+	&& clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+	=> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+	y0:(MOVDnop x0:(MOVBUload [7] {s} p mem)))
+	y1:(MOVDnop x1:(MOVBUload [6] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [5] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [4] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [3] {s} p mem)))
+	y5:(MOVDnop x5:(MOVBUload [2] {s} p mem)))
+	y6:(MOVDnop x6:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	y7:(MOVDnop x7:(MOVBUloadidx ptr0 idx0 mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+	&& o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+	=> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDloadidx <t> ptr0 idx0 mem)
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [7] idx) mem)))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [6] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [5] idx) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [4] idx) mem)))
+	y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+	y5:(MOVDnop x5:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+	y6:(MOVDnop x6:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	y7:(MOVDnop x7:(MOVBUloadidx ptr idx mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+	&& o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+	&& clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+	=> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDloadidx <t> ptr idx mem)
+
+// big endian loads
+// b[1] | b[0]<<8 => load 16-bit, reverse
+(ORshiftLL <t> [8]
+	y0:(MOVDnop x0:(MOVBUload [i1] {s} p mem))
+	y1:(MOVDnop x1:(MOVBUload [i0] {s} p mem)))
+	&& i1 == i0+1
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1
+	&& mergePoint(b,x0,x1) != nil
+	&& clobber(x0, x1, y0, y1)
+	=> @mergePoint(b,x0,x1) (REV16W <t> (MOVHUload <t> [i0] {s} p mem))
+(ORshiftLL <t> [8]
+	y0:(MOVDnop x0:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr0 idx0 mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1
+	&& mergePoint(b,x0,x1) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x0, x1, y0, y1)
+	=> @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr0 idx0 mem))
+(ORshiftLL <t> [8]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [1] idx) mem))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr idx mem)))
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1
+	&& mergePoint(b,x0,x1) != nil
+	&& clobber(x0, x1, y0, y1)
+	=> @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr idx mem))
+
+// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 => load 32-bit, reverse
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	y0:(REV16W  x0:(MOVHUload [i2] {s} p mem))
+	y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i0] {s} p mem)))
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& clobber(x0, x1, x2, y0, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	y0:(REV16W  x0:(MOVHUload [2] {s} p mem))
+	y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr0 idx0 mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, y0, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem))
+(ORshiftLL <t> [24] o0:(ORshiftLL [16]
+	y0:(REV16W  x0:(MOVHUloadidx ptr (ADDconst [2] idx) mem))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr idx mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1
+	&& o0.Uses == 1
+	&& mergePoint(b,x0,x1,x2) != nil
+	&& clobber(x0, x1, x2, y0, y1, y2, o0)
+	=> @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUloadidx <t> ptr idx mem))
+
+// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 => load 64-bit, reverse
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	y0:(REVW    x0:(MOVWUload [i4] {s} p mem))
+	y1:(MOVDnop x1:(MOVBUload [i3] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [i1] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [i0] {s} p mem)))
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& i4 == i0+4
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	y0:(REVW    x0:(MOVWUload [4] {s} p mem))
+	y1:(MOVDnop x1:(MOVBUload [3] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [2] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	y4:(MOVDnop x4:(MOVBUloadidx ptr0 idx0 mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDloadidx <t> ptr0 idx0 mem))
+(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32]
+	y0:(REVW    x0:(MOVWUloadidx ptr (ADDconst [4] idx) mem))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	y4:(MOVDnop x4:(MOVBUloadidx ptr idx mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4) != nil
+	&& clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2)
+	=> @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDloadidx <t> ptr idx mem))
+
+// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] => load 32-bit, reverse
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+	y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem)))
+	y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [i3] {s} p mem)))
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3) != nil
+	&& clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+	=> @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem)))
+	y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUload [2] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [3] {s} p mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+	=> @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem)))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3) != nil
+	&& clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0)
+	=> @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr idx mem))
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] => load 64-bit, reverse
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+	y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem)))
+	y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem)))
+	y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [i3] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [i4] {s} p mem)))
+	y5:(MOVDnop x5:(MOVBUload [i5] {s} p mem)))
+	y6:(MOVDnop x6:(MOVBUload [i6] {s} p mem)))
+	y7:(MOVDnop x7:(MOVBUload [i7] {s} p mem)))
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& i4 == i0+4
+	&& i5 == i0+5
+	&& i6 == i0+6
+	&& i7 == i0+7
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+	&& o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+	&& clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+	=> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem)))
+	y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
+	y2:(MOVDnop x2:(MOVBUload [2] {s} p mem)))
+	y3:(MOVDnop x3:(MOVBUload [3] {s} p mem)))
+	y4:(MOVDnop x4:(MOVBUload [4] {s} p mem)))
+	y5:(MOVDnop x5:(MOVBUload [5] {s} p mem)))
+	y6:(MOVDnop x6:(MOVBUload [6] {s} p mem)))
+	y7:(MOVDnop x7:(MOVBUload [7] {s} p mem)))
+	&& s == nil
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+	&& o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+	=> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDloadidx <t> ptr0 idx0 mem))
+(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56]
+	y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem)))
+	y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
+	y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem)))
+	y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem)))
+	y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [4] idx) mem)))
+	y5:(MOVDnop x5:(MOVBUloadidx ptr (ADDconst [5] idx) mem)))
+	y6:(MOVDnop x6:(MOVBUloadidx ptr (ADDconst [6] idx) mem)))
+	y7:(MOVDnop x7:(MOVBUloadidx ptr (ADDconst [7] idx) mem)))
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1
+	&& y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1
+	&& o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1
+	&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+	&& clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0)
+	=> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDloadidx <t> ptr idx mem))
+
+// Combine zero stores into larger (unaligned) stores.
+(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
+	&& x.Uses == 1
+	&& areAdjacentOffsets(int64(i),int64(j),1)
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVBstorezero [1] {s} (ADD ptr0 idx0) x:(MOVBstorezeroidx ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstorezeroidx ptr1 idx1 mem)
+(MOVBstorezeroidx ptr (ADDconst [1] idx) x:(MOVBstorezeroidx ptr idx mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstorezeroidx ptr idx mem)
+(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
+	&& x.Uses == 1
+	&& areAdjacentOffsets(int64(i),int64(j),2)
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVWstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVHstorezero [2] {s} (ADD ptr0 idx0) x:(MOVHstorezeroidx ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVWstorezeroidx ptr1 idx1 mem)
+(MOVHstorezeroidx ptr (ADDconst [2] idx) x:(MOVHstorezeroidx ptr idx mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVWstorezeroidx ptr idx mem)
+(MOVHstorezero [2] {s} (ADDshiftLL [1] ptr0 idx0) x:(MOVHstorezeroidx2 ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVWstorezeroidx ptr1 (SLLconst <idx1.Type> [1] idx1) mem)
+(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
+	&& x.Uses == 1
+	&& areAdjacentOffsets(int64(i),int64(j),4)
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVDstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVWstorezero [4] {s} (ADD ptr0 idx0) x:(MOVWstorezeroidx ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVDstorezeroidx ptr1 idx1 mem)
+(MOVWstorezeroidx ptr (ADDconst [4] idx) x:(MOVWstorezeroidx ptr idx mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVDstorezeroidx ptr idx mem)
+(MOVWstorezero [4] {s} (ADDshiftLL [2] ptr0 idx0) x:(MOVWstorezeroidx4 ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVDstorezeroidx ptr1 (SLLconst <idx1.Type> [2] idx1) mem)
+(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
+	&& x.Uses == 1
+	&& areAdjacentOffsets(int64(i),int64(j),8)
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVQstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem)
+(MOVDstorezero [8] {s} p0:(ADD ptr0 idx0) x:(MOVDstorezeroidx ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVQstorezero [0] {s} p0 mem)
+(MOVDstorezero [8] {s} p0:(ADDshiftLL [3] ptr0 idx0) x:(MOVDstorezeroidx8 ptr1 idx1 mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVQstorezero [0] {s} p0 mem)
+
+// Combine stores into larger (unaligned) stores.
+(MOVBstore [i] {s} ptr0 (SRLconst [8] w) x:(MOVBstore [i-1] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [8] w) x:(MOVBstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstoreidx ptr (ADDconst [1] idx) (SRLconst [8] w) x:(MOVBstoreidx ptr idx w mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstoreidx ptr idx w mem)
+(MOVBstore [i] {s} ptr0 (UBFX [armBFAuxInt(8, 8)] w) x:(MOVBstore [i-1] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(8, 8)] w) x:(MOVBstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstore [i] {s} ptr0 (UBFX [armBFAuxInt(8, 24)] w) x:(MOVBstore [i-1] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(8, 24)] w) x:(MOVBstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstore [i] {s} ptr0 (SRLconst [8] (MOVDreg w)) x:(MOVBstore [i-1] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [8] (MOVDreg w)) x:(MOVBstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w mem)
+(MOVBstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVBstore [i-1] {s} ptr1 w0:(SRLconst [j-8] w) mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w0 mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVBstoreidx ptr1 idx1 w0:(SRLconst [j-8] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w0 mem)
+(MOVBstore [i] {s} ptr0 (UBFX [bfc] w) x:(MOVBstore [i-1] {s} ptr1 w0:(UBFX [bfc2] w) mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& bfc.getARM64BFwidth() == 32 - bfc.getARM64BFlsb()
+	&& bfc2.getARM64BFwidth() == 32 - bfc2.getARM64BFlsb()
+	&& bfc2.getARM64BFlsb() == bfc.getARM64BFlsb() - 8
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w0 mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [bfc] w) x:(MOVBstoreidx ptr1 idx1 w0:(UBFX [bfc2] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& bfc.getARM64BFwidth() == 32 - bfc.getARM64BFlsb()
+	&& bfc2.getARM64BFwidth() == 32 - bfc2.getARM64BFlsb()
+	&& bfc2.getARM64BFlsb() == bfc.getARM64BFlsb() - 8
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w0 mem)
+(MOVBstore [i] {s} ptr0 (SRLconst [j] (MOVDreg w)) x:(MOVBstore [i-1] {s} ptr1 w0:(SRLconst [j-8] (MOVDreg w)) mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr0 w0 mem)
+(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [j] (MOVDreg w)) x:(MOVBstoreidx ptr1 idx1 w0:(SRLconst [j-8] (MOVDreg w)) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr1 idx1 w0 mem)
+(MOVHstore [i] {s} ptr0 (SRLconst [16] w) x:(MOVHstore [i-2] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVWstore [i-2] {s} ptr0 w mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [16] w) x:(MOVHstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 idx1 w mem)
+(MOVHstoreidx ptr (ADDconst [2] idx) (SRLconst [16] w) x:(MOVHstoreidx ptr idx w mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVWstoreidx ptr idx w mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [16] w) x:(MOVHstoreidx2 ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem)
+(MOVHstore [i] {s} ptr0 (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstore [i-2] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVWstore [i-2] {s} ptr0 w mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 idx1 w mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstoreidx2 ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem)
+(MOVHstore [i] {s} ptr0 (SRLconst [16] (MOVDreg w)) x:(MOVHstore [i-2] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVWstore [i-2] {s} ptr0 w mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [16] (MOVDreg w)) x:(MOVHstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 idx1 w mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [16] (MOVDreg w)) x:(MOVHstoreidx2 ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem)
+(MOVHstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVHstore [i-2] {s} ptr1 w0:(SRLconst [j-16] w) mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVWstore [i-2] {s} ptr0 w0 mem)
+(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVHstoreidx ptr1 idx1 w0:(SRLconst [j-16] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 idx1 w0 mem)
+(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [j] w) x:(MOVHstoreidx2 ptr1 idx1 w0:(SRLconst [j-16] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w0 mem)
+(MOVWstore [i] {s} ptr0 (SRLconst [32] w) x:(MOVWstore [i-4] {s} ptr1 w mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVDstore [i-4] {s} ptr0 w mem)
+(MOVWstore [4] {s} (ADD ptr0 idx0) (SRLconst [32] w) x:(MOVWstoreidx ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVDstoreidx ptr1 idx1 w mem)
+(MOVWstoreidx ptr (ADDconst [4] idx) (SRLconst [32] w) x:(MOVWstoreidx ptr idx w mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVDstoreidx ptr idx w mem)
+(MOVWstore [4] {s} (ADDshiftLL [2] ptr0 idx0) (SRLconst [32] w) x:(MOVWstoreidx4 ptr1 idx1 w mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVDstoreidx ptr1 (SLLconst <idx1.Type> [2] idx1) w mem)
+(MOVWstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVWstore [i-4] {s} ptr1 w0:(SRLconst [j-32] w) mem))
+	&& x.Uses == 1
+	&& isSamePtr(ptr0, ptr1)
+	&& clobber(x)
+	=> (MOVDstore [i-4] {s} ptr0 w0 mem)
+(MOVWstore [4] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVWstoreidx ptr1 idx1 w0:(SRLconst [j-32] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVDstoreidx ptr1 idx1 w0 mem)
+(MOVWstore [4] {s} (ADDshiftLL [2] ptr0 idx0) (SRLconst [j] w) x:(MOVWstoreidx4 ptr1 idx1 w0:(SRLconst [j-32] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1)
+	&& clobber(x)
+	=> (MOVDstoreidx ptr1 (SLLconst <idx1.Type> [2] idx1) w0 mem)
+(MOVBstore [i] {s} ptr w
+	x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] w)
+	x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] w)
+	x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] w)
+	x3:(MOVBstore [i-4] {s} ptr (SRLconst [32] w)
+	x4:(MOVBstore [i-5] {s} ptr (SRLconst [40] w)
+	x5:(MOVBstore [i-6] {s} ptr (SRLconst [48] w)
+	x6:(MOVBstore [i-7] {s} ptr (SRLconst [56] w) mem))))))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& x3.Uses == 1
+	&& x4.Uses == 1
+	&& x5.Uses == 1
+	&& x6.Uses == 1
+	&& clobber(x0, x1, x2, x3, x4, x5, x6)
+	=> (MOVDstore [i-7] {s} ptr (REV <w.Type> w) mem)
+(MOVBstore [7] {s} p w
+	x0:(MOVBstore [6] {s} p (SRLconst [8] w)
+	x1:(MOVBstore [5] {s} p (SRLconst [16] w)
+	x2:(MOVBstore [4] {s} p (SRLconst [24] w)
+	x3:(MOVBstore [3] {s} p (SRLconst [32] w)
+	x4:(MOVBstore [2] {s} p (SRLconst [40] w)
+	x5:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [48] w)
+	x6:(MOVBstoreidx ptr0 idx0 (SRLconst [56] w) mem))))))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& x3.Uses == 1
+	&& x4.Uses == 1
+	&& x5.Uses == 1
+	&& x6.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2, x3, x4, x5, x6)
+	=> (MOVDstoreidx ptr0 idx0 (REV <w.Type> w) mem)
+(MOVBstore [i] {s} ptr w
+	x0:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 24)] w)
+	x1:(MOVBstore [i-2] {s} ptr (UBFX [armBFAuxInt(16, 16)] w)
+	x2:(MOVBstore [i-3] {s} ptr (UBFX [armBFAuxInt(24, 8)] w) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& clobber(x0, x1, x2)
+	=> (MOVWstore [i-3] {s} ptr (REVW <w.Type> w) mem)
+(MOVBstore [3] {s} p w
+	x0:(MOVBstore [2] {s} p (UBFX [armBFAuxInt(8, 24)] w)
+	x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (UBFX [armBFAuxInt(16, 16)] w)
+	x2:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(24, 8)] w) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2)
+	=> (MOVWstoreidx ptr0 idx0 (REVW <w.Type> w) mem)
+(MOVBstoreidx ptr (ADDconst [3] idx) w
+	x0:(MOVBstoreidx ptr (ADDconst [2] idx) (UBFX [armBFAuxInt(8, 24)] w)
+	x1:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(16, 16)] w)
+	x2:(MOVBstoreidx ptr idx (UBFX [armBFAuxInt(24, 8)] w) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& clobber(x0, x1, x2)
+	=> (MOVWstoreidx ptr idx (REVW <w.Type> w) mem)
+(MOVBstoreidx ptr idx w
+	x0:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(8, 24)] w)
+	x1:(MOVBstoreidx ptr (ADDconst [2] idx) (UBFX [armBFAuxInt(16, 16)] w)
+	x2:(MOVBstoreidx ptr (ADDconst [3] idx) (UBFX [armBFAuxInt(24, 8)] w) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& clobber(x0, x1, x2)
+	=> (MOVWstoreidx ptr idx w mem)
+(MOVBstore [i] {s} ptr w
+	x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] (MOVDreg w))
+	x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] (MOVDreg w))
+	x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] (MOVDreg w)) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& clobber(x0, x1, x2)
+	=> (MOVWstore [i-3] {s} ptr (REVW <w.Type> w) mem)
+(MOVBstore [3] {s} p w
+	x0:(MOVBstore [2] {s} p (SRLconst [8] (MOVDreg w))
+	x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [16] (MOVDreg w))
+	x2:(MOVBstoreidx ptr0 idx0 (SRLconst [24] (MOVDreg w)) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2)
+	=> (MOVWstoreidx ptr0 idx0 (REVW <w.Type> w) mem)
+(MOVBstore [i] {s} ptr w
+	x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] w)
+	x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] w)
+	x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] w) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& clobber(x0, x1, x2)
+	=> (MOVWstore [i-3] {s} ptr (REVW <w.Type> w) mem)
+(MOVBstore [3] {s} p w
+	x0:(MOVBstore [2] {s} p (SRLconst [8] w)
+	x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [16] w)
+	x2:(MOVBstoreidx ptr0 idx0 (SRLconst [24] w) mem))))
+	&& x0.Uses == 1
+	&& x1.Uses == 1
+	&& x2.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& isSamePtr(p1, p)
+	&& clobber(x0, x1, x2)
+	=> (MOVWstoreidx ptr0 idx0 (REVW <w.Type> w) mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (SRLconst [8] w) mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr (REV16W <w.Type> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (SRLconst [8] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr0 idx0 (REV16W <w.Type> w) mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 8)] w) mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr (REV16W <w.Type> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(8, 8)] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr0 idx0 (REV16W <w.Type> w) mem)
+(MOVBstoreidx ptr (ADDconst [1] idx) w x:(MOVBstoreidx ptr idx (UBFX [armBFAuxInt(8, 8)] w) mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstoreidx ptr idx (REV16W <w.Type> w) mem)
+(MOVBstoreidx ptr idx w x:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(8, 8)] w) mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstoreidx ptr idx w mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (SRLconst [8] (MOVDreg w)) mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr (REV16W <w.Type> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (SRLconst [8] (MOVDreg w)) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr0 idx0 (REV16W <w.Type> w) mem)
+(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 24)] w) mem))
+	&& x.Uses == 1
+	&& clobber(x)
+	=> (MOVHstore [i-1] {s} ptr (REV16W <w.Type> w) mem)
+(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(8, 24)] w) mem))
+	&& x.Uses == 1
+	&& s == nil
+	&& (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1))
+	&& clobber(x)
+	=> (MOVHstoreidx ptr0 idx0 (REV16W <w.Type> w) mem)
+
+// FP simplification
+(FNEGS (FMULS x y)) => (FNMULS x y)
+(FNEGD (FMULD x y)) => (FNMULD x y)
+(FMULS (FNEGS x) y) => (FNMULS x y)
+(FMULD (FNEGD x) y) => (FNMULD x y)
+(FNEGS (FNMULS x y)) => (FMULS x y)
+(FNEGD (FNMULD x y)) => (FMULD x y)
+(FNMULS (FNEGS x) y) => (FMULS x y)
+(FNMULD (FNEGD x) y) => (FMULD x y)
+(FADDS a (FMULS x y)) => (FMADDS a x y)
+(FADDD a (FMULD x y)) => (FMADDD a x y)
+(FSUBS a (FMULS x y)) => (FMSUBS a x y)
+(FSUBD a (FMULD x y)) => (FMSUBD a x y)
+(FSUBS (FMULS x y) a) => (FNMSUBS a x y)
+(FSUBD (FMULD x y) a) => (FNMSUBD a x y)
+(FADDS a (FNMULS x y)) => (FMSUBS a x y)
+(FADDD a (FNMULD x y)) => (FMSUBD a x y)
+(FSUBS a (FNMULS x y)) => (FMADDS a x y)
+(FSUBD a (FNMULD x y)) => (FMADDD a x y)
+(FSUBS (FNMULS x y) a) => (FNMADDS a x y)
+(FSUBD (FNMULD x y) a) => (FNMADDD a x y)
+
+(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read8(sym, int64(off)))])
+(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVWUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVDload  [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
new file mode 100644
index 0000000..4d1d14e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@@ -0,0 +1,762 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - *const instructions may use a constant larger than the instruction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R27).
+
+// Suffixes encode the bit width of various instructions.
+// D (double word) = 64 bit
+// W (word)        = 32 bit
+// H (half word)   = 16 bit
+// HU              = 16 bit unsigned
+// B (byte)        = 8 bit
+// BU              = 8 bit unsigned
+// S (single)      = 32 bit float
+// D (double)      = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesARM64 = []string{
+	"R0",
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"R16",
+	"R17",
+	"R18", // platform register, not used
+	"R19",
+	"R20",
+	"R21",
+	"R22",
+	"R23",
+	"R24",
+	"R25",
+	"R26",
+	// R27 = REGTMP not used in regalloc
+	"g",   // aka R28
+	"R29", // frame pointer, not used
+	"R30", // aka REGLINK
+	"SP",  // aka R31
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+	"F16",
+	"F17",
+	"F18",
+	"F19",
+	"F20",
+	"F21",
+	"F22",
+	"F23",
+	"F24",
+	"F25",
+	"F26",
+	"F27",
+	"F28",
+	"F29",
+	"F30",
+	"F31",
+
+	// If you add registers, update asyncPreempt in runtime.
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesARM64) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesARM64 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		gp         = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30")
+		gpg        = gp | buildReg("g")
+		gpsp       = gp | buildReg("SP")
+		gpspg      = gpg | buildReg("SP")
+		gpspsbg    = gpspg | buildReg("SB")
+		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+		r0         = buildReg("R0")
+		r1         = buildReg("R1")
+		r2         = buildReg("R2")
+		r3         = buildReg("R3")
+	)
+	// Common regInfo
+	var (
+		gp01           = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp0flags1      = regInfo{inputs: []regMask{0}, outputs: []regMask{gp}}
+		gp11           = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11sp         = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+		gp1flags       = regInfo{inputs: []regMask{gpg}}
+		gp1flags1      = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11flags      = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp, 0}}
+		gp21           = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+		gp21nog        = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+		gp21flags      = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
+		gp2flags       = regInfo{inputs: []regMask{gpg, gpg}}
+		gp2flags1      = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+		gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}}
+		gp2load        = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		gp22           = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}}
+		gp31           = regInfo{inputs: []regMask{gpg, gpg, gpg}, outputs: []regMask{gp}}
+		gpload         = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+		gpstore        = regInfo{inputs: []regMask{gpspsbg, gpg}}
+		gpstore0       = regInfo{inputs: []regMask{gpspsbg}}
+		gpstore2       = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
+		gpxchg         = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		gpcas          = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+		fp01           = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11           = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		fpgp           = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+		gpfp           = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+		fp21           = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp31           = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
+		fp2flags       = regInfo{inputs: []regMask{fp, fp}}
+		fp1flags       = regInfo{inputs: []regMask{fp}}
+		fpload         = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+		fp2load        = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}}
+		fpstore        = regInfo{inputs: []regMask{gpspsbg, fp}}
+		fpstore2       = regInfo{inputs: []regMask{gpspsbg, gpg, fp}}
+		readflags      = regInfo{inputs: nil, outputs: []regMask{gp}}
+	)
+	ops := []opData{
+		// binary ops
+		{name: "ADCSflags", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCS", commutative: true}, // arg0+arg1+carry, set flags.
+		{name: "ADCzerocarry", argLength: 1, reg: gp0flags1, typ: "UInt64", asm: "ADC"},                               // ZR+ZR+carry
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},                                         // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int64"},                                       // arg0 + auxInt
+		{name: "ADDSconstflags", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDS", aux: "Int64"},      // arg0+auxint, set flags.
+		{name: "ADDSflags", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDS", commutative: true},      // arg0+arg1, set flags.
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                                                            // arg0 - arg1
+		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int64"},                                         // arg0 - auxInt
+		{name: "SBCSflags", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "SBCS"},                    // arg0-(arg1+borrowing), set flags.
+		{name: "SUBSflags", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "SUBS"},                         // arg0 - arg1, set flags.
+		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true},                                         // arg0 * arg1
+		{name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true},                                       // arg0 * arg1, 32-bit
+		{name: "MNEG", argLength: 2, reg: gp21, asm: "MNEG", commutative: true},                                       // -arg0 * arg1
+		{name: "MNEGW", argLength: 2, reg: gp21, asm: "MNEGW", commutative: true},                                     // -arg0 * arg1, 32-bit
+		{name: "MULH", argLength: 2, reg: gp21, asm: "SMULH", commutative: true},                                      // (arg0 * arg1) >> 64, signed
+		{name: "UMULH", argLength: 2, reg: gp21, asm: "UMULH", commutative: true},                                     // (arg0 * arg1) >> 64, unsigned
+		{name: "MULL", argLength: 2, reg: gp21, asm: "SMULL", commutative: true},                                      // arg0 * arg1, signed, 32-bit mult results in 64-bit
+		{name: "UMULL", argLength: 2, reg: gp21, asm: "UMULL", commutative: true},                                     // arg0 * arg1, unsigned, 32-bit mult results in 64-bit
+		{name: "DIV", argLength: 2, reg: gp21, asm: "SDIV"},                                                           // arg0 / arg1, signed
+		{name: "UDIV", argLength: 2, reg: gp21, asm: "UDIV"},                                                          // arg0 / arg1, unsighed
+		{name: "DIVW", argLength: 2, reg: gp21, asm: "SDIVW"},                                                         // arg0 / arg1, signed, 32 bit
+		{name: "UDIVW", argLength: 2, reg: gp21, asm: "UDIVW"},                                                        // arg0 / arg1, unsighed, 32 bit
+		{name: "MOD", argLength: 2, reg: gp21, asm: "REM"},                                                            // arg0 % arg1, signed
+		{name: "UMOD", argLength: 2, reg: gp21, asm: "UREM"},                                                          // arg0 % arg1, unsigned
+		{name: "MODW", argLength: 2, reg: gp21, asm: "REMW"},                                                          // arg0 % arg1, signed, 32 bit
+		{name: "UMODW", argLength: 2, reg: gp21, asm: "UREMW"},                                                        // arg0 % arg1, unsigned, 32 bit
+
+		{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true},   // arg0 + arg1
+		{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true},   // arg0 + arg1
+		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"},                      // arg0 - arg1
+		{name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"},                      // arg0 - arg1
+		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true},   // arg0 * arg1
+		{name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true},   // arg0 * arg1
+		{name: "FNMULS", argLength: 2, reg: fp21, asm: "FNMULS", commutative: true}, // -(arg0 * arg1)
+		{name: "FNMULD", argLength: 2, reg: fp21, asm: "FNMULD", commutative: true}, // -(arg0 * arg1)
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"},                      // arg0 / arg1
+		{name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"},                      // arg0 / arg1
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt
+		{name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true},  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int64"},  // arg0 | auxInt
+		{name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int64"}, // arg0 ^ auxInt
+		{name: "BIC", argLength: 2, reg: gp21, asm: "BIC"},                    // arg0 &^ arg1
+		{name: "EON", argLength: 2, reg: gp21, asm: "EON"},                    // arg0 ^ ^arg1
+		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                    // arg0 | ^arg1
+
+		{name: "LoweredMuluhilo", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, returns (hi, lo)
+
+		// unary ops
+		{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"},                                    // ^arg0
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                                    // -arg0
+		{name: "NEGSflags", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "NEGS"}, // -arg0, set flags.
+		{name: "NGCzerocarry", argLength: 1, reg: gp0flags1, typ: "UInt64", asm: "NGC"},       // -1 if borrowing, 0 otherwise.
+		{name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD"},                                // abs(arg0), float64
+		{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"},                                // -arg0, float32
+		{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"},                                // -arg0, float64
+		{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"},                              // sqrt(arg0), float64
+		{name: "REV", argLength: 1, reg: gp11, asm: "REV"},                                    // byte reverse, 64-bit
+		{name: "REVW", argLength: 1, reg: gp11, asm: "REVW"},                                  // byte reverse, 32-bit
+		{name: "REV16W", argLength: 1, reg: gp11, asm: "REV16W"},                              // byte reverse in each 16-bit halfword, 32-bit
+		{name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"},                                  // bit reverse, 64-bit
+		{name: "RBITW", argLength: 1, reg: gp11, asm: "RBITW"},                                // bit reverse, 32-bit
+		{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"},                                    // count leading zero, 64-bit
+		{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"},                                  // count leading zero, 32-bit
+		{name: "VCNT", argLength: 1, reg: fp11, asm: "VCNT"},                                  // count set bits for each 8-bit unit and store the result in each 8-bit unit
+		{name: "VUADDLV", argLength: 1, reg: fp11, asm: "VUADDLV"},                            // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit.
+		{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+		{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+
+		// 3-operand, the addend comes first
+		{name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"},   // +arg0 + (arg1 * arg2)
+		{name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD"},   // +arg0 + (arg1 * arg2)
+		{name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS"}, // -arg0 - (arg1 * arg2)
+		{name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD"}, // -arg0 - (arg1 * arg2)
+		{name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"},   // +arg0 - (arg1 * arg2)
+		{name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD"},   // +arg0 - (arg1 * arg2)
+		{name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS"}, // -arg0 + (arg1 * arg2)
+		{name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD"}, // -arg0 + (arg1 * arg2)
+		{name: "MADD", argLength: 3, reg: gp31, asm: "MADD"},       // +arg0 + (arg1 * arg2)
+		{name: "MADDW", argLength: 3, reg: gp31, asm: "MADDW"},     // +arg0 + (arg1 * arg2), 32-bit
+		{name: "MSUB", argLength: 3, reg: gp31, asm: "MSUB"},       // +arg0 - (arg1 * arg2)
+		{name: "MSUBW", argLength: 3, reg: gp31, asm: "MSUBW"},     // +arg0 - (arg1 * arg2), 32-bit
+
+		// shifts
+		{name: "SLL", argLength: 2, reg: gp21, asm: "LSL"},                        // arg0 << arg1, shift amount is mod 64
+		{name: "SLLconst", argLength: 1, reg: gp11, asm: "LSL", aux: "Int64"},     // arg0 << auxInt
+		{name: "SRL", argLength: 2, reg: gp21, asm: "LSR"},                        // arg0 >> arg1, unsigned, shift amount is mod 64
+		{name: "SRLconst", argLength: 1, reg: gp11, asm: "LSR", aux: "Int64"},     // arg0 >> auxInt, unsigned
+		{name: "SRA", argLength: 2, reg: gp21, asm: "ASR"},                        // arg0 >> arg1, signed, shift amount is mod 64
+		{name: "SRAconst", argLength: 1, reg: gp11, asm: "ASR", aux: "Int64"},     // arg0 >> auxInt, signed
+		{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                        // arg0 right rotate by (arg1 mod 64) bits
+		{name: "RORW", argLength: 2, reg: gp21, asm: "RORW"},                      // arg0 right rotate by (arg1 mod 32) bits
+		{name: "RORconst", argLength: 1, reg: gp11, asm: "ROR", aux: "Int64"},     // arg0 right rotate by auxInt bits
+		{name: "RORWconst", argLength: 1, reg: gp11, asm: "RORW", aux: "Int64"},   // uint32(arg0) right rotate by auxInt bits
+		{name: "EXTRconst", argLength: 2, reg: gp21, asm: "EXTR", aux: "Int64"},   // extract 64 bits from arg0:arg1 starting at lsb auxInt
+		{name: "EXTRWconst", argLength: 2, reg: gp21, asm: "EXTRW", aux: "Int64"}, // extract 32 bits from arg0[31:0]:arg1[31:0] starting at lsb auxInt and zero top 32 bits
+
+		// comparisons
+		{name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"},                      // arg0 compare to arg1
+		{name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int64", typ: "Flags"},   // arg0 compare to auxInt
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1, 32 bit
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt, 32 bit
+		{name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags", commutative: true},   // arg0 compare to -arg1
+		{name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int64", typ: "Flags"},   // arg0 compare to -auxInt
+		{name: "CMNW", argLength: 2, reg: gp2flags, asm: "CMNW", typ: "Flags", commutative: true}, // arg0 compare to -arg1, 32 bit
+		{name: "CMNWconst", argLength: 1, reg: gp1flags, asm: "CMNW", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt, 32 bit
+		{name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true},   // arg0 & arg1 compare to 0
+		{name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int64", typ: "Flags"},   // arg0 & auxInt compare to 0
+		{name: "TSTW", argLength: 2, reg: gp2flags, asm: "TSTW", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0, 32 bit
+		{name: "TSTWconst", argLength: 1, reg: gp1flags, asm: "TSTW", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0, 32 bit
+		{name: "FCMPS", argLength: 2, reg: fp2flags, asm: "FCMPS", typ: "Flags"},                  // arg0 compare to arg1, float32
+		{name: "FCMPD", argLength: 2, reg: fp2flags, asm: "FCMPD", typ: "Flags"},                  // arg0 compare to arg1, float64
+		{name: "FCMPS0", argLength: 1, reg: fp1flags, asm: "FCMPS", typ: "Flags"},                 // arg0 compare to 0, float32
+		{name: "FCMPD0", argLength: 1, reg: fp1flags, asm: "FCMPD", typ: "Flags"},                 // arg0 compare to 0, float64
+
+		// shifted ops
+		{name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"},                   // ^(arg0<<auxInt)
+		{name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"},                   // ^(arg0>>auxInt), unsigned shift
+		{name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"},                   // ^(arg0>>auxInt), signed shift
+		{name: "NEGshiftLL", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"},                   // -(arg0<<auxInt)
+		{name: "NEGshiftRL", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"},                   // -(arg0>>auxInt), unsigned shift
+		{name: "NEGshiftRA", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"},                   // -(arg0>>auxInt), signed shift
+		{name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"},                   // arg0 + arg1<<auxInt
+		{name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"},                   // arg0 + arg1>>auxInt, unsigned shift
+		{name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"},                   // arg0 + arg1>>auxInt, signed shift
+		{name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"},                   // arg0 - arg1<<auxInt
+		{name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"},                   // arg0 - arg1>>auxInt, unsigned shift
+		{name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"},                   // arg0 - arg1>>auxInt, signed shift
+		{name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"},                   // arg0 & (arg1<<auxInt)
+		{name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"},                   // arg0 & (arg1>>auxInt), unsigned shift
+		{name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"},                   // arg0 & (arg1>>auxInt), signed shift
+		{name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"},                    // arg0 | arg1<<auxInt
+		{name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"},                    // arg0 | arg1>>auxInt, unsigned shift
+		{name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"},                    // arg0 | arg1>>auxInt, signed shift
+		{name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"},                   // arg0 ^ arg1<<auxInt
+		{name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"},                   // arg0 ^ arg1>>auxInt, unsigned shift
+		{name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"},                   // arg0 ^ arg1>>auxInt, signed shift
+		{name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"},                   // arg0 &^ (arg1<<auxInt)
+		{name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"},                   // arg0 &^ (arg1>>auxInt), unsigned shift
+		{name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"},                   // arg0 &^ (arg1>>auxInt), signed shift
+		{name: "EONshiftLL", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"},                   // arg0 ^ ^(arg1<<auxInt)
+		{name: "EONshiftRL", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"},                   // arg0 ^ ^(arg1>>auxInt), unsigned shift
+		{name: "EONshiftRA", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"},                   // arg0 ^ ^(arg1>>auxInt), signed shift
+		{name: "ORNshiftLL", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"},                   // arg0 | ^(arg1<<auxInt)
+		{name: "ORNshiftRL", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"},                   // arg0 | ^(arg1>>auxInt), unsigned shift
+		{name: "ORNshiftRA", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"},                   // arg0 | ^(arg1>>auxInt), signed shift
+		{name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1<<auxInt
+		{name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift
+		{name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift
+		{name: "CMNshiftLL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1<<auxInt) compare to 0
+		{name: "CMNshiftRL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1>>auxInt) compare to 0, unsigned shift
+		{name: "CMNshiftRA", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1>>auxInt) compare to 0, signed shift
+		{name: "TSTshiftLL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1<<auxInt) compare to 0
+		{name: "TSTshiftRL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1>>auxInt) compare to 0, unsigned shift
+		{name: "TSTshiftRA", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1>>auxInt) compare to 0, signed shift
+
+		// bitfield ops
+		// for all bitfield ops lsb is auxInt>>8, width is auxInt&0xff
+		// insert low width bits of arg1 into the result starting at bit lsb, copy other bits from arg0
+		{name: "BFI", argLength: 2, reg: gp21nog, asm: "BFI", aux: "ARM64BitField", resultInArg0: true},
+		// extract width bits of arg1 starting at bit lsb and insert at low end of result, copy other bits from arg0
+		{name: "BFXIL", argLength: 2, reg: gp21nog, asm: "BFXIL", aux: "ARM64BitField", resultInArg0: true},
+		// insert low width bits of arg0 into the result starting at bit lsb, bits to the left of the inserted bit field are set to the high/sign bit of the inserted bit field, bits to the right are zeroed
+		{name: "SBFIZ", argLength: 1, reg: gp11, asm: "SBFIZ", aux: "ARM64BitField"},
+		// extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are set to the high/sign bit of the extracted bitfield
+		{name: "SBFX", argLength: 1, reg: gp11, asm: "SBFX", aux: "ARM64BitField"},
+		// insert low width bits of arg0 into the result starting at bit lsb, bits to the left and right of the inserted bit field are zeroed
+		{name: "UBFIZ", argLength: 1, reg: gp11, asm: "UBFIZ", aux: "ARM64BitField"},
+		// extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are zeroed
+		{name: "UBFX", argLength: 1, reg: gp11, asm: "UBFX", aux: "ARM64BitField"},
+
+		// moves
+		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "UInt64", rematerializeable: true},      // 64 bits from auxint
+		{name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVS", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"},      // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},   // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVD", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "FMOVSload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVS", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "FMOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux.  arg1=mem.
+
+		// register indexed load
+		{name: "MOVDloadidx", argLength: 3, reg: gp2load, asm: "MOVD", typ: "UInt64"},    // load 64-bit dword from arg0 + arg1, arg2 = mem.
+		{name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"},     // load 32-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
+		{name: "MOVWUloadidx", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"},  // load 32-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
+		{name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"},     // load 16-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
+		{name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"},  // load 16-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
+		{name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"},      // load 8-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
+		{name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"},   // load 8-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
+		{name: "FMOVSloadidx", argLength: 3, reg: fp2load, asm: "FMOVS", typ: "Float32"}, // load 32-bit float from arg0 + arg1, arg2=mem.
+		{name: "FMOVDloadidx", argLength: 3, reg: fp2load, asm: "FMOVD", typ: "Float64"}, // load 64-bit float from arg0 + arg1, arg2=mem.
+
+		// shifted register indexed load
+		{name: "MOVHloadidx2", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"},    // load 16-bit half-word from arg0 + arg1*2, sign-extended to 64-bit, arg2=mem.
+		{name: "MOVHUloadidx2", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load 16-bit half-word from arg0 + arg1*2, zero-extended to 64-bit, arg2=mem.
+		{name: "MOVWloadidx4", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"},    // load 32-bit word from arg0 + arg1*4, sign-extended to 64-bit, arg2=mem.
+		{name: "MOVWUloadidx4", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"}, // load 32-bit word from arg0 + arg1*4, zero-extended to 64-bit, arg2=mem.
+		{name: "MOVDloadidx8", argLength: 3, reg: gp2load, asm: "MOVD", typ: "UInt64"},   // load 64-bit double-word from arg0 + arg1*8, arg2 = mem.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 1 byte of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVDstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},   // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "STP", argLength: 4, reg: gpstore2, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},         // store 16 bytes of arg1 and arg2 to arg0 + auxInt + aux.  arg3=mem.
+		{name: "FMOVSstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVS", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+
+		// register indexed store
+		{name: "MOVBstoreidx", argLength: 4, reg: gpstore2, asm: "MOVB", typ: "Mem"},   // store 1 byte of arg2 to arg0 + arg1, arg3 = mem.
+		{name: "MOVHstoreidx", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"},   // store 2 bytes of arg2 to arg0 + arg1, arg3 = mem.
+		{name: "MOVWstoreidx", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"},   // store 4 bytes of arg2 to arg0 + arg1, arg3 = mem.
+		{name: "MOVDstoreidx", argLength: 4, reg: gpstore2, asm: "MOVD", typ: "Mem"},   // store 8 bytes of arg2 to arg0 + arg1, arg3 = mem.
+		{name: "FMOVSstoreidx", argLength: 4, reg: fpstore2, asm: "FMOVS", typ: "Mem"}, // store 32-bit float of arg2 to arg0 + arg1, arg3=mem.
+		{name: "FMOVDstoreidx", argLength: 4, reg: fpstore2, asm: "FMOVD", typ: "Mem"}, // store 64-bit float of arg2 to arg0 + arg1, arg3=mem.
+
+		// shifted register indexed store
+		{name: "MOVHstoreidx2", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg2 to arg0 + arg1*2, arg3 = mem.
+		{name: "MOVWstoreidx4", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg2 to arg0 + arg1*4, arg3 = mem.
+		{name: "MOVDstoreidx8", argLength: 4, reg: gpstore2, asm: "MOVD", typ: "Mem"}, // store 8 bytes of arg2 to arg0 + arg1*8, arg3 = mem.
+
+		{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVQstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store 16 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+
+		// register indexed store zero
+		{name: "MOVBstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVB", typ: "Mem"}, // store 1 byte of zero to arg0 + arg1, arg2 = mem.
+		{name: "MOVHstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1, arg2 = mem.
+		{name: "MOVWstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1, arg2 = mem.
+		{name: "MOVDstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1, arg2 = mem.
+
+		// shifted register indexed store zero
+		{name: "MOVHstorezeroidx2", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1*2, arg2 = mem.
+		{name: "MOVWstorezeroidx4", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1*4, arg2 = mem.
+		{name: "MOVDstorezeroidx8", argLength: 3, reg: gpstore, asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1*8, arg2 = mem.
+
+		{name: "FMOVDgpfp", argLength: 1, reg: gpfp, asm: "FMOVD"}, // move int64 to float64 (no conversion)
+		{name: "FMOVDfpgp", argLength: 1, reg: fpgp, asm: "FMOVD"}, // move float64 to int64 (no conversion)
+		{name: "FMOVSgpfp", argLength: 1, reg: gpfp, asm: "FMOVS"}, // move 32bits from int to float reg (no conversion)
+		{name: "FMOVSfpgp", argLength: 1, reg: fpgp, asm: "FMOVS"}, // move 32bits from float to int reg, zero extend (no conversion)
+
+		// conversions
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"},   // move from arg0, sign-extended from half
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0, sign-extended from word
+		{name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+		{name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOVD"},   // move from arg0
+
+		{name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		{name: "SCVTFWS", argLength: 1, reg: gpfp, asm: "SCVTFWS"},   // int32 -> float32
+		{name: "SCVTFWD", argLength: 1, reg: gpfp, asm: "SCVTFWD"},   // int32 -> float64
+		{name: "UCVTFWS", argLength: 1, reg: gpfp, asm: "UCVTFWS"},   // uint32 -> float32
+		{name: "UCVTFWD", argLength: 1, reg: gpfp, asm: "UCVTFWD"},   // uint32 -> float64
+		{name: "SCVTFS", argLength: 1, reg: gpfp, asm: "SCVTFS"},     // int64 -> float32
+		{name: "SCVTFD", argLength: 1, reg: gpfp, asm: "SCVTFD"},     // int64 -> float64
+		{name: "UCVTFS", argLength: 1, reg: gpfp, asm: "UCVTFS"},     // uint64 -> float32
+		{name: "UCVTFD", argLength: 1, reg: gpfp, asm: "UCVTFD"},     // uint64 -> float64
+		{name: "FCVTZSSW", argLength: 1, reg: fpgp, asm: "FCVTZSSW"}, // float32 -> int32
+		{name: "FCVTZSDW", argLength: 1, reg: fpgp, asm: "FCVTZSDW"}, // float64 -> int32
+		{name: "FCVTZUSW", argLength: 1, reg: fpgp, asm: "FCVTZUSW"}, // float32 -> uint32
+		{name: "FCVTZUDW", argLength: 1, reg: fpgp, asm: "FCVTZUDW"}, // float64 -> uint32
+		{name: "FCVTZSS", argLength: 1, reg: fpgp, asm: "FCVTZSS"},   // float32 -> int64
+		{name: "FCVTZSD", argLength: 1, reg: fpgp, asm: "FCVTZSD"},   // float64 -> int64
+		{name: "FCVTZUS", argLength: 1, reg: fpgp, asm: "FCVTZUS"},   // float32 -> uint64
+		{name: "FCVTZUD", argLength: 1, reg: fpgp, asm: "FCVTZUD"},   // float64 -> uint64
+		{name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD"},     // float32 -> float64
+		{name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS"},     // float64 -> float32
+
+		// floating-point round to integral
+		{name: "FRINTAD", argLength: 1, reg: fp11, asm: "FRINTAD"},
+		{name: "FRINTMD", argLength: 1, reg: fp11, asm: "FRINTMD"},
+		{name: "FRINTND", argLength: 1, reg: fp11, asm: "FRINTND"},
+		{name: "FRINTPD", argLength: 1, reg: fp11, asm: "FRINTPD"},
+		{name: "FRINTZD", argLength: 1, reg: fp11, asm: "FRINTZD"},
+
+		// conditional instructions; auxint is
+		// one of the arm64 comparison pseudo-ops (LessThan, LessThanU, etc.)
+		{name: "CSEL", argLength: 3, reg: gp2flags1, asm: "CSEL", aux: "CCop"},  // auxint(flags) ? arg0 : arg1
+		{name: "CSEL0", argLength: 2, reg: gp1flags1, asm: "CSEL", aux: "CCop"}, // auxint(flags) ? arg0 : 0
+
+		// function calls
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                               // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R26"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                         // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// pseudo-ops
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil.  arg1=mem.
+
+		{name: "Equal", argLength: 1, reg: readflags},            // bool, true flags encode x==y false otherwise.
+		{name: "NotEqual", argLength: 1, reg: readflags},         // bool, true flags encode x!=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: readflags},         // bool, true flags encode signed x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: readflags},        // bool, true flags encode signed x<=y false otherwise.
+		{name: "GreaterThan", argLength: 1, reg: readflags},      // bool, true flags encode signed x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: readflags},     // bool, true flags encode signed x>=y false otherwise.
+		{name: "LessThanU", argLength: 1, reg: readflags},        // bool, true flags encode unsigned x<y false otherwise.
+		{name: "LessEqualU", argLength: 1, reg: readflags},       // bool, true flags encode unsigned x<=y false otherwise.
+		{name: "GreaterThanU", argLength: 1, reg: readflags},     // bool, true flags encode unsigned x>y false otherwise.
+		{name: "GreaterEqualU", argLength: 1, reg: readflags},    // bool, true flags encode unsigned x>=y false otherwise.
+		{name: "LessThanF", argLength: 1, reg: readflags},        // bool, true flags encode floating-point x<y false otherwise.
+		{name: "LessEqualF", argLength: 1, reg: readflags},       // bool, true flags encode floating-point x<=y false otherwise.
+		{name: "GreaterThanF", argLength: 1, reg: readflags},     // bool, true flags encode floating-point x>y false otherwise.
+		{name: "GreaterEqualF", argLength: 1, reg: readflags},    // bool, true flags encode floating-point x>=y false otherwise.
+		{name: "NotLessThanF", argLength: 1, reg: readflags},     // bool, true flags encode floating-point x>=y || x is unordered with y, false otherwise.
+		{name: "NotLessEqualF", argLength: 1, reg: readflags},    // bool, true flags encode floating-point x>y || x is unordered with y, false otherwise.
+		{name: "NotGreaterThanF", argLength: 1, reg: readflags},  // bool, true flags encode floating-point x<=y || x is unordered with y, false otherwise.
+		{name: "NotGreaterEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<y || x is unordered with y, false otherwise.
+		// duffzero
+		// arg0 = address of memory to zero
+		// arg1 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		// R20 changed as side effect
+		// R16 and R17 may be clobbered by linker trampoline.
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20")},
+				clobbers: buildReg("R16 R17 R20 R30"),
+			},
+			faultOnNilArg0: true,
+			unsafePoint:    true, // FP maintenance around DUFFZERO can be clobbered by interrupts
+		},
+
+		// large zeroing
+		// arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
+		// arg1 = address of the last 16-byte unit to zero
+		// arg2 = mem
+		// returns mem
+		//	STP.P	(ZR,ZR), 16(R16)
+		//	CMP	Rarg1, R16
+		//	BLE	-2(PC)
+		// Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
+		// the-end-of-the-memory - 16 is with the area to zero, ok to spill.
+		{
+			name:      "LoweredZero",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R16"), gp},
+				clobbers: buildReg("R16"),
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+		},
+
+		// duffcopy
+		// arg0 = address of dst memory (in R21, changed as side effect)
+		// arg1 = address of src memory (in R20, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// returns mem
+		// R20, R21 changed as side effect
+		// R16 and R17 may be clobbered by linker trampoline.
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R21"), buildReg("R20")},
+				clobbers: buildReg("R16 R17 R20 R21 R26 R30"),
+			},
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+		},
+
+		// large move
+		// arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
+		// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// returns mem
+		//	MOVD.P	8(R16), Rtmp
+		//	MOVD.P	Rtmp, 8(R17)
+		//	CMP	Rarg2, R16
+		//	BLE	-3(PC)
+		// Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
+		// the-end-of-src - 8 is within the area to copy, ok to spill.
+		{
+			name:      "LoweredMove",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R17"), buildReg("R16"), gp},
+				clobbers: buildReg("R16 R17"),
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R26 (arm64.REGCTXT, the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R26")}}, zeroWidth: true},
+
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+		// Constant flag value.
+		// Note: there's an "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// This op is for temporary use by rewrite rules. It
+		// cannot appear in the generated assembly.
+		{name: "FlagConstant", aux: "FlagConstant"},
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// atomic loads.
+		// load from arg0. arg1=mem. auxint must be zero.
+		// returns <value,memory> so they can be properly ordered with other loads.
+		{name: "LDAR", argLength: 2, reg: gpload, asm: "LDAR", faultOnNilArg0: true},
+		{name: "LDARB", argLength: 2, reg: gpload, asm: "LDARB", faultOnNilArg0: true},
+		{name: "LDARW", argLength: 2, reg: gpload, asm: "LDARW", faultOnNilArg0: true},
+
+		// atomic stores.
+		// store arg1 to arg0. arg2=mem. returns memory. auxint must be zero.
+		{name: "STLRB", argLength: 3, reg: gpstore, asm: "STLRB", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "STLR", argLength: 3, reg: gpstore, asm: "STLR", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "STLRW", argLength: 3, reg: gpstore, asm: "STLRW", faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic exchange.
+		// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
+		// LDAXR	(Rarg0), Rout
+		// STLXR	Rarg1, (Rarg0), Rtmp
+		// CBNZ		Rtmp, -2(PC)
+		{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic exchange variant.
+		// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
+		// SWPALD	Rarg1, (Rarg0), Rout
+		{name: "LoweredAtomicExchange64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicExchange32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic add.
+		// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+		// LDAXR	(Rarg0), Rout
+		// ADD		Rarg1, Rout
+		// STLXR	Rout, (Rarg0), Rtmp
+		// CBNZ		Rtmp, -3(PC)
+		{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic add variant.
+		// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+		// LDADDAL	(Rarg0), Rarg1, Rout
+		// ADD		Rarg1, Rout
+		{name: "LoweredAtomicAdd64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicAdd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// LDAXR	(Rarg0), Rtmp
+		// CMP		Rarg1, Rtmp
+		// BNE		3(PC)
+		// STLXR	Rarg2, (Rarg0), Rtmp
+		// CBNZ		Rtmp, -4(PC)
+		// CSET		EQ, Rout
+		{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic compare and swap variant.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// MOV  	Rarg1, Rtmp
+		// CASAL	Rtmp, (Rarg0), Rarg2
+		// CMP  	Rarg1, Rtmp
+		// CSET 	EQ, Rout
+		{name: "LoweredAtomicCas64Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicCas32Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic and/or.
+		// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+		// LDAXR	(Rarg0), Rout
+		// AND/OR	Rarg1, Rout
+		// STLXR	Rout, (Rarg0), Rtmp
+		// CBNZ		Rtmp, -3(PC)
+		{name: "LoweredAtomicAnd8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAnd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicOr8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicOr32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic and/or variant.
+		// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+		//   AND:
+		// MNV       Rarg1, Rtemp
+		// LDANDALB  Rtemp, (Rarg0), Rout
+		// AND       Rarg1, Rout
+		//   OR:
+		// LDORALB  Rarg1, (Rarg0), Rout
+		// ORR       Rarg1, Rout
+		{name: "LoweredAtomicAnd8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAnd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicOr8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicOr32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary,
+		// but clobbers R30 (LR) because it's a call.
+		// R16 and R17 may be clobbered by linker trampoline.
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R16 R17 R30")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+	}
+
+	blocks := []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LT", controls: 1},
+		{name: "LE", controls: 1},
+		{name: "GT", controls: 1},
+		{name: "GE", controls: 1},
+		{name: "ULT", controls: 1},
+		{name: "ULE", controls: 1},
+		{name: "UGT", controls: 1},
+		{name: "UGE", controls: 1},
+		{name: "Z", controls: 1},                  // Control == 0 (take a register instead of flags)
+		{name: "NZ", controls: 1},                 // Control != 0
+		{name: "ZW", controls: 1},                 // Control == 0, 32-bit
+		{name: "NZW", controls: 1},                // Control != 0, 32-bit
+		{name: "TBZ", controls: 1, aux: "Int64"},  // Control & (1 << AuxInt) == 0
+		{name: "TBNZ", controls: 1, aux: "Int64"}, // Control & (1 << AuxInt) != 0
+		{name: "FLT", controls: 1},
+		{name: "FLE", controls: 1},
+		{name: "FGT", controls: 1},
+		{name: "FGE", controls: 1},
+		{name: "LTnoov", controls: 1}, // 'LT' but without honoring overflow
+		{name: "LEnoov", controls: 1}, // 'LE' but without honoring overflow
+		{name: "GTnoov", controls: 1}, // 'GT' but without honoring overflow
+		{name: "GEnoov", controls: 1}, // 'GE' but without honoring overflow
+	}
+
+	archs = append(archs, arch{
+		name:            "ARM64",
+		pkg:             "cmd/internal/obj/arm64",
+		genfile:         "../../arm64/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesARM64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: -1, // not used
+		linkreg:         int8(num["R30"]),
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/ARMOps.go b/src/cmd/compile/internal/ssa/gen/ARMOps.go
new file mode 100644
index 0000000..1a7eefa
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
@@ -0,0 +1,600 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - *const instructions may use a constant larger than the instruction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R11).
+
+// Suffixes encode the bit width of various instructions.
+// W (word)      = 32 bit
+// H (half word) = 16 bit
+// HU            = 16 bit unsigned
+// B (byte)      = 8 bit
+// BU            = 8 bit unsigned
+// F (float)     = 32 bit float
+// D (double)    = 64 bit float
+
+var regNamesARM = []string{
+	"R0",
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"g",   // aka R10
+	"R11", // tmp
+	"R12",
+	"SP",  // aka R13
+	"R14", // link
+	"R15", // pc
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15", // tmp
+
+	// If you add registers, update asyncPreempt in runtime.
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesARM) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesARM {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		gp         = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14")
+		gpg        = gp | buildReg("g")
+		gpsp       = gp | buildReg("SP")
+		gpspg      = gpg | buildReg("SP")
+		gpspsbg    = gpspg | buildReg("SB")
+		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+		r0         = buildReg("R0")
+		r1         = buildReg("R1")
+		r2         = buildReg("R2")
+		r3         = buildReg("R3")
+		r4         = buildReg("R4")
+	)
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11      = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11carry = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp, 0}}
+		gp11sp    = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+		gp1flags  = regInfo{inputs: []regMask{gpg}}
+		gp1flags1 = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}
+		gp21      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+		gp21carry = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, 0}}
+		gp2flags  = regInfo{inputs: []regMask{gpg, gpg}}
+		gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+		gp22      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}}
+		gp31      = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		gp31carry = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp, 0}}
+		gp3flags  = regInfo{inputs: []regMask{gp, gp, gp}}
+		gp3flags1 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		gpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+		gpstore   = regInfo{inputs: []regMask{gpspsbg, gpg}}
+		gp2load   = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		gp2store  = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
+		fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11      = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		fp1flags  = regInfo{inputs: []regMask{fp}}
+		fpgp      = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}, clobbers: buildReg("F15")} // int-float conversion uses F15 as tmp
+		gpfp      = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}, clobbers: buildReg("F15")}
+		fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp31      = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
+		fp2flags  = regInfo{inputs: []regMask{fp, fp}}
+		fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+		fpstore   = regInfo{inputs: []regMask{gpspsbg, fp}}
+		readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+	)
+	ops := []opData{
+		// binary ops
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},     // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32"},   // arg0 + auxInt
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                        // arg0 - arg1
+		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32"},     // arg0 - auxInt
+		{name: "RSB", argLength: 2, reg: gp21, asm: "RSB"},                        // arg1 - arg0
+		{name: "RSBconst", argLength: 1, reg: gp11, asm: "RSB", aux: "Int32"},     // auxInt - arg0
+		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true},     // arg0 * arg1
+		{name: "HMUL", argLength: 2, reg: gp21, asm: "MULL", commutative: true},   // (arg0 * arg1) >> 32, signed
+		{name: "HMULU", argLength: 2, reg: gp21, asm: "MULLU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+
+		// udiv runtime call for soft division
+		// output0 = arg0/arg1, output1 = arg0%arg1
+		// see ../../../../../runtime/vlop_arm.s
+		{
+			name:      "CALLudiv",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), buildReg("R0")},
+				outputs:  []regMask{buildReg("R0"), buildReg("R1")},
+				clobbers: buildReg("R2 R3 R14"),
+			},
+			clobberFlags: true,
+			typ:          "(UInt32,UInt32)",
+			call:         false, // TODO(mdempsky): Should this be true?
+		},
+
+		{name: "ADDS", argLength: 2, reg: gp21carry, asm: "ADD", commutative: true}, // arg0 + arg1, set carry flag
+		{name: "ADDSconst", argLength: 1, reg: gp11carry, asm: "ADD", aux: "Int32"}, // arg0 + auxInt, set carry flag
+		{name: "ADC", argLength: 3, reg: gp2flags1, asm: "ADC", commutative: true},  // arg0 + arg1 + carry, arg2=flags
+		{name: "ADCconst", argLength: 2, reg: gp1flags1, asm: "ADC", aux: "Int32"},  // arg0 + auxInt + carry, arg1=flags
+		{name: "SUBS", argLength: 2, reg: gp21carry, asm: "SUB"},                    // arg0 - arg1, set carry flag
+		{name: "SUBSconst", argLength: 1, reg: gp11carry, asm: "SUB", aux: "Int32"}, // arg0 - auxInt, set carry flag
+		{name: "RSBSconst", argLength: 1, reg: gp11carry, asm: "RSB", aux: "Int32"}, // auxInt - arg0, set carry flag
+		{name: "SBC", argLength: 3, reg: gp2flags1, asm: "SBC"},                     // arg0 - arg1 - carry, arg2=flags
+		{name: "SBCconst", argLength: 2, reg: gp1flags1, asm: "SBC", aux: "Int32"},  // arg0 - auxInt - carry, arg1=flags
+		{name: "RSCconst", argLength: 2, reg: gp1flags1, asm: "RSC", aux: "Int32"},  // auxInt - arg0 - carry, arg1=flags
+
+		{name: "MULLU", argLength: 2, reg: gp22, asm: "MULLU", commutative: true}, // arg0 * arg1, high 32 bits in out0, low 32 bits in out1
+		{name: "MULA", argLength: 3, reg: gp31, asm: "MULA"},                      // arg0 * arg1 + arg2
+		{name: "MULS", argLength: 3, reg: gp31, asm: "MULS"},                      // arg2 - arg0 * arg1
+
+		{name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true},   // arg0 + arg1
+		{name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true},   // arg0 + arg1
+		{name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"},                      // arg0 - arg1
+		{name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"},                      // arg0 - arg1
+		{name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true},   // arg0 * arg1
+		{name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true},   // arg0 * arg1
+		{name: "NMULF", argLength: 2, reg: fp21, asm: "NMULF", commutative: true}, // -(arg0 * arg1)
+		{name: "NMULD", argLength: 2, reg: fp21, asm: "NMULD", commutative: true}, // -(arg0 * arg1)
+		{name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"},                      // arg0 / arg1
+		{name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"},                      // arg0 / arg1
+
+		{name: "MULAF", argLength: 3, reg: fp31, asm: "MULAF", resultInArg0: true}, // arg0 + (arg1 * arg2)
+		{name: "MULAD", argLength: 3, reg: fp31, asm: "MULAD", resultInArg0: true}, // arg0 + (arg1 * arg2)
+		{name: "MULSF", argLength: 3, reg: fp31, asm: "MULSF", resultInArg0: true}, // arg0 - (arg1 * arg2)
+		{name: "MULSD", argLength: 3, reg: fp31, asm: "MULSD", resultInArg0: true}, // arg0 - (arg1 * arg2)
+
+		// FMULAD only exists on platforms with the VFPv4 instruction set.
+		// Any use must be preceded by a successful check of runtime.arm_support_vfpv4.
+		{name: "FMULAD", argLength: 3, reg: fp31, asm: "FMULAD", resultInArg0: true}, // arg0 + (arg1 * arg2)
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt
+		{name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true},  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int32"},  // arg0 | auxInt
+		{name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int32"}, // arg0 ^ auxInt
+		{name: "BIC", argLength: 2, reg: gp21, asm: "BIC"},                    // arg0 &^ arg1
+		{name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt
+
+		// bit extraction, AuxInt = Width<<8 | LSB
+		{name: "BFX", argLength: 1, reg: gp11, asm: "BFX", aux: "Int32"},   // extract W bits from bit L in arg0, then signed extend
+		{name: "BFXU", argLength: 1, reg: gp11, asm: "BFXU", aux: "Int32"}, // extract W bits from bit L in arg0, then unsigned extend
+
+		// unary ops
+		{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
+
+		{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"},   // -arg0, float32
+		{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
+		{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+		{name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"},   // abs(arg0), float64
+
+		{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"},     // count leading zero
+		{name: "REV", argLength: 1, reg: gp11, asm: "REV"},     // reverse byte order
+		{name: "REV16", argLength: 1, reg: gp11, asm: "REV16"}, // reverse byte order in 16-bit halfwords
+		{name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"},   // reverse bit order
+
+		// shifts
+		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                    // arg0 << arg1, shift amount is mod 256
+		{name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt, 0 <= auxInt < 32
+		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                    // arg0 >> arg1, unsigned, shift amount is mod 256
+		{name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, unsigned, 0 <= auxInt < 32
+		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                    // arg0 >> arg1, signed, shift amount is mod 256
+		{name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed, 0 <= auxInt < 32
+		{name: "SRR", argLength: 2, reg: gp21},                                // arg0 right rotate by arg1 bits
+		{name: "SRRconst", argLength: 1, reg: gp11, aux: "Int32"},             // arg0 right rotate by auxInt bits, 0 <= auxInt < 32
+
+		// auxInt for all of these satisfy 0 <= auxInt < 32
+		{name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt
+		{name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift
+		{name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift
+		{name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt
+		{name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift
+		{name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift
+		{name: "RSBshiftLL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0
+		{name: "RSBshiftRL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift
+		{name: "RSBshiftRA", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift
+		{name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1<<auxInt)
+		{name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), unsigned shift
+		{name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), signed shift
+		{name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"},  // arg0 | arg1<<auxInt
+		{name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"},  // arg0 | arg1>>auxInt, unsigned shift
+		{name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"},  // arg0 | arg1>>auxInt, signed shift
+		{name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1<<auxInt
+		{name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, unsigned shift
+		{name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, signed shift
+		{name: "XORshiftRR", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ (arg1 right rotate by auxInt)
+		{name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1<<auxInt)
+		{name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), unsigned shift
+		{name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), signed shift
+		{name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0<<auxInt)
+		{name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), unsigned shift
+		{name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), signed shift
+
+		{name: "ADCshiftLL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1<<auxInt + carry, arg2=flags
+		{name: "ADCshiftRL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, unsigned shift, arg2=flags
+		{name: "ADCshiftRA", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, signed shift, arg2=flags
+		{name: "SBCshiftLL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1<<auxInt - carry, arg2=flags
+		{name: "SBCshiftRL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, unsigned shift, arg2=flags
+		{name: "SBCshiftRA", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, signed shift, arg2=flags
+		{name: "RSCshiftLL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1<<auxInt - arg0 - carry, arg2=flags
+		{name: "RSCshiftRL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, unsigned shift, arg2=flags
+		{name: "RSCshiftRA", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, signed shift, arg2=flags
+
+		{name: "ADDSshiftLL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt, set carry flag
+		{name: "ADDSshiftRL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift, set carry flag
+		{name: "ADDSshiftRA", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift, set carry flag
+		{name: "SUBSshiftLL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt, set carry flag
+		{name: "SUBSshiftRL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift, set carry flag
+		{name: "SUBSshiftRA", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift, set carry flag
+		{name: "RSBSshiftLL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0, set carry flag
+		{name: "RSBSshiftRL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift, set carry flag
+		{name: "RSBSshiftRA", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift, set carry flag
+
+		{name: "ADDshiftLLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1<<arg2
+		{name: "ADDshiftRLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift
+		{name: "ADDshiftRAreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift
+		{name: "SUBshiftLLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1<<arg2
+		{name: "SUBshiftRLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift
+		{name: "SUBshiftRAreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift
+		{name: "RSBshiftLLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1<<arg2 - arg0
+		{name: "RSBshiftRLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift
+		{name: "RSBshiftRAreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift
+		{name: "ANDshiftLLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1<<arg2)
+		{name: "ANDshiftRLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), unsigned shift
+		{name: "ANDshiftRAreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), signed shift
+		{name: "ORshiftLLreg", argLength: 3, reg: gp31, asm: "ORR"},  // arg0 | arg1<<arg2
+		{name: "ORshiftRLreg", argLength: 3, reg: gp31, asm: "ORR"},  // arg0 | arg1>>arg2, unsigned shift
+		{name: "ORshiftRAreg", argLength: 3, reg: gp31, asm: "ORR"},  // arg0 | arg1>>arg2, signed shift
+		{name: "XORshiftLLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1<<arg2
+		{name: "XORshiftRLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, unsigned shift
+		{name: "XORshiftRAreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, signed shift
+		{name: "BICshiftLLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1<<arg2)
+		{name: "BICshiftRLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), unsigned shift
+		{name: "BICshiftRAreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), signed shift
+		{name: "MVNshiftLLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0<<arg1)
+		{name: "MVNshiftRLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), unsigned shift
+		{name: "MVNshiftRAreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), signed shift
+
+		{name: "ADCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1<<arg2 + carry, arg3=flags
+		{name: "ADCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, unsigned shift, arg3=flags
+		{name: "ADCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, signed shift, arg3=flags
+		{name: "SBCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1<<arg2 - carry, arg3=flags
+		{name: "SBCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, unsigned shift, arg3=flags
+		{name: "SBCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, signed shift, arg3=flags
+		{name: "RSCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1<<arg2 - arg0 - carry, arg3=flags
+		{name: "RSCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, unsigned shift, arg3=flags
+		{name: "RSCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, signed shift, arg3=flags
+
+		{name: "ADDSshiftLLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1<<arg2, set carry flag
+		{name: "ADDSshiftRLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift, set carry flag
+		{name: "ADDSshiftRAreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift, set carry flag
+		{name: "SUBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1<<arg2, set carry flag
+		{name: "SUBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift, set carry flag
+		{name: "SUBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift, set carry flag
+		{name: "RSBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1<<arg2 - arg0, set carry flag
+		{name: "RSBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift, set carry flag
+		{name: "RSBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift, set carry flag
+
+		// comparisons
+		{name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt
+		{name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags", commutative: true}, // arg0 compare to -arg1
+		{name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt
+		{name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0
+		{name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0
+		{name: "TEQ", argLength: 2, reg: gp2flags, asm: "TEQ", typ: "Flags", commutative: true}, // arg0 ^ arg1 compare to 0
+		{name: "TEQconst", argLength: 1, reg: gp1flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ auxInt compare to 0
+		{name: "CMPF", argLength: 2, reg: fp2flags, asm: "CMPF", typ: "Flags"},                  // arg0 compare to arg1, float32
+		{name: "CMPD", argLength: 2, reg: fp2flags, asm: "CMPD", typ: "Flags"},                  // arg0 compare to arg1, float64
+
+		{name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1<<auxInt
+		{name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift
+		{name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift
+		{name: "CMNshiftLL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1<<auxInt)
+		{name: "CMNshiftRL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1>>auxInt), unsigned shift
+		{name: "CMNshiftRA", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1>>auxInt), signed shift
+		{name: "TSTshiftLL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1<<auxInt) compare to 0
+		{name: "TSTshiftRL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1>>auxInt) compare to 0, unsigned shift
+		{name: "TSTshiftRA", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1>>auxInt) compare to 0, signed shift
+		{name: "TEQshiftLL", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1<<auxInt) compare to 0
+		{name: "TEQshiftRL", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1>>auxInt) compare to 0, unsigned shift
+		{name: "TEQshiftRA", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1>>auxInt) compare to 0, signed shift
+
+		{name: "CMPshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1<<arg2
+		{name: "CMPshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, unsigned shift
+		{name: "CMPshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, signed shift
+		{name: "CMNshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1<<arg2) compare to 0
+		{name: "CMNshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1>>arg2) compare to 0, unsigned shift
+		{name: "CMNshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1>>arg2) compare to 0, signed shift
+		{name: "TSTshiftLLreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1<<arg2) compare to 0
+		{name: "TSTshiftRLreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1>>arg2) compare to 0, unsigned shift
+		{name: "TSTshiftRAreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1>>arg2) compare to 0, signed shift
+		{name: "TEQshiftLLreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1<<arg2) compare to 0
+		{name: "TEQshiftRLreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1>>arg2) compare to 0, unsigned shift
+		{name: "TEQshiftRAreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1>>arg2) compare to 0, signed shift
+
+		{name: "CMPF0", argLength: 1, reg: fp1flags, asm: "CMPF", typ: "Flags"}, // arg0 compare to 0, float32
+		{name: "CMPD0", argLength: 1, reg: fp1flags, asm: "CMPD", typ: "Flags"}, // arg0 compare to 0, float64
+
+		// moves
+		{name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true},    // 32 low bits of auxint
+		{name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+		{name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+		{name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"},   // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+
+		{name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "UInt32"},                   // load from arg0 + arg1. arg2=mem
+		{name: "MOVWloadshiftLL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1<<auxInt. arg2=mem
+		{name: "MOVWloadshiftRL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1>>auxInt, unsigned shift. arg2=mem
+		{name: "MOVWloadshiftRA", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1>>auxInt, signed shift. arg2=mem
+		{name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"},                  // load from arg0 + arg1. arg2=mem
+		{name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"},                     // load from arg0 + arg1. arg2=mem
+		{name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"},                 // load from arg0 + arg1. arg2=mem
+		{name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"},                    // load from arg0 + arg1. arg2=mem
+
+		{name: "MOVWstoreidx", argLength: 4, reg: gp2store, asm: "MOVW", typ: "Mem"},                   // store arg2 to arg0 + arg1. arg3=mem
+		{name: "MOVWstoreshiftLL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1<<auxInt. arg3=mem
+		{name: "MOVWstoreshiftRL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1>>auxInt, unsigned shift. arg3=mem
+		{name: "MOVWstoreshiftRA", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1>>auxInt, signed shift. arg3=mem
+		{name: "MOVBstoreidx", argLength: 4, reg: gp2store, asm: "MOVB", typ: "Mem"},                   // store arg2 to arg0 + arg1. arg3=mem
+		{name: "MOVHstoreidx", argLength: 4, reg: gp2store, asm: "MOVH", typ: "Mem"},                   // store arg2 to arg0 + arg1. arg3=mem
+
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVBS"},  // move from arg0, sign-extended from byte
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVHS"},  // move from arg0, sign-extended from half
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0
+
+		{name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		{name: "MOVWF", argLength: 1, reg: gpfp, asm: "MOVWF"},  // int32 -> float32
+		{name: "MOVWD", argLength: 1, reg: gpfp, asm: "MOVWD"},  // int32 -> float64
+		{name: "MOVWUF", argLength: 1, reg: gpfp, asm: "MOVWF"}, // uint32 -> float32, set U bit in the instruction
+		{name: "MOVWUD", argLength: 1, reg: gpfp, asm: "MOVWD"}, // uint32 -> float64, set U bit in the instruction
+		{name: "MOVFW", argLength: 1, reg: fpgp, asm: "MOVFW"},  // float32 -> int32
+		{name: "MOVDW", argLength: 1, reg: fpgp, asm: "MOVDW"},  // float64 -> int32
+		{name: "MOVFWU", argLength: 1, reg: fpgp, asm: "MOVFW"}, // float32 -> uint32, set U bit in the instruction
+		{name: "MOVDWU", argLength: 1, reg: fpgp, asm: "MOVDW"}, // float64 -> uint32, set U bit in the instruction
+		{name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"},  // float32 -> float64
+		{name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"},  // float64 -> float32
+
+		// conditional instructions, for lowering shifts
+		{name: "CMOVWHSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates HS, arg1=flags
+		{name: "CMOVWLSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates LS, arg1=flags
+		{name: "SRAcond", argLength: 3, reg: gp2flags1, asm: "SRA"},                                         // arg0 >> 31 if flags indicates HS, arg0 >> arg1 otherwise, signed shift, arg2=flags
+
+		// function calls
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                              // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R7"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                        // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// pseudo-ops
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil.  arg1=mem.
+
+		{name: "Equal", argLength: 1, reg: readflags},         // bool, true flags encode x==y false otherwise.
+		{name: "NotEqual", argLength: 1, reg: readflags},      // bool, true flags encode x!=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: readflags},      // bool, true flags encode signed x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: readflags},     // bool, true flags encode signed x<=y false otherwise.
+		{name: "GreaterThan", argLength: 1, reg: readflags},   // bool, true flags encode signed x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: readflags},  // bool, true flags encode signed x>=y false otherwise.
+		{name: "LessThanU", argLength: 1, reg: readflags},     // bool, true flags encode unsigned x<y false otherwise.
+		{name: "LessEqualU", argLength: 1, reg: readflags},    // bool, true flags encode unsigned x<=y false otherwise.
+		{name: "GreaterThanU", argLength: 1, reg: readflags},  // bool, true flags encode unsigned x>y false otherwise.
+		{name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise.
+
+		// duffzero (must be 4-byte aligned)
+		// arg0 = address of memory to zero (in R1, changed as side effect)
+		// arg1 = value to store (always zero)
+		// arg2 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), buildReg("R0")},
+				clobbers: buildReg("R1 R14"),
+			},
+			faultOnNilArg0: true,
+		},
+
+		// duffcopy (must be 4-byte aligned)
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// returns mem
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1")},
+				clobbers: buildReg("R0 R1 R2 R14"),
+			},
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// large or unaligned zeroing
+		// arg0 = address of memory to zero (in R1, changed as side effect)
+		// arg1 = address of the last element to zero
+		// arg2 = value to store (always zero)
+		// arg3 = mem
+		// returns mem
+		//	MOVW.P	Rarg2, 4(R1)
+		//	CMP	R1, Rarg1
+		//	BLE	-2(PC)
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), gp, gp},
+				clobbers: buildReg("R1"),
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+		},
+
+		// large or unaligned move
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// returns mem
+		//	MOVW.P	4(R1), Rtmp
+		//	MOVW.P	Rtmp, 4(R2)
+		//	CMP	R1, Rarg2
+		//	BLE	-3(PC)
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1"), gp},
+				clobbers: buildReg("R1 R2"),
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R7 (arm.REGCTXT, the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R7")}}, zeroWidth: true},
+
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		// Extend ops are the same as Bounds ops except the indexes are 64-bit.
+		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r0, r1}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+
+		// Constant flag value.
+		// Note: there's an "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// This op is for temporary use by rewrite rules. It
+		// cannot appear in the generated assembly.
+		{name: "FlagConstant", aux: "FlagConstant"},
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary,
+		// but clobbers R14 (LR) because it's a call.
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R14")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+	}
+
+	blocks := []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LT", controls: 1},
+		{name: "LE", controls: 1},
+		{name: "GT", controls: 1},
+		{name: "GE", controls: 1},
+		{name: "ULT", controls: 1},
+		{name: "ULE", controls: 1},
+		{name: "UGT", controls: 1},
+		{name: "UGE", controls: 1},
+		{name: "LTnoov", controls: 1}, // 'LT' but without honoring overflow
+		{name: "LEnoov", controls: 1}, // 'LE' but without honoring overflow
+		{name: "GTnoov", controls: 1}, // 'GT' but without honoring overflow
+		{name: "GEnoov", controls: 1}, // 'GE' but without honoring overflow
+	}
+
+	archs = append(archs, arch{
+		name:            "ARM",
+		pkg:             "cmd/internal/obj/arm",
+		genfile:         "../../arm/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesARM,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: -1, // not used
+		linkreg:         int8(num["R14"]),
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/MIPS.rules b/src/cmd/compile/internal/ssa/gen/MIPS.rules
new file mode 100644
index 0000000..8ad2c90
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules
@@ -0,0 +1,697 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|32|16|8) ...) => (ADD ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+
+(Select0 (Add32carry <t> x y)) => (ADD <t.FieldType(0)> x y)
+(Select1 (Add32carry <t> x y)) => (SGTU <typ.Bool> x (ADD <t.FieldType(0)> x y))
+(Add32withcarry <t> x y c) => (ADD c (ADD <t> x y))
+
+(Sub(Ptr|32|16|8) ...) => (SUB ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+
+(Select0 (Sub32carry <t> x y)) => (SUB <t.FieldType(0)> x y)
+(Select1 (Sub32carry <t> x y)) => (SGTU <typ.Bool> (SUB <t.FieldType(0)> x y) x)
+(Sub32withcarry <t> x y c) => (SUB (SUB <t> x y) c)
+
+(Mul(32|16|8) ...) => (MUL ...)
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+
+(Hmul(32|32u) x y) => (Select0 (MUL(T|TU) x y))
+(Mul32uhilo ...) => (MULTU ...)
+
+(Div32 x y) => (Select1 (DIV x y))
+(Div32u x y) => (Select1 (DIVU x y))
+(Div16 x y) => (Select1 (DIV (SignExt16to32 x) (SignExt16to32 y)))
+(Div16u x y) => (Select1 (DIVU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Div8 x y) => (Select1 (DIV (SignExt8to32 x) (SignExt8to32 y)))
+(Div8u x y) => (Select1 (DIVU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod32 x y) => (Select0 (DIV x y))
+(Mod32u x y) => (Select0 (DIVU x y))
+(Mod16 x y) => (Select0 (DIV (SignExt16to32 x) (SignExt16to32 y)))
+(Mod16u x y) => (Select0 (DIVU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Mod8 x y) => (Select0 (DIV (SignExt8to32 x) (SignExt8to32 y)))
+(Mod8u x y) => (Select0 (DIVU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+
+// (x + y) / 2 with x>=y  becomes  (x - y) / 2 + y
+(Avg32u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(32|16|8) ...) => (AND ...)
+(Or(32|16|8) ...) => (OR ...)
+(Xor(32|16|8) ...) => (XOR ...)
+
+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64  x (Const64 [c])) && uint32(c) < 32 => (SLLconst x [int32(c)])
+(Rsh32x64  x (Const64 [c])) && uint32(c) < 32 => (SRAconst x [int32(c)])
+(Rsh32Ux64 x (Const64 [c])) && uint32(c) < 32 => (SRLconst x [int32(c)])
+(Lsh16x64  x (Const64 [c])) && uint32(c) < 16 => (SLLconst x [int32(c)])
+(Rsh16x64  x (Const64 [c])) && uint32(c) < 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Rsh16Ux64 x (Const64 [c])) && uint32(c) < 16 => (SRLconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)])
+(Lsh8x64   x (Const64 [c])) && uint32(c) < 8  => (SLLconst x [int32(c)])
+(Rsh8x64   x (Const64 [c])) && uint32(c) < 8  => (SRAconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+(Rsh8Ux64  x (Const64 [c])) && uint32(c) < 8  => (SRLconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c]))  && uint32(c) >= 32 => (MOVWconst [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint32(c) >= 32 => (MOVWconst [0])
+(Lsh16x64 _ (Const64 [c]))  && uint32(c) >= 16 => (MOVWconst [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint32(c) >= 16 => (MOVWconst [0])
+(Lsh8x64 _ (Const64 [c]))   && uint32(c) >= 8  => (MOVWconst [0])
+(Rsh8Ux64 _ (Const64 [c]))  && uint32(c) >= 8  => (MOVWconst [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh32x64 x (Const64 [c])) && uint32(c) >= 32 => (SRAconst x [31])
+(Rsh16x64 x (Const64 [c])) && uint32(c) >= 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [31])
+(Rsh8x64  x (Const64 [c])) && uint32(c) >= 8  => (SRAconst (SLLconst <typ.UInt32> x [24]) [31])
+
+// shifts
+// hardware instruction uses only the low 5 bits of the shift
+// we compare to 32 to ensure Go semantics for large shifts
+(Lsh32x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Lsh32x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Lsh32x8 <t> x y)  => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Lsh16x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Lsh16x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Lsh16x8 <t> x y)  => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Lsh8x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Lsh8x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Lsh8x8 <t> x y)  => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh32Ux32 <t> x y) => (CMOVZ (SRL <t> x y) (MOVWconst [0]) (SGTUconst [32] y))
+(Rsh32Ux16 <t> x y) => (CMOVZ (SRL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Rsh32Ux8 <t> x y)  => (CMOVZ (SRL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh16Ux32 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) y) (MOVWconst [0]) (SGTUconst [32] y))
+(Rsh16Ux16 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Rsh16Ux8 <t> x y)  => (CMOVZ (SRL <t> (ZeroExt16to32 x) (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh8Ux32 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) y) (MOVWconst [0]) (SGTUconst [32] y))
+(Rsh8Ux16 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y)))
+(Rsh8Ux8 <t> x y)  => (CMOVZ (SRL <t> (ZeroExt8to32 x) (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y)))
+
+(Rsh32x32 x y) => (SRA x ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y)))
+(Rsh32x16 x y) => (SRA x ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y))))
+(Rsh32x8 x y)  => (SRA x ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y))))
+
+(Rsh16x32 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y)))
+(Rsh16x16 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y))))
+(Rsh16x8 x y)  => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y))))
+
+(Rsh8x32 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y)))
+(Rsh8x16 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y))))
+(Rsh8x8 x y)  => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y))))
+
+// rotates
+(RotateLeft8  <t> x (MOVWconst [c])) => (Or8  (Lsh8x32 <t>  x (MOVWconst [c&7]))  (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+(RotateLeft16 <t> x (MOVWconst [c])) => (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+(RotateLeft32 <t> x (MOVWconst [c])) => (Or32 (Lsh32x32 <t> x (MOVWconst [c&31])) (Rsh32Ux32 <t> x (MOVWconst [-c&31])))
+(RotateLeft64 <t> x (MOVWconst [c])) => (Or64 (Lsh64x32 <t> x (MOVWconst [c&63])) (Rsh64Ux32 <t> x (MOVWconst [-c&63])))
+
+// unary ops
+(Neg(32|16|8) ...) => (NEG ...)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(32|16|8) x) => (NORconst [0] x)
+
+(Sqrt ...) => (SQRTD ...)
+
+// TODO: optimize this case?
+(Ctz32NonZero ...) => (Ctz32 ...)
+
+// count trailing zero
+// 32 - CLZ(x&-x - 1)
+(Ctz32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> (SUBconst <t> [1] (AND <t> x (NEG <t> x)))))
+
+// bit length
+(BitLen32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> x))
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XORconst [1] (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// constants
+(Const(32|16|8) [val]) => (MOVWconst [int32(val)])
+(Const(32|64)F ...) => (MOV(F|D)const ...)
+(ConstNil) => (MOVWconst [0])
+(ConstBool [b]) => (MOVWconst [b2i32(b)])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...)  => (Copy ...)
+(Trunc32to8 ...)  => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...)  => (MOVBUreg ...)
+(ZeroExt8to32 ...)  => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+
+(SignExt8to16 ...)  => (MOVBreg ...)
+(SignExt8to32 ...)  => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+
+(Signmask x) => (SRAconst x [31])
+(Zeromask x) => (NEG (SGTU x (MOVWconst [0])))
+(Slicemask <t> x) => (SRAconst (NEG <t> x) [31])
+
+// float-int conversion
+(Cvt32to(32|64)F ...) => (MOVW(F|D) ...)
+(Cvt(32|64)Fto32 ...) => (TRUNC(F|D)W ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+// comparisons
+(Eq8 x y)  => (SGTUconst [1] (XOR (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (SGTUconst [1] (XOR (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (SGTUconst [1] (XOR x y))
+(EqPtr x y) => (SGTUconst [1] (XOR x y))
+(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y))
+
+(Neq8 x y)  => (SGTU (XOR (ZeroExt8to32 x) (ZeroExt8to32 y)) (MOVWconst [0]))
+(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to32 y)) (MOVWconst [0]))
+(Neq32 x y) => (SGTU (XOR x y) (MOVWconst [0]))
+(NeqPtr x y) => (SGTU (XOR x y) (MOVWconst [0]))
+(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y))
+
+(Less8 x y)  => (SGT (SignExt8to32 y) (SignExt8to32 x))
+(Less16 x y) => (SGT (SignExt16to32 y) (SignExt16to32 x))
+(Less32 x y) => (SGT y x)
+(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y)  => (SGTU (ZeroExt8to32 y) (ZeroExt8to32 x))
+(Less16U x y) => (SGTU (ZeroExt16to32 y) (ZeroExt16to32 x))
+(Less32U x y) => (SGTU y x)
+
+(Leq8 x y)  => (XORconst [1] (SGT (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (XORconst [1] (SGT (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (XORconst [1] (SGT x y))
+(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y)  => (XORconst [1] (SGTU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (XORconst [1] (SGTU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (XORconst [1] (SGTU x y))
+
+(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr)
+
+(Addr {sym} base) => (MOVWaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVWaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore ptr (MOVWconst [0]) mem)
+(Zero [2] ptr mem) =>
+	(MOVBstore [1] ptr (MOVWconst [0])
+		(MOVBstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore ptr (MOVWconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] ptr (MOVWconst [0])
+		(MOVHstore [0] ptr (MOVWconst [0]) mem))
+(Zero [4] ptr mem) =>
+	(MOVBstore [3] ptr (MOVWconst [0])
+		(MOVBstore [2] ptr (MOVWconst [0])
+			(MOVBstore [1] ptr (MOVWconst [0])
+				(MOVBstore [0] ptr (MOVWconst [0]) mem))))
+(Zero [3] ptr mem) =>
+	(MOVBstore [2] ptr (MOVWconst [0])
+		(MOVBstore [1] ptr (MOVWconst [0])
+			(MOVBstore [0] ptr (MOVWconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [4] ptr (MOVWconst [0])
+		(MOVHstore [2] ptr (MOVWconst [0])
+			(MOVHstore [0] ptr (MOVWconst [0]) mem)))
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+		(MOVWstore [4] ptr (MOVWconst [0])
+			(MOVWstore [0] ptr (MOVWconst [0]) mem))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [8] ptr (MOVWconst [0])
+		(MOVWstore [4] ptr (MOVWconst [0])
+			(MOVWstore [0] ptr (MOVWconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [12] ptr (MOVWconst [0])
+		(MOVWstore [8] ptr (MOVWconst [0])
+			(MOVWstore [4] ptr (MOVWconst [0])
+				(MOVWstore [0] ptr (MOVWconst [0]) mem))))
+
+// large or unaligned zeroing uses a loop
+(Zero [s] {t} ptr mem)
+	&& (s > 16  || t.Alignment()%4 != 0) =>
+	(LoweredZero [int32(t.Alignment())]
+		ptr
+		(ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))])
+		mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore dst (MOVHUload src mem) mem)
+(Move [2] dst src mem) =>
+	(MOVBstore [1] dst (MOVBUload [1] src mem)
+		(MOVBstore dst (MOVBUload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] dst (MOVHUload [2] src mem)
+		(MOVHstore dst (MOVHUload src mem) mem))
+(Move [4] dst src mem) =>
+	(MOVBstore [3] dst (MOVBUload [3] src mem)
+		(MOVBstore [2] dst (MOVBUload [2] src mem)
+			(MOVBstore [1] dst (MOVBUload [1] src mem)
+				(MOVBstore dst (MOVBUload src mem) mem))))
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBUload [2] src mem)
+		(MOVBstore [1] dst (MOVBUload [1] src mem)
+			(MOVBstore dst (MOVBUload src mem) mem)))
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [4] dst (MOVWload [4] src mem)
+		(MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [6] dst (MOVHload [6] src mem)
+		(MOVHstore [4] dst (MOVHload [4] src mem)
+			(MOVHstore [2] dst (MOVHload [2] src mem)
+				(MOVHstore dst (MOVHload src mem) mem))))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [4] dst (MOVHload [4] src mem)
+		(MOVHstore [2] dst (MOVHload [2] src mem)
+			(MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [8] dst (MOVWload [8] src mem)
+		(MOVWstore [4] dst (MOVWload [4] src mem)
+			(MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [12] dst (MOVWload [12] src mem)
+		(MOVWstore [8] dst (MOVWload [8] src mem)
+			(MOVWstore [4] dst (MOVWload [4] src mem)
+				(MOVWstore dst (MOVWload src mem) mem))))
+
+
+// large or unaligned move uses a loop
+(Move [s] {t} dst src mem)
+	&& (s > 16 && logLargeCopy(v, s) || t.Alignment()%4 != 0) =>
+	(LoweredMove [int32(t.Alignment())]
+		dst
+		src
+		(ADDconst <src.Type> src [int32(s-moveSize(t.Alignment(), config))])
+		mem)
+
+// calls
+(StaticCall ...)  => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...)   => (CALLinter ...)
+
+// atomic intrinsics
+(AtomicLoad(8|32) ...) => (LoweredAtomicLoad(8|32) ...)
+(AtomicLoadPtr    ...) => (LoweredAtomicLoad32     ...)
+
+(AtomicStore(8|32)  ...) => (LoweredAtomicStore(8|32) ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore32     ...)
+
+(AtomicExchange32 ...) => (LoweredAtomicExchange ...)
+(AtomicAdd32 ...) => (LoweredAtomicAdd ...)
+
+(AtomicCompareAndSwap32 ...) => (LoweredAtomicCas ...)
+
+// AtomicOr8(ptr,val)  =>   LoweredAtomicOr(ptr&^3,uint32(val) << ((ptr & 3) * 8))
+(AtomicOr8 ptr val mem) && !config.BigEndian =>
+	(LoweredAtomicOr (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+		(SLL <typ.UInt32> (ZeroExt8to32 val)
+			(SLLconst <typ.UInt32> [3]
+				(ANDconst <typ.UInt32> [3] ptr))) mem)
+
+// AtomicAnd8(ptr,val)  =>  LoweredAtomicAnd(ptr&^3,(uint32(val) << ((ptr & 3) * 8)) | ^(uint32(0xFF) << ((ptr & 3) * 8))))
+(AtomicAnd8  ptr val mem) && !config.BigEndian =>
+	(LoweredAtomicAnd (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+		(OR <typ.UInt32> (SLL <typ.UInt32> (ZeroExt8to32 val)
+			(SLLconst <typ.UInt32> [3]
+				(ANDconst  <typ.UInt32> [3] ptr)))
+		(NORconst [0] <typ.UInt32> (SLL <typ.UInt32>
+			(MOVWconst [0xff]) (SLLconst <typ.UInt32> [3]
+				(ANDconst <typ.UInt32> [3] ptr))))) mem)
+
+// AtomicOr8(ptr,val)  =>  LoweredAtomicOr(ptr&^3,uint32(val) << (((ptr^3) & 3) * 8))
+(AtomicOr8 ptr val mem) && config.BigEndian =>
+	(LoweredAtomicOr (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+		(SLL <typ.UInt32> (ZeroExt8to32 val)
+			(SLLconst <typ.UInt32> [3]
+				(ANDconst <typ.UInt32> [3]
+					(XORconst <typ.UInt32> [3] ptr)))) mem)
+
+// AtomicAnd8(ptr,val)  =>  LoweredAtomicAnd(ptr&^3,(uint32(val) << (((ptr^3) & 3) * 8)) | ^(uint32(0xFF) << (((ptr^3) & 3) * 8))))
+(AtomicAnd8  ptr val mem) && config.BigEndian =>
+	(LoweredAtomicAnd (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr)
+		(OR <typ.UInt32> (SLL <typ.UInt32> (ZeroExt8to32 val)
+			(SLLconst <typ.UInt32> [3]
+				(ANDconst  <typ.UInt32> [3]
+					(XORconst <typ.UInt32> [3] ptr))))
+		(NORconst [0] <typ.UInt32> (SLL <typ.UInt32>
+			(MOVWconst [0xff]) (SLLconst <typ.UInt32> [3]
+				(ANDconst <typ.UInt32> [3]
+					(XORconst <typ.UInt32> [3] ptr)))))) mem)
+
+(AtomicAnd32 ...) => (LoweredAtomicAnd ...)
+(AtomicOr32  ...) => (LoweredAtomicOr  ...)
+
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (SGTU ptr (MOVWconst [0]))
+(IsInBounds idx len) => (SGTU len idx)
+(IsSliceInBounds idx len) => (XORconst [1] (SGTU idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+(If cond yes no) => (NE cond yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem)
+(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem)
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NE (FPFlagTrue cmp)  yes no) => (FPT cmp yes no)
+(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagTrue cmp)  yes no) => (FPF cmp yes no)
+(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no)
+(NE (XORconst [1] cmp:(SGT _ _))     yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTU _ _))    yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTconst _))  yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTzero _))   yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUzero _))  yes no) => (EQ cmp yes no)
+(EQ (XORconst [1] cmp:(SGT _ _))     yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTU _ _))    yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTconst _))  yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTzero _))   yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUzero _))  yes no) => (NE cmp yes no)
+(NE (SGTUconst [1] x) yes no) => (EQ x yes no)
+(EQ (SGTUconst [1] x) yes no) => (NE x yes no)
+(NE (SGTUzero x) yes no) => (NE x yes no)
+(EQ (SGTUzero x) yes no) => (EQ x yes no)
+(NE (SGTconst [0] x) yes no) => (LTZ x yes no)
+(EQ (SGTconst [0] x) yes no) => (GEZ x yes no)
+(NE (SGTzero x) yes no) => (GTZ x yes no)
+(EQ (SGTzero x) yes no) => (LEZ x yes no)
+
+// fold offset into address
+(ADDconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off1+off2] {sym} ptr)
+
+// fold address into load/store
+(MOVBload  [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBload  [off1+off2] {sym} ptr mem)
+(MOVBUload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBUload [off1+off2] {sym} ptr mem)
+(MOVHload  [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHload  [off1+off2] {sym} ptr mem)
+(MOVHUload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHUload [off1+off2] {sym} ptr mem)
+(MOVWload  [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWload  [off1+off2] {sym} ptr mem)
+(MOVFload  [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVFload  [off1+off2] {sym} ptr mem)
+(MOVDload  [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVDload  [off1+off2] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBstore [off1+off2] {sym} ptr val mem)
+(MOVHstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHstore [off1+off2] {sym} ptr val mem)
+(MOVWstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWstore [off1+off2] {sym} ptr val mem)
+(MOVFstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVFstore [off1+off2] {sym} ptr val mem)
+(MOVDstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVDstore [off1+off2] {sym} ptr val mem)
+
+(MOVBstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBstorezero [off1+off2] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHstorezero [off1+off2] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWstorezero [off1+off2] {sym} ptr mem)
+
+(MOVBload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)  =>
+	(MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)  =>
+	(MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVFload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVFstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) =>
+	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) =>
+	(MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _))  && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBreg x)
+(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBUreg x)
+(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _))  && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHreg x)
+(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHUreg x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _))  && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVFload [off] {sym} ptr (MOVFstore [off2] {sym2} ptr2 x _))  && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _))  && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+
+// store zero
+(MOVBstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _))   => (MOVWreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHreg x:(MOVBload _ _))   => (MOVWreg x)
+(MOVHreg x:(MOVBUload _ _))  => (MOVWreg x)
+(MOVHreg x:(MOVHload _ _))   => (MOVWreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVWreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _))   => (MOVWreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHreg x:(MOVBreg _))   => (MOVWreg x)
+(MOVHreg x:(MOVBUreg _))  => (MOVWreg x)
+(MOVHreg x:(MOVHreg _))   => (MOVWreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVWreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVWreg x)
+
+// sign extended loads
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values.  See test/atomicload.go.
+(MOVBreg <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload  <t> [off] {sym} ptr mem)
+(MOVBUreg <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem)
+(MOVHreg <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload  <t> [off] {sym} ptr mem)
+(MOVHUreg <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem)
+
+// fold extensions and ANDs together
+(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x)
+(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&0xffff] x)
+(MOVBreg (ANDconst [c] x)) && c & 0x80   == 0 => (ANDconst [c&0x7f] x)
+(MOVHreg (ANDconst [c] x)) && c & 0x8000 == 0 => (ANDconst [c&0x7fff] x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x)  mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x)  mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x)  mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x)  mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x)  mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x)  mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVWnop doesn't emit instruction, only for ensuring the type.
+(MOVWreg x) && x.Uses == 1 => (MOVWnop x)
+
+// fold constant into arithmatic ops
+(ADD x (MOVWconst [c])) => (ADDconst [c] x)
+(SUB x (MOVWconst [c])) => (SUBconst [c] x)
+(AND x (MOVWconst [c])) => (ANDconst [c] x)
+(OR  x (MOVWconst [c])) => (ORconst  [c] x)
+(XOR x (MOVWconst [c])) => (XORconst [c] x)
+(NOR x (MOVWconst [c])) => (NORconst [c] x)
+
+(SLL x (MOVWconst [c])) => (SLLconst x [c&31])
+(SRL x (MOVWconst [c])) => (SRLconst x [c&31])
+(SRA x (MOVWconst [c])) => (SRAconst x [c&31])
+
+(SGT  (MOVWconst [c]) x) => (SGTconst  [c] x)
+(SGTU (MOVWconst [c]) x) => (SGTUconst [c] x)
+(SGT x  (MOVWconst [0])) => (SGTzero x)
+(SGTU x (MOVWconst [0])) => (SGTUzero x)
+
+// mul with constant
+(Select1 (MULTU (MOVWconst [0])  _ )) => (MOVWconst [0])
+(Select0 (MULTU (MOVWconst [0])  _ )) => (MOVWconst [0])
+(Select1 (MULTU (MOVWconst [1])  x )) => x
+(Select0 (MULTU (MOVWconst [1])  _ )) => (MOVWconst [0])
+(Select1 (MULTU (MOVWconst [-1]) x )) => (NEG <x.Type> x)
+(Select0 (MULTU (MOVWconst [-1]) x )) => (CMOVZ (ADDconst <x.Type> [-1] x) (MOVWconst [0]) x)
+(Select1 (MULTU (MOVWconst [c])  x )) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
+(Select0 (MULTU (MOVWconst [c])  x )) && isPowerOfTwo64(int64(uint32(c))) => (SRLconst [int32(32-log2uint32(int64(c)))] x)
+
+(MUL (MOVWconst [0])  _ ) => (MOVWconst [0])
+(MUL (MOVWconst [1])  x ) => x
+(MUL (MOVWconst [-1]) x ) => (NEG x)
+(MUL (MOVWconst [c]) x ) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
+
+// generic simplifications
+(ADD x (NEG y)) => (SUB x y)
+(SUB x x) => (MOVWconst [0])
+(SUB (MOVWconst [0]) x) => (NEG x)
+(AND x x) => x
+(OR  x x) => x
+(XOR x x) => (MOVWconst [0])
+
+// miscellaneous patterns generated by dec64
+(AND (SGTUconst [1] x) (SGTUconst [1] y)) =>  (SGTUconst [1] (OR <x.Type> x y))
+(OR (SGTUzero x) (SGTUzero y)) =>  (SGTUzero (OR <x.Type> x y))
+
+// remove redundant *const ops
+(ADDconst [0]  x) => x
+(SUBconst [0]  x) => x
+(ANDconst [0]  _) => (MOVWconst [0])
+(ANDconst [-1] x) => x
+(ORconst  [0]  x) => x
+(ORconst  [-1] _) => (MOVWconst [-1])
+(XORconst [0]  x) => x
+(XORconst [-1] x) => (NORconst [0] x)
+
+// generic constant folding
+(ADDconst [c] (MOVWconst [d]))  => (MOVWconst [int32(c+d)])
+(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x)
+(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x)
+(SUBconst [c] (MOVWconst [d]))  => (MOVWconst [d-c])
+(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x)
+(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x)
+(SLLconst [c] (MOVWconst [d]))  => (MOVWconst [d<<uint32(c)])
+(SRLconst [c] (MOVWconst [d]))  => (MOVWconst [int32(uint32(d)>>uint32(c))])
+(SRAconst [c] (MOVWconst [d]))  => (MOVWconst [d>>uint32(c)])
+(MUL (MOVWconst [c]) (MOVWconst [d])) => (MOVWconst [c*d])
+(Select1 (MULTU  (MOVWconst [c]) (MOVWconst [d]))) => (MOVWconst [int32(uint32(c)*uint32(d))])
+(Select0 (MULTU  (MOVWconst [c]) (MOVWconst [d]))) => (MOVWconst [int32((int64(uint32(c))*int64(uint32(d)))>>32)])
+(Select1 (DIV  (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [c/d])
+(Select1 (DIVU (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)/uint32(d))])
+(Select0 (DIV  (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [c%d])
+(Select0 (DIVU (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)%uint32(d))])
+(ANDconst [c] (MOVWconst [d])) => (MOVWconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVWconst [d])) => (MOVWconst [c|d])
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (MOVWconst [d])) => (MOVWconst [c^d])
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(NORconst [c] (MOVWconst [d])) => (MOVWconst [^(c|d)])
+(NEG (MOVWconst [c])) => (MOVWconst [-c])
+(MOVBreg  (MOVWconst [c])) => (MOVWconst [int32(int8(c))])
+(MOVBUreg (MOVWconst [c])) => (MOVWconst [int32(uint8(c))])
+(MOVHreg  (MOVWconst [c])) => (MOVWconst [int32(int16(c))])
+(MOVHUreg (MOVWconst [c])) => (MOVWconst [int32(uint16(c))])
+(MOVWreg  (MOVWconst [c])) => (MOVWconst [c])
+
+// constant comparisons
+(SGTconst  [c] (MOVWconst [d])) && c >  d => (MOVWconst [1])
+(SGTconst  [c] (MOVWconst [d])) && c <= d => (MOVWconst [0])
+(SGTUconst [c] (MOVWconst [d])) && uint32(c) >  uint32(d) => (MOVWconst [1])
+(SGTUconst [c] (MOVWconst [d])) && uint32(c) <= uint32(d) => (MOVWconst [0])
+(SGTzero (MOVWconst [d])) && d >  0 => (MOVWconst [1])
+(SGTzero (MOVWconst [d])) && d <= 0 => (MOVWconst [0])
+(SGTUzero (MOVWconst [d])) && d != 0 => (MOVWconst [1])
+(SGTUzero (MOVWconst [d])) && d == 0 => (MOVWconst [0])
+
+// other known comparisons
+(SGTconst [c] (MOVBreg _)) && 0x7f < c   => (MOVWconst [1])
+(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVWconst [0])
+(SGTconst [c] (MOVBUreg _)) && 0xff < c  => (MOVWconst [1])
+(SGTconst [c] (MOVBUreg _)) && c < 0     => (MOVWconst [0])
+(SGTUconst [c] (MOVBUreg _)) && 0xff < uint32(c) => (MOVWconst [1])
+(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVWconst [1])
+(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVWconst [0])
+(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVWconst [1])
+(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVWconst [0])
+(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint32(c) => (MOVWconst [1])
+(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVWconst [1])
+(SGTUconst [c] (ANDconst [m] _)) && uint32(m) < uint32(c) => (MOVWconst [1])
+(SGTconst [c] (SRLconst _ [d])) && 0 <= c && uint32(d) <= 31 && 0xffffffff>>uint32(d) < uint32(c) => (MOVWconst [1])
+(SGTUconst [c] (SRLconst _ [d])) && uint32(d) <= 31 && 0xffffffff>>uint32(d) < uint32(c) => (MOVWconst [1])
+
+// absorb constants into branches
+(EQ  (MOVWconst [0]) yes no) => (First yes no)
+(EQ  (MOVWconst [c]) yes no) && c != 0 => (First no yes)
+(NE  (MOVWconst [0]) yes no) => (First no yes)
+(NE  (MOVWconst [c]) yes no) && c != 0 => (First yes no)
+(LTZ (MOVWconst [c]) yes no) && c <  0 => (First yes no)
+(LTZ (MOVWconst [c]) yes no) && c >= 0 => (First no yes)
+(LEZ (MOVWconst [c]) yes no) && c <= 0 => (First yes no)
+(LEZ (MOVWconst [c]) yes no) && c >  0 => (First no yes)
+(GTZ (MOVWconst [c]) yes no) && c >  0 => (First yes no)
+(GTZ (MOVWconst [c]) yes no) && c <= 0 => (First no yes)
+(GEZ (MOVWconst [c]) yes no) && c >= 0 => (First yes no)
+(GEZ (MOVWconst [c]) yes no) && c <  0 => (First no yes)
+
+// conditional move
+(CMOVZ _ f (MOVWconst [0])) => f
+(CMOVZ a _ (MOVWconst [c])) && c!=0 => a
+(CMOVZzero _ (MOVWconst [0])) => (MOVWconst [0])
+(CMOVZzero a (MOVWconst [c])) && c!=0 => a
+(CMOVZ a (MOVWconst [0]) c) => (CMOVZzero a c)
+
+// atomic
+(LoweredAtomicStore32 ptr (MOVWconst [0]) mem) => (LoweredAtomicStorezero ptr mem)
+(LoweredAtomicAdd ptr (MOVWconst [c]) mem) && is16Bit(int64(c)) => (LoweredAtomicAddconst [c] ptr mem)
+
diff --git a/src/cmd/compile/internal/ssa/gen/MIPS64.rules b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
new file mode 100644
index 0000000..088c9b1
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
@@ -0,0 +1,678 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+(Add(Ptr|64|32|16|8) ...) => (ADDV ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUBV ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)
+
+(Mul(64|32|16|8) x y) => (Select1 (MULVU x y))
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+(Mul64uhilo ...) => (MULVU ...)
+(Select0 (Mul64uover x y)) => (Select1 <typ.UInt64> (MULVU x y))
+(Select1 (Mul64uover x y)) => (SGTU <typ.Bool> (Select0 <typ.UInt64> (MULVU x y)) (MOVVconst <typ.UInt64> [0]))
+
+(Hmul64 x y) => (Select0 (MULV x y))
+(Hmul64u x y) => (Select0 (MULVU x y))
+(Hmul32 x y) => (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32])
+(Hmul32u x y) => (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32])
+
+(Div64 x y) => (Select1 (DIVV x y))
+(Div64u x y) => (Select1 (DIVVU x y))
+(Div32 x y) => (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Div32u x y) => (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Div16 x y) => (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Div16u x y) => (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Div8 x y) => (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Div8u x y) => (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Div(32|64)F ...) => (DIV(F|D) ...)
+
+(Mod64 x y) => (Select0 (DIVV x y))
+(Mod64u x y) => (Select0 (DIVVU x y))
+(Mod32 x y) => (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Mod32u x y) => (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Mod16 x y) => (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Mod16u x y) => (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Mod8 x y) => (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Mod8u x y) => (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+
+// (x + y) / 2 with x>=y => (x - y) / 2 + y
+(Avg64u <t> x y) => (ADDV (SRLVconst <t> (SUBV <t> x y) [1]) y)
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+// shifts
+// hardware instruction uses only the low 6 bits of the shift
+// we compare to 64 to ensure Go semantics for large shifts
+(Lsh64x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh64x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh64x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+
+(Lsh32x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh32x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh32x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+
+(Lsh16x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh16x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh16x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+
+(Lsh8x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh8x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh8x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+
+(Rsh64Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y))
+(Rsh64Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y)))
+(Rsh64Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> x (ZeroExt8to64  y)))
+
+(Rsh32Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y))
+(Rsh32Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)))
+(Rsh32Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64  y)))
+
+(Rsh16Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y))
+(Rsh16Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Rsh16Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64  y)))
+
+(Rsh8Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y))
+(Rsh8Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)))
+(Rsh8Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64  y)))
+
+(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh64x8  <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+
+(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh32x8  <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+
+(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh16x8  <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+
+(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh8x8  <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+
+// rotates
+(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
+(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
+(RotateLeft32 <t> x (MOVVconst [c])) => (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
+(RotateLeft64 <t> x (MOVVconst [c])) => (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))
+
+// unary ops
+(Neg(64|32|16|8) ...) => (NEGV ...)
+(Neg(32|64)F ...) => (NEG(F|D) ...)
+
+(Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x)
+
+(Sqrt ...) => (SQRTD ...)
+
+// boolean ops -- booleans are represented with 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVVconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)
+
+// constants
+(Const(64|32|16|8) [val]) => (MOVVconst [int64(val)])
+(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)])
+(ConstNil) => (MOVVconst [0])
+(ConstBool [b]) => (MOVVconst [int64(b2i(b))])
+
+(Slicemask <t> x) => (SRAVconst (NEGV <t> x) [63])
+
+// truncations
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Zero-/Sign-extensions
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+// float <=> int conversion
+(Cvt32to32F ...) => (MOVWF ...)
+(Cvt32to64F ...) => (MOVWD ...)
+(Cvt64to32F ...) => (MOVVF ...)
+(Cvt64to64F ...) => (MOVVD ...)
+(Cvt32Fto32 ...) => (TRUNCFW ...)
+(Cvt64Fto32 ...) => (TRUNCDW ...)
+(Cvt32Fto64 ...) => (TRUNCFV ...)
+(Cvt64Fto64 ...) => (TRUNCDV ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (Copy ...)
+
+// comparisons
+(Eq8 x y)  => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Eq16 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq32 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Eq64 x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(EqPtr x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y))
+
+(Neq8 x y)  => (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0]))
+(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0]))
+(Neq32 x y) => (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0]))
+(Neq64 x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(NeqPtr x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y))
+
+(Less8 x y)  => (SGT (SignExt8to64 y) (SignExt8to64 x))
+(Less16 x y) => (SGT (SignExt16to64 y) (SignExt16to64 x))
+(Less32 x y) => (SGT (SignExt32to64 y) (SignExt32to64 x))
+(Less64 x y) => (SGT y x)
+(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN
+
+(Less8U x y)  => (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x))
+(Less16U x y) => (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x))
+(Less32U x y) => (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x))
+(Less64U x y) => (SGTU y x)
+
+(Leq8 x y)  => (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y)))
+(Leq16 x y) => (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y)))
+(Leq32 x y) => (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y)))
+(Leq64 x y) => (XOR (MOVVconst [1]) (SGT x y))
+(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN
+
+(Leq8U x y)  => (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Leq16U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Leq32U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Leq64U x y) => (XOR (MOVVconst [1]) (SGTU x y))
+
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVVaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDVconst [off] ptr)
+
+(Addr {sym} base) => (MOVVaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVVaddr {sym} base)
+
+// loads
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVVload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)
+
+// stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVVstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+
+// zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore ptr (MOVVconst [0]) mem)
+(Zero [2] ptr mem) =>
+	(MOVBstore [1] ptr (MOVVconst [0])
+		(MOVBstore [0] ptr (MOVVconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore ptr (MOVVconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] ptr (MOVVconst [0])
+		(MOVHstore [0] ptr (MOVVconst [0]) mem))
+(Zero [4] ptr mem) =>
+	(MOVBstore [3] ptr (MOVVconst [0])
+		(MOVBstore [2] ptr (MOVVconst [0])
+			(MOVBstore [1] ptr (MOVVconst [0])
+				(MOVBstore [0] ptr (MOVVconst [0]) mem))))
+(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVVstore ptr (MOVVconst [0]) mem)
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [4] ptr (MOVVconst [0])
+		(MOVWstore [0] ptr (MOVVconst [0]) mem))
+(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [6] ptr (MOVVconst [0])
+		(MOVHstore [4] ptr (MOVVconst [0])
+			(MOVHstore [2] ptr (MOVVconst [0])
+				(MOVHstore [0] ptr (MOVVconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+	(MOVBstore [2] ptr (MOVVconst [0])
+		(MOVBstore [1] ptr (MOVVconst [0])
+			(MOVBstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [4] ptr (MOVVconst [0])
+		(MOVHstore [2] ptr (MOVVconst [0])
+			(MOVHstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [8] ptr (MOVVconst [0])
+		(MOVWstore [4] ptr (MOVVconst [0])
+			(MOVWstore [0] ptr (MOVVconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVVstore [8] ptr (MOVVconst [0])
+		(MOVVstore [0] ptr (MOVVconst [0]) mem))
+(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVVstore [16] ptr (MOVVconst [0])
+		(MOVVstore [8] ptr (MOVVconst [0])
+			(MOVVstore [0] ptr (MOVVconst [0]) mem)))
+
+// medium zeroing uses a duff device
+// 8, and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+	&& s%8 == 0 && s > 24 && s <= 8*128
+	&& t.Alignment()%8 == 0 && !config.noDuffDevice =>
+	(DUFFZERO [8 * (128 - s/8)] ptr mem)
+
+// large or unaligned zeroing uses a loop
+(Zero [s] {t} ptr mem)
+	&& (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 =>
+	(LoweredZero [t.Alignment()]
+		ptr
+		(ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)])
+		mem)
+
+// moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore dst (MOVHload src mem) mem)
+(Move [2] dst src mem) =>
+	(MOVBstore [1] dst (MOVBload [1] src mem)
+		(MOVBstore dst (MOVBload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] dst (MOVHload [2] src mem)
+		(MOVHstore dst (MOVHload src mem) mem))
+(Move [4] dst src mem) =>
+	(MOVBstore [3] dst (MOVBload [3] src mem)
+		(MOVBstore [2] dst (MOVBload [2] src mem)
+			(MOVBstore [1] dst (MOVBload [1] src mem)
+				(MOVBstore dst (MOVBload src mem) mem))))
+(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVVstore dst (MOVVload src mem) mem)
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [4] dst (MOVWload [4] src mem)
+		(MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [6] dst (MOVHload [6] src mem)
+		(MOVHstore [4] dst (MOVHload [4] src mem)
+			(MOVHstore [2] dst (MOVHload [2] src mem)
+				(MOVHstore dst (MOVHload src mem) mem))))
+
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBload [2] src mem)
+		(MOVBstore [1] dst (MOVBload [1] src mem)
+			(MOVBstore dst (MOVBload src mem) mem)))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [4] dst (MOVHload [4] src mem)
+		(MOVHstore [2] dst (MOVHload [2] src mem)
+			(MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [8] dst (MOVWload [8] src mem)
+		(MOVWstore [4] dst (MOVWload [4] src mem)
+			(MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVVstore [8] dst (MOVVload [8] src mem)
+		(MOVVstore dst (MOVVload src mem) mem))
+(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVVstore [16] dst (MOVVload [16] src mem)
+		(MOVVstore [8] dst (MOVVload [8] src mem)
+			(MOVVstore dst (MOVVload src mem) mem)))
+
+// medium move uses a duff device
+(Move [s] {t} dst src mem)
+	&& s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s)  =>
+	(DUFFCOPY [16 * (128 - s/8)] dst src mem)
+// 16 and 128 are magic constants.  16 is the number of bytes to encode:
+//	MOVV	(R1), R23
+//	ADDV	$8, R1
+//	MOVV	R23, (R2)
+//	ADDV	$8, R2
+// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy.
+
+// large or unaligned move uses a loop
+(Move [s] {t} dst src mem)
+	&& s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 =>
+	(LoweredMove [t.Alignment()]
+		dst
+		src
+		(ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)])
+		mem)
+
+// calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// atomic intrinsics
+(AtomicLoad(8|32|64) ...) => (LoweredAtomicLoad(8|32|64) ...)
+(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...)
+
+(AtomicStore(8|32|64) ...) => (LoweredAtomicStore(8|32|64) ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
+
+(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+
+(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+
+(AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...)
+
+// checks
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (SGTU ptr (MOVVconst [0]))
+(IsInBounds idx len) => (SGTU len idx)
+(IsSliceInBounds idx len) => (XOR (MOVVconst [1]) (SGTU idx len))
+
+// pseudo-ops
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+(If cond yes no) => (NE cond yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Optimizations
+
+// Absorb boolean tests into block
+(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no)
+(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no)
+(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no)
+(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no)
+(NE (SGTUconst [1] x) yes no) => (EQ x yes no)
+(EQ (SGTUconst [1] x) yes no) => (NE x yes no)
+(NE (SGTU x (MOVVconst [0])) yes no) => (NE x yes no)
+(EQ (SGTU x (MOVVconst [0])) yes no) => (EQ x yes no)
+(NE (SGTconst [0] x) yes no) => (LTZ x yes no)
+(EQ (SGTconst [0] x) yes no) => (GEZ x yes no)
+(NE (SGT x (MOVVconst [0])) yes no) => (GTZ x yes no)
+(EQ (SGT x (MOVVconst [0])) yes no) => (LEZ x yes no)
+
+// fold offset into address
+(ADDVconst [off1] (MOVVaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) => (MOVVaddr [int32(off1)+int32(off2)] {sym} ptr)
+
+// fold address into load/store
+(MOVBload  [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBload  [off1+int32(off2)] {sym} ptr mem)
+(MOVBUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBUload [off1+int32(off2)] {sym} ptr mem)
+(MOVHload  [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHload  [off1+int32(off2)] {sym} ptr mem)
+(MOVHUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHUload [off1+int32(off2)] {sym} ptr mem)
+(MOVWload  [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWload  [off1+int32(off2)] {sym} ptr mem)
+(MOVWUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWUload [off1+int32(off2)] {sym} ptr mem)
+(MOVVload  [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVload  [off1+int32(off2)] {sym} ptr mem)
+(MOVFload  [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVFload  [off1+int32(off2)] {sym} ptr mem)
+(MOVDload  [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDload  [off1+int32(off2)] {sym} ptr mem)
+
+(MOVBstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVBstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVHstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVHstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVWstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVWstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVVstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVVstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVFstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVFstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVDstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVDstore [off1+int32(off2)] {sym} ptr val mem)
+(MOVBstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVVstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVstorezero [off1+int32(off2)] {sym} ptr mem)
+
+(MOVBload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVBUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVHload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVHUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVVload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVVload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVFload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVFload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVDload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVHstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVVstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVVstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVFstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVFstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVDstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVBstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVHstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+(MOVVstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVVstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
+
+// store zero
+(MOVBstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVVstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVVstorezero [off] {sym} ptr mem)
+
+// don't extend after proper load
+(MOVBreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVVreg x)
+
+// fold double extensions
+(MOVBreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVVreg x)
+
+// don't extend before store
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// if a register move has only 1 use, just use the same register without emitting instruction
+// MOVVnop doesn't emit instruction, only for ensuring the type.
+(MOVVreg x) && x.Uses == 1 => (MOVVnop x)
+
+// fold constant into arithmatic ops
+(ADDV x (MOVVconst [c])) && is32Bit(c) => (ADDVconst [c] x)
+(SUBV x (MOVVconst [c])) && is32Bit(c) => (SUBVconst [c] x)
+(AND x (MOVVconst [c])) && is32Bit(c) => (ANDconst [c] x)
+(OR  x (MOVVconst [c])) && is32Bit(c) => (ORconst  [c] x)
+(XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x)
+(NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x)
+
+(SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63])
+(SLLV x (MOVVconst [c])) => (SLLVconst x [c])
+(SRLV x (MOVVconst [c])) => (SRLVconst x [c])
+(SRAV x (MOVVconst [c])) => (SRAVconst x [c])
+
+(SGT  (MOVVconst [c]) x) && is32Bit(c) => (SGTconst  [c] x)
+(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x)
+
+// mul by constant
+(Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x)
+(Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0])
+(Select1 (MULVU x (MOVVconst [1]))) => x
+(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SLLVconst [log64(c)] x)
+
+// div by constant
+(Select1 (DIVVU x (MOVVconst [1]))) => x
+(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SRLVconst [log64(c)] x)
+(Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0])                       // mod
+(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (ANDconst [c-1] x) // mod
+
+// generic simplifications
+(ADDV x (NEGV y)) => (SUBV x y)
+(SUBV x x) => (MOVVconst [0])
+(SUBV (MOVVconst [0]) x) => (NEGV x)
+(AND x x) => x
+(OR  x x) => x
+(XOR x x) => (MOVVconst [0])
+
+// remove redundant *const ops
+(ADDVconst [0]  x) => x
+(SUBVconst [0]  x) => x
+(ANDconst [0]  _) => (MOVVconst [0])
+(ANDconst [-1] x) => x
+(ORconst  [0]  x) => x
+(ORconst  [-1] _) => (MOVVconst [-1])
+(XORconst [0]  x) => x
+(XORconst [-1] x) => (NORconst [0] x)
+
+// generic constant folding
+(ADDVconst [c] (MOVVconst [d]))  => (MOVVconst [c+d])
+(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) => (ADDVconst [c+d] x)
+(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) => (ADDVconst [c-d] x)
+(SUBVconst [c] (MOVVconst [d]))  => (MOVVconst [d-c])
+(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) => (ADDVconst [-c-d] x)
+(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) => (ADDVconst [-c+d] x)
+(SLLVconst [c] (MOVVconst [d]))  => (MOVVconst [d<<uint64(c)])
+(SRLVconst [c] (MOVVconst [d]))  => (MOVVconst [int64(uint64(d)>>uint64(c))])
+(SRAVconst [c] (MOVVconst [d]))  => (MOVVconst [d>>uint64(c)])
+(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c*d])
+(Select1 (DIVV  (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c/d])
+(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)/uint64(d))])
+(Select0 (DIVV  (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c%d])   // mod
+(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)%uint64(d))]) // mod
+(ANDconst [c] (MOVVconst [d])) => (MOVVconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVVconst [d])) => (MOVVconst [c|d])
+(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) => (ORconst [c|d] x)
+(XORconst [c] (MOVVconst [d])) => (MOVVconst [c^d])
+(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) => (XORconst [c^d] x)
+(NORconst [c] (MOVVconst [d])) => (MOVVconst [^(c|d)])
+(NEGV (MOVVconst [c])) => (MOVVconst [-c])
+(MOVBreg  (MOVVconst [c])) => (MOVVconst [int64(int8(c))])
+(MOVBUreg (MOVVconst [c])) => (MOVVconst [int64(uint8(c))])
+(MOVHreg  (MOVVconst [c])) => (MOVVconst [int64(int16(c))])
+(MOVHUreg (MOVVconst [c])) => (MOVVconst [int64(uint16(c))])
+(MOVWreg  (MOVVconst [c])) => (MOVVconst [int64(int32(c))])
+(MOVWUreg (MOVVconst [c])) => (MOVVconst [int64(uint32(c))])
+(MOVVreg  (MOVVconst [c])) => (MOVVconst [c])
+(LoweredAtomicStore(32|64) ptr (MOVVconst [0]) mem) => (LoweredAtomicStorezero(32|64) ptr mem)
+(LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst32 [int32(c)] ptr mem)
+(LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst64 [c] ptr mem)
+
+// constant comparisons
+(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1])
+(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) => (MOVVconst [1])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) => (MOVVconst [0])
+
+// other known comparisons
+(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVVconst [1])
+(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVVconst [0])
+(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVVconst [1])
+(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVVconst [0])
+(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVWUreg _)) && c < 0 => (MOVVconst [0])
+(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVVconst [1])
+(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+
+// absorb constants into branches
+(EQ  (MOVVconst [0]) yes no) => (First yes no)
+(EQ  (MOVVconst [c]) yes no) && c != 0 => (First no yes)
+(NE  (MOVVconst [0]) yes no) => (First no yes)
+(NE  (MOVVconst [c]) yes no) && c != 0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c <  0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c >= 0 => (First no yes)
+(LEZ (MOVVconst [c]) yes no) && c <= 0 => (First yes no)
+(LEZ (MOVVconst [c]) yes no) && c >  0 => (First no yes)
+(GTZ (MOVVconst [c]) yes no) && c >  0 => (First yes no)
+(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes)
+(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
+(GEZ (MOVVconst [c]) yes no) && c <  0 => (First no yes)
diff --git a/src/cmd/compile/internal/ssa/gen/MIPS64Ops.go b/src/cmd/compile/internal/ssa/gen/MIPS64Ops.go
new file mode 100644
index 0000000..e1e3933
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64Ops.go
@@ -0,0 +1,482 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - *const instructions may use a constant larger than the instruction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R23).
+
+// Suffixes encode the bit width of various instructions.
+// V (vlong)     = 64 bit
+// WU (word)     = 32 bit unsigned
+// W (word)      = 32 bit
+// H (half word) = 16 bit
+// HU            = 16 bit unsigned
+// B (byte)      = 8 bit
+// BU            = 8 bit unsigned
+// F (float)     = 32 bit float
+// D (double)    = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesMIPS64 = []string{
+	"R0", // constant 0
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"R16",
+	"R17",
+	"R18",
+	"R19",
+	"R20",
+	"R21",
+	"R22",
+	// R23 = REGTMP not used in regalloc
+	"R24",
+	"R25",
+	// R26 reserved by kernel
+	// R27 reserved by kernel
+	// R28 = REGSB not used in regalloc
+	"SP",  // aka R29
+	"g",   // aka R30
+	"R31", // aka REGLINK
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+	"F16",
+	"F17",
+	"F18",
+	"F19",
+	"F20",
+	"F21",
+	"F22",
+	"F23",
+	"F24",
+	"F25",
+	"F26",
+	"F27",
+	"F28",
+	"F29",
+	"F30",
+	"F31",
+
+	"HI", // high bits of multiplication
+	"LO", // low bits of multiplication
+
+	// If you add registers, update asyncPreempt in runtime.
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesMIPS64) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesMIPS64 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		gp         = buildReg("R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R31")
+		gpg        = gp | buildReg("g")
+		gpsp       = gp | buildReg("SP")
+		gpspg      = gpg | buildReg("SP")
+		gpspsbg    = gpspg | buildReg("SB")
+		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+		lo         = buildReg("LO")
+		hi         = buildReg("HI")
+		callerSave = gp | fp | lo | hi | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+		r1         = buildReg("R1")
+		r2         = buildReg("R2")
+		r3         = buildReg("R3")
+		r4         = buildReg("R4")
+	)
+	// Common regInfo
+	var (
+		gp01     = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11     = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11sp   = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+		gp21     = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+		gp2hilo  = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{hi, lo}}
+		gpload   = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+		gpstore  = regInfo{inputs: []regMask{gpspsbg, gpg}}
+		gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+		gpxchg   = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		gpcas    = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+		fp01     = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11     = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		//fp1flags  = regInfo{inputs: []regMask{fp}}
+		//fpgp      = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+		//gpfp      = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+		fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp2flags  = regInfo{inputs: []regMask{fp, fp}}
+		fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+		fpstore   = regInfo{inputs: []regMask{gpspsbg, fp}}
+		readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+	)
+	ops := []opData{
+		// binary ops
+		{name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true},                             // arg0 + arg1
+		{name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"},                           // arg0 + auxInt. auxInt is 32-bit, also in other *const ops.
+		{name: "SUBV", argLength: 2, reg: gp21, asm: "SUBVU"},                                                // arg0 - arg1
+		{name: "SUBVconst", argLength: 1, reg: gp11, asm: "SUBVU", aux: "Int64"},                             // arg0 - auxInt
+		{name: "MULV", argLength: 2, reg: gp2hilo, asm: "MULV", commutative: true, typ: "(Int64,Int64)"},     // arg0 * arg1, signed, results hi,lo
+		{name: "MULVU", argLength: 2, reg: gp2hilo, asm: "MULVU", commutative: true, typ: "(UInt64,UInt64)"}, // arg0 * arg1, unsigned, results hi,lo
+		{name: "DIVV", argLength: 2, reg: gp2hilo, asm: "DIVV", typ: "(Int64,Int64)"},                        // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+		{name: "DIVVU", argLength: 2, reg: gp2hilo, asm: "DIVVU", typ: "(UInt64,UInt64)"},                    // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+
+		{name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+		{name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+		{name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"},                    // arg0 - arg1
+		{name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"},                    // arg0 - arg1
+		{name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+		{name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+		{name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"},                    // arg0 / arg1
+		{name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"},                    // arg0 / arg1
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},                // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"},                // arg0 & auxInt
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},                  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"},                  // arg0 | auxInt
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt64"}, // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", typ: "UInt64"}, // arg0 ^ auxInt
+		{name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true},                // ^(arg0 | arg1)
+		{name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"},                // ^(arg0 | auxInt)
+
+		{name: "NEGV", argLength: 1, reg: gp11},                // -arg0
+		{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"},   // -arg0, float32
+		{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
+		{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+
+		// shifts
+		{name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"},                    // arg0 << arg1, shift amount is mod 64
+		{name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt
+		{name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"},                    // arg0 >> arg1, unsigned, shift amount is mod 64
+		{name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned
+		{name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"},                    // arg0 >> arg1, signed, shift amount is mod 64
+		{name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed
+
+		// comparisons
+		{name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"},                      // 1 if arg0 > arg1 (signed), 0 otherwise
+		{name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int64", typ: "Bool"},   // 1 if auxInt > arg0 (signed), 0 otherwise
+		{name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"},                    // 1 if arg0 > arg1 (unsigned), 0 otherwise
+		{name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise
+
+		{name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32
+		{name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64
+		{name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32
+		{name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64
+		{name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32
+		{name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64
+
+		// moves
+		{name: "MOVVconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVV", typ: "UInt64", rematerializeable: true},    // auxint
+		{name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+		{name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+		{name: "MOVVaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVV", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVVload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVV", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"},   // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVVstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+
+		{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVVstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux.  ar12=mem.
+
+		// conversions
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"},   // move from arg0, sign-extended from half
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0, sign-extended from word
+		{name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+		{name: "MOVVreg", argLength: 1, reg: gp11, asm: "MOVV"},   // move from arg0
+
+		{name: "MOVVnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		{name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"},     // int32 -> float32
+		{name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"},     // int32 -> float64
+		{name: "MOVVF", argLength: 1, reg: fp11, asm: "MOVVF"},     // int64 -> float32
+		{name: "MOVVD", argLength: 1, reg: fp11, asm: "MOVVD"},     // int64 -> float64
+		{name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32
+		{name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32
+		{name: "TRUNCFV", argLength: 1, reg: fp11, asm: "TRUNCFV"}, // float32 -> int64
+		{name: "TRUNCDV", argLength: 1, reg: fp11, asm: "TRUNCDV"}, // float64 -> int64
+		{name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"},     // float32 -> float64
+		{name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"},     // float64 -> float32
+
+		// function calls
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                               // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R22"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                         // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// duffzero
+		// arg0 = address of memory to zero
+		// arg1 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		// R1 aka mips.REGRT1 changed as side effect
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{gp},
+				clobbers: buildReg("R1 R31"),
+			},
+			faultOnNilArg0: true,
+		},
+
+		// duffcopy
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// returns mem
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1")},
+				clobbers: buildReg("R1 R2 R31"),
+			},
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// large or unaligned zeroing
+		// arg0 = address of memory to zero (in R1, changed as side effect)
+		// arg1 = address of the last element to zero
+		// arg2 = mem
+		// auxint = alignment
+		// returns mem
+		//	SUBV	$8, R1
+		//	MOVV	R0, 8(R1)
+		//	ADDV	$8, R1
+		//	BNE	Rarg1, R1, -2(PC)
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), gp},
+				clobbers: buildReg("R1"),
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+		},
+
+		// large or unaligned move
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// auxint = alignment
+		// returns mem
+		//	SUBV	$8, R1
+		//	MOVV	8(R1), Rtmp
+		//	MOVV	Rtmp, (R2)
+		//	ADDV	$8, R1
+		//	ADDV	$8, R2
+		//	BNE	Rarg2, R1, -4(PC)
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1"), gp},
+				clobbers: buildReg("R1 R2"),
+			},
+			clobberFlags:   true,
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// atomic loads.
+		// load from arg0. arg1=mem.
+		// returns <value,memory> so they can be properly ordered with other loads.
+		{name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+		// atomic stores.
+		// store arg1 to arg0. arg2=mem. returns memory.
+		{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		// store zero to arg0. arg1=mem. returns memory.
+		{name: "LoweredAtomicStorezero32", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStorezero64", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic exchange.
+		// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>.
+		// SYNC
+		// LL	(Rarg0), Rout
+		// MOVV Rarg1, Rtmp
+		// SC	Rtmp, (Rarg0)
+		// BEQ	Rtmp, -3(PC)
+		// SYNC
+		{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic add.
+		// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+		// SYNC
+		// LL	(Rarg0), Rout
+		// ADDV Rarg1, Rout, Rtmp
+		// SC	Rtmp, (Rarg0)
+		// BEQ	Rtmp, -3(PC)
+		// SYNC
+		// ADDV Rarg1, Rout
+		{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		// *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit.
+		{name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// SYNC
+		// MOVV $0, Rout
+		// LL	(Rarg0), Rtmp
+		// BNE	Rtmp, Rarg1, 4(PC)
+		// MOVV Rarg2, Rout
+		// SC	Rout, (Rarg0)
+		// BEQ	Rout, -4(PC)
+		// SYNC
+		{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// pseudo-ops
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil.  arg1=mem.
+
+		{name: "FPFlagTrue", argLength: 1, reg: readflags},  // bool, true if FP flag is true
+		{name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R22 (mips.REGCTXT, the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R22")}}, zeroWidth: true},
+
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary,
+		// but clobbers R31 (LR) because it's a call
+		// and R23 (REGTMP).
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ gpg) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+	}
+
+	blocks := []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LTZ", controls: 1}, // < 0
+		{name: "LEZ", controls: 1}, // <= 0
+		{name: "GTZ", controls: 1}, // > 0
+		{name: "GEZ", controls: 1}, // >= 0
+		{name: "FPT", controls: 1}, // FP flag is true
+		{name: "FPF", controls: 1}, // FP flag is false
+	}
+
+	archs = append(archs, arch{
+		name:            "MIPS64",
+		pkg:             "cmd/internal/obj/mips",
+		genfile:         "../../mips64/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesMIPS64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		specialregmask:  hi | lo,
+		framepointerreg: -1, // not used
+		linkreg:         int8(num["R31"]),
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/MIPSOps.go b/src/cmd/compile/internal/ssa/gen/MIPSOps.go
new file mode 100644
index 0000000..75ab99e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/MIPSOps.go
@@ -0,0 +1,439 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - Unused portions of AuxInt are filled by sign-extending the used portion.
+//  - *const instructions may use a constant larger than the instruction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R23).
+
+// Suffixes encode the bit width of various instructions.
+// W (word)      = 32 bit
+// H (half word) = 16 bit
+// HU            = 16 bit unsigned
+// B (byte)      = 8 bit
+// BU            = 8 bit unsigned
+// F (float)     = 32 bit float
+// D (double)    = 64 bit float
+
+// Note: registers not used in regalloc are not included in this list,
+// so that regmask stays within int64
+// Be careful when hand coding regmasks.
+var regNamesMIPS = []string{
+	"R0", // constant 0
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"R16",
+	"R17",
+	"R18",
+	"R19",
+	"R20",
+	"R21",
+	"R22",
+	//REGTMP
+	"R24",
+	"R25",
+	// R26 reserved by kernel
+	// R27 reserved by kernel
+	"R28",
+	"SP",  // aka R29
+	"g",   // aka R30
+	"R31", // REGLINK
+
+	// odd FP registers contain high parts of 64-bit FP values
+	"F0",
+	"F2",
+	"F4",
+	"F6",
+	"F8",
+	"F10",
+	"F12",
+	"F14",
+	"F16",
+	"F18",
+	"F20",
+	"F22",
+	"F24",
+	"F26",
+	"F28",
+	"F30",
+
+	"HI", // high bits of multiplication
+	"LO", // low bits of multiplication
+
+	// If you add registers, update asyncPreempt in runtime.
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesMIPS) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesMIPS {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		gp         = buildReg("R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31")
+		gpg        = gp | buildReg("g")
+		gpsp       = gp | buildReg("SP")
+		gpspg      = gpg | buildReg("SP")
+		gpspsbg    = gpspg | buildReg("SB")
+		fp         = buildReg("F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30")
+		lo         = buildReg("LO")
+		hi         = buildReg("HI")
+		callerSave = gp | fp | lo | hi | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+		r1         = buildReg("R1")
+		r2         = buildReg("R2")
+		r3         = buildReg("R3")
+		r4         = buildReg("R4")
+		r5         = buildReg("R5")
+	)
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11      = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
+		gp11sp    = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
+		gp21      = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
+		gp31      = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}}
+		gp2hilo   = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{hi, lo}}
+		gpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
+		gpstore   = regInfo{inputs: []regMask{gpspsbg, gpg}}
+		gpxchg    = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
+		gpcas     = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+		gpstore0  = regInfo{inputs: []regMask{gpspsbg}}
+		fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11      = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp2flags  = regInfo{inputs: []regMask{fp, fp}}
+		fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
+		fpstore   = regInfo{inputs: []regMask{gpspsbg, fp}}
+		readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
+	)
+	ops := []opData{
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADDU", commutative: true},                                                                           // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADDU", aux: "Int32"},                                                                         // arg0 + auxInt
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUBU"},                                                                                              // arg0 - arg1
+		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUBU", aux: "Int32"},                                                                           // arg0 - auxInt
+		{name: "MUL", argLength: 2, reg: regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}, clobbers: hi | lo}, asm: "MUL", commutative: true}, // arg0 * arg1
+		{name: "MULT", argLength: 2, reg: gp2hilo, asm: "MUL", commutative: true, typ: "(Int32,Int32)"},                                                  // arg0 * arg1, signed, results hi,lo
+		{name: "MULTU", argLength: 2, reg: gp2hilo, asm: "MULU", commutative: true, typ: "(UInt32,UInt32)"},                                              // arg0 * arg1, unsigned, results hi,lo
+		{name: "DIV", argLength: 2, reg: gp2hilo, asm: "DIV", typ: "(Int32,Int32)"},                                                                      // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+		{name: "DIVU", argLength: 2, reg: gp2hilo, asm: "DIVU", typ: "(UInt32,UInt32)"},                                                                  // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1
+
+		{name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
+		{name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
+		{name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"},                    // arg0 - arg1
+		{name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"},                    // arg0 - arg1
+		{name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
+		{name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
+		{name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"},                    // arg0 / arg1
+		{name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"},                    // arg0 / arg1
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},                // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"},                // arg0 & auxInt
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},                  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int32"},                  // arg0 | auxInt
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt32"}, // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int32", typ: "UInt32"}, // arg0 ^ auxInt
+		{name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true},                // ^(arg0 | arg1)
+		{name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int32"},                // ^(arg0 | auxInt)
+
+		{name: "NEG", argLength: 1, reg: gp11},                 // -arg0
+		{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"},   // -arg0, float32
+		{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
+		{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
+
+		// shifts
+		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                    // arg0 << arg1, shift amount is mod 32
+		{name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt, shift amount must be 0 through 31 inclusive
+		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                    // arg0 >> arg1, unsigned, shift amount is mod 32
+		{name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, shift amount must be 0 through 31 inclusive
+		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                    // arg0 >> arg1, signed, shift amount is mod 32
+		{name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed, shift amount must be 0 through 31 inclusive
+
+		{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"},
+
+		// comparisons
+		{name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"},                      // 1 if arg0 > arg1 (signed), 0 otherwise
+		{name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int32", typ: "Bool"},   // 1 if auxInt > arg0 (signed), 0 otherwise
+		{name: "SGTzero", argLength: 1, reg: gp11, asm: "SGT", typ: "Bool"},                  // 1 if arg0 > 0 (signed), 0 otherwise
+		{name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"},                    // 1 if arg0 > arg1 (unsigned), 0 otherwise
+		{name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int32", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise
+		{name: "SGTUzero", argLength: 1, reg: gp11, asm: "SGTU", typ: "Bool"},                // 1 if arg0 > 0 (unsigned), 0 otherwise
+
+		{name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32
+		{name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64
+		{name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32
+		{name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64
+		{name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32
+		{name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64
+
+		// moves
+		{name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true},    // auxint
+		{name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float32", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
+		{name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
+
+		{name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"},     // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"},   // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"},  // load from arg0 + auxInt + aux.  arg1=mem.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+		{name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux.  arg2=mem.
+
+		{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+		{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
+
+		// conversions
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"},   // move from arg0, sign-extended from half
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0
+
+		{name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		// conditional move on zero (returns arg1 if arg2 is 0, otherwise arg0)
+		// order of parameters is reversed so we can use resultInArg0 (OpCMOVZ result arg1 arg2-> CMOVZ arg2reg, arg1reg, resultReg)
+		{name: "CMOVZ", argLength: 3, reg: gp31, asm: "CMOVZ", resultInArg0: true},
+		{name: "CMOVZzero", argLength: 2, reg: regInfo{inputs: []regMask{gp, gpg}, outputs: []regMask{gp}}, asm: "CMOVZ", resultInArg0: true},
+
+		{name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"},     // int32 -> float32
+		{name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"},     // int32 -> float64
+		{name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32
+		{name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32
+		{name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"},     // float32 -> float64
+		{name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"},     // float64 -> float32
+
+		// function calls
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                               // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R22"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                         // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// atomic ops
+
+		// load from arg0. arg1=mem.
+		// returns <value,memory> so they can be properly ordered with other loads.
+		// SYNC
+		// MOV(B|W)	(Rarg0), Rout
+		// SYNC
+		{name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+		// store arg1 to arg0. arg2=mem. returns memory.
+		// SYNC
+		// MOV(B|W)	Rarg1, (Rarg0)
+		// SYNC
+		{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStorezero", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic exchange.
+		// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>.
+		// SYNC
+		// LL	(Rarg0), Rout
+		// MOVW Rarg1, Rtmp
+		// SC	Rtmp, (Rarg0)
+		// BEQ	Rtmp, -3(PC)
+		// SYNC
+		{name: "LoweredAtomicExchange", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic add.
+		// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+		// SYNC
+		// LL	(Rarg0), Rout
+		// ADDU Rarg1, Rout, Rtmp
+		// SC	Rtmp, (Rarg0)
+		// BEQ	Rtmp, -3(PC)
+		// SYNC
+		// ADDU Rarg1, Rout
+		{name: "LoweredAtomicAdd", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAddconst", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// SYNC
+		// MOVW $0, Rout
+		// LL	(Rarg0), Rtmp
+		// BNE	Rtmp, Rarg1, 4(PC)
+		// MOVW Rarg2, Rout
+		// SC	Rout, (Rarg0)
+		// BEQ	Rout, -4(PC)
+		// SYNC
+		{name: "LoweredAtomicCas", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// atomic and/or.
+		// *arg0 &= (|=) arg1. arg2=mem. returns memory.
+		// SYNC
+		// LL	(Rarg0), Rtmp
+		// AND	Rarg1, Rtmp
+		// SC	Rtmp, (Rarg0)
+		// BEQ	Rtmp, -3(PC)
+		// SYNC
+		{name: "LoweredAtomicAnd", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicOr", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// large or unaligned zeroing
+		// arg0 = address of memory to zero (in R1, changed as side effect)
+		// arg1 = address of the last element to zero
+		// arg2 = mem
+		// auxint = alignment
+		// returns mem
+		//	SUBU	$4, R1
+		//	MOVW	R0, 4(R1)
+		//	ADDU	$4, R1
+		//	BNE	Rarg1, R1, -2(PC)
+		{
+			name:      "LoweredZero",
+			aux:       "Int32",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), gp},
+				clobbers: buildReg("R1"),
+			},
+			faultOnNilArg0: true,
+		},
+
+		// large or unaligned move
+		// arg0 = address of dst memory (in R2, changed as side effect)
+		// arg1 = address of src memory (in R1, changed as side effect)
+		// arg2 = address of the last element of src
+		// arg3 = mem
+		// auxint = alignment
+		// returns mem
+		//	SUBU	$4, R1
+		//	MOVW	4(R1), Rtmp
+		//	MOVW	Rtmp, (R2)
+		//	ADDU	$4, R1
+		//	ADDU	$4, R2
+		//	BNE	Rarg2, R1, -4(PC)
+		{
+			name:      "LoweredMove",
+			aux:       "Int32",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R2"), buildReg("R1"), gp},
+				clobbers: buildReg("R1 R2"),
+			},
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// pseudo-ops
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil.  arg1=mem.
+
+		{name: "FPFlagTrue", argLength: 1, reg: readflags},  // bool, true if FP flag is true
+		{name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R22 (mips.REGCTXT, the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R22")}}, zeroWidth: true},
+
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary,
+		// but clobbers R31 (LR) because it's a call
+		// and R23 (REGTMP).
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ gpg) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		// Extend ops are the same as Bounds ops except the indexes are 64-bit.
+		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r3, r4}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+	}
+
+	blocks := []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LTZ", controls: 1}, // < 0
+		{name: "LEZ", controls: 1}, // <= 0
+		{name: "GTZ", controls: 1}, // > 0
+		{name: "GEZ", controls: 1}, // >= 0
+		{name: "FPT", controls: 1}, // FP flag is true
+		{name: "FPF", controls: 1}, // FP flag is false
+	}
+
+	archs = append(archs, arch{
+		name:            "MIPS",
+		pkg:             "cmd/internal/obj/mips",
+		genfile:         "../../mips/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesMIPS,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		specialregmask:  hi | lo,
+		framepointerreg: -1, // not used
+		linkreg:         int8(num["R31"]),
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
new file mode 100644
index 0000000..c064046
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -0,0 +1,1461 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(Ptr|64|32|16|8) ...) => (ADD ...)
+(Add64F ...) => (FADD ...)
+(Add32F ...) => (FADDS ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUB ...)
+(Sub32F ...) => (FSUBS ...)
+(Sub64F ...) => (FSUB ...)
+
+// Combine 64 bit integer multiply and adds
+(ADD l:(MULLD x y) z) && objabi.GOPPC64 >= 9 && l.Uses == 1 && clobber(l) => (MADDLD x y z)
+
+(Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Mod64 x y) && objabi.GOPPC64 >=9 => (MODSD x y)
+(Mod64 x y) && objabi.GOPPC64 <=8 => (SUB x (MULLD y (DIVD x y)))
+(Mod64u x y) && objabi.GOPPC64 >= 9 => (MODUD x y)
+(Mod64u x y) && objabi.GOPPC64 <= 8 => (SUB x (MULLD y (DIVDU x y)))
+(Mod32 x y) && objabi.GOPPC64 >= 9 => (MODSW x y)
+(Mod32 x y) && objabi.GOPPC64 <= 8 => (SUB x (MULLW y (DIVW x y)))
+(Mod32u x y) && objabi.GOPPC64 >= 9 => (MODUW x y)
+(Mod32u x y) && objabi.GOPPC64 <= 8 => (SUB x (MULLW y (DIVWU x y)))
+
+// (x + y) / 2 with x>=y => (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y)
+
+(Add64carry ...) => (LoweredAdd64Carry ...)
+(Mul64 ...) => (MULLD ...)
+(Mul(32|16|8) ...) => (MULLW ...)
+(Mul64uhilo ...) => (LoweredMuluhilo ...)
+
+(Div64 [false] x y) => (DIVD x y)
+(Div64u ...) => (DIVDU ...)
+(Div32 [false] x y) => (DIVW x y)
+(Div32u ...) => (DIVWU ...)
+(Div16 [false]  x y) => (DIVW  (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (DIVWU (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (DIVW  (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (DIVWU (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+(Hmul(64|64u|32|32u) ...) => (MULH(D|DU|W|WU) ...)
+
+(Mul32F ...) => (FMULS ...)
+(Mul64F ...) => (FMUL ...)
+
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIV ...)
+
+// Lowering float <=> int
+(Cvt32to32F x) => (FCFIDS (MTVSRD (SignExt32to64 x)))
+(Cvt32to64F x) => (FCFID (MTVSRD (SignExt32to64 x)))
+(Cvt64to32F x) => (FCFIDS (MTVSRD x))
+(Cvt64to64F x) => (FCFID (MTVSRD x))
+
+(Cvt32Fto32 x) => (MFVSRD (FCTIWZ x))
+(Cvt32Fto64 x) => (MFVSRD (FCTIDZ x))
+(Cvt64Fto32 x) => (MFVSRD (FCTIWZ x))
+(Cvt64Fto64 x) => (MFVSRD (FCTIDZ x))
+
+(Cvt32Fto64F ...) => (Copy ...) // Note v will have the wrong type for patterns dependent on Float32/Float64
+(Cvt64Fto32F ...) => (FRSP ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
+
+(Sqrt ...) => (FSQRT ...)
+(Floor ...) => (FFLOOR ...)
+(Ceil ...) => (FCEIL ...)
+(Trunc ...) => (FTRUNC ...)
+(Round ...) => (FROUND ...)
+(Copysign x y) => (FCPSGN y x)
+(Abs ...) => (FABS ...)
+(FMA ...) => (FMADD ...)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to(16|32|64) ...) => (MOVBreg ...)
+(SignExt16to(32|64) ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to(16|32|64) ...) => (MOVBZreg ...)
+(ZeroExt16to(32|64) ...) => (MOVHZreg ...)
+(ZeroExt32to64 ...) => (MOVWZreg ...)
+
+(Trunc(16|32|64)to8 <t> x) && isSigned(t) => (MOVBreg x)
+(Trunc(16|32|64)to8  x) => (MOVBZreg x)
+(Trunc(32|64)to16 <t> x) && isSigned(t) => (MOVHreg x)
+(Trunc(32|64)to16 x) => (MOVHZreg x)
+(Trunc64to32 <t> x) && isSigned(t) => (MOVWreg x)
+(Trunc64to32 x) => (MOVWZreg x)
+
+// Lowering constants
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32|64)F ...) => (FMOV(S|D)const ...)
+(ConstNil) => (MOVDconst [0])
+(ConstBool [b]) => (MOVDconst [b2i(b)])
+
+// Constant folding
+(FABS (FMOVDconst [x])) => (FMOVDconst [math.Abs(x)])
+(FSQRT (FMOVDconst [x])) && x >= 0 => (FMOVDconst [math.Sqrt(x)])
+(FFLOOR (FMOVDconst [x])) => (FMOVDconst [math.Floor(x)])
+(FCEIL (FMOVDconst [x])) => (FMOVDconst [math.Ceil(x)])
+(FTRUNC (FMOVDconst [x])) => (FMOVDconst [math.Trunc(x)])
+
+// Rotates
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 x (MOVDconst [c])) => (ROTLWconst [c&31] x)
+(RotateLeft64 x (MOVDconst [c])) => (ROTLconst [c&63] x)
+
+// Rotate generation with const shift
+(ADD (SLDconst x [c]) (SRDconst x [d])) && d == 64-c => (ROTLconst [c] x)
+( OR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c => (ROTLconst [c] x)
+(XOR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c => (ROTLconst [c] x)
+
+(ADD (SLWconst x [c]) (SRWconst x [d])) && d == 32-c => (ROTLWconst [c] x)
+( OR (SLWconst x [c]) (SRWconst x [d])) && d == 32-c => (ROTLWconst [c] x)
+(XOR (SLWconst x [c]) (SRWconst x [d])) && d == 32-c => (ROTLWconst [c] x)
+
+// Rotate generation with non-const shift
+// these match patterns from math/bits/RotateLeft[32|64], but there could be others
+(ADD (SLD x (ANDconst <typ.Int64> [63] y)) (SRD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))) => (ROTL x y)
+(ADD (SLD x (ANDconst <typ.Int64> [63] y)) (SRD x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y)))) => (ROTL x y)
+( OR (SLD x (ANDconst <typ.Int64> [63] y)) (SRD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))) => (ROTL x y)
+( OR (SLD x (ANDconst <typ.Int64> [63] y)) (SRD x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y)))) => (ROTL x y)
+(XOR (SLD x (ANDconst <typ.Int64> [63] y)) (SRD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))) => (ROTL x y)
+(XOR (SLD x (ANDconst <typ.Int64> [63] y)) (SRD x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y)))) => (ROTL x y)
+
+
+(ADD (SLW x (ANDconst <typ.Int32> [31] y)) (SRW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))) => (ROTLW x y)
+(ADD (SLW x (ANDconst <typ.Int32> [31] y)) (SRW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))) => (ROTLW x y)
+( OR (SLW x (ANDconst <typ.Int32> [31] y)) (SRW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))) => (ROTLW x y)
+( OR (SLW x (ANDconst <typ.Int32> [31] y)) (SRW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))) => (ROTLW x y)
+(XOR (SLW x (ANDconst <typ.Int32> [31] y)) (SRW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))) => (ROTLW x y)
+(XOR (SLW x (ANDconst <typ.Int32> [31] y)) (SRW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))) => (ROTLW x y)
+
+
+// Lowering rotates
+(RotateLeft32 x y) => (ROTLW x y)
+(RotateLeft64 x y) => (ROTL x y)
+
+// Constant rotate generation
+(ROTLW  x (MOVDconst [c])) => (ROTLWconst  x [c&31])
+(ROTL   x (MOVDconst [c])) => (ROTLconst   x [c&63])
+
+// Combine rotate and mask operations
+(ANDconst [m] (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(AND (MOVDconst [m]) (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(ANDconst [m] (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+(AND (MOVDconst [m]) (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+
+// Note, any rotated word bitmask is still a valid word bitmask.
+(ROTLWconst [r] (AND (MOVDconst [m]) x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+(ROTLWconst [r] (ANDconst [m] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+
+(ANDconst [m] (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(ANDconst [m] (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+
+(SRWconst (ANDconst [m] x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (ANDconst [m] x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+
+// Merge shift right + shift left and clear left (e.g for a table lookup)
+(CLRLSLDI [c] (SRWconst [s] x)) && mergePPC64ClrlsldiSrw(int64(c),s) != 0 => (RLWINM [mergePPC64ClrlsldiSrw(int64(c),s)] x)
+(SLDconst [l] (SRWconst [r] x)) && mergePPC64SldiSrw(l,r) != 0 => (RLWINM [mergePPC64SldiSrw(l,r)] x)
+// The following reduction shows up frequently too. e.g b[(x>>14)&0xFF]
+(CLRLSLDI [c] i:(RLWINM [s] x)) && mergePPC64ClrlsldiRlwinm(c,s) != 0 => (RLWINM [mergePPC64ClrlsldiRlwinm(c,s)] x)
+
+// large constant shifts
+(Lsh64x64  _ (MOVDconst [c])) && uint64(c) >= 64 => (MOVDconst [0])
+(Rsh64Ux64 _ (MOVDconst [c])) && uint64(c) >= 64 => (MOVDconst [0])
+(Lsh32x64  _ (MOVDconst [c])) && uint64(c) >= 32 => (MOVDconst [0])
+(Rsh32Ux64 _ (MOVDconst [c])) && uint64(c) >= 32 => (MOVDconst [0])
+(Lsh16x64  _ (MOVDconst [c])) && uint64(c) >= 16 => (MOVDconst [0])
+(Rsh16Ux64 _ (MOVDconst [c])) && uint64(c) >= 16 => (MOVDconst [0])
+(Lsh8x64   _ (MOVDconst [c])) && uint64(c) >= 8  => (MOVDconst [0])
+(Rsh8Ux64  _ (MOVDconst [c])) && uint64(c) >= 8  => (MOVDconst [0])
+
+// large constant signed right shift, we leave the sign bit
+(Rsh64x64 x (MOVDconst [c])) && uint64(c) >= 64 => (SRADconst x [63])
+(Rsh32x64 x (MOVDconst [c])) && uint64(c) >= 32 => (SRAWconst x [63])
+(Rsh16x64 x (MOVDconst [c])) && uint64(c) >= 16 => (SRAWconst (SignExt16to32 x) [63])
+(Rsh8x64  x (MOVDconst [c])) && uint64(c) >= 8  => (SRAWconst (SignExt8to32  x) [63])
+
+// constant shifts
+(Lsh64x64  x (MOVDconst [c])) && uint64(c) < 64 => (SLDconst x [c])
+(Rsh64x64  x (MOVDconst [c])) && uint64(c) < 64 => (SRADconst x [c])
+(Rsh64Ux64 x (MOVDconst [c])) && uint64(c) < 64 => (SRDconst x [c])
+(Lsh32x64  x (MOVDconst [c])) && uint64(c) < 32 => (SLWconst x [c])
+(Rsh32x64  x (MOVDconst [c])) && uint64(c) < 32 => (SRAWconst x [c])
+(Rsh32Ux64 x (MOVDconst [c])) && uint64(c) < 32 => (SRWconst x [c])
+(Lsh16x64  x (MOVDconst [c])) && uint64(c) < 16 => (SLWconst x [c])
+(Rsh16x64  x (MOVDconst [c])) && uint64(c) < 16 => (SRAWconst (SignExt16to32 x) [c])
+(Rsh16Ux64 x (MOVDconst [c])) && uint64(c) < 16 => (SRWconst (ZeroExt16to32 x) [c])
+(Lsh8x64   x (MOVDconst [c])) && uint64(c) < 8  => (SLWconst x [c])
+(Rsh8x64   x (MOVDconst [c])) && uint64(c) < 8  => (SRAWconst (SignExt8to32  x) [c])
+(Rsh8Ux64  x (MOVDconst [c])) && uint64(c) < 8  => (SRWconst (ZeroExt8to32  x) [c])
+
+(Lsh64x32  x (MOVDconst [c])) && uint32(c) < 64 => (SLDconst x [c&63])
+(Rsh64x32  x (MOVDconst [c])) && uint32(c) < 64 => (SRADconst x [c&63])
+(Rsh64Ux32 x (MOVDconst [c])) && uint32(c) < 64 => (SRDconst x [c&63])
+(Lsh32x32  x (MOVDconst [c])) && uint32(c) < 32 => (SLWconst x [c&31])
+(Rsh32x32  x (MOVDconst [c])) && uint32(c) < 32 => (SRAWconst x [c&31])
+(Rsh32Ux32 x (MOVDconst [c])) && uint32(c) < 32 => (SRWconst x [c&31])
+(Lsh16x32  x (MOVDconst [c])) && uint32(c) < 16 => (SLWconst x [c&31])
+(Rsh16x32  x (MOVDconst [c])) && uint32(c) < 16 => (SRAWconst (SignExt16to32 x) [c&15])
+(Rsh16Ux32 x (MOVDconst [c])) && uint32(c) < 16 => (SRWconst (ZeroExt16to32 x) [c&15])
+(Lsh8x32   x (MOVDconst [c])) && uint32(c) < 8  => (SLWconst x [c&7])
+(Rsh8x32   x (MOVDconst [c])) && uint32(c) < 8  => (SRAWconst (SignExt8to32  x) [c&7])
+(Rsh8Ux32  x (MOVDconst [c])) && uint32(c) < 8  => (SRWconst (ZeroExt8to32  x) [c&7])
+
+// Lower bounded shifts first. No need to check shift value.
+(Lsh64x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLD x y)
+(Lsh32x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SLW x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
+(Rsh64x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAD x y)
+(Rsh32x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAW x y)
+(Rsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
+(Rsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
+
+// non-constant rotates
+// These are subexpressions found in statements that can become rotates
+// In these cases the shift count is known to be < 64 so the more complicated expressions
+// with Mask & Carry is not needed
+(Lsh64x64 x (AND y (MOVDconst [63]))) => (SLD x (ANDconst <typ.Int64> [63] y))
+(Lsh64x64 x (ANDconst <typ.Int64> [63] y)) => (SLD x (ANDconst <typ.Int64> [63] y))
+(Rsh64Ux64 x (AND y (MOVDconst [63]))) => (SRD x (ANDconst <typ.Int64> [63] y))
+(Rsh64Ux64 x (ANDconst <typ.UInt> [63] y)) => (SRD x (ANDconst <typ.UInt> [63] y))
+(Rsh64Ux64 x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y))) => (SRD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))
+(Rsh64Ux64 x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y))) => (SRD x (SUBFCconst <typ.UInt> [64]  (ANDconst <typ.UInt> [63] y)))
+(Rsh64Ux64 x (SUB <typ.UInt> (MOVDconst [64]) (AND <typ.UInt> y (MOVDconst [63])))) => (SRD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))
+(Rsh64Ux64 x (SUBFCconst <typ.UInt> [64] (AND <typ.UInt> y (MOVDconst [63])))) => (SRD x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y)))
+(Rsh64x64 x (AND y (MOVDconst [63]))) => (SRAD x (ANDconst <typ.Int64> [63] y))
+(Rsh64x64 x (ANDconst <typ.UInt> [63] y)) => (SRAD x (ANDconst <typ.UInt> [63] y))
+(Rsh64x64 x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y))) => (SRAD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))
+(Rsh64x64 x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y))) => (SRAD x (SUBFCconst <typ.UInt> [64]  (ANDconst <typ.UInt> [63] y)))
+(Rsh64x64 x (SUB <typ.UInt> (MOVDconst [64]) (AND <typ.UInt> y (MOVDconst [63])))) => (SRAD x (SUB <typ.UInt> (MOVDconst [64]) (ANDconst <typ.UInt> [63] y)))
+(Rsh64x64 x (SUBFCconst <typ.UInt> [64] (AND <typ.UInt> y (MOVDconst [63])))) => (SRAD x (SUBFCconst <typ.UInt> [64] (ANDconst <typ.UInt> [63] y)))
+
+(Lsh64x64 x y)  => (SLD  x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+(Rsh64x64 x y) => (SRAD x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+(Rsh64Ux64 x y) => (SRD x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+
+(Lsh32x64 x (AND y (MOVDconst [31]))) => (SLW x (ANDconst <typ.Int32> [31] y))
+(Lsh32x64 x (ANDconst <typ.Int32> [31] y)) => (SLW x (ANDconst <typ.Int32> [31] y))
+
+(Rsh32Ux64 x (AND y (MOVDconst [31]))) => (SRW x (ANDconst <typ.Int32> [31] y))
+(Rsh32Ux64 x (ANDconst <typ.UInt> [31] y)) => (SRW x (ANDconst <typ.UInt> [31] y))
+(Rsh32Ux64 x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y))) => (SRW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))
+(Rsh32Ux64 x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y))) => (SRW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))
+(Rsh32Ux64 x (SUB <typ.UInt> (MOVDconst [32]) (AND <typ.UInt> y (MOVDconst [31])))) => (SRW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))
+(Rsh32Ux64 x (SUBFCconst <typ.UInt> [32] (AND <typ.UInt> y (MOVDconst [31])))) => (SRW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))
+
+(Rsh32x64 x (AND y (MOVDconst [31]))) => (SRAW x (ANDconst <typ.Int32> [31] y))
+(Rsh32x64 x (ANDconst <typ.UInt> [31] y)) => (SRAW x (ANDconst <typ.UInt> [31] y))
+(Rsh32x64 x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y))) => (SRAW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))
+(Rsh32x64 x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y))) => (SRAW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))
+(Rsh32x64 x (SUB <typ.UInt> (MOVDconst [32]) (AND <typ.UInt> y (MOVDconst [31])))) => (SRAW x (SUB <typ.UInt> (MOVDconst [32]) (ANDconst <typ.UInt> [31] y)))
+(Rsh32x64 x (SUBFCconst <typ.UInt> [32] (AND <typ.UInt> y (MOVDconst [31])))) => (SRAW x (SUBFCconst <typ.UInt> [32] (ANDconst <typ.UInt> [31] y)))
+
+(Rsh32x64 x y)  => (SRAW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+(Rsh32Ux64 x y) => (SRW  x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+(Lsh32x64 x y)  => (SLW  x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+
+(Rsh16x64 x y)  => (SRAW (SignExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+(Rsh16Ux64 x y) => (SRW  (ZeroExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+(Lsh16x64 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+
+(Rsh8x64 x y)  => (SRAW (SignExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+(Rsh8Ux64 x y) => (SRW  (ZeroExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+(Lsh8x64 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+
+(Rsh64x32 x y)  => (SRAD x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+(Rsh64Ux32 x y) => (SRD x  (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+(Lsh64x32 x y)  => (SLD x  (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
+(Rsh32x32 x y)  => (SRAW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+(Rsh32Ux32 x y) => (SRW x  (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+(Lsh32x32 x y)  => (SLW x  (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+
+(Rsh16x32 x y)  => (SRAW (SignExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+(Rsh16Ux32 x y) => (SRW  (ZeroExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+(Lsh16x32 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+
+(Rsh8x32 x y)  => (SRAW (SignExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+(Rsh8Ux32 x y) => (SRW  (ZeroExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+(Lsh8x32 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+
+
+(Rsh64x16 x y)  => (SRAD x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
+(Rsh64Ux16 x y) => (SRD x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
+(Lsh64x16 x y)  => (SLD x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
+
+(Rsh32x16 x y)  => (SRAW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
+(Rsh32Ux16 x y) => (SRW x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
+(Lsh32x16 x y)  => (SLW x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
+
+(Rsh16x16 x y)  => (SRAW (SignExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
+(Rsh16Ux16 x y) => (SRW  (ZeroExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
+(Lsh16x16 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
+
+(Rsh8x16 x y)  => (SRAW (SignExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
+(Rsh8Ux16 x y) => (SRW  (ZeroExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
+(Lsh8x16 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
+
+
+(Rsh64x8 x y)  => (SRAD x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
+(Rsh64Ux8 x y) => (SRD x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
+(Lsh64x8 x y)  => (SLD x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
+
+(Rsh32x8 x y)  => (SRAW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
+(Rsh32Ux8 x y) => (SRW x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
+(Lsh32x8 x y)  => (SLW x  (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
+
+(Rsh16x8 x y)  => (SRAW (SignExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
+(Rsh16Ux8 x y) => (SRW  (ZeroExt16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
+(Lsh16x8 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
+
+(Rsh8x8 x y)  => (SRAW (SignExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
+(Rsh8Ux8 x y) => (SRW  (ZeroExt8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
+(Lsh8x8 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
+
+// Cleaning up shift ops
+(ISEL [0] (ANDconst [d] y) (MOVDconst [-1]) (CMPU (ANDconst [d] y) (MOVDconst [c]))) && c >= d => (ANDconst [d] y)
+(ISEL [0] (ANDconst [d] y) (MOVDconst [-1]) (CMPUconst [c] (ANDconst [d] y))) && c >= d => (ANDconst [d] y)
+(ORN x (MOVDconst [-1])) => x
+
+(S(RAD|RD|LD) x (MOVDconst [c])) => (S(RAD|RD|LD)const [c&63 | (c>>6&1*63)] x)
+(S(RAW|RW|LW) x (MOVDconst [c])) => (S(RAW|RW|LW)const [c&31 | (c>>5&1*31)] x)
+
+(Addr {sym} base) => (MOVDaddr {sym} [0] base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
+(OffPtr [off] ptr) => (ADD (MOVDconst <typ.Int64> [off]) ptr)
+
+// TODO: optimize these cases?
+(Ctz32NonZero ...) => (Ctz32 ...)
+(Ctz64NonZero ...) => (Ctz64 ...)
+
+(Ctz64 x) && objabi.GOPPC64<=8 => (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
+(Ctz64 x) => (CNTTZD x)
+(Ctz32 x) && objabi.GOPPC64<=8 => (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))
+(Ctz32 x) => (CNTTZW (MOVWZreg x))
+(Ctz16 x) => (POPCNTW (MOVHZreg (ANDN <typ.Int16> (ADDconst <typ.Int16> [-1] x) x)))
+(Ctz8 x)  => (POPCNTB (MOVBZreg (ANDN <typ.UInt8> (ADDconst <typ.UInt8> [-1] x) x)))
+
+(BitLen64 x) => (SUBFCconst [64] (CNTLZD <typ.Int> x))
+(BitLen32 x) => (SUBFCconst [32] (CNTLZW <typ.Int> x))
+
+(PopCount64 ...) => (POPCNTD ...)
+(PopCount32 x) => (POPCNTW (MOVWZreg x))
+(PopCount16 x) => (POPCNTW (MOVHZreg x))
+(PopCount8 x) => (POPCNTB (MOVBZreg x))
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
+
+(Neg(64|32|16|8) ...) => (NEG ...)
+(Neg64F ...) => (FNEG ...)
+(Neg32F ...) => (FNEG ...)
+
+(Com(64|32|16|8) x) => (NOR x x)
+
+// Lowering boolean ops
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(Not x) => (XORconst [1] x)
+
+// Use ANDN for AND x NOT y
+(AND x (NOR y y)) => (ANDN x y)
+
+// Lowering comparisons
+(EqB x y)  => (ANDconst [1] (EQV x y))
+// Sign extension dependence on operand sign sets up for sign/zero-extension elision later
+(Eq8 x y) && isSigned(x.Type) && isSigned(y.Type) => (Equal (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Eq16 x y) && isSigned(x.Type) && isSigned(y.Type) => (Equal (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Eq8 x y) => (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (Equal (CMPW x y))
+(Eq64 x y) => (Equal (CMP x y))
+(Eq32F x y) => (Equal (FCMPU x y))
+(Eq64F x y) => (Equal (FCMPU x y))
+(EqPtr x y) => (Equal (CMP x y))
+
+(NeqB ...) => (XOR ...)
+// Like Eq8 and Eq16, prefer sign extension likely to enable later elision.
+(Neq8 x y) && isSigned(x.Type) && isSigned(y.Type) => (NotEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Neq16 x y) && isSigned(x.Type) && isSigned(y.Type) => (NotEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Neq8 x y)  => (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) => (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) => (NotEqual (CMPW x y))
+(Neq64 x y) => (NotEqual (CMP x y))
+(Neq32F x y) => (NotEqual (FCMPU x y))
+(Neq64F x y) => (NotEqual (FCMPU x y))
+(NeqPtr x y) => (NotEqual (CMP x y))
+
+(Less8 x y)  => (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) => (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) => (LessThan (CMPW x y))
+(Less64 x y) => (LessThan (CMP x y))
+(Less32F x y) => (FLessThan (FCMPU x y))
+(Less64F x y) => (FLessThan (FCMPU x y))
+
+(Less8U x y)  => (LessThan (CMPWU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) => (LessThan (CMPWU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) => (LessThan (CMPWU x y))
+(Less64U x y) => (LessThan (CMPU x y))
+
+(Leq8 x y)  => (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (LessEqual (CMPW x y))
+(Leq64 x y) => (LessEqual (CMP x y))
+(Leq32F x y) => (FLessEqual (FCMPU x y))
+(Leq64F x y) => (FLessEqual (FCMPU x y))
+
+(Leq8U x y)  => (LessEqual (CMPWU (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (LessEqual (CMPWU (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (LessEqual (CMPWU x y))
+(Leq64U x y) => (LessEqual (CMPU x y))
+
+// Absorb pseudo-ops into blocks.
+(If (Equal cc) yes no) => (EQ cc yes no)
+(If (NotEqual cc) yes no) => (NE cc yes no)
+(If (LessThan cc) yes no) => (LT cc yes no)
+(If (LessEqual cc) yes no) => (LE cc yes no)
+(If (GreaterThan cc) yes no) => (GT cc yes no)
+(If (GreaterEqual cc) yes no) => (GE cc yes no)
+(If (FLessThan cc) yes no) => (FLT cc yes no)
+(If (FLessEqual cc) yes no) => (FLE cc yes no)
+(If (FGreaterThan cc) yes no) => (FGT cc yes no)
+(If (FGreaterEqual cc) yes no) => (FGE cc yes no)
+
+(If cond yes no) => (NE (CMPWconst [0] cond) yes no)
+
+// Absorb boolean tests into block
+(NE (CMPWconst [0] (Equal cc)) yes no) => (EQ cc yes no)
+(NE (CMPWconst [0] (NotEqual cc)) yes no) => (NE cc yes no)
+(NE (CMPWconst [0] (LessThan cc)) yes no) => (LT cc yes no)
+(NE (CMPWconst [0] (LessEqual cc)) yes no) => (LE cc yes no)
+(NE (CMPWconst [0] (GreaterThan cc)) yes no) => (GT cc yes no)
+(NE (CMPWconst [0] (GreaterEqual cc)) yes no) => (GE cc yes no)
+(NE (CMPWconst [0] (FLessThan cc)) yes no) => (FLT cc yes no)
+(NE (CMPWconst [0] (FLessEqual cc)) yes no) => (FLE cc yes no)
+(NE (CMPWconst [0] (FGreaterThan cc)) yes no) => (FGT cc yes no)
+(NE (CMPWconst [0] (FGreaterEqual cc)) yes no) => (FGE cc yes no)
+
+// Elide compares of bit tests // TODO need to make both CC and result of ANDCC available.
+(EQ (CMPconst [0] (ANDconst [c] x)) yes no) => (EQ (ANDCCconst [c] x) yes no)
+(NE (CMPconst [0] (ANDconst [c] x)) yes no) => (NE (ANDCCconst [c] x) yes no)
+(EQ (CMPWconst [0] (ANDconst [c] x)) yes no) => (EQ (ANDCCconst [c] x) yes no)
+(NE (CMPWconst [0] (ANDconst [c] x)) yes no) => (NE (ANDCCconst [c] x) yes no)
+
+// absorb flag constants into branches
+(EQ (FlagEQ) yes no) => (First yes no)
+(EQ (FlagLT) yes no) => (First no yes)
+(EQ (FlagGT) yes no) => (First no yes)
+
+(NE (FlagEQ) yes no) => (First no yes)
+(NE (FlagLT) yes no) => (First yes no)
+(NE (FlagGT) yes no) => (First yes no)
+
+(LT (FlagEQ) yes no) => (First no yes)
+(LT (FlagLT) yes no) => (First yes no)
+(LT (FlagGT) yes no) => (First no yes)
+
+(LE (FlagEQ) yes no) => (First yes no)
+(LE (FlagLT) yes no) => (First yes no)
+(LE (FlagGT) yes no) => (First no yes)
+
+(GT (FlagEQ) yes no) => (First no yes)
+(GT (FlagLT) yes no) => (First no yes)
+(GT (FlagGT) yes no) => (First yes no)
+
+(GE (FlagEQ) yes no) => (First yes no)
+(GE (FlagLT) yes no) => (First no yes)
+(GE (FlagGT) yes no) => (First yes no)
+
+// absorb InvertFlags into branches
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)
+
+// constant comparisons
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y)  => (FlagLT)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y)  => (FlagGT)
+
+(CMPconst (MOVDconst [x]) [y]) && x==y => (FlagEQ)
+(CMPconst (MOVDconst [x]) [y]) && x<y  => (FlagLT)
+(CMPconst (MOVDconst [x]) [y]) && x>y  => (FlagGT)
+
+(CMPWUconst (MOVDconst [x]) [y]) && int32(x)==int32(y)  => (FlagEQ)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) => (FlagLT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) => (FlagGT)
+
+(CMPUconst (MOVDconst [x]) [y]) && x==y  => (FlagEQ)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) => (FlagLT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) => (FlagGT)
+
+// other known comparisons
+//(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagLT)
+//(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagLT)
+//(CMPconst (ANDconst _ [m]) [n]) && 0 <= int32(m) && int32(m) < int32(n) => (FlagLT)
+//(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) => (FlagLT)
+
+// absorb flag constants into boolean values
+(Equal (FlagEQ)) => (MOVDconst [1])
+(Equal (FlagLT)) => (MOVDconst [0])
+(Equal (FlagGT)) => (MOVDconst [0])
+
+(NotEqual (FlagEQ)) => (MOVDconst [0])
+(NotEqual (FlagLT)) => (MOVDconst [1])
+(NotEqual (FlagGT)) => (MOVDconst [1])
+
+(LessThan (FlagEQ)) => (MOVDconst [0])
+(LessThan (FlagLT)) => (MOVDconst [1])
+(LessThan (FlagGT)) => (MOVDconst [0])
+
+(LessEqual (FlagEQ)) => (MOVDconst [1])
+(LessEqual (FlagLT)) => (MOVDconst [1])
+(LessEqual (FlagGT)) => (MOVDconst [0])
+
+(GreaterThan (FlagEQ)) => (MOVDconst [0])
+(GreaterThan (FlagLT)) => (MOVDconst [0])
+(GreaterThan (FlagGT)) => (MOVDconst [1])
+
+(GreaterEqual (FlagEQ)) => (MOVDconst [1])
+(GreaterEqual (FlagLT)) => (MOVDconst [0])
+(GreaterEqual (FlagGT)) => (MOVDconst [1])
+
+// absorb InvertFlags into boolean values
+(Equal (InvertFlags x)) => (Equal x)
+(NotEqual (InvertFlags x)) => (NotEqual x)
+(LessThan (InvertFlags x)) => (GreaterThan x)
+(GreaterThan (InvertFlags x)) => (LessThan x)
+(LessEqual (InvertFlags x)) => (GreaterEqual x)
+(GreaterEqual (InvertFlags x)) => (LessEqual x)
+
+// Elide compares of bit tests // TODO need to make both CC and result of ANDCC available.
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] (ANDconst [c] x)) yes no) => ((EQ|NE|LT|LE|GT|GE) (ANDCCconst [c] x) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPWconst [0] (ANDconst [c] x)) yes no) => ((EQ|NE|LT|LE|GT|GE) (ANDCCconst [c] x) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (ANDCC x y) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(OR x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (ORCC x y) yes no)
+((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(XOR x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (XORCC x y) yes no)
+
+(CondSelect x y bool) && flagArg(bool) != nil => (ISEL [2] x y bool)
+(CondSelect x y bool) && flagArg(bool) == nil => (ISEL [2] x y (CMPWconst [0] bool))
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) => (MOVWZload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) => (MOVHload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) => (MOVHZload ptr mem)
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBZload ptr mem)
+(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) => (MOVBreg (MOVBZload ptr mem)) // PPC has no signed-byte load.
+(Load <t> ptr mem) && is8BitInt(t) && !isSigned(t) => (MOVBZload ptr mem)
+
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is32BitFloat(val.Type) => (FMOVDstore ptr val mem) // glitch from (Cvt32Fto64F x) => x -- type is wrong
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && (is64BitInt(val.Type) || isPtr(val.Type)) => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitInt(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Using Zero instead of LoweredZero allows the
+// target address to be folded where possible.
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstorezero destptr mem)
+(Zero [2] destptr mem) =>
+	(MOVHstorezero destptr mem)
+(Zero [3] destptr mem) =>
+	(MOVBstorezero [2] destptr
+		(MOVHstorezero destptr mem))
+(Zero [4] destptr mem) =>
+	(MOVWstorezero destptr mem)
+(Zero [5] destptr mem) =>
+	(MOVBstorezero [4] destptr
+        	(MOVWstorezero destptr mem))
+(Zero [6] destptr mem) =>
+	(MOVHstorezero [4] destptr
+		(MOVWstorezero destptr mem))
+(Zero [7] destptr mem) =>
+	(MOVBstorezero [6] destptr
+		(MOVHstorezero [4] destptr
+			(MOVWstorezero destptr mem)))
+
+// MOVD for store with DS must have offsets that are multiple of 4
+(Zero [8] {t} destptr mem) && t.Alignment()%4 == 0 =>
+        (MOVDstorezero destptr mem)
+(Zero [8] destptr mem) =>
+        (MOVWstorezero [4] destptr
+                (MOVWstorezero [0] destptr mem))
+// Handle these cases only if aligned properly, otherwise use general case below
+(Zero [12] {t} destptr mem) && t.Alignment()%4 == 0 =>
+        (MOVWstorezero [8] destptr
+                (MOVDstorezero [0] destptr mem))
+(Zero [16] {t} destptr mem) && t.Alignment()%4 == 0 =>
+       (MOVDstorezero [8] destptr
+                (MOVDstorezero [0] destptr mem))
+(Zero [24] {t} destptr mem) && t.Alignment()%4 == 0 =>
+       (MOVDstorezero [16] destptr
+               (MOVDstorezero [8] destptr
+                       (MOVDstorezero [0] destptr mem)))
+(Zero [32] {t} destptr mem) && t.Alignment()%4 == 0 =>
+       (MOVDstorezero [24] destptr
+               (MOVDstorezero [16] destptr
+                       (MOVDstorezero [8] destptr
+                               (MOVDstorezero [0] destptr mem))))
+
+// Handle cases not handled above
+// Lowered Short cases do not generate loops, and as a result don't clobber
+// the address registers or flags.
+(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 && s < 64 => (LoweredZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 => (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s < 128 && objabi.GOPPC64 >= 9 => (LoweredQuadZeroShort [s] ptr mem)
+(Zero [s] ptr mem) && objabi.GOPPC64 >= 9 => (LoweredQuadZero [s] ptr mem)
+
+// moves
+// Only the MOVD and MOVW instructions require 4 byte
+// alignment in the offset field.  The other MOVx instructions
+// allow any alignment.
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBZload src mem) mem)
+(Move [2] dst src mem) =>
+        (MOVHstore dst (MOVHZload src mem) mem)
+(Move [4] dst src mem) =>
+	(MOVWstore dst (MOVWZload src mem) mem)
+// MOVD for load and store must have offsets that are multiple of 4
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVDstore dst (MOVDload src mem) mem)
+(Move [8] dst src mem) =>
+	(MOVWstore [4] dst (MOVWZload [4] src mem)
+		(MOVWstore dst (MOVWZload src mem) mem))
+(Move [3] dst src mem) =>
+        (MOVBstore [2] dst (MOVBZload [2] src mem)
+                (MOVHstore dst (MOVHload src mem) mem))
+(Move [5] dst src mem) =>
+        (MOVBstore [4] dst (MOVBZload [4] src mem)
+                (MOVWstore dst (MOVWZload src mem) mem))
+(Move [6] dst src mem) =>
+        (MOVHstore [4] dst (MOVHZload [4] src mem)
+                (MOVWstore dst (MOVWZload src mem) mem))
+(Move [7] dst src mem) =>
+        (MOVBstore [6] dst (MOVBZload [6] src mem)
+                (MOVHstore [4] dst (MOVHZload [4] src mem)
+                        (MOVWstore dst (MOVWZload src mem) mem)))
+
+// Large move uses a loop. Since the address is computed and the
+// offset is zero, any alignment can be used.
+(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s) =>
+        (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && s <= 64 && objabi.GOPPC64 >= 9 =>
+        (LoweredQuadMoveShort [s] dst src mem)
+(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s) =>
+        (LoweredQuadMove [s] dst src mem)
+
+// Calls
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// Miscellaneous
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr))
+(IsInBounds idx len) => (LessThan (CMPU idx len))
+(IsSliceInBounds idx len) => (LessEqual (CMPU idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Optimizations
+// Note that PPC "logical" immediates come in 0:15 and 16:31 unsigned immediate forms,
+// so ORconst, XORconst easily expand into a pair.
+
+// Include very-large constants in the const-const case.
+(AND (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&d])
+(OR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|d])
+(XOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c^d])
+(ORN (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|^d])
+(ANDN (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&^d])
+(NOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [^(c|d)])
+
+// Discover consts
+(AND x (MOVDconst [c])) && isU16Bit(c) => (ANDconst [c] x)
+(XOR x (MOVDconst [c])) && isU32Bit(c) => (XORconst [c] x)
+(OR x (MOVDconst [c])) && isU32Bit(c) => (ORconst [c] x)
+
+// Simplify consts
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
+(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
+(ANDconst [-1] x) => x
+(ANDconst [0] _) => (MOVDconst [0])
+(XORconst [0] x) => x
+(ORconst [-1] _) => (MOVDconst [-1])
+(ORconst [0] x) => x
+
+// zero-extend of small and => small and
+(MOVBZreg y:(ANDconst [c] _)) && uint64(c) <= 0xFF => y
+(MOVHZreg y:(ANDconst [c] _)) && uint64(c) <= 0xFFFF => y
+(MOVWZreg y:(ANDconst [c] _)) && uint64(c) <= 0xFFFFFFFF => y
+(MOVWZreg y:(AND (MOVDconst [c]) _)) && uint64(c) <= 0xFFFFFFFF => y
+
+// sign extend of small-positive and => small-positive-and
+(MOVBreg y:(ANDconst [c] _)) && uint64(c) <= 0x7F => y
+(MOVHreg y:(ANDconst [c] _)) && uint64(c) <= 0x7FFF => y
+(MOVWreg y:(ANDconst [c] _)) && uint64(c) <= 0xFFFF => y // 0xFFFF is largest immediate constant, when regarded as 32-bit is > 0
+(MOVWreg y:(AND (MOVDconst [c]) _)) && uint64(c) <= 0x7FFFFFFF => y
+
+// small and of zero-extend => either zero-extend or small and
+(ANDconst [c] y:(MOVBZreg _)) && c&0xFF == 0xFF => y
+(ANDconst [0xFF] y:(MOVBreg _)) => y
+(ANDconst [c] y:(MOVHZreg _))  && c&0xFFFF == 0xFFFF => y
+(ANDconst [0xFFFF] y:(MOVHreg _)) => y
+
+(AND (MOVDconst [c]) y:(MOVWZreg _))  && c&0xFFFFFFFF == 0xFFFFFFFF => y
+(AND (MOVDconst [0xFFFFFFFF]) y:(MOVWreg x)) => (MOVWZreg x)
+// normal case
+(ANDconst [c] (MOV(B|BZ)reg x)) => (ANDconst [c&0xFF] x)
+(ANDconst [c] (MOV(H|HZ)reg x)) => (ANDconst [c&0xFFFF] x)
+(ANDconst [c] (MOV(W|WZ)reg x)) => (ANDconst [c&0xFFFFFFFF] x)
+
+// Eliminate unnecessary sign/zero extend following right shift
+(MOV(B|H|W)Zreg (SRWconst [c] (MOVBZreg x))) => (SRWconst [c] (MOVBZreg x))
+(MOV(H|W)Zreg (SRWconst [c] (MOVHZreg x))) => (SRWconst [c] (MOVHZreg x))
+(MOVWZreg (SRWconst [c] (MOVWZreg x))) => (SRWconst [c] (MOVWZreg x))
+(MOV(B|H|W)reg (SRAWconst [c] (MOVBreg x))) => (SRAWconst [c] (MOVBreg x))
+(MOV(H|W)reg (SRAWconst [c] (MOVHreg x))) => (SRAWconst [c] (MOVHreg x))
+(MOVWreg (SRAWconst [c] (MOVWreg x))) => (SRAWconst [c] (MOVWreg x))
+
+(MOVWZreg (SRWconst [c] x)) && sizeof(x.Type) <= 32 => (SRWconst [c] x)
+(MOVHZreg (SRWconst [c] x)) && sizeof(x.Type) <= 16 => (SRWconst [c] x)
+(MOVBZreg (SRWconst [c] x)) && sizeof(x.Type) == 8 => (SRWconst [c] x)
+(MOVWreg (SRAWconst [c] x)) && sizeof(x.Type) <= 32 => (SRAWconst [c] x)
+(MOVHreg (SRAWconst [c] x)) && sizeof(x.Type) <= 16 => (SRAWconst [c] x)
+(MOVBreg (SRAWconst [c] x)) && sizeof(x.Type) == 8 => (SRAWconst [c] x)
+
+// initial right shift will handle sign/zero extend
+(MOVBZreg (SRDconst [c] x)) && c>=56 => (SRDconst [c] x)
+(MOVBreg (SRDconst [c] x)) && c>56 => (SRDconst [c] x)
+(MOVBreg (SRDconst [c] x)) && c==56 => (SRADconst [c] x)
+(MOVBreg (SRADconst [c] x)) && c>=56 => (SRADconst [c] x)
+(MOVBZreg (SRWconst [c] x)) && c>=24 => (SRWconst [c] x)
+(MOVBreg (SRWconst [c] x)) && c>24 => (SRWconst [c] x)
+(MOVBreg (SRWconst [c] x)) && c==24 => (SRAWconst [c] x)
+(MOVBreg (SRAWconst [c] x)) && c>=24 => (SRAWconst [c] x)
+
+(MOVHZreg (SRDconst [c] x)) && c>=48 => (SRDconst [c] x)
+(MOVHreg (SRDconst [c] x)) && c>48 => (SRDconst [c] x)
+(MOVHreg (SRDconst [c] x)) && c==48 => (SRADconst [c] x)
+(MOVHreg (SRADconst [c] x)) && c>=48 => (SRADconst [c] x)
+(MOVHZreg (SRWconst [c] x)) && c>=16 => (SRWconst [c] x)
+(MOVHreg (SRWconst [c] x)) && c>16 => (SRWconst [c] x)
+(MOVHreg (SRAWconst [c] x)) && c>=16 => (SRAWconst [c] x)
+(MOVHreg (SRWconst [c] x)) && c==16 => (SRAWconst [c] x)
+
+(MOVWZreg (SRDconst [c] x)) && c>=32 => (SRDconst [c] x)
+(MOVWreg (SRDconst [c] x)) && c>32 => (SRDconst [c] x)
+(MOVWreg (SRADconst [c] x)) && c>=32 => (SRADconst [c] x)
+(MOVWreg (SRDconst [c] x)) && c==32 => (SRADconst [c] x)
+
+// Various redundant zero/sign extension combinations.
+(MOVBZreg y:(MOVBZreg _)) => y  // repeat
+(MOVBreg y:(MOVBreg _)) => y // repeat
+(MOVBreg (MOVBZreg x)) => (MOVBreg x)
+(MOVBZreg (MOVBreg x)) => (MOVBZreg x)
+
+// H - there are more combinations than these
+
+(MOVHZreg y:(MOVHZreg _)) => y // repeat
+(MOVHZreg y:(MOVBZreg _)) => y // wide of narrow
+(MOVHZreg y:(MOVHBRload _ _)) => y
+
+(MOVHreg y:(MOVHreg _)) => y // repeat
+(MOVHreg y:(MOVBreg _)) => y // wide of narrow
+
+(MOVHreg y:(MOVHZreg x)) => (MOVHreg x)
+(MOVHZreg y:(MOVHreg x)) => (MOVHZreg x)
+
+// W - there are more combinations than these
+
+(MOVWZreg y:(MOVWZreg _)) => y // repeat
+(MOVWZreg y:(MOVHZreg _)) => y // wide of narrow
+(MOVWZreg y:(MOVBZreg _)) => y // wide of narrow
+(MOVWZreg y:(MOVHBRload _ _)) => y
+(MOVWZreg y:(MOVWBRload _ _)) => y
+
+(MOVWreg y:(MOVWreg _)) => y // repeat
+(MOVWreg y:(MOVHreg _)) => y // wide of narrow
+(MOVWreg y:(MOVBreg _)) => y // wide of narrow
+
+(MOVWreg y:(MOVWZreg x)) => (MOVWreg x)
+(MOVWZreg y:(MOVWreg x)) => (MOVWZreg x)
+
+// Truncate then logical then truncate: omit first, lesser or equal truncate
+(MOVWZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVWZreg ((OR|XOR|AND) <t> x y))
+(MOVHZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVHZreg ((OR|XOR|AND) <t> x y))
+(MOVHZreg ((OR|XOR|AND) <t> x (MOVHZreg y))) => (MOVHZreg ((OR|XOR|AND) <t> x y))
+(MOVBZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y))
+(MOVBZreg ((OR|XOR|AND) <t> x (MOVHZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y))
+(MOVBZreg ((OR|XOR|AND) <t> x (MOVBZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y))
+
+(MOV(B|H|W)Zreg z:(ANDconst [c] (MOVBZload ptr x))) => z
+(MOVBZreg z:(AND y (MOVBZload ptr x))) => z
+(MOV(H|W)Zreg z:(ANDconst [c] (MOVHZload ptr x))) => z
+(MOVHZreg z:(AND y (MOVHZload ptr x))) => z
+(MOVWZreg z:(ANDconst [c] (MOVWZload ptr x))) => z
+(MOVWZreg z:(AND y (MOVWZload ptr x))) => z
+
+// Arithmetic constant ops
+
+(ADD x (MOVDconst [c])) && is32Bit(c) => (ADDconst [c] x)
+(ADDconst [c] (ADDconst [d] x)) && is32Bit(c+d) => (ADDconst [c+d] x)
+(ADDconst [0] x) => x
+(SUB x (MOVDconst [c])) && is32Bit(-c) => (ADDconst [-c] x)
+
+(ADDconst [c] (MOVDaddr [d] {sym} x)) && is32Bit(c+int64(d)) => (MOVDaddr [int32(c+int64(d))] {sym} x)
+(ADDconst [c] x:(SP)) && is32Bit(c) => (MOVDaddr [int32(c)] x) // so it is rematerializeable
+
+(MULL(W|D) x (MOVDconst [c])) && is16Bit(c) => (MULL(W|D)const [int32(c)] x)
+
+// Subtract from (with carry, but ignored) constant.
+// Note, these clobber the carry bit.
+(SUB (MOVDconst [c]) x) && is32Bit(c) => (SUBFCconst [c] x)
+(SUBFCconst [c] (NEG x)) => (ADDconst [c] x)
+(SUBFCconst [c] (SUBFCconst [d] x)) && is32Bit(c-d) => (ADDconst [c-d] x)
+(SUBFCconst [0] x) => (NEG x)
+(ADDconst [c] (SUBFCconst [d] x)) && is32Bit(c+d) => (SUBFCconst [c+d] x)
+(NEG (ADDconst [c] x)) && is32Bit(-c) => (SUBFCconst [-c] x)
+(NEG (SUBFCconst [c] x)) && is32Bit(-c) => (ADDconst [-c] x)
+
+// Use register moves instead of stores and loads to move int<=>float values
+// Common with math Float64bits, Float64frombits
+(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr x _)) => (MFVSRD x)
+(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr x _)) => (MTVSRD x)
+
+(FMOVDstore [off] {sym} ptr (MTVSRD x) mem) => (MOVDstore [off] {sym} ptr x mem)
+(MOVDstore [off] {sym} ptr (MFVSRD x) mem) => (FMOVDstore [off] {sym} ptr x mem)
+
+(MTVSRD (MOVDconst [c])) && !math.IsNaN(math.Float64frombits(uint64(c))) => (FMOVDconst [math.Float64frombits(uint64(c))])
+(MFVSRD (FMOVDconst [c])) => (MOVDconst [int64(math.Float64bits(c))])
+
+(MTVSRD x:(MOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (FMOVDload [off] {sym} ptr mem)
+(MFVSRD x:(FMOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVDload [off] {sym} ptr mem)
+
+// Fold offsets for stores.
+(MOVDstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(int64(off1)+off2) && (int64(off1)+off2)%4 == 0 => (MOVDstore [off1+int32(off2)] {sym} x val mem)
+(MOVWstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(int64(off1)+off2) => (MOVWstore [off1+int32(off2)] {sym} x val mem)
+(MOVHstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(int64(off1)+off2) => (MOVHstore [off1+int32(off2)] {sym} x val mem)
+(MOVBstore [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(int64(off1)+off2) => (MOVBstore [off1+int32(off2)] {sym} x val mem)
+
+(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is16Bit(int64(off1)+off2) => (FMOVSstore [off1+int32(off2)] {sym} ptr val mem)
+(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is16Bit(int64(off1)+off2) => (FMOVDstore [off1+int32(off2)] {sym} ptr val mem)
+
+// Fold address into load/store.
+// The assembler needs to generate several instructions and use
+// temp register for accessing global, and each time it will reload
+// the temp register. So don't fold address of global, unless there
+// is only one use.
+(MOVBstore [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVHstore [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVWstore [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(MOVDstore [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) && (off1+off2)%4 == 0 =>
+        (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+(FMOVSstore [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+(FMOVDstore [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
+
+(MOVBZload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVBZload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHZload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVHZload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) && (off1+off2)%4 == 0 =>
+        (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWZload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (MOVWZload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) && (off1+off2)%4 == 0 =>
+        (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOVSload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (FMOVSload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(FMOVDload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2)
+	&& is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) =>
+        (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+// Fold offsets for loads.
+(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is16Bit(int64(off1)+off2) => (FMOVSload [off1+int32(off2)] {sym} ptr mem)
+(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is16Bit(int64(off1)+off2) => (FMOVDload [off1+int32(off2)] {sym} ptr mem)
+
+(MOVDload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) && (int64(off1)+off2)%4 == 0 => (MOVDload [off1+int32(off2)] {sym} x mem)
+(MOVWload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) && (int64(off1)+off2)%4 == 0 => (MOVWload [off1+int32(off2)] {sym} x mem)
+(MOVWZload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => (MOVWZload [off1+int32(off2)] {sym} x mem)
+(MOVHload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => (MOVHload [off1+int32(off2)] {sym} x mem)
+(MOVHZload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => (MOVHZload [off1+int32(off2)] {sym} x mem)
+(MOVBZload [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => (MOVBZload [off1+int32(off2)] {sym} x mem)
+
+// Determine load + addressing that can be done as a register indexed load
+(MOV(D|W|WZ|H|HZ|BZ)load [0] {sym} p:(ADD ptr idx) mem) && sym == nil && p.Uses == 1 => (MOV(D|W|WZ|H|HZ|BZ)loadidx ptr idx mem)
+
+// Determine indexed loads with constant values that can be done without index
+(MOV(D|W)loadidx ptr (MOVDconst [c]) mem) && is16Bit(c) && c%4 == 0 => (MOV(D|W)load [int32(c)] ptr mem)
+(MOV(WZ|H|HZ|BZ)loadidx ptr (MOVDconst [c]) mem) && is16Bit(c) => (MOV(WZ|H|HZ|BZ)load [int32(c)] ptr mem)
+(MOV(D|W)loadidx (MOVDconst [c]) ptr mem) && is16Bit(c) && c%4 == 0 => (MOV(D|W)load [int32(c)] ptr mem)
+(MOV(WZ|H|HZ|BZ)loadidx (MOVDconst [c]) ptr mem) && is16Bit(c) => (MOV(WZ|H|HZ|BZ)load [int32(c)] ptr mem)
+
+// Store of zero => storezero
+(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+
+// Fold offsets for storezero
+(MOVDstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) && (int64(off1)+off2)%4 == 0 =>
+    (MOVDstorezero [off1+int32(off2)] {sym} x mem)
+(MOVWstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) =>
+    (MOVWstorezero [off1+int32(off2)] {sym} x mem)
+(MOVHstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) =>
+    (MOVHstorezero [off1+int32(off2)] {sym} x mem)
+(MOVBstorezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) =>
+    (MOVBstorezero [off1+int32(off2)] {sym} x mem)
+
+// Stores with addressing that can be done as indexed stores
+(MOV(D|W|H|B)store [0] {sym} p:(ADD ptr idx) val mem) && sym == nil && p.Uses == 1 => (MOV(D|W|H|B)storeidx ptr idx val mem)
+
+// Stores with constant index values can be done without indexed instructions
+(MOVDstoreidx ptr (MOVDconst [c]) val mem) && is16Bit(c) && c%4 == 0 => (MOVDstore [int32(c)] ptr val mem)
+(MOV(W|H|B)storeidx ptr (MOVDconst [c]) val mem) && is16Bit(c) => (MOV(W|H|B)store [int32(c)] ptr val mem)
+(MOVDstoreidx (MOVDconst [c]) ptr val mem) && is16Bit(c) && c%4 == 0 => (MOVDstore [int32(c)] ptr val mem)
+(MOV(W|H|B)storeidx (MOVDconst [c]) ptr val mem) && is16Bit(c) => (MOV(W|H|B)store [int32(c)] ptr val mem)
+
+// Fold symbols into storezero
+(MOVDstorezero [off1] {sym1} p:(MOVDaddr [off2] {sym2} x) mem) && canMergeSym(sym1,sym2)
+	&& (x.Op != OpSB || p.Uses == 1) && (off1+off2)%4 == 0 =>
+    (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} x mem)
+(MOVWstorezero [off1] {sym1} p:(MOVDaddr [off2] {sym2} x) mem) && canMergeSym(sym1,sym2)
+	&& (x.Op != OpSB || p.Uses == 1) =>
+    (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} x mem)
+(MOVHstorezero [off1] {sym1} p:(MOVDaddr [off2] {sym2} x) mem) && canMergeSym(sym1,sym2)
+	&& (x.Op != OpSB || p.Uses == 1) =>
+    (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} x mem)
+(MOVBstorezero [off1] {sym1} p:(MOVDaddr [off2] {sym2} x) mem) && canMergeSym(sym1,sym2)
+	&& (x.Op != OpSB || p.Uses == 1) =>
+    (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} x mem)
+
+// atomic intrinsics
+(AtomicLoad(8|32|64|Ptr)  ptr mem) => (LoweredAtomicLoad(8|32|64|Ptr) [1] ptr mem)
+(AtomicLoadAcq(32|64)     ptr mem) => (LoweredAtomicLoad(32|64) [0] ptr mem)
+
+(AtomicStore(8|32|64)    ptr val mem) => (LoweredAtomicStore(8|32|64) [1] ptr val mem)
+(AtomicStoreRel(32|64)   ptr val mem) => (LoweredAtomicStore(32|64) [0] ptr val mem)
+//(AtomicStorePtrNoWB ptr val mem) => (STLR  ptr val mem)
+
+(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+
+(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
+
+(AtomicCompareAndSwap(32|64) ptr old new_ mem) => (LoweredAtomicCas(32|64) [1] ptr old new_ mem)
+(AtomicCompareAndSwapRel32   ptr old new_ mem) => (LoweredAtomicCas32 [0] ptr old new_ mem)
+
+(AtomicAnd8  ...) => (LoweredAtomicAnd8  ...)
+(AtomicAnd32 ...) => (LoweredAtomicAnd32 ...)
+(AtomicOr8   ...) => (LoweredAtomicOr8   ...)
+(AtomicOr32  ...) => (LoweredAtomicOr32  ...)
+
+(Slicemask <t> x) => (SRADconst (NEG <t> x) [63])
+
+// Note that MOV??reg returns a 64-bit int, x is not necessarily that wide
+// This may interact with other patterns in the future. (Compare with arm64)
+(MOV(B|H|W)Zreg x:(MOVBZload _ _)) => x
+(MOV(B|H|W)Zreg x:(MOVBZloadidx _ _ _)) => x
+(MOV(H|W)Zreg x:(MOVHZload _ _)) => x
+(MOV(H|W)Zreg x:(MOVHZloadidx _ _ _)) => x
+(MOV(H|W)reg x:(MOVHload _ _)) => x
+(MOV(H|W)reg x:(MOVHloadidx _ _ _)) => x
+(MOVWZreg x:(MOVWZload _ _)) => x
+(MOVWZreg x:(MOVWZloadidx _ _ _)) => x
+(MOVWreg x:(MOVWload _ _)) => x
+(MOVWreg x:(MOVWloadidx _ _ _)) => x
+
+// don't extend if argument is already extended
+(MOVBreg x:(Arg <t>)) && is8BitInt(t) && isSigned(t) => x
+(MOVBZreg x:(Arg <t>)) && is8BitInt(t) && !isSigned(t) => x
+(MOVHreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && isSigned(t) => x
+(MOVHZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && !isSigned(t) => x
+(MOVWreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && isSigned(t) => x
+(MOVWZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && !isSigned(t) => x
+
+(MOVBZreg (MOVDconst [c]))  => (MOVDconst [int64(uint8(c))])
+(MOVBreg (MOVDconst [c]))  => (MOVDconst [int64(int8(c))])
+(MOVHZreg (MOVDconst [c]))  => (MOVDconst [int64(uint16(c))])
+(MOVHreg (MOVDconst [c]))  => (MOVDconst [int64(int16(c))])
+(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+(MOVWZreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+
+// Implement clrsldi and clrslwi extended mnemonics as described in
+// ISA 3.0 section C.8. AuxInt field contains values needed for
+// the instructions, packed together since there is only one available.
+(SLDconst [c] z:(MOVBZreg x)) && c < 8 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,56,63,64)] x)
+(SLDconst [c] z:(MOVHZreg x)) && c < 16 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,48,63,64)] x)
+(SLDconst [c] z:(MOVWZreg x)) && c < 32 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,32,63,64)] x)
+
+(SLDconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLWconst [c] z:(MOVBZreg x)) && z.Uses == 1 && c < 8 => (CLRLSLWI [newPPC64ShiftAuxInt(c,24,31,32)] x)
+(SLWconst [c] z:(MOVHZreg x)) && z.Uses == 1 && c < 16 => (CLRLSLWI [newPPC64ShiftAuxInt(c,16,31,32)] x)
+(SLWconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+// special case for power9
+(SL(W|D)const [c] z:(MOVWreg x)) && c < 32 && objabi.GOPPC64 >= 9 => (EXTSWSLconst [c] x)
+
+// Lose widening ops fed to stores
+(MOVBstore [off] {sym} ptr (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOV(W|WZ)reg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 => (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem)
+(MOVBstore [off] {sym} ptr (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 => (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem)
+(MOVBstoreidx ptr idx (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstoreidx ptr idx x mem)
+(MOVHstoreidx ptr idx (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHstoreidx ptr idx x mem)
+(MOVWstoreidx ptr idx (MOV(W|WZ)reg x) mem) => (MOVWstoreidx ptr idx x mem)
+(MOVBstoreidx ptr idx (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 => (MOVBstoreidx ptr idx (SRWconst <typ.UInt32> x [c]) mem)
+(MOVBstoreidx ptr idx (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 => (MOVBstoreidx ptr idx (SRWconst <typ.UInt32> x [c]) mem)
+(MOVHBRstore {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHBRstore {sym} ptr x mem)
+(MOVWBRstore {sym} ptr (MOV(W|WZ)reg x) mem) => (MOVWBRstore {sym} ptr x mem)
+
+// Lose W-widening ops fed to compare-W
+(CMPW x (MOVWreg y)) => (CMPW x y)
+(CMPW (MOVWreg x) y) => (CMPW x y)
+(CMPWU x (MOVWZreg y)) => (CMPWU x y)
+(CMPWU (MOVWZreg x) y) => (CMPWU x y)
+
+(CMP x (MOVDconst [c])) && is16Bit(c) => (CMPconst x [c])
+(CMP (MOVDconst [c]) y) && is16Bit(c) => (InvertFlags (CMPconst y [c]))
+(CMPW x (MOVDconst [c])) && is16Bit(c) => (CMPWconst x [int32(c)])
+(CMPW (MOVDconst [c]) y) && is16Bit(c) => (InvertFlags (CMPWconst y [int32(c)]))
+
+(CMPU x (MOVDconst [c])) && isU16Bit(c) => (CMPUconst x [c])
+(CMPU (MOVDconst [c]) y) && isU16Bit(c) => (InvertFlags (CMPUconst y [c]))
+(CMPWU x (MOVDconst [c])) && isU16Bit(c) => (CMPWUconst x [int32(c)])
+(CMPWU (MOVDconst [c]) y) && isU16Bit(c) => (InvertFlags (CMPWUconst y [int32(c)]))
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+((CMP|CMPW|CMPU|CMPWU) x y) && x.ID > y.ID => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x))
+
+// ISEL auxInt values 0=LT 1=GT 2=EQ   arg2 ? arg0 : arg1
+// ISEL auxInt values 4=GE 5=LE 6=NE   arg2 ? arg1 : arg0
+// ISELB special case where arg0, arg1 values are 0, 1
+
+(Equal cmp) => (ISELB [2] (MOVDconst [1]) cmp)
+(NotEqual cmp) => (ISELB [6] (MOVDconst [1]) cmp)
+(LessThan cmp) => (ISELB [0] (MOVDconst [1]) cmp)
+(FLessThan cmp) => (ISELB [0] (MOVDconst [1]) cmp)
+(FLessEqual cmp) => (ISEL [2] (MOVDconst [1]) (ISELB [0] (MOVDconst [1]) cmp) cmp)
+(GreaterEqual cmp) => (ISELB [4] (MOVDconst [1]) cmp)
+(GreaterThan cmp) => (ISELB [1] (MOVDconst [1]) cmp)
+(FGreaterThan cmp) => (ISELB [1] (MOVDconst [1]) cmp)
+(FGreaterEqual cmp) => (ISEL [2] (MOVDconst [1]) (ISELB [1] (MOVDconst [1]) cmp) cmp)
+(LessEqual cmp) => (ISELB [5] (MOVDconst [1]) cmp)
+
+(ISELB [0] _ (FlagLT)) => (MOVDconst [1])
+(ISELB [0] _ (Flag(GT|EQ))) => (MOVDconst [0])
+(ISELB [1] _ (FlagGT)) => (MOVDconst [1])
+(ISELB [1] _ (Flag(LT|EQ))) => (MOVDconst [0])
+(ISELB [2] _ (FlagEQ)) => (MOVDconst [1])
+(ISELB [2] _ (Flag(LT|GT))) => (MOVDconst [0])
+(ISELB [4] _ (FlagLT)) => (MOVDconst [0])
+(ISELB [4] _ (Flag(GT|EQ))) => (MOVDconst [1])
+(ISELB [5] _ (FlagGT)) => (MOVDconst [0])
+(ISELB [5] _ (Flag(LT|EQ))) => (MOVDconst [1])
+(ISELB [6] _ (FlagEQ)) => (MOVDconst [0])
+(ISELB [6] _ (Flag(LT|GT))) => (MOVDconst [1])
+
+(ISEL [2] x _ (FlagEQ)) => x
+(ISEL [2] _ y (Flag(LT|GT))) => y
+
+(ISEL [6] _ y (FlagEQ)) => y
+(ISEL [6] x _ (Flag(LT|GT))) => x
+
+(ISEL [0] _ y (Flag(EQ|GT))) => y
+(ISEL [0] x _ (FlagLT)) => x
+
+(ISEL [5] _ x (Flag(EQ|LT))) => x
+(ISEL [5] y _ (FlagGT)) => y
+
+(ISEL [1] _ y (Flag(EQ|LT))) => y
+(ISEL [1] x _ (FlagGT)) => x
+
+(ISEL [4] x _ (Flag(EQ|GT))) => x
+(ISEL [4] _ y (FlagLT)) => y
+
+(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 0 => (ISELB [n+1] (MOVDconst [1]) bool)
+(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 1 => (ISELB [n-1] (MOVDconst [1]) bool)
+(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 2 => (ISELB [n] (MOVDconst [1]) bool)
+(ISEL [n] x y (InvertFlags bool)) && n%4 == 0 => (ISEL [n+1] x y bool)
+(ISEL [n] x y (InvertFlags bool)) && n%4 == 1 => (ISEL [n-1] x y bool)
+(ISEL [n] x y (InvertFlags bool)) && n%4 == 2 => (ISEL [n] x y bool)
+
+// A particular pattern seen in cgo code:
+(AND (MOVDconst [c]) x:(MOVBZload _ _)) => (ANDconst [c&0xFF] x)
+
+// floating point negative abs
+(FNEG (FABS x)) => (FNABS x)
+(FNEG (FNABS x)) => (FABS x)
+
+// floating-point fused multiply-add/sub
+(FADD (FMUL x y) z) => (FMADD x y z)
+(FSUB (FMUL x y) z) => (FMSUB x y z)
+(FADDS (FMULS x y) z) => (FMADDS x y z)
+(FSUBS (FMULS x y) z) => (FMSUBS x y z)
+
+
+// The following statements are found in encoding/binary functions UintXX (load) and PutUintXX (store)
+// and convert the statements in these functions from multiple single byte loads or stores to
+// the single largest possible load or store.
+// Some are marked big or little endian based on the order in which the bytes are loaded or stored,
+// not on the ordering of the machine. These are intended for little endian machines.
+// To implement for big endian machines, most rules would have to be duplicated but the
+// resulting rule would be reversed, i. e., MOVHZload on little endian would be MOVHBRload on big endian
+// and vice versa.
+// b[0] | b[1]<<8 => load 16-bit Little endian
+(OR <t> x0:(MOVBZload [i0] {s} p mem)
+	o1:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [8]))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& x0.Uses ==1 && x1.Uses == 1
+	&& o1.Uses == 1
+	&& mergePoint(b, x0, x1) != nil
+	&& clobber(x0, x1, o1)
+	 => @mergePoint(b,x0,x1) (MOVHZload <t> {s} [i0] p mem)
+
+// b[0]<<8 | b[1] => load 16-bit Big endian on Little endian arch.
+// Use byte-reverse indexed load for 2 bytes.
+(OR <t> x0:(MOVBZload [i1] {s} p mem)
+	o1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [8]))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& x0.Uses ==1 && x1.Uses == 1
+	&& o1.Uses == 1
+	&& mergePoint(b, x0, x1) != nil
+	&& clobber(x0, x1, o1)
+	  => @mergePoint(b,x0,x1) (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<n+8 | b[1]<<n => load 16-bit Big endian (where n%8== 0)
+// Use byte-reverse indexed load for 2 bytes,
+// then shift left to the correct position. Used to match subrules
+// from longer rules.
+(OR <t> s0:(SL(W|D)const x0:(MOVBZload [i1] {s} p mem) [n1])
+	s1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [n2]))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& n1%8 == 0
+	&& n2 == n1+8
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1) != nil
+	&& clobber(x0, x1, s0, s1)
+	  => @mergePoint(b,x0,x1) (SLDconst <t> (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [n1])
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 => load 32-bit Little endian
+// Use byte-reverse indexed load for 4 bytes.
+(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i3] {s} p mem) [24])
+	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [16])
+	x0:(MOVHZload [i0] {s} p mem)))
+	&& !config.BigEndian
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses ==1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0, x1, x2, s0, s1, o0)
+	 => @mergePoint(b,x0,x1,x2) (MOVWZload <t> {s} [i0] p mem)
+
+// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with computed address.
+// Could be used to match subrules of a longer rule.
+(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i0] {s} p mem) [24])
+	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [16])
+	x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem)))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0, x1, x2, s0, s1, o0)
+	  => @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with computed address.
+// Could be used to match subrules of a longer rule.
+(OR <t> x0:(MOVBZload [i3] {s} p mem)
+	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [8])
+	s1:(SL(W|D)const x2:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [16])))
+	&& !config.BigEndian
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0, x1, x2, s0, s1, o0)
+	  => @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load to for 4 bytes with computed address.
+// Used to match longer rules.
+(OR <t> s2:(SLDconst x2:(MOVBZload [i3] {s} p mem) [32])
+	o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i2] {s} p mem) [40])
+	s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [48])))
+	&& !config.BigEndian
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0, x1, x2, s0, s1, s2, o0)
+	  => @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])
+
+// b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 => load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with constant address.
+// Used to match longer rules.
+(OR <t> s2:(SLDconst x2:(MOVBZload [i0] {s} p mem) [56])
+        o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48])
+        s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem) [32])))
+        && !config.BigEndian
+        && i1 == i0+1
+        && i2 == i0+2
+        && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+        && o0.Uses == 1
+        && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1
+        && mergePoint(b, x0, x1, x2) != nil
+        && clobber(x0, x1, x2, s0, s1, s2, o0)
+          => @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])
+
+// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4] <<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 => load 64-bit Little endian
+// Rules with commutative ops and many operands will result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+// Offset must be multiple of 4 for MOVD
+(OR <t> s6:(SLDconst x7:(MOVBZload [i7] {s} p mem) [56])
+	o5:(OR <t> s5:(SLDconst x6:(MOVBZload [i6] {s} p mem) [48])
+	o4:(OR <t> s4:(SLDconst x5:(MOVBZload [i5] {s} p mem) [40])
+	o3:(OR <t> s3:(SLDconst x4:(MOVBZload [i4] {s} p mem) [32])
+	x0:(MOVWZload {s} [i0] p mem)))))
+	&& !config.BigEndian
+	&& i0%4 == 0
+	&& i4 == i0+4
+	&& i5 == i0+5
+	&& i6 == i0+6
+	&& i7 == i0+7
+	&& x0.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses ==1 && x7.Uses == 1
+	&& o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1
+	&& s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1
+	&& mergePoint(b, x0, x4, x5, x6, x7) != nil
+	&& clobber(x0, x4, x5, x6, x7, s3, s4, s5, s6, o3, o4, o5)
+	  => @mergePoint(b,x0,x4,x5,x6,x7) (MOVDload <t> {s} [i0] p mem)
+
+// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 load 64-bit Big endian ordered bytes on Little endian arch
+// Use byte-reverse indexed load of 8 bytes.
+// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+(OR <t> s0:(SLDconst x0:(MOVBZload [i0] {s} p mem) [56])
+	o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48])
+	o1:(OR <t> s2:(SLDconst x2:(MOVBZload [i2] {s} p mem) [40])
+	o2:(OR <t> s3:(SLDconst x3:(MOVBZload [i3] {s} p mem) [32])
+	x4:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i4] p) mem)))))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& i4 == i0+4
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+	&& mergePoint(b, x0, x1, x2, x3, x4) != nil
+	&& clobber(x0, x1, x2, x3, x4, o0, o1, o2, s0, s1, s2, s3)
+	  => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] => load 64-bit Big endian ordered bytes on Little endian arch
+// Use byte-reverse indexed load of 8 bytes.
+// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+(OR <t> x7:(MOVBZload [i7] {s} p mem)
+	o5:(OR <t> s6:(SLDconst x6:(MOVBZload [i6] {s} p mem) [8])
+	o4:(OR <t> s5:(SLDconst x5:(MOVBZload [i5] {s} p mem) [16])
+	o3:(OR <t> s4:(SLDconst x4:(MOVBZload [i4] {s} p mem) [24])
+	s0:(SL(W|D)const x3:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])))))
+	&& !config.BigEndian
+	&& i4 == i0+4
+	&& i5 == i0+5
+	&& i6 == i0+6
+	&& i7 == i0+7
+	&& x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1
+	&& s0.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1
+	&& mergePoint(b, x3, x4, x5, x6, x7) != nil
+	&& clobber(x3, x4, x5, x6, x7, o3, o4, o5, s0, s4, s5, s6)
+	=> @mergePoint(b,x3,x4,x5,x6,x7) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// 2 byte store Little endian as in:
+//      b[0] = byte(v >> 16)
+//      b[1] = byte(v >> 24)
+// Added for use in matching longer rules.
+(MOVBstore [i1] {s} p (SR(W|D)const w [24])
+        x0:(MOVBstore [i0] {s} p (SR(W|D)const w [16]) mem))
+        && !config.BigEndian
+        && x0.Uses == 1
+        && i1 == i0+1
+        && clobber(x0)
+          => (MOVHstore [i0] {s} p (SRWconst <typ.UInt16> w [16]) mem)
+
+// 2 byte store Little endian as in:
+//      b[0] = byte(v)
+//      b[1] = byte(v >> 8)
+(MOVBstore [i1] {s} p (SR(W|D)const w [8])
+	x0:(MOVBstore [i0] {s} p w mem))
+	&& !config.BigEndian
+	&& x0.Uses == 1
+	&& i1 == i0+1
+	&& clobber(x0)
+	  => (MOVHstore [i0] {s} p w mem)
+
+// 4 byte store Little endian as in:
+//     b[0:1] = uint16(v)
+//     b[2:3] = uint16(v >> 16)
+(MOVHstore [i1] {s} p (SR(W|D)const w [16])
+	x0:(MOVHstore [i0] {s} p w mem))
+	&& !config.BigEndian
+	&& x0.Uses == 1
+	&& i1 == i0+2
+	&& clobber(x0)
+	  => (MOVWstore [i0] {s} p w mem)
+
+// 4 byte store Big endian as in:
+//     b[0] = byte(v >> 24)
+//     b[1] = byte(v >> 16)
+//     b[2] = byte(v >> 8)
+//     b[3] = byte(v)
+// Use byte-reverse indexed 4 byte store.
+(MOVBstore [i3] {s} p w
+	x0:(MOVBstore [i2] {s} p (SRWconst w [8])
+	x1:(MOVBstore [i1] {s} p (SRWconst w [16])
+	x2:(MOVBstore [i0] {s} p (SRWconst w [24]) mem))))
+	&& !config.BigEndian
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& i1 == i0+1 && i2 == i0+2 && i3 == i0+3
+	&& clobber(x0, x1, x2)
+	  => (MOVWBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
+// The 2 byte store appears after the 4 byte store so that the
+// match for the 2 byte store is not done first.
+// If the 4 byte store is based on the 2 byte store then there are
+// variations on the MOVDaddr subrule that would require additional
+// rules to be written.
+
+// 2 byte store Big endian as in:
+//      b[0] = byte(v >> 8)
+//      b[1] = byte(v)
+(MOVBstore [i1] {s} p w x0:(MOVBstore [i0] {s} p (SRWconst w [8]) mem))
+	&& !config.BigEndian
+	&& x0.Uses == 1
+	&& i1 == i0+1
+	&& clobber(x0)
+	  => (MOVHBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
+// 8 byte store Little endian as in:
+//	b[0] = byte(v)
+//	b[1] = byte(v >> 8)
+//	b[2] = byte(v >> 16)
+//	b[3] = byte(v >> 24)
+//	b[4] = byte(v >> 32)
+//	b[5] = byte(v >> 40)
+//	b[6] = byte(v >> 48)
+//	b[7] = byte(v >> 56)
+// Built on previously defined rules
+// Offset must be multiple of 4 for MOVDstore
+(MOVBstore [i7] {s} p (SRDconst w [56])
+	x0:(MOVBstore [i6] {s} p (SRDconst w [48])
+	x1:(MOVBstore [i5] {s} p (SRDconst w [40])
+	x2:(MOVBstore [i4] {s} p (SRDconst w [32])
+	x3:(MOVWstore [i0] {s} p w mem)))))
+	&& !config.BigEndian
+	&& i0%4 == 0
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1
+	&& i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
+	&& clobber(x0, x1, x2, x3)
+	  => (MOVDstore [i0] {s} p w mem)
+
+// 8 byte store Big endian as in:
+//      b[0] = byte(v >> 56)
+//      b[1] = byte(v >> 48)
+//      b[2] = byte(v >> 40)
+//      b[3] = byte(v >> 32)
+//      b[4] = byte(v >> 24)
+//      b[5] = byte(v >> 16)
+//      b[6] = byte(v >> 8)
+//      b[7] = byte(v)
+// Use byte-reverse indexed 8 byte store.
+(MOVBstore [i7] {s} p w
+        x0:(MOVBstore [i6] {s} p (SRDconst w [8])
+        x1:(MOVBstore [i5] {s} p (SRDconst w [16])
+        x2:(MOVBstore [i4] {s} p (SRDconst w [24])
+        x3:(MOVBstore [i3] {s} p (SRDconst w [32])
+        x4:(MOVBstore [i2] {s} p (SRDconst w [40])
+        x5:(MOVBstore [i1] {s} p (SRDconst w [48])
+        x6:(MOVBstore [i0] {s} p (SRDconst w [56]) mem))))))))
+        && !config.BigEndian
+        && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1
+        && i1 == i0+1 && i2 == i0+2 && i3 == i0+3 && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
+        && clobber(x0, x1, x2, x3, x4, x5, x6)
+          => (MOVDBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
new file mode 100644
index 0000000..f7198b9
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -0,0 +1,717 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Less-than-64-bit integer types live in the low portion of registers.
+//    For now, the upper portion is junk; sign/zero-extension might be optimized in the future, but not yet.
+//  - Boolean types are zero or 1; stored in a byte, but loaded with AMOVBZ so the upper bytes of a register are zero.
+//  - *const instructions may use a constant larger than the instruction can encode.
+//    In this case the assembler expands to multiple instructions and uses tmp
+//    register (R31).
+
+var regNamesPPC64 = []string{
+	"R0", // REGZERO, not used, but simplifies counting in regalloc
+	"SP", // REGSP
+	"SB", // REGSB
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11", // REGCTXT for closures
+	"R12",
+	"R13", // REGTLS
+	"R14",
+	"R15",
+	"R16",
+	"R17",
+	"R18",
+	"R19",
+	"R20",
+	"R21",
+	"R22",
+	"R23",
+	"R24",
+	"R25",
+	"R26",
+	"R27",
+	"R28",
+	"R29",
+	"g",   // REGG.  Using name "g" and setting Config.hasGReg makes it "just happen".
+	"R31", // REGTMP
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+	"F16",
+	"F17",
+	"F18",
+	"F19",
+	"F20",
+	"F21",
+	"F22",
+	"F23",
+	"F24",
+	"F25",
+	"F26",
+	"F27",
+	"F28",
+	"F29",
+	"F30",
+	"F31",
+
+	// If you add registers, update asyncPreempt in runtime.
+
+	// "CR0",
+	// "CR1",
+	// "CR2",
+	// "CR3",
+	// "CR4",
+	// "CR5",
+	// "CR6",
+	// "CR7",
+
+	// "CR",
+	// "XER",
+	// "LR",
+	// "CTR",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesPPC64) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesPPC64 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	var (
+		gp = buildReg("R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29")
+		fp = buildReg("F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26")
+		sp = buildReg("SP")
+		sb = buildReg("SB")
+		gr = buildReg("g")
+		// cr  = buildReg("CR")
+		// ctr = buildReg("CTR")
+		// lr  = buildReg("LR")
+		tmp     = buildReg("R31")
+		ctxt    = buildReg("R11")
+		callptr = buildReg("R12")
+		// tls = buildReg("R13")
+		gp01        = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11        = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
+		gp21        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+		gp21a0      = regInfo{inputs: []regMask{gp, gp | sp | sb}, outputs: []regMask{gp}}
+		gp31        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+		gp22        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
+		gp32        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
+		gp1cr       = regInfo{inputs: []regMask{gp | sp | sb}}
+		gp2cr       = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
+		crgp        = regInfo{inputs: nil, outputs: []regMask{gp}}
+		crgp11      = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}
+		crgp21      = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
+		gpload      = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
+		gploadidx   = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}}
+		gpstore     = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
+		gpstoreidx  = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}}
+		gpstorezero = regInfo{inputs: []regMask{gp | sp | sb}} // ppc64.REGZERO is reserved zero value
+		gpxchg      = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}}
+		gpcas       = regInfo{inputs: []regMask{gp | sp | sb, gp, gp}, outputs: []regMask{gp}}
+		fp01        = regInfo{inputs: nil, outputs: []regMask{fp}}
+		fp11        = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
+		fpgp        = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+		gpfp        = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
+		fp21        = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+		fp31        = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
+		fp2cr       = regInfo{inputs: []regMask{fp, fp}}
+		fpload      = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{fp}}
+		fploadidx   = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{fp}}
+		fpstore     = regInfo{inputs: []regMask{gp | sp | sb, fp}}
+		fpstoreidx  = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, fp}}
+		callerSave  = regMask(gp | fp | gr)
+		r3          = buildReg("R3")
+		r4          = buildReg("R4")
+		r5          = buildReg("R5")
+		r6          = buildReg("R6")
+	)
+	ops := []opData{
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true},     // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11, asm: "ADD", aux: "Int64"},     // arg0 + auxInt
+		{name: "FADD", argLength: 2, reg: fp21, asm: "FADD", commutative: true},   // arg0+arg1
+		{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0+arg1
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                        // arg0-arg1
+		{name: "SUBFCconst", argLength: 1, reg: gp11, asm: "SUBC", aux: "Int64"},  // auxInt - arg0 (with carry)
+		{name: "FSUB", argLength: 2, reg: fp21, asm: "FSUB"},                      // arg0-arg1
+		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"},                    // arg0-arg1
+
+		{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit)
+		{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit)
+		{name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+		{name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+		{name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"},                  // (arg0*arg1)+arg2 (signed 64-bit)
+
+		{name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true},   // (arg0 * arg1) >> 64, signed
+		{name: "MULHW", argLength: 2, reg: gp21, asm: "MULHW", commutative: true},   // (arg0 * arg1) >> 32, signed
+		{name: "MULHDU", argLength: 2, reg: gp21, asm: "MULHDU", commutative: true}, // (arg0 * arg1) >> 64, unsigned
+		{name: "MULHWU", argLength: 2, reg: gp21, asm: "MULHWU", commutative: true}, // (arg0 * arg1) >> 32, unsigned
+		{name: "LoweredMuluhilo", argLength: 2, reg: gp22, resultNotInArgs: true},   // arg0 * arg1, returns (hi, lo)
+
+		{name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true},   // arg0*arg1
+		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1
+
+		{name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD"},   // arg0*arg1 + arg2
+		{name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // arg0*arg1 + arg2
+		{name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB"},   // arg0*arg1 - arg2
+		{name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // arg0*arg1 - arg2
+
+		{name: "SRAD", argLength: 2, reg: gp21, asm: "SRAD"}, // signed arg0 >> (arg1&127), 64 bit width (note: 127, not 63!)
+		{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // signed arg0 >> (arg1&63), 32 bit width
+		{name: "SRD", argLength: 2, reg: gp21, asm: "SRD"},   // unsigned arg0 >> (arg1&127), 64 bit width
+		{name: "SRW", argLength: 2, reg: gp21, asm: "SRW"},   // unsigned arg0 >> (arg1&63), 32 bit width
+		{name: "SLD", argLength: 2, reg: gp21, asm: "SLD"},   // arg0 << (arg1&127), 64 bit width
+		{name: "SLW", argLength: 2, reg: gp21, asm: "SLW"},   // arg0 << (arg1&63), 32 bit width
+
+		{name: "ROTL", argLength: 2, reg: gp21, asm: "ROTL"},   // arg0 rotate left by arg1 mod 64
+		{name: "ROTLW", argLength: 2, reg: gp21, asm: "ROTLW"}, // uint32(arg0) rotate left by arg1 mod 32
+		// The following are ops to implement the extended mnemonics for shifts as described in section C.8 of the ISA.
+		// The constant shift values are packed into the aux int32.
+		{name: "RLDICL", argLength: 1, reg: gp11, asm: "RLDICL", aux: "Int32"},     // arg0 extract bits identified by shift params"
+		{name: "CLRLSLWI", argLength: 1, reg: gp11, asm: "CLRLSLWI", aux: "Int32"}, //
+		{name: "CLRLSLDI", argLength: 1, reg: gp11, asm: "CLRLSLDI", aux: "Int32"}, //
+
+		{name: "LoweredAdd64Carry", argLength: 3, reg: gp32, resultNotInArgs: true}, // arg0 + arg1 + carry, returns (sum, carry)
+
+		{name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "Int64"}, // signed arg0 >> auxInt, 0 <= auxInt < 64, 64 bit width
+		{name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "Int64"}, // signed arg0 >> auxInt, 0 <= auxInt < 32, 32 bit width
+		{name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "Int64"},   // unsigned arg0 >> auxInt, 0 <= auxInt < 64, 64 bit width
+		{name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "Int64"},   // unsigned arg0 >> auxInt, 0 <= auxInt < 32, 32 bit width
+		{name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "Int64"},   // arg0 << auxInt, 0 <= auxInt < 64, 64 bit width
+		{name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "Int64"},   // arg0 << auxInt, 0 <= auxInt < 32, 32 bit width
+
+		{name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"},   // arg0 rotate left by auxInt bits
+		{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
+		{name: "EXTSWSLconst", argLength: 1, reg: gp11, asm: "EXTSWSLI", aux: "Int64"},
+
+		{name: "RLWINM", argLength: 1, reg: gp11, asm: "RLWNM", aux: "Int64"},                      // Rotate and mask by immediate "rlwinm". encodePPC64RotateMask describes aux
+		{name: "RLWNM", argLength: 2, reg: gp21, asm: "RLWNM", aux: "Int64"},                       // Rotate and mask by "rlwnm". encodePPC64RotateMask describes aux
+		{name: "RLWMI", argLength: 2, reg: gp21a0, asm: "RLWMI", aux: "Int64", resultInArg0: true}, // "rlwimi" similar aux encoding as above
+
+		{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
+		{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
+
+		{name: "CNTTZD", argLength: 1, reg: gp11, asm: "CNTTZD"}, // count trailing zeros
+		{name: "CNTTZW", argLength: 1, reg: gp11, asm: "CNTTZW"}, // count trailing zeros (32 bit)
+
+		{name: "POPCNTD", argLength: 1, reg: gp11, asm: "POPCNTD"}, // number of set bits in arg0
+		{name: "POPCNTW", argLength: 1, reg: gp11, asm: "POPCNTW"}, // number of set bits in each word of arg0 placed in corresponding word
+		{name: "POPCNTB", argLength: 1, reg: gp11, asm: "POPCNTB"}, // number of set bits in each byte of arg0 placed in corresponding byte
+
+		{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"},   // arg0/arg1
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1
+
+		{name: "DIVD", argLength: 2, reg: gp21, asm: "DIVD", typ: "Int64"},   // arg0/arg1 (signed 64-bit)
+		{name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"},   // arg0/arg1 (signed 32-bit)
+		{name: "DIVDU", argLength: 2, reg: gp21, asm: "DIVDU", typ: "Int64"}, // arg0/arg1 (unsigned 64-bit)
+		{name: "DIVWU", argLength: 2, reg: gp21, asm: "DIVWU", typ: "Int32"}, // arg0/arg1 (unsigned 32-bit)
+
+		{name: "MODUD", argLength: 2, reg: gp21, asm: "MODUD", typ: "UInt64"}, // arg0 % arg1 (unsigned 64-bit)
+		{name: "MODSD", argLength: 2, reg: gp21, asm: "MODSD", typ: "Int64"},  // arg0 % arg1 (signed 64-bit)
+		{name: "MODUW", argLength: 2, reg: gp21, asm: "MODUW", typ: "UInt32"}, // arg0 % arg1 (unsigned 32-bit)
+		{name: "MODSW", argLength: 2, reg: gp21, asm: "MODSW", typ: "Int32"},  // arg0 % arg1 (signed 32-bit)
+		// MOD is implemented as rem := arg0 - (arg0/arg1) * arg1
+
+		// Conversions are all float-to-float register operations.  "Integer" refers to encoding in the FP register.
+		{name: "FCTIDZ", argLength: 1, reg: fp11, asm: "FCTIDZ", typ: "Float64"}, // convert float to 64-bit int round towards zero
+		{name: "FCTIWZ", argLength: 1, reg: fp11, asm: "FCTIWZ", typ: "Float64"}, // convert float to 32-bit int round towards zero
+		{name: "FCFID", argLength: 1, reg: fp11, asm: "FCFID", typ: "Float64"},   // convert 64-bit integer to float
+		{name: "FCFIDS", argLength: 1, reg: fp11, asm: "FCFIDS", typ: "Float32"}, // convert 32-bit integer to float
+		{name: "FRSP", argLength: 1, reg: fp11, asm: "FRSP", typ: "Float64"},     // round float to 32-bit value
+
+		// Movement between float and integer registers with no change in bits; accomplished with stores+loads on PPC.
+		// Because the 32-bit load-literal-bits instructions have impoverished addressability, always widen the
+		// data instead and use FMOVDload and FMOVDstore instead (this will also dodge endianess issues).
+		// There are optimizations that should apply -- (Xi2f64 (MOVWload (not-ADD-ptr+offset) ) ) could use
+		// the word-load instructions.  (Xi2f64 (MOVDload ptr )) can be (FMOVDload ptr)
+
+		{name: "MFVSRD", argLength: 1, reg: fpgp, asm: "MFVSRD", typ: "Int64"},   // move 64 bits of F register into G register
+		{name: "MTVSRD", argLength: 1, reg: gpfp, asm: "MTVSRD", typ: "Float64"}, // move 64 bits of G register into F register
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},                    // arg0&arg1
+		{name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"},                                     // arg0&^arg1
+		{name: "ANDCC", argLength: 2, reg: gp2cr, asm: "ANDCC", commutative: true, typ: "Flags"}, // arg0&arg1 sets CC
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},                      // arg0|arg1
+		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                                       // arg0|^arg1
+		{name: "ORCC", argLength: 2, reg: gp2cr, asm: "ORCC", commutative: true, typ: "Flags"},   // arg0|arg1 sets CC
+		{name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true},                    // ^(arg0|arg1)
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", typ: "Int64", commutative: true},      // arg0^arg1
+		{name: "XORCC", argLength: 2, reg: gp2cr, asm: "XORCC", commutative: true, typ: "Flags"}, // arg0^arg1 sets CC
+		{name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true},      // arg0^^arg1
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                                       // -arg0 (integer)
+		{name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"},                                     // -arg0 (floating point)
+		{name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"},                                   // sqrt(arg0) (floating point)
+		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"},                                 // sqrt(arg0) (floating point, single precision)
+		{name: "FFLOOR", argLength: 1, reg: fp11, asm: "FRIM"},                                   // floor(arg0), float64
+		{name: "FCEIL", argLength: 1, reg: fp11, asm: "FRIP"},                                    // ceil(arg0), float64
+		{name: "FTRUNC", argLength: 1, reg: fp11, asm: "FRIZ"},                                   // trunc(arg0), float64
+		{name: "FROUND", argLength: 1, reg: fp11, asm: "FRIN"},                                   // round(arg0), float64
+		{name: "FABS", argLength: 1, reg: fp11, asm: "FABS"},                                     // abs(arg0), float64
+		{name: "FNABS", argLength: 1, reg: fp11, asm: "FNABS"},                                   // -abs(arg0), float64
+		{name: "FCPSGN", argLength: 2, reg: fp21, asm: "FCPSGN"},                                 // copysign arg0 -> arg1, float64
+
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"},                                                                                     // arg0|aux
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64"},                                                                                   // arg0^aux
+		{name: "ANDconst", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}, asm: "ANDCC", aux: "Int64", clobberFlags: true}, // arg0&aux // and-immediate sets CC on PPC, always.
+		{name: "ANDCCconst", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}}, asm: "ANDCC", aux: "Int64", typ: "Flags"},                             // arg0&aux == 0 // and-immediate sets CC on PPC, always.
+
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB", typ: "Int64"},   // sign extend int8 to int64
+		{name: "MOVBZreg", argLength: 1, reg: gp11, asm: "MOVBZ", typ: "Int64"}, // zero extend uint8 to uint64
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH", typ: "Int64"},   // sign extend int16 to int64
+		{name: "MOVHZreg", argLength: 1, reg: gp11, asm: "MOVHZ", typ: "Int64"}, // zero extend uint16 to uint64
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW", typ: "Int64"},   // sign extend int32 to int64
+		{name: "MOVWZreg", argLength: 1, reg: gp11, asm: "MOVWZ", typ: "Int64"}, // zero extend uint32 to uint64
+
+		// Load bytes in the endian order of the arch from arg0+aux+auxint into a 64 bit register.
+		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load byte zero extend
+		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // load 2 bytes sign extend
+		{name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},    // load 4 bytes sign extend
+		{name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend
+		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"},    // load 8 bytes
+
+		// Load bytes in reverse endian order of the arch from arg0 into a 64 bit register, all zero extend.
+		// The generated instructions are indexed loads with no offset field in the instruction so the aux fields are not used.
+		// In these cases the index register field is set to 0 and the full address is in the base register.
+		{name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes reverse order
+		{name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend reverse order
+		{name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend reverse order
+
+		// In these cases an index register is used in addition to a base register
+		// Loads from memory location arg[0] + arg[1].
+		{name: "MOVBZloadidx", argLength: 3, reg: gploadidx, asm: "MOVBZ", typ: "UInt8"},  // zero extend uint8 to uint64
+		{name: "MOVHloadidx", argLength: 3, reg: gploadidx, asm: "MOVH", typ: "Int16"},    // sign extend int16 to int64
+		{name: "MOVHZloadidx", argLength: 3, reg: gploadidx, asm: "MOVHZ", typ: "UInt16"}, // zero extend uint16 to uint64
+		{name: "MOVWloadidx", argLength: 3, reg: gploadidx, asm: "MOVW", typ: "Int32"},    // sign extend int32 to int64
+		{name: "MOVWZloadidx", argLength: 3, reg: gploadidx, asm: "MOVWZ", typ: "UInt32"}, // zero extend uint32 to uint64
+		{name: "MOVDloadidx", argLength: 3, reg: gploadidx, asm: "MOVD", typ: "Int64"},
+		{name: "MOVHBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVHBR", typ: "Int16"}, // sign extend int16 to int64
+		{name: "MOVWBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVWBR", typ: "Int32"}, // sign extend int32 to int64
+		{name: "MOVDBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVDBR", typ: "Int64"},
+		{name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", typ: "Float64"},
+		{name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", typ: "Float32"},
+
+		// Store bytes in the reverse endian order of the arch into arg0.
+		// These are indexed stores with no offset field in the instruction so the auxint fields are not used.
+		{name: "MOVDBRstore", argLength: 3, reg: gpstore, asm: "MOVDBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes reverse order
+		{name: "MOVWBRstore", argLength: 3, reg: gpstore, asm: "MOVWBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes reverse order
+		{name: "MOVHBRstore", argLength: 3, reg: gpstore, asm: "MOVHBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes reverse order
+
+		// Floating point loads from arg0+aux+auxint
+		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load double float
+		{name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load single float
+
+		// Store bytes in the endian order of the arch into arg0+aux+auxint
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte
+		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes
+		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes
+
+		// Store floating point value into arg0+aux+auxint
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store double flot
+		{name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store single float
+
+		// Stores using index and base registers
+		// Stores to arg[0] + arg[1]
+		{name: "MOVBstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVB", typ: "Mem"},     // store bye
+		{name: "MOVHstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVH", typ: "Mem"},     // store half word
+		{name: "MOVWstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVW", typ: "Mem"},     // store word
+		{name: "MOVDstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVD", typ: "Mem"},     // store double word
+		{name: "FMOVDstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVD", typ: "Mem"},   // store double float
+		{name: "FMOVSstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVS", typ: "Mem"},   // store single float
+		{name: "MOVHBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVHBR", typ: "Mem"}, // store half word reversed byte using index reg
+		{name: "MOVWBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVWBR", typ: "Mem"}, // store word reversed byte using index reg
+		{name: "MOVDBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVDBR", typ: "Mem"}, // store double word reversed byte using index reg
+
+		// The following ops store 0 into arg0+aux+auxint arg1=mem
+		{name: "MOVBstorezero", argLength: 2, reg: gpstorezero, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 1 byte
+		{name: "MOVHstorezero", argLength: 2, reg: gpstorezero, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 2 bytes
+		{name: "MOVWstorezero", argLength: 2, reg: gpstorezero, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 4 bytes
+		{name: "MOVDstorezero", argLength: 2, reg: gpstorezero, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 8 bytes
+
+		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb | gp}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB/GP
+
+		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "Int64", rematerializeable: true}, //
+		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", rematerializeable: true},           //
+		{name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float32", asm: "FMOVS", rematerializeable: true},           //
+		{name: "FCMPU", argLength: 2, reg: fp2cr, asm: "FCMPU", typ: "Flags"},
+
+		{name: "CMP", argLength: 2, reg: gp2cr, asm: "CMP", typ: "Flags"},     // arg0 compare to arg1
+		{name: "CMPU", argLength: 2, reg: gp2cr, asm: "CMPU", typ: "Flags"},   // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2cr, asm: "CMPW", typ: "Flags"},   // arg0 compare to arg1
+		{name: "CMPWU", argLength: 2, reg: gp2cr, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1
+		{name: "CMPconst", argLength: 1, reg: gp1cr, asm: "CMP", aux: "Int64", typ: "Flags"},
+		{name: "CMPUconst", argLength: 1, reg: gp1cr, asm: "CMPU", aux: "Int64", typ: "Flags"},
+		{name: "CMPWconst", argLength: 1, reg: gp1cr, asm: "CMPW", aux: "Int32", typ: "Flags"},
+		{name: "CMPWUconst", argLength: 1, reg: gp1cr, asm: "CMPWU", aux: "Int32", typ: "Flags"},
+
+		// ISEL auxInt values 0=LT 1=GT 2=EQ   arg2 ? arg0 : arg1
+		// ISEL auxInt values 4=GE 5=LE 6=NE   arg2 ? arg1 : arg0
+		// ISELB special case where arg0, arg1 values are 0, 1 for boolean result
+		{name: "ISEL", argLength: 3, reg: crgp21, asm: "ISEL", aux: "Int32", typ: "Int32"},  // see above
+		{name: "ISELB", argLength: 2, reg: crgp11, asm: "ISEL", aux: "Int32", typ: "Int32"}, // see above
+
+		// pseudo-ops
+		{name: "Equal", argLength: 1, reg: crgp},         // bool, true flags encode x==y false otherwise.
+		{name: "NotEqual", argLength: 1, reg: crgp},      // bool, true flags encode x!=y false otherwise.
+		{name: "LessThan", argLength: 1, reg: crgp},      // bool, true flags encode  x<y false otherwise.
+		{name: "FLessThan", argLength: 1, reg: crgp},     // bool, true flags encode  x<y false otherwise.
+		{name: "LessEqual", argLength: 1, reg: crgp},     // bool, true flags encode  x<=y false otherwise.
+		{name: "FLessEqual", argLength: 1, reg: crgp},    // bool, true flags encode  x<=y false otherwise; PPC <= === !> which is wrong for NaN
+		{name: "GreaterThan", argLength: 1, reg: crgp},   // bool, true flags encode  x>y false otherwise.
+		{name: "FGreaterThan", argLength: 1, reg: crgp},  // bool, true flags encode  x>y false otherwise.
+		{name: "GreaterEqual", argLength: 1, reg: crgp},  // bool, true flags encode  x>=y false otherwise.
+		{name: "FGreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode  x>=y false otherwise.; PPC >= === !< which is wrong for NaN
+
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of the closure pointer.
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{ctxt}}, zeroWidth: true},
+
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+		// Round ops to block fused-multiply-add extraction.
+		{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+		{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                       // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{callptr, ctxt, 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{callptr}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},            // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// large or unaligned zeroing
+		// arg0 = address of memory to zero (in R3, changed as side effect)
+		// returns mem
+		//
+		// a loop is generated when there is more than one iteration
+		// needed to clear 4 doublewords
+		//
+		//	XXLXOR	VS32,VS32,VS32
+		// 	MOVD	$len/32,R31
+		//	MOVD	R31,CTR
+		//	MOVD	$16,R31
+		//	loop:
+		//	STXVD2X VS32,(R0)(R3)
+		//	STXVD2X	VS32,(R31),R3)
+		//	ADD	R3,32
+		//	BC	loop
+
+		// remaining doubleword clears generated as needed
+		//	MOVD	R0,(R3)
+		//	MOVD	R0,8(R3)
+		//	MOVD	R0,16(R3)
+		//	MOVD	R0,24(R3)
+
+		// one or more of these to clear remainder < 8 bytes
+		//	MOVW	R0,n1(R3)
+		//	MOVH	R0,n2(R3)
+		//	MOVB	R0,n3(R3)
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20")},
+				clobbers: buildReg("R20"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+		{
+			name:      "LoweredZeroShort",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs: []regMask{gp}},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+		{
+			name:      "LoweredQuadZeroShort",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs: []regMask{gp},
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+		{
+			name:      "LoweredQuadZero",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20")},
+				clobbers: buildReg("R20"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			unsafePoint:    true,
+		},
+
+		// R31 is temp register
+		// Loop code:
+		//	MOVD len/32,R31		set up loop ctr
+		//	MOVD R31,CTR
+		//	MOVD $16,R31		index register
+		// loop:
+		//	LXVD2X (R0)(R4),VS32
+		//	LXVD2X (R31)(R4),VS33
+		//	ADD  R4,$32          increment src
+		//	STXVD2X VS32,(R0)(R3)
+		//	STXVD2X VS33,(R31)(R3)
+		//	ADD  R3,$32          increment dst
+		//	BC 16,0,loop         branch ctr
+		// For this purpose, VS32 and VS33 are treated as
+		// scratch registers. Since regalloc does not
+		// track vector registers, even if it could be marked
+		// as clobbered it would have no effect.
+		// TODO: If vector registers are managed by regalloc
+		// mark these as clobbered.
+		//
+		// Bytes not moved by this loop are moved
+		// with a combination of the following instructions,
+		// starting with the largest sizes and generating as
+		// many as needed, using the appropriate offset value.
+		//	MOVD  n(R4),R14
+		//	MOVD  R14,n(R3)
+		//	MOVW  n1(R4),R14
+		//	MOVW  R14,n1(R3)
+		//	MOVH  n2(R4),R14
+		//	MOVH  R14,n2(R3)
+		//	MOVB  n3(R4),R14
+		//	MOVB  R14,n3(R3)
+
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20"), buildReg("R21")},
+				clobbers: buildReg("R20 R21"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},
+		{
+			name:      "LoweredMoveShort",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs: []regMask{gp, gp},
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},
+
+		// The following is similar to the LoweredMove, but uses
+		// LXV instead of LXVD2X, which does not require an index
+		// register and will do 4 in a loop instead of only.
+		{
+			name:      "LoweredQuadMove",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R20"), buildReg("R21")},
+				clobbers: buildReg("R20 R21"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},
+
+		{
+			name:      "LoweredQuadMoveShort",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs: []regMask{gp, gp},
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+			unsafePoint:    true,
+		},
+
+		{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
+
+		{name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, typ: "UInt8", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, typ: "UInt32", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoadPtr", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
+
+		// atomic add32, 64
+		// LWSYNC
+		// LDAR         (Rarg0), Rout
+		// ADD		Rarg1, Rout
+		// STDCCC       Rout, (Rarg0)
+		// BNE          -3(PC)
+		// return new sum
+		{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic exchange32, 64
+		// LWSYNC
+		// LDAR         (Rarg0), Rout
+		// STDCCC       Rarg1, (Rarg0)
+		// BNE          -2(PC)
+		// ISYNC
+		// return old val
+		{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// SYNC
+		// LDAR		(Rarg0), Rtmp
+		// CMP		Rarg1, Rtmp
+		// BNE		3(PC)
+		// STDCCC	Rarg2, (Rarg0)
+		// BNE		-4(PC)
+		// CBNZ         Rtmp, -4(PC)
+		// CSET         EQ, Rout
+		{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
+
+		// atomic 8/32 and/or.
+		// *arg0 &= (|=) arg1. arg2=mem. returns memory. auxint must be zero.
+		// LBAR/LWAT	(Rarg0), Rtmp
+		// AND/OR	Rarg1, Rtmp
+		// STBCCC/STWCCC Rtmp, (Rarg0), Rtmp
+		// BNE		Rtmp, -3(PC)
+		{name: "LoweredAtomicAnd8", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicAnd32", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicOr8", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicOr32", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It preserves R0 through R17 (except special registers R1, R2, R11, R12, R13), g, and its arguments R20 and R21,
+		// but may clobber anything else, including R31 (REGTMP).
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ buildReg("R0 R3 R4 R5 R6 R7 R8 R9 R10 R14 R15 R16 R17 R20 R21 g")) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r6}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r5}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// So if we want (LessThan (CMP a b)) but we can't do that because a is a constant,
+		// then we do (LessThan (InvertFlags (CMP b a))) instead.
+		// Rewrites will convert this to (GreaterThan (CMP b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Constant flag values. For any comparison, there are 3 possible
+		// outcomes: either the three from the signed total order (<,==,>)
+		// or the three from the unsigned total order, depending on which
+		// comparison operation was used (CMP or CMPU -- PPC is different from
+		// the other architectures, which have a single comparison producing
+		// both signed and unsigned comparison results.)
+
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"}, // equal
+		{name: "FlagLT"}, // signed < or unsigned <
+		{name: "FlagGT"}, // signed > or unsigned >
+	}
+
+	blocks := []blockData{
+		{name: "EQ", controls: 1},
+		{name: "NE", controls: 1},
+		{name: "LT", controls: 1},
+		{name: "LE", controls: 1},
+		{name: "GT", controls: 1},
+		{name: "GE", controls: 1},
+		{name: "FLT", controls: 1},
+		{name: "FLE", controls: 1},
+		{name: "FGT", controls: 1},
+		{name: "FGE", controls: 1},
+	}
+
+	archs = append(archs, arch{
+		name:            "PPC64",
+		pkg:             "cmd/internal/obj/ppc64",
+		genfile:         "../../ppc64/ssa.go",
+		ops:             ops,
+		blocks:          blocks,
+		regnames:        regNamesPPC64,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: int8(num["SP"]),
+		linkreg:         -1, // not used
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/README b/src/cmd/compile/internal/ssa/gen/README
new file mode 100644
index 0000000..6d2c6bb
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/README
@@ -0,0 +1,7 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+This package generates opcode tables, rewrite rules, etc. for the ssa compiler.
+Run it with go-1.13 (or above):
+   go run *.go
diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64.rules b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
new file mode 100644
index 0000000..4380a5e
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
@@ -0,0 +1,737 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Optimizations TODO:
+// * Use SLTI and SLTIU for comparisons to constants, instead of SLT/SLTU with constants in registers
+// * Use the zero register instead of moving 0 into a register.
+// * Add rules to avoid generating a temp bool value for (If (SLT[U] ...) ...).
+// * Optimize left and right shift by simplifying SLTIU, Neg, and ADD for constants.
+// * Arrange for non-trivial Zero and Move lowerings to use aligned loads and stores.
+// * Eliminate zero immediate shifts, adds, etc.
+// * Avoid using Neq32 for writeBarrier.enabled checks.
+
+// Lowering arithmetic
+(Add64 ...) => (ADD ...)
+(AddPtr ...) => (ADD ...)
+(Add32 ...) => (ADD ...)
+(Add16 ...) => (ADD ...)
+(Add8 ...) => (ADD ...)
+(Add32F ...) => (FADDS ...)
+(Add64F ...) => (FADDD ...)
+
+(Sub64 ...) => (SUB ...)
+(SubPtr ...) => (SUB ...)
+(Sub32 ...) => (SUB ...)
+(Sub16 ...) => (SUB ...)
+(Sub8 ...) => (SUB ...)
+(Sub32F ...) => (FSUBS ...)
+(Sub64F ...) => (FSUBD ...)
+
+(Mul64 ...) => (MUL  ...)
+(Mul32 ...) => (MULW ...)
+(Mul16 x y) => (MULW (SignExt16to32 x) (SignExt16to32 y))
+(Mul8 x y)  => (MULW (SignExt8to32 x)  (SignExt8to32 y))
+(Mul32F ...) => (FMULS ...)
+(Mul64F ...) => (FMULD ...)
+
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIVD ...)
+
+(Div64 x y [false])  => (DIV x y)
+(Div64u ...) => (DIVU ...)
+(Div32 x y [false])  => (DIVW x y)
+(Div32u ...) => (DIVUW ...)
+(Div16 x y [false])  => (DIVW  (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (DIVUW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y)   => (DIVW  (SignExt8to32 x)  (SignExt8to32 y))
+(Div8u x y)  => (DIVUW (ZeroExt8to32 x)  (ZeroExt8to32 y))
+
+(Hmul64 ...)  => (MULH  ...)
+(Hmul64u ...) => (MULHU ...)
+(Hmul32 x y)  => (SRAI [32] (MUL  (SignExt32to64 x) (SignExt32to64 y)))
+(Hmul32u x y) => (SRLI [32] (MUL  (ZeroExt32to64 x) (ZeroExt32to64 y)))
+
+// (x + y) / 2 => (x / 2) + (y / 2) + (x & y & 1)
+(Avg64u <t> x y) => (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y)))
+
+(Mod64 x y [false])  => (REM x y)
+(Mod64u ...) => (REMU  ...)
+(Mod32 x y [false])  => (REMW x y)
+(Mod32u ...) => (REMUW ...)
+(Mod16 x y [false])  => (REMW  (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y)   => (REMW  (SignExt8to32  x) (SignExt8to32  y))
+(Mod8u x y)  => (REMUW (ZeroExt8to32  x) (ZeroExt8to32  y))
+
+(And64 ...) => (AND ...)
+(And32 ...) => (AND ...)
+(And16 ...) => (AND ...)
+(And8  ...) => (AND ...)
+
+(Or64 ...) => (OR ...)
+(Or32 ...) => (OR ...)
+(Or16 ...) => (OR ...)
+(Or8  ...) => (OR ...)
+
+(Xor64 ...) => (XOR ...)
+(Xor32 ...) => (XOR ...)
+(Xor16 ...) => (XOR ...)
+(Xor8  ...) => (XOR ...)
+
+(Neg64  ...) => (NEG ...)
+(Neg32  ...) => (NEG ...)
+(Neg16  ...) => (NEG ...)
+(Neg8   ...) => (NEG ...)
+(Neg32F ...) => (FNEGS ...)
+(Neg64F ...) => (FNEGD ...)
+
+(Com64 ...) => (NOT ...)
+(Com32 ...) => (NOT ...)
+(Com16 ...) => (NOT ...)
+(Com8  ...) => (NOT ...)
+
+(Sqrt ...) => (FSQRTD ...)
+
+// Sign and zero extension.
+
+(SignExt8to16  ...) => (MOVBreg ...)
+(SignExt8to32  ...) => (MOVBreg ...)
+(SignExt8to64  ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to16  ...) => (MOVBUreg ...)
+(ZeroExt8to32  ...) => (MOVBUreg ...)
+(ZeroExt8to64  ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(Cvt32to32F ...) => (FCVTSW ...)
+(Cvt32to64F ...) => (FCVTDW ...)
+(Cvt64to32F ...) => (FCVTSL ...)
+(Cvt64to64F ...) => (FCVTDL ...)
+
+(Cvt32Fto32 ...) => (FCVTWS ...)
+(Cvt32Fto64 ...) => (FCVTLS ...)
+(Cvt64Fto32 ...) => (FCVTWD ...)
+(Cvt64Fto64 ...) => (FCVTLD ...)
+
+(Cvt32Fto64F ...) => (FCVTDS ...)
+(Cvt64Fto32F ...) => (FCVTSD ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (Copy ...)
+(Round64F ...) => (Copy ...)
+
+// From genericOps.go:
+// "0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0"
+//
+// Like other arches, we compute ~((x-1) >> 63), with arithmetic right shift.
+// For positive x, bit 63 of x-1 is always 0, so the result is -1.
+// For zero x, bit 63 of x-1 is 1, so the result is 0.
+//
+(Slicemask <t> x) => (NOT (SRAI <t> [63] (ADDI <t> [-1] x)))
+
+// Truncations
+// We ignore the unused high parts of registers, so truncates are just copies.
+(Trunc16to8  ...) => (Copy ...)
+(Trunc32to8  ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8  ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Shifts
+
+// SLL only considers the bottom 6 bits of y. If y > 64, the result should
+// always be 0.
+//
+// Breaking down the operation:
+//
+// (SLL x y) generates x << (y & 63).
+//
+// If y < 64, this is the value we want. Otherwise, we want zero.
+//
+// So, we AND with -1 * uint64(y < 64), which is 0xfffff... if y < 64 and 0 otherwise.
+(Lsh8x8   <t> x y) => (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Lsh8x16  <t> x y) => (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh8x32  <t> x y) => (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh8x64  <t> x y) => (AND (SLL <t> x y) (Neg8  <t> (SLTIU <t> [64] y)))
+(Lsh16x8  <t> x y) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Lsh16x16 <t> x y) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh16x32 <t> x y) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh16x64 <t> x y) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] y)))
+(Lsh32x8  <t> x y) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Lsh32x16 <t> x y) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh32x32 <t> x y) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh32x64 <t> x y) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] y)))
+(Lsh64x8  <t> x y) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Lsh64x16 <t> x y) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Lsh64x32 <t> x y) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Lsh64x64 <t> x y) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
+
+// SRL only considers the bottom 6 bits of y. If y > 64, the result should
+// always be 0. See Lsh above for a detailed description.
+(Rsh8Ux8   <t> x y) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Rsh8Ux16  <t> x y) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh8Ux32  <t> x y) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh8Ux64  <t> x y) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] y)))
+(Rsh16Ux8  <t> x y) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Rsh16Ux16 <t> x y) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh16Ux32 <t> x y) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh16Ux64 <t> x y) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
+(Rsh32Ux8  <t> x y) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Rsh32Ux16 <t> x y) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh32Ux32 <t> x y) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh32Ux64 <t> x y) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
+(Rsh64Ux8  <t> x y) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+(Rsh64Ux16 <t> x y) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+(Rsh64Ux32 <t> x y) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+(Rsh64Ux64 <t> x y) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] y)))
+
+// SRA only considers the bottom 6 bits of y. If y > 64, the result should
+// be either 0 or -1 based on the sign bit.
+//
+// We implement this by performing the max shift (-1) if y >= 64.
+//
+// We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves
+// us with -1 (0xffff...) if y >= 64.
+//
+// We don't need to sign-extend the OR result, as it will be at minimum 8 bits,
+// more than the 6 bits SRA cares about.
+(Rsh8x8   <t> x y) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+(Rsh8x16  <t> x y) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh8x32  <t> x y) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh8x64  <t> x y) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+(Rsh16x8  <t> x y) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+(Rsh16x16 <t> x y) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh16x32 <t> x y) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh16x64 <t> x y) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+(Rsh32x8  <t> x y) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+(Rsh32x16 <t> x y) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh32x32 <t> x y) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh32x64 <t> x y) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+(Rsh64x8  <t> x y) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+(Rsh64x16 <t> x y) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+(Rsh64x32 <t> x y) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+(Rsh64x64 <t> x y) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+
+// rotates
+(RotateLeft8 <t> x (MOVBconst [c])) => (Or8 (Lsh8x64 <t> x (MOVBconst [c&7])) (Rsh8Ux64 <t> x (MOVBconst [-c&7])))
+(RotateLeft16 <t> x (MOVHconst [c])) => (Or16 (Lsh16x64 <t> x (MOVHconst [c&15])) (Rsh16Ux64 <t> x (MOVHconst [-c&15])))
+(RotateLeft32 <t> x (MOVWconst [c])) => (Or32 (Lsh32x64 <t> x (MOVWconst [c&31])) (Rsh32Ux64 <t> x (MOVWconst [-c&31])))
+(RotateLeft64 <t> x (MOVDconst [c])) => (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63])))
+
+(Less64  ...) => (SLT  ...)
+(Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
+(Less16  x y) => (SLT  (SignExt16to64 x) (SignExt16to64 y))
+(Less8   x y) => (SLT  (SignExt8to64  x) (SignExt8to64  y))
+(Less64U ...) => (SLTU ...)
+(Less32U x y) => (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Less16U x y) => (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Less8U  x y) => (SLTU (ZeroExt8to64  x) (ZeroExt8to64  y))
+(Less64F ...) => (FLTD ...)
+(Less32F ...) => (FLTS ...)
+
+// Convert x <= y to !(y > x).
+(Leq64  x y) => (Not (Less64  y x))
+(Leq32  x y) => (Not (Less32  y x))
+(Leq16  x y) => (Not (Less16  y x))
+(Leq8   x y) => (Not (Less8   y x))
+(Leq64U x y) => (Not (Less64U y x))
+(Leq32U x y) => (Not (Less32U y x))
+(Leq16U x y) => (Not (Less16U y x))
+(Leq8U  x y) => (Not (Less8U  y x))
+(Leq64F ...) => (FLED ...)
+(Leq32F ...) => (FLES ...)
+
+(EqPtr x y) => (SEQZ (SUB <x.Type> x y))
+(Eq64  x y) => (SEQZ (SUB <x.Type> x y))
+(Eq32  x y) => (SEQZ (SUBW <x.Type> x y))
+(Eq16  x y) => (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq8   x y) => (SEQZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
+(Eq64F ...) => (FEQD ...)
+(Eq32F ...) => (FEQS ...)
+
+(NeqPtr x y) => (SNEZ (SUB <x.Type> x y))
+(Neq64  x y) => (SNEZ (SUB <x.Type> x y))
+(Neq32  x y) => (SNEZ (SUBW <x.Type> x y))
+(Neq16  x y) => (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Neq8   x y) => (SNEZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
+(Neq64F ...) => (FNED ...)
+(Neq32F ...) => (FNES ...)
+
+// Loads
+(Load <t> ptr mem) &&  t.IsBoolean()                  => (MOVBUload ptr mem)
+(Load <t> ptr mem) && ( is8BitInt(t) &&  isSigned(t)) => (MOVBload  ptr mem)
+(Load <t> ptr mem) && ( is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) &&  isSigned(t)) => (MOVHload  ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) &&  isSigned(t)) => (MOVWload  ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t))     => (MOVDload  ptr mem)
+(Load <t> ptr mem) &&  is32BitFloat(t)                => (FMOVWload ptr mem)
+(Load <t> ptr mem) &&  is64BitFloat(t)                => (FMOVDload ptr mem)
+
+// Stores
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 &&  is32BitFloat(val.Type) => (FMOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 &&  is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+
+// We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis
+// knows what variables are being read/written by the ops.
+(MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVHload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVDload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVDload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+(MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+	(MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+
+(MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVBUload [off1+int32(off2)] {sym} base mem)
+(MOVBload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVBload  [off1+int32(off2)] {sym} base mem)
+(MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVHUload [off1+int32(off2)] {sym} base mem)
+(MOVHload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVHload  [off1+int32(off2)] {sym} base mem)
+(MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVWUload [off1+int32(off2)] {sym} base mem)
+(MOVWload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVWload  [off1+int32(off2)] {sym} base mem)
+(MOVDload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+	(MOVDload  [off1+int32(off2)] {sym} base mem)
+
+(MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+	(MOVBstore [off1+int32(off2)] {sym} base val mem)
+(MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+	(MOVHstore [off1+int32(off2)] {sym} base val mem)
+(MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+	(MOVWstore [off1+int32(off2)] {sym} base val mem)
+(MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+	(MOVDstore [off1+int32(off2)] {sym} base val mem)
+(MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+(MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
+
+// Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis
+// with OffPtr -> ADDI.
+(ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+int64(d)) => (MOVaddr [int32(c)+d] {s} x)
+
+// Small zeroing
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVBconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore ptr (MOVHconst [0]) mem)
+(Zero [2] ptr mem) =>
+	(MOVBstore [1] ptr (MOVBconst [0])
+		(MOVBstore ptr (MOVBconst [0]) mem))
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore ptr (MOVWconst [0]) mem)
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] ptr (MOVHconst [0])
+		(MOVHstore ptr (MOVHconst [0]) mem))
+(Zero [4] ptr mem) =>
+	(MOVBstore [3] ptr (MOVBconst [0])
+		(MOVBstore [2] ptr (MOVBconst [0])
+			(MOVBstore [1] ptr (MOVBconst [0])
+				(MOVBstore ptr (MOVBconst [0]) mem))))
+(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore ptr (MOVDconst [0]) mem)
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [4] ptr (MOVWconst [0])
+		(MOVWstore ptr (MOVWconst [0]) mem))
+(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [6] ptr (MOVHconst [0])
+		(MOVHstore [4] ptr (MOVHconst [0])
+			(MOVHstore [2] ptr (MOVHconst [0])
+				(MOVHstore ptr (MOVHconst [0]) mem))))
+
+(Zero [3] ptr mem) =>
+	(MOVBstore [2] ptr (MOVBconst [0])
+		(MOVBstore [1] ptr (MOVBconst [0])
+			(MOVBstore ptr (MOVBconst [0]) mem)))
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [4] ptr (MOVHconst [0])
+		(MOVHstore [2] ptr (MOVHconst [0])
+			(MOVHstore ptr (MOVHconst [0]) mem)))
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [8] ptr (MOVWconst [0])
+		(MOVWstore [4] ptr (MOVWconst [0])
+			(MOVWstore ptr (MOVWconst [0]) mem)))
+(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore [8] ptr (MOVDconst [0])
+		(MOVDstore ptr (MOVDconst [0]) mem))
+(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore [16] ptr (MOVDconst [0])
+		(MOVDstore [8] ptr (MOVDconst [0])
+			(MOVDstore ptr (MOVDconst [0]) mem)))
+(Zero [32] {t} ptr mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore [24] ptr (MOVDconst [0])
+		(MOVDstore [16] ptr (MOVDconst [0])
+			(MOVDstore [8] ptr (MOVDconst [0])
+				(MOVDstore ptr (MOVDconst [0]) mem))))
+
+// Medium 8-aligned zeroing uses a Duff's device
+// 8 and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+	&& s%8 == 0 && s <= 8*128
+	&& t.Alignment()%8 == 0 && !config.noDuffDevice =>
+	(DUFFZERO [8 * (128 - s/8)] ptr mem)
+
+// Generic zeroing uses a loop
+(Zero [s] {t} ptr mem) =>
+	(LoweredZero [t.Alignment()]
+		ptr
+		(ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)]))
+		mem)
+
+(Convert ...) => (MOVconvert ...)
+
+// Checks
+(IsNonNil p) => (NeqPtr (MOVDconst [0]) p)
+(IsInBounds ...) => (Less64U ...)
+(IsSliceInBounds ...) => (Leq64U ...)
+
+// Trivial lowering
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// Small moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore dst (MOVHload src mem) mem)
+(Move [2] dst src mem) =>
+	(MOVBstore [1] dst (MOVBload [1] src mem)
+		(MOVBstore dst (MOVBload src mem) mem))
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore dst (MOVWload src mem) mem)
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [2] dst (MOVHload [2] src mem)
+		(MOVHstore dst (MOVHload src mem) mem))
+(Move [4] dst src mem) =>
+	(MOVBstore [3] dst (MOVBload [3] src mem)
+		(MOVBstore [2] dst (MOVBload [2] src mem)
+			(MOVBstore [1] dst (MOVBload [1] src mem)
+				(MOVBstore dst (MOVBload src mem) mem))))
+(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore dst (MOVDload src mem) mem)
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [4] dst (MOVWload [4] src mem)
+		(MOVWstore dst (MOVWload src mem) mem))
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [6] dst (MOVHload [6] src mem)
+		(MOVHstore [4] dst (MOVHload [4] src mem)
+			(MOVHstore [2] dst (MOVHload [2] src mem)
+				(MOVHstore dst (MOVHload src mem) mem))))
+
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBload [2] src mem)
+		(MOVBstore [1] dst (MOVBload [1] src mem)
+			(MOVBstore dst (MOVBload src mem) mem)))
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
+	(MOVHstore [4] dst (MOVHload [4] src mem)
+		(MOVHstore [2] dst (MOVHload [2] src mem)
+			(MOVHstore dst (MOVHload src mem) mem)))
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
+	(MOVWstore [8] dst (MOVWload [8] src mem)
+		(MOVWstore [4] dst (MOVWload [4] src mem)
+			(MOVWstore dst (MOVWload src mem) mem)))
+(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore [8] dst (MOVDload [8] src mem)
+		(MOVDstore dst (MOVDload src mem) mem))
+(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore [16] dst (MOVDload [16] src mem)
+		(MOVDstore [8] dst (MOVDload [8] src mem)
+			(MOVDstore dst (MOVDload src mem) mem)))
+(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 =>
+	(MOVDstore [24] dst (MOVDload [24] src mem)
+		(MOVDstore [16] dst (MOVDload [16] src mem)
+			(MOVDstore [8] dst (MOVDload [8] src mem)
+				(MOVDstore dst (MOVDload src mem) mem))))
+
+// Medium 8-aligned move uses a Duff's device
+// 16 and 128 are magic constants, see runtime/mkduff.go
+(Move [s] {t} dst src mem)
+	&& s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [16 * (128 - s/8)] dst src mem)
+
+// Generic move uses a loop
+(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
+	(LoweredMove [t.Alignment()]
+		dst
+		src
+		(ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src)
+		mem)
+
+// Boolean ops; 0=false, 1=true
+(AndB ...) => (AND ...)
+(OrB  ...) => (OR  ...)
+(EqB  x y) => (SEQZ (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not  ...) => (SEQZ ...)
+
+// Lowering pointer arithmetic
+// TODO: Special handling for SP offsets, like ARM
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVaddr [int32(off)] ptr)
+(OffPtr [off] ptr) && is32Bit(off) => (ADDI [off] ptr)
+(OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr)
+
+// TODO(jsing): Check if we actually need MOV{B,H,W}const as most platforms
+// use a single MOVDconst op.
+(Const8 ...) => (MOVBconst ...)
+(Const16 ...) => (MOVHconst ...)
+(Const32 ...) => (MOVWconst ...)
+(Const64 ...) => (MOVDconst ...)
+(Const32F [val]) => (FMVSX (MOVWconst [int32(math.Float32bits(val))]))
+(Const64F [val]) => (FMVDX (MOVDconst [int64(math.Float64bits(val))]))
+(ConstNil) => (MOVDconst [0])
+(ConstBool [val]) => (MOVBconst [int8(b2i(val))])
+
+// Convert 64 bit immediate to two 32 bit immediates, combine with add and shift.
+// The lower 32 bit immediate will be treated as signed,
+// so if it is negative, adjust for the borrow by incrementing the top half.
+// We don't have to worry about overflow from the increment,
+// because if the top half is all 1s, and int32(c) is negative,
+// then the overall constant fits in an int32.
+(MOVDconst <t> [c]) && !is32Bit(c) && int32(c) <  0 => (ADD (SLLI <t> [32] (MOVDconst [c>>32+1])) (MOVDconst [int64(int32(c))]))
+(MOVDconst <t> [c]) && !is32Bit(c) && int32(c) >= 0 => (ADD (SLLI <t> [32] (MOVDconst [c>>32+0])) (MOVDconst [int64(int32(c))]))
+
+(Addr {sym} base) => (MOVaddr {sym} [0] base)
+(LocalAddr {sym} base _) => (MOVaddr {sym} base)
+
+// Calls
+(StaticCall  ...) => (CALLstatic  ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall   ...) => (CALLinter   ...)
+
+// Atomic Intrinsics
+(AtomicLoad8   ...) => (LoweredAtomicLoad8  ...)
+(AtomicLoad32  ...) => (LoweredAtomicLoad32 ...)
+(AtomicLoad64  ...) => (LoweredAtomicLoad64 ...)
+(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...)
+
+(AtomicStore8       ...) => (LoweredAtomicStore8  ...)
+(AtomicStore32      ...) => (LoweredAtomicStore32 ...)
+(AtomicStore64      ...) => (LoweredAtomicStore64 ...)
+(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
+
+(AtomicAdd32 ...) => (LoweredAtomicAdd32 ...)
+(AtomicAdd64 ...) => (LoweredAtomicAdd64 ...)
+
+(AtomicCompareAndSwap32 ...) => (LoweredAtomicCas32 ...)
+(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...)
+
+(AtomicExchange32 ...) => (LoweredAtomicExchange32 ...)
+(AtomicExchange64 ...) => (LoweredAtomicExchange64 ...)
+
+// Conditional branches
+(If cond yes no) => (BNEZ cond yes no)
+
+// Optimizations
+
+// Absorb SEQZ/SNEZ into branch.
+(BEQZ (SEQZ x) yes no) => (BNEZ x yes no)
+(BEQZ (SNEZ x) yes no) => (BEQZ x yes no)
+(BNEZ (SEQZ x) yes no) => (BEQZ x yes no)
+(BNEZ (SNEZ x) yes no) => (BNEZ x yes no)
+
+// Convert BEQZ/BNEZ into more optimal branch conditions.
+(BEQZ (SUB x y) yes no) => (BEQ x y yes no)
+(BNEZ (SUB x y) yes no) => (BNE x y yes no)
+(BEQZ (SLT x y) yes no) => (BGE x y yes no)
+(BNEZ (SLT x y) yes no) => (BLT x y yes no)
+(BEQZ (SLTU x y) yes no) => (BGEU x y yes no)
+(BNEZ (SLTU x y) yes no) => (BLTU x y yes no)
+
+// Convert branch with zero to BEQZ/BNEZ.
+(BEQ (MOVDconst [0]) cond yes no) => (BEQZ cond yes no)
+(BEQ cond (MOVDconst [0]) yes no) => (BEQZ cond yes no)
+(BNE (MOVDconst [0]) cond yes no) => (BNEZ cond yes no)
+(BNE cond (MOVDconst [0]) yes no) => (BNEZ cond yes no)
+
+// Store zero
+(MOVBstore [off] {sym} ptr (MOVBconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVHconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem)
+
+// Avoid sign/zero extension for consts.
+(MOVBreg  (MOVBconst [c])) => (MOVDconst [int64(c)])
+(MOVHreg  (MOVBconst [c])) => (MOVDconst [int64(c)])
+(MOVHreg  (MOVHconst [c])) => (MOVDconst [int64(c)])
+(MOVWreg  (MOVBconst [c])) => (MOVDconst [int64(c)])
+(MOVWreg  (MOVHconst [c])) => (MOVDconst [int64(c)])
+(MOVWreg  (MOVWconst [c])) => (MOVDconst [int64(c)])
+(MOVBUreg (MOVBconst [c])) => (MOVDconst [int64(uint8(c))])
+(MOVHUreg (MOVBconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVHUreg (MOVHconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVWUreg (MOVBconst [c])) => (MOVDconst [int64(uint32(c))])
+(MOVWUreg (MOVHconst [c])) => (MOVDconst [int64(uint32(c))])
+(MOVWUreg (MOVWconst [c])) => (MOVDconst [int64(uint32(c))])
+
+// Avoid sign/zero extension after properly typed load.
+(MOVBreg  x:(MOVBload  _ _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBload  _ _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg  x:(MOVHload  _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBload  _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVHload  _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVWload  _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x)
+
+// Fold double extensions.
+(MOVBreg  x:(MOVBreg  _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBreg  _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg  x:(MOVHreg  _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBreg  _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWreg  x:(MOVHreg  _)) => (MOVDreg x)
+(MOVWreg  x:(MOVWreg  _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x)
+
+// Do not extend before store.
+(MOVBstore [off] {sym} ptr (MOVBreg  x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg  x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg  x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg  x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg  x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg  x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// Replace extend after load with alternate load where possible.
+(MOVBreg  <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload  <t> [off] {sym} ptr mem)
+(MOVHreg  <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload  <t> [off] {sym} ptr mem)
+(MOVWreg  <t> x:(MOVWUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload  <t> [off] {sym} ptr mem)
+(MOVBUreg <t> x:(MOVBload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem)
+(MOVHUreg <t> x:(MOVHload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem)
+(MOVWUreg <t> x:(MOVWload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWUload <t> [off] {sym} ptr mem)
+
+// If a register move has only 1 use, just use the same register without emitting instruction
+// MOVnop does not emit an instruction, only for ensuring the type.
+(MOVDreg x) && x.Uses == 1 => (MOVDnop x)
+
+// Fold constant into immediate instructions where possible.
+(ADD (MOVBconst [val]) x) => (ADDI [int64(val)] x)
+(ADD (MOVHconst [val]) x) => (ADDI [int64(val)] x)
+(ADD (MOVWconst [val]) x) => (ADDI [int64(val)] x)
+(ADD (MOVDconst [val]) x) && is32Bit(val) => (ADDI [val] x)
+
+(AND (MOVBconst [val]) x) => (ANDI [int64(val)] x)
+(AND (MOVHconst [val]) x) => (ANDI [int64(val)] x)
+(AND (MOVWconst [val]) x) => (ANDI [int64(val)] x)
+(AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x)
+
+(OR (MOVBconst [val]) x) => (ORI [int64(val)] x)
+(OR (MOVHconst [val]) x) => (ORI [int64(val)] x)
+(OR (MOVWconst [val]) x) => (ORI [int64(val)] x)
+(OR (MOVDconst [val]) x) && is32Bit(val) => (ORI [val] x)
+
+(XOR (MOVBconst [val]) x) => (XORI [int64(val)] x)
+(XOR (MOVHconst [val]) x) => (XORI [int64(val)] x)
+(XOR (MOVWconst [val]) x) => (XORI [int64(val)] x)
+(XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x)
+
+(SLL x (MOVBconst [val])) => (SLLI [int64(val&63)] x)
+(SLL x (MOVHconst [val])) => (SLLI [int64(val&63)] x)
+(SLL x (MOVWconst [val])) => (SLLI [int64(val&63)] x)
+(SLL x (MOVDconst [val])) => (SLLI [int64(val&63)] x)
+
+(SRL x (MOVBconst [val])) => (SRLI [int64(val&63)] x)
+(SRL x (MOVHconst [val])) => (SRLI [int64(val&63)] x)
+(SRL x (MOVWconst [val])) => (SRLI [int64(val&63)] x)
+(SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x)
+
+(SRA x (MOVBconst [val])) => (SRAI [int64(val&63)] x)
+(SRA x (MOVHconst [val])) => (SRAI [int64(val&63)] x)
+(SRA x (MOVWconst [val])) => (SRAI [int64(val&63)] x)
+(SRA x (MOVDconst [val])) => (SRAI [int64(val&63)] x)
+
+// Convert subtraction of a const into ADDI with negative immediate, where possible.
+(SUB x (MOVBconst [val])) => (ADDI [-int64(val)] x)
+(SUB x (MOVHconst [val])) => (ADDI [-int64(val)] x)
+(SUB x (MOVWconst [val])) && is32Bit(-int64(val)) => (ADDI [-int64(val)] x)
+(SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x)
+
+// Subtraction of zero.
+(SUB x (MOVBconst [0])) => x
+(SUB x (MOVHconst [0])) => x
+(SUB x (MOVWconst [0])) => x
+(SUB x (MOVDconst [0])) => x
+
+// Subtraction of zero with sign extension.
+(SUBW x (MOVWconst [0])) => (ADDIW [0] x)
+
+// Subtraction from zero.
+(SUB (MOVBconst [0]) x) => (NEG x)
+(SUB (MOVHconst [0]) x) => (NEG x)
+(SUB (MOVWconst [0]) x) => (NEG x)
+(SUB (MOVDconst [0]) x) => (NEG x)
+
+// Subtraction from zero with sign extension.
+(SUBW (MOVDconst [0]) x) => (NEGW x)
+
+// Addition of zero.
+(ADDI [0] x) => x
diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
new file mode 100644
index 0000000..f643192
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
@@ -0,0 +1,464 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import (
+	"fmt"
+)
+
+// Notes:
+//  - Boolean types occupy the entire register. 0=false, 1=true.
+
+// Suffixes encode the bit width of various instructions:
+//
+// D (double word) = 64 bit int
+// W (word)        = 32 bit int
+// H (half word)   = 16 bit int
+// B (byte)        = 8 bit int
+// S (single)      = 32 bit float
+// D (double)      = 64 bit float
+// L               = 64 bit int, used when the opcode starts with F
+
+const (
+	riscv64REG_G    = 27
+	riscv64REG_CTXT = 20
+	riscv64REG_LR   = 1
+	riscv64REG_SP   = 2
+	riscv64REG_TP   = 4
+	riscv64REG_TMP  = 31
+	riscv64REG_ZERO = 0
+)
+
+func riscv64RegName(r int) string {
+	switch {
+	case r == riscv64REG_G:
+		return "g"
+	case r == riscv64REG_SP:
+		return "SP"
+	case 0 <= r && r <= 31:
+		return fmt.Sprintf("X%d", r)
+	case 32 <= r && r <= 63:
+		return fmt.Sprintf("F%d", r-32)
+	default:
+		panic(fmt.Sprintf("unknown register %d", r))
+	}
+}
+
+func init() {
+	var regNamesRISCV64 []string
+	var gpMask, fpMask, gpgMask, gpspMask, gpspsbMask, gpspsbgMask regMask
+	regNamed := make(map[string]regMask)
+
+	// Build the list of register names, creating an appropriately indexed
+	// regMask for the gp and fp registers as we go.
+	//
+	// If name is specified, use it rather than the riscv reg number.
+	addreg := func(r int, name string) regMask {
+		mask := regMask(1) << uint(len(regNamesRISCV64))
+		if name == "" {
+			name = riscv64RegName(r)
+		}
+		regNamesRISCV64 = append(regNamesRISCV64, name)
+		regNamed[name] = mask
+		return mask
+	}
+
+	// General purpose registers.
+	for r := 0; r <= 31; r++ {
+		if r == riscv64REG_LR {
+			// LR is not used by regalloc, so we skip it to leave
+			// room for pseudo-register SB.
+			continue
+		}
+
+		mask := addreg(r, "")
+
+		// Add general purpose registers to gpMask.
+		switch r {
+		// ZERO, TP and TMP are not in any gp mask.
+		case riscv64REG_ZERO, riscv64REG_TP, riscv64REG_TMP:
+		case riscv64REG_G:
+			gpgMask |= mask
+			gpspsbgMask |= mask
+		case riscv64REG_SP:
+			gpspMask |= mask
+			gpspsbMask |= mask
+			gpspsbgMask |= mask
+		default:
+			gpMask |= mask
+			gpgMask |= mask
+			gpspMask |= mask
+			gpspsbMask |= mask
+			gpspsbgMask |= mask
+		}
+	}
+
+	// Floating pointer registers.
+	for r := 32; r <= 63; r++ {
+		mask := addreg(r, "")
+		fpMask |= mask
+	}
+
+	// Pseudo-register: SB
+	mask := addreg(-1, "SB")
+	gpspsbMask |= mask
+	gpspsbgMask |= mask
+
+	if len(regNamesRISCV64) > 64 {
+		// regMask is only 64 bits.
+		panic("Too many RISCV64 registers")
+	}
+
+	regCtxt := regNamed["X20"]
+	callerSave := gpMask | fpMask | regNamed["g"]
+
+	var (
+		gpstore  = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register
+		gpstore0 = regInfo{inputs: []regMask{gpspsbMask}}
+		gp01     = regInfo{outputs: []regMask{gpMask}}
+		gp11     = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}
+		gp21     = regInfo{inputs: []regMask{gpMask, gpMask}, outputs: []regMask{gpMask}}
+		gpload   = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{gpMask}}
+		gp11sb   = regInfo{inputs: []regMask{gpspsbMask}, outputs: []regMask{gpMask}}
+		gpxchg   = regInfo{inputs: []regMask{gpspsbgMask, gpgMask}, outputs: []regMask{gpMask}}
+		gpcas    = regInfo{inputs: []regMask{gpspsbgMask, gpgMask, gpgMask}, outputs: []regMask{gpMask}}
+
+		fp11    = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{fpMask}}
+		fp21    = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{fpMask}}
+		gpfp    = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{fpMask}}
+		fpgp    = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{gpMask}}
+		fpstore = regInfo{inputs: []regMask{gpspsbMask, fpMask, 0}}
+		fpload  = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{fpMask}}
+		fp2gp   = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{gpMask}}
+
+		call        = regInfo{clobbers: callerSave}
+		callClosure = regInfo{inputs: []regMask{gpspMask, regCtxt, 0}, clobbers: callerSave}
+		callInter   = regInfo{inputs: []regMask{gpMask}, clobbers: callerSave}
+	)
+
+	RISCV64ops := []opData{
+		{name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1
+		{name: "ADDI", argLength: 1, reg: gp11sb, asm: "ADDI", aux: "Int64"},  // arg0 + auxint
+		{name: "ADDIW", argLength: 1, reg: gp11, asm: "ADDIW", aux: "Int64"},  // 32 low bits of arg0 + auxint, sign extended to 64 bits
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                    // -arg0
+		{name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW"},                  // -arg0 of 32 bits, sign extended to 64 bits
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB"},                    // arg0 - arg1
+		{name: "SUBW", argLength: 2, reg: gp21, asm: "SUBW"},                  // 32 low bits of arg 0 - 32 low bits of arg 1, sign extended to 64 bits
+
+		// M extension. H means high (i.e., it returns the top bits of
+		// the result). U means unsigned. W means word (i.e., 32-bit).
+		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true, typ: "Int64"}, // arg0 * arg1
+		{name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true, typ: "Int32"},
+		{name: "MULH", argLength: 2, reg: gp21, asm: "MULH", commutative: true, typ: "Int64"},
+		{name: "MULHU", argLength: 2, reg: gp21, asm: "MULHU", commutative: true, typ: "UInt64"},
+		{name: "DIV", argLength: 2, reg: gp21, asm: "DIV", typ: "Int64"}, // arg0 / arg1
+		{name: "DIVU", argLength: 2, reg: gp21, asm: "DIVU", typ: "UInt64"},
+		{name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"},
+		{name: "DIVUW", argLength: 2, reg: gp21, asm: "DIVUW", typ: "UInt32"},
+		{name: "REM", argLength: 2, reg: gp21, asm: "REM", typ: "Int64"}, // arg0 % arg1
+		{name: "REMU", argLength: 2, reg: gp21, asm: "REMU", typ: "UInt64"},
+		{name: "REMW", argLength: 2, reg: gp21, asm: "REMW", typ: "Int32"},
+		{name: "REMUW", argLength: 2, reg: gp21, asm: "REMUW", typ: "UInt32"},
+
+		{name: "MOVaddr", argLength: 1, reg: gp11sb, asm: "MOV", aux: "SymOff", rematerializeable: true, symEffect: "RdWr"}, // arg0 + auxint + offset encoded in aux
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+
+		{name: "MOVBconst", reg: gp01, asm: "MOV", typ: "UInt8", aux: "Int8", rematerializeable: true},   // 8 low bits of auxint
+		{name: "MOVHconst", reg: gp01, asm: "MOV", typ: "UInt16", aux: "Int16", rematerializeable: true}, // 16 low bits of auxint
+		{name: "MOVWconst", reg: gp01, asm: "MOV", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+		{name: "MOVDconst", reg: gp01, asm: "MOV", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
+
+		// Loads: load <size> bits from arg0+auxint+aux and extend to 64 bits; arg1=mem
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"},     //  8 bits, sign extend
+		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // 16 bits, sign extend
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},    // 32 bits, sign extend
+		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOV", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"},     // 64 bits
+		{name: "MOVBUload", argLength: 2, reg: gpload, asm: "MOVBU", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  //  8 bits, zero extend
+		{name: "MOVHUload", argLength: 2, reg: gpload, asm: "MOVHU", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // 16 bits, zero extend
+		{name: "MOVWUload", argLength: 2, reg: gpload, asm: "MOVWU", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // 32 bits, zero extend
+
+		// Stores: store <size> lowest bits in arg1 to arg0+auxint+aux; arg2=mem
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, //  8 bits
+		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits
+		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOV", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // 64 bits
+
+		// Stores: store <size> of zero in arg0+auxint+aux; arg1=mem
+		{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, //  8 bits
+		{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits
+		{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits
+		{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // 64 bits
+
+		// Conversions
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"},   // move from arg0, sign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0, sign-extended from word
+		{name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOV"},    // move from arg0
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+
+		{name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register
+
+		// Shift ops
+		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                 // arg0 << (aux1 & 63)
+		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                 // arg0 >> (aux1 & 63), signed
+		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                 // arg0 >> (aux1 & 63), unsigned
+		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63
+		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63
+		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63
+
+		// Bitwise ops
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},    // arg0 ^ auxint
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},   // arg0 | arg1
+		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},      // arg0 | auxint
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},    // arg0 & auxint
+		{name: "NOT", argLength: 1, reg: gp11, asm: "NOT"},                    // ^arg0
+
+		// Generate boolean values
+		{name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"},                 // arg0 == 0, result is 0 or 1
+		{name: "SNEZ", argLength: 1, reg: gp11, asm: "SNEZ"},                 // arg0 != 0, result is 0 or 1
+		{name: "SLT", argLength: 2, reg: gp21, asm: "SLT"},                   // arg0 < arg1, result is 0 or 1
+		{name: "SLTI", argLength: 1, reg: gp11, asm: "SLTI", aux: "Int64"},   // arg0 < auxint, result is 0 or 1
+		{name: "SLTU", argLength: 2, reg: gp21, asm: "SLTU"},                 // arg0 < arg1, unsigned, result is 0 or 1
+		{name: "SLTIU", argLength: 1, reg: gp11, asm: "SLTIU", aux: "Int64"}, // arg0 < auxint, unsigned, result is 0 or 1
+
+		// MOVconvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GC
+		// (particularly stack maps). It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		{name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem
+
+		// Calls
+		{name: "CALLstatic", argLength: 1, reg: call, aux: "CallOff", call: true},         // call static function aux.(*gc.Sym). arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: callClosure, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: callInter, aux: "CallOff", call: true},     // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// duffzero
+		// arg0 = address of memory to zero (in X10, changed as side effect)
+		// arg1 = mem
+		// auxint = offset into duffzero code to start executing
+		// X1 (link register) changed because of function call
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X10"]},
+				clobbers: regNamed["X1"] | regNamed["X10"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+		},
+
+		// duffcopy
+		// arg0 = address of dst memory (in X11, changed as side effect)
+		// arg1 = address of src memory (in X10, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// X1 (link register) changed because of function call
+		// returns mem
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X11"], regNamed["X10"]},
+				clobbers: regNamed["X1"] | regNamed["X10"] | regNamed["X11"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// Generic moves and zeros
+
+		// general unaligned zeroing
+		// arg0 = address of memory to zero (in X5, changed as side effect)
+		// arg1 = address of the last element to zero (inclusive)
+		// arg2 = mem
+		// auxint = element size
+		// returns mem
+		//	mov	ZERO, (X5)
+		//	ADD	$sz, X5
+		//	BGEU	Rarg1, X5, -2(PC)
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X5"], gpMask},
+				clobbers: regNamed["X5"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+		},
+
+		// general unaligned move
+		// arg0 = address of dst memory (in X5, changed as side effect)
+		// arg1 = address of src memory (in X6, changed as side effect)
+		// arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2)
+		// arg3 = mem
+		// auxint = alignment
+		// clobbers X7 as a tmp register.
+		// returns mem
+		//	mov	(X6), X7
+		//	mov	X7, (X5)
+		//	ADD	$sz, X5
+		//	ADD	$sz, X6
+		//	BGEU	Rarg2, X5, -4(PC)
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]},
+				clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// Atomic loads.
+		// load from arg0. arg1=mem.
+		// returns <value,memory> so they can be properly ordered with other loads.
+		{name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
+		{name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true},
+
+		// Atomic stores.
+		// store arg1 to arg0. arg2=mem. returns memory.
+		{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
+
+		// Atomic exchange.
+		// store arg1 to *arg0. arg2=mem. returns <old content of *arg0, memory>.
+		{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
+		// Atomic add.
+		// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
+		{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// Atomic compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+		// if *arg0 == arg1 {
+		//   *arg0 = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// MOV  $0, Rout
+		// LR	(Rarg0), Rtmp
+		// BNE	Rtmp, Rarg1, 3(PC)
+		// SC	Rarg2, (Rarg0), Rtmp
+		// BNE	Rtmp, ZERO, -3(PC)
+		// MOV  $1, Rout
+		{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
+		// Lowering pass-throughs
+		{name: "LoweredNilCheck", argLength: 2, faultOnNilArg0: true, nilCheck: true, reg: regInfo{inputs: []regMask{gpspMask}}}, // arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{regCtxt}}},                                                // scheduler ensures only at beginning of entry block
+
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary,
+		// but clobbers RA (LR) because it's a call
+		// and T6 (REG_TMP).
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}, clobbers: (callerSave &^ (gpMask | regNamed["g"])) | regNamed["X1"]}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X7"], regNamed["X28"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X6"], regNamed["X7"]}}, typ: "Mem", call: true},  // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}}, typ: "Mem", call: true},  // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+
+		// F extension.
+		{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true, typ: "Float32"},                                           // arg0 + arg1
+		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS", commutative: false, typ: "Float32"},                                          // arg0 - arg1
+		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, typ: "Float32"},                                           // arg0 * arg1
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", commutative: false, typ: "Float32"},                                          // arg0 / arg1
+		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"},                                                            // sqrt(arg0)
+		{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"},                                                              // -arg0
+		{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"},                                                              // reinterpret arg0 as float
+		{name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"},                                                            // float32(low 32 bits of arg0)
+		{name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"},                                                            // float32(arg0)
+		{name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"},                                                              // int32(arg0)
+		{name: "FCVTLS", argLength: 1, reg: fpgp, asm: "FCVTLS", typ: "Int64"},                                                              // int64(arg0)
+		{name: "FMOVWload", argLength: 2, reg: fpload, asm: "MOVF", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load float32 from arg0+auxint+aux
+		{name: "FMOVWstore", argLength: 3, reg: fpstore, asm: "MOVF", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store float32 to arg0+auxint+aux
+		{name: "FEQS", argLength: 2, reg: fp2gp, asm: "FEQS", commutative: true},                                                            // arg0 == arg1
+		{name: "FNES", argLength: 2, reg: fp2gp, asm: "FNES", commutative: true},                                                            // arg0 != arg1
+		{name: "FLTS", argLength: 2, reg: fp2gp, asm: "FLTS"},                                                                               // arg0 < arg1
+		{name: "FLES", argLength: 2, reg: fp2gp, asm: "FLES"},                                                                               // arg0 <= arg1
+
+		// D extension.
+		{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true, typ: "Float64"},                                           // arg0 + arg1
+		{name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD", commutative: false, typ: "Float64"},                                          // arg0 - arg1
+		{name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true, typ: "Float64"},                                           // arg0 * arg1
+		{name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD", commutative: false, typ: "Float64"},                                          // arg0 / arg1
+		{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD", typ: "Float64"},                                                            // sqrt(arg0)
+		{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"},                                                              // -arg0
+		{name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"},                                                              // reinterpret arg0 as float
+		{name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"},                                                            // float64(low 32 bits of arg0)
+		{name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"},                                                            // float64(arg0)
+		{name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"},                                                              // int32(arg0)
+		{name: "FCVTLD", argLength: 1, reg: fpgp, asm: "FCVTLD", typ: "Int64"},                                                              // int64(arg0)
+		{name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS", typ: "Float64"},                                                            // float64(arg0)
+		{name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD", typ: "Float32"},                                                            // float32(arg0)
+		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "MOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load float64 from arg0+auxint+aux
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // store float6 to arg0+auxint+aux
+		{name: "FEQD", argLength: 2, reg: fp2gp, asm: "FEQD", commutative: true},                                                            // arg0 == arg1
+		{name: "FNED", argLength: 2, reg: fp2gp, asm: "FNED", commutative: true},                                                            // arg0 != arg1
+		{name: "FLTD", argLength: 2, reg: fp2gp, asm: "FLTD"},                                                                               // arg0 < arg1
+		{name: "FLED", argLength: 2, reg: fp2gp, asm: "FLED"},                                                                               // arg0 <= arg1
+	}
+
+	RISCV64blocks := []blockData{
+		{name: "BEQ", controls: 2},
+		{name: "BNE", controls: 2},
+		{name: "BLT", controls: 2},
+		{name: "BGE", controls: 2},
+		{name: "BLTU", controls: 2},
+		{name: "BGEU", controls: 2},
+
+		{name: "BEQZ", controls: 1},
+		{name: "BNEZ", controls: 1},
+		{name: "BLEZ", controls: 1},
+		{name: "BGEZ", controls: 1},
+		{name: "BLTZ", controls: 1},
+		{name: "BGTZ", controls: 1},
+	}
+
+	archs = append(archs, arch{
+		name:            "RISCV64",
+		pkg:             "cmd/internal/obj/riscv",
+		genfile:         "../../riscv64/ssa.go",
+		ops:             RISCV64ops,
+		blocks:          RISCV64blocks,
+		regnames:        regNamesRISCV64,
+		gpregmask:       gpMask,
+		fpregmask:       fpMask,
+		framepointerreg: -1, // not used
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
new file mode 100644
index 0000000..384f2e8
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -0,0 +1,1695 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(64|Ptr) ...) => (ADD ...)
+(Add(32|16|8) ...) => (ADDW ...)
+(Add32F x y) => (Select0 (FADDS x y))
+(Add64F x y) => (Select0 (FADD x y))
+
+(Sub(64|Ptr) ...) => (SUB ...)
+(Sub(32|16|8) ...) => (SUBW ...)
+(Sub32F x y) => (Select0 (FSUBS x y))
+(Sub64F x y) => (Select0 (FSUB x y))
+
+(Mul64 ...) => (MULLD ...)
+(Mul(32|16|8) ...) => (MULLW ...)
+(Mul32F ...) => (FMULS ...)
+(Mul64F ...) => (FMUL ...)
+(Mul64uhilo ...) => (MLGR ...)
+
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIV ...)
+
+(Div64 x y) => (DIVD x y)
+(Div64u ...) => (DIVDU ...)
+// DIVW/DIVWU has a 64-bit dividend and a 32-bit divisor,
+// so a sign/zero extension of the dividend is required.
+(Div32  x y) => (DIVW  (MOVWreg x) y)
+(Div32u x y) => (DIVWU (MOVWZreg x) y)
+(Div16  x y) => (DIVW  (MOVHreg x) (MOVHreg y))
+(Div16u x y) => (DIVWU (MOVHZreg x) (MOVHZreg y))
+(Div8   x y) => (DIVW  (MOVBreg x) (MOVBreg y))
+(Div8u  x y) => (DIVWU (MOVBZreg x) (MOVBZreg y))
+
+(Hmul(64|64u) ...) => (MULH(D|DU) ...)
+(Hmul32  x y) => (SRDconst [32] (MULLD (MOVWreg x) (MOVWreg y)))
+(Hmul32u x y) => (SRDconst [32] (MULLD (MOVWZreg x) (MOVWZreg y)))
+
+(Mod64 x y) => (MODD x y)
+(Mod64u ...) => (MODDU ...)
+// MODW/MODWU has a 64-bit dividend and a 32-bit divisor,
+// so a sign/zero extension of the dividend is required.
+(Mod32  x y) => (MODW  (MOVWreg x) y)
+(Mod32u x y) => (MODWU (MOVWZreg x) y)
+(Mod16  x y) => (MODW  (MOVHreg x) (MOVHreg y))
+(Mod16u x y) => (MODWU (MOVHZreg x) (MOVHZreg y))
+(Mod8   x y) => (MODW  (MOVBreg x) (MOVBreg y))
+(Mod8u  x y) => (MODWU (MOVBZreg x) (MOVBZreg y))
+
+// (x + y) / 2 with x>=y -> (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y)
+
+(And64 ...) => (AND ...)
+(And(32|16|8) ...) => (ANDW ...)
+
+(Or64 ...) => (OR ...)
+(Or(32|16|8) ...) => (ORW ...)
+
+(Xor64 ...) => (XOR ...)
+(Xor(32|16|8) ...) => (XORW ...)
+
+(Neg64 ...) => (NEG ...)
+(Neg(32|16|8) ...) => (NEGW ...)
+(Neg32F ...) => (FNEGS ...)
+(Neg64F ...) => (FNEG ...)
+
+(Com64 ...) => (NOT ...)
+(Com(32|16|8) ...) => (NOTW ...)
+(NOT x) => (XOR (MOVDconst [-1]) x)
+(NOTW x) => (XORWconst [-1] x)
+
+// Lowering boolean ops
+(AndB ...) => (ANDW ...)
+(OrB ...) => (ORW ...)
+(Not x) => (XORWconst [1] x)
+
+// Lowering pointer arithmetic
+(OffPtr [off] ptr:(SP)) => (MOVDaddr [int32(off)] ptr)
+(OffPtr [off] ptr) && is32Bit(off) => (ADDconst [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr)
+
+// TODO: optimize these cases?
+(Ctz64NonZero ...) => (Ctz64 ...)
+(Ctz32NonZero ...) => (Ctz32 ...)
+
+// Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
+(Ctz64 <t> x) => (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
+(Ctz32 <t> x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
+
+(BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x))
+
+// POPCNT treats the input register as a vector of 8 bytes, producing
+// a population count for each individual byte. For inputs larger than
+// a single byte we therefore need to sum the individual bytes produced
+// by the POPCNT instruction. For example, the following instruction
+// sequence could be used to calculate the population count of a 4-byte
+// value:
+//
+//     MOVD   $0x12345678, R1 // R1=0x12345678 <-- input
+//     POPCNT R1, R2          // R2=0x02030404
+//     SRW    $16, R2, R3     // R3=0x00000203
+//     ADDW   R2, R3, R4      // R4=0x02030607
+//     SRW    $8, R4, R5      // R5=0x00020306
+//     ADDW   R4, R5, R6      // R6=0x0205090d
+//     MOVBZ  R6, R7          // R7=0x0000000d <-- result is 13
+//
+(PopCount8  x) => (POPCNT (MOVBZreg x))
+(PopCount16 x) => (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
+(PopCount32 x) => (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
+(PopCount64 x) => (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
+
+// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
+// 2, 4 or 8 bytes respectively. The result is a single byte however
+// other bytes might contain junk so a zero extension is required if
+// the desired output type is larger than 1 byte.
+(SumBytes2 x) => (ADDW (SRWconst <typ.UInt8> x [8]) x)
+(SumBytes4 x) => (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
+(SumBytes8 x) => (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
+
+(Bswap64 ...) => (MOVDBR ...)
+(Bswap32 ...) => (MOVWBR ...)
+
+// add with carry
+(Select0 (Add64carry x y c))
+  => (Select0 <typ.UInt64> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))
+(Select1 (Add64carry x y c))
+  => (Select0 <typ.UInt64> (ADDE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))))
+
+// subtract with borrow
+(Select0 (Sub64borrow x y c))
+  => (Select0 <typ.UInt64> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c))))
+(Select1 (Sub64borrow x y c))
+  => (NEG (Select0 <typ.UInt64> (SUBE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c)))))))
+
+// math package intrinsics
+(Sqrt      ...) => (FSQRT ...)
+(Floor       x) => (FIDBR [7] x)
+(Ceil        x) => (FIDBR [6] x)
+(Trunc       x) => (FIDBR [5] x)
+(RoundToEven x) => (FIDBR [4] x)
+(Round       x) => (FIDBR [1] x)
+(FMA     x y z) => (FMADD z x y)
+
+// Atomic loads and stores.
+// The SYNC instruction (fast-BCR-serialization) prevents store-load
+// reordering. Other sequences of memory operations (load-load,
+// store-store and load-store) are already guaranteed not to be reordered.
+(AtomicLoad(8|32|Acq32|64|Ptr) ptr mem) => (MOV(BZ|WZ|WZ|D|D)atomicload ptr mem)
+(AtomicStore(8|32|64|PtrNoWB) ptr val mem) => (SYNC (MOV(B|W|D|D)atomicstore ptr val mem))
+
+// Store-release doesn't require store-load ordering.
+(AtomicStoreRel32 ptr val mem) => (MOVWatomicstore ptr val mem)
+
+// Atomic adds.
+(AtomicAdd32 ptr val mem) => (AddTupleFirst32 val (LAA ptr val mem))
+(AtomicAdd64 ptr val mem) => (AddTupleFirst64 val (LAAG ptr val mem))
+(Select0 <t> (AddTupleFirst32 val tuple)) => (ADDW val (Select0 <t> tuple))
+(Select1     (AddTupleFirst32   _ tuple)) => (Select1 tuple)
+(Select0 <t> (AddTupleFirst64 val tuple)) => (ADD val (Select0 <t> tuple))
+(Select1     (AddTupleFirst64   _ tuple)) => (Select1 tuple)
+
+// Atomic exchanges.
+(AtomicExchange32 ptr val mem) => (LoweredAtomicExchange32 ptr val mem)
+(AtomicExchange64 ptr val mem) => (LoweredAtomicExchange64 ptr val mem)
+
+// Atomic compare and swap.
+(AtomicCompareAndSwap32 ptr old new_ mem) => (LoweredAtomicCas32 ptr old new_ mem)
+(AtomicCompareAndSwap64 ptr old new_ mem) => (LoweredAtomicCas64 ptr old new_ mem)
+
+// Atomic and: *(*uint8)(ptr) &= val
+//
+// Round pointer down to nearest word boundary and pad value with ones before
+// applying atomic AND operation to target word.
+//
+// *(*uint32)(ptr &^ 3) &= rotateleft(uint32(val) | 0xffffff00, ((3 << 3) ^ ((ptr & 3) << 3))
+//
+(AtomicAnd8 ptr val mem)
+  => (LANfloor
+       ptr
+       (RLL <typ.UInt32>
+         (ORWconst <typ.UInt32> val [-1<<8])
+         (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
+       mem)
+
+// Atomic or: *(*uint8)(ptr) |= val
+//
+// Round pointer down to nearest word boundary and pad value with zeros before
+// applying atomic OR operation to target word.
+//
+// *(*uint32)(ptr &^ 3) |= uint32(val) << ((3 << 3) ^ ((ptr & 3) << 3))
+//
+(AtomicOr8  ptr val mem)
+  => (LAOfloor
+       ptr
+       (SLW <typ.UInt32>
+         (MOVBZreg <typ.UInt32> val)
+         (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
+       mem)
+
+(AtomicAnd32 ...) => (LAN ...)
+(AtomicOr32  ...) => (LAO ...)
+
+// Lowering extension
+// Note: we always extend to 64 bits even though some ops don't need that many result bits.
+(SignExt8to(16|32|64) ...) => (MOVBreg ...)
+(SignExt16to(32|64) ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to(16|32|64) ...) => (MOVBZreg ...)
+(ZeroExt16to(32|64) ...) => (MOVHZreg ...)
+(ZeroExt32to64 ...) => (MOVWZreg ...)
+
+(Slicemask <t> x) => (SRADconst (NEG <t> x) [63])
+
+// Lowering truncation
+// Because we ignore high parts of registers, truncates are just copies.
+(Trunc(16|32|64)to8 ...) => (Copy ...)
+(Trunc(32|64)to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
+
+// Lowering float <-> int
+(Cvt32to32F ...) => (CEFBRA ...)
+(Cvt32to64F ...) => (CDFBRA ...)
+(Cvt64to32F ...) => (CEGBRA ...)
+(Cvt64to64F ...) => (CDGBRA ...)
+
+(Cvt32Fto32 ...) => (CFEBRA ...)
+(Cvt32Fto64 ...) => (CGEBRA ...)
+(Cvt64Fto32 ...) => (CFDBRA ...)
+(Cvt64Fto64 ...) => (CGDBRA ...)
+
+// Lowering float <-> uint
+(Cvt32Uto32F ...) => (CELFBR ...)
+(Cvt32Uto64F ...) => (CDLFBR ...)
+(Cvt64Uto32F ...) => (CELGBR ...)
+(Cvt64Uto64F ...) => (CDLGBR ...)
+
+(Cvt32Fto32U ...) => (CLFEBR ...)
+(Cvt32Fto64U ...) => (CLGEBR ...)
+(Cvt64Fto32U ...) => (CLFDBR ...)
+(Cvt64Fto64U ...) => (CLGDBR ...)
+
+// Lowering float32 <-> float64
+(Cvt32Fto64F ...) => (LDEBR ...)
+(Cvt64Fto32F ...) => (LEDBR ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
+
+// Lowering shifts
+
+// Lower bounded shifts first. No need to check shift value.
+(Lsh64x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLD x y)
+(Lsh32x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SLW x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
+(Rsh64x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAD x y)
+(Rsh32x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAW x y)
+(Rsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
+(Rsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
+
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+//   result = shift >= 64 ? 0 : arg << shift
+(Lsh(64|32|16|8)x64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
+(Lsh(64|32|16|8)x32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Lsh(64|32|16|8)x16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Lsh(64|32|16|8)x8  <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
+
+(Rsh(64|32)Ux64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
+(Rsh(64|32)Ux32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Rsh(64|32)Ux16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Rsh(64|32)Ux8  <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
+
+(Rsh(16|8)Ux64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPUconst y [64]))
+(Rsh(16|8)Ux32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Rsh(16|8)Ux16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Rsh(16|8)Ux8  <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to 63 (all ones) if the shift value is more than 63.
+//   result = arg >> (shift >= 64 ? 63 : shift)
+(Rsh(64|32)x64 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst  y [64])))
+(Rsh(64|32)x32 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
+(Rsh(64|32)x16 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
+(Rsh(64|32)x8  x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))
+
+(Rsh(16|8)x64 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst  y [64])))
+(Rsh(16|8)x32 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
+(Rsh(16|8)x16 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
+(Rsh(16|8)x8  x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))
+
+// Lowering rotates
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 ...) => (RLL  ...)
+(RotateLeft64 ...) => (RLLG ...)
+
+// Lowering comparisons
+(Less64      x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Less32      x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Less(16|8)  x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
+(Less64U     x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
+(Less32U     x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
+(Less(16|8)U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
+(Less64F     x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Less32F     x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+(Leq64      x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Leq32      x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Leq(16|8)  x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y)))
+(Leq64U     x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
+(Leq32U     x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
+(Leq(16|8)U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y)))
+(Leq64F     x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Leq32F     x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+(Eq(64|Ptr) x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Eq32       x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Eq(16|8|B) x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y)))
+(Eq64F      x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Eq32F      x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+(Neq(64|Ptr) x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
+(Neq32       x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
+(Neq(16|8|B) x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y)))
+(Neq64F      x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
+(Neq32F      x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
+
+// Lowering loads
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) => (MOVWZload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) => (MOVHload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) => (MOVHZload ptr mem)
+(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && !isSigned(t))) => (MOVBZload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem)
+
+// Lowering stores
+// These more-specific FP versions of Store pattern should come first.
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem)
+
+(Store {t} ptr val mem) && t.Size() == 8 => (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+
+// Lowering moves
+
+// Load and store for small copies.
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBZload src mem) mem)
+(Move [2] dst src mem) => (MOVHstore dst (MOVHZload src mem) mem)
+(Move [4] dst src mem) => (MOVWstore dst (MOVWZload src mem) mem)
+(Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem)
+(Move [16] dst src mem) =>
+	(MOVDstore [8] dst (MOVDload [8] src mem)
+		(MOVDstore dst (MOVDload src mem) mem))
+(Move [24] dst src mem) =>
+        (MOVDstore [16] dst (MOVDload [16] src mem)
+	        (MOVDstore [8] dst (MOVDload [8] src mem)
+                (MOVDstore dst (MOVDload src mem) mem)))
+(Move [3] dst src mem) =>
+	(MOVBstore [2] dst (MOVBZload [2] src mem)
+		(MOVHstore dst (MOVHZload src mem) mem))
+(Move [5] dst src mem) =>
+	(MOVBstore [4] dst (MOVBZload [4] src mem)
+		(MOVWstore dst (MOVWZload src mem) mem))
+(Move [6] dst src mem) =>
+	(MOVHstore [4] dst (MOVHZload [4] src mem)
+		(MOVWstore dst (MOVWZload src mem) mem))
+(Move [7] dst src mem) =>
+	(MOVBstore [6] dst (MOVBZload [6] src mem)
+		(MOVHstore [4] dst (MOVHZload [4] src mem)
+			(MOVWstore dst (MOVWZload src mem) mem)))
+
+// MVC for other moves. Use up to 4 instructions (sizes up to 1024 bytes).
+(Move [s] dst src mem) && s > 0 && s <= 256 && logLargeCopy(v, s) =>
+	(MVC [makeValAndOff32(int32(s), 0)] dst src mem)
+(Move [s] dst src mem) && s > 256 && s <= 512 && logLargeCopy(v, s) =>
+	(MVC [makeValAndOff32(int32(s)-256, 256)] dst src (MVC [makeValAndOff32(256, 0)] dst src mem))
+(Move [s] dst src mem) && s > 512 && s <= 768 && logLargeCopy(v, s) =>
+	(MVC [makeValAndOff32(int32(s)-512, 512)] dst src (MVC [makeValAndOff32(256, 256)] dst src (MVC [makeValAndOff32(256, 0)] dst src mem)))
+(Move [s] dst src mem) && s > 768 && s <= 1024 && logLargeCopy(v, s) =>
+	(MVC [makeValAndOff32(int32(s)-768, 768)] dst src (MVC [makeValAndOff32(256, 512)] dst src (MVC [makeValAndOff32(256, 256)] dst src (MVC [makeValAndOff32(256, 0)] dst src mem))))
+
+// Move more than 1024 bytes using a loop.
+(Move [s] dst src mem) && s > 1024 && logLargeCopy(v, s) =>
+	(LoweredMove [s%256] dst src (ADD <src.Type> src (MOVDconst [(s/256)*256])) mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [0] destptr mem)
+(Zero [2] destptr mem) => (MOVHstoreconst [0] destptr mem)
+(Zero [4] destptr mem) => (MOVWstoreconst [0] destptr mem)
+(Zero [8] destptr mem) => (MOVDstoreconst [0] destptr mem)
+(Zero [3] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,2)] destptr
+		(MOVHstoreconst [0] destptr mem))
+(Zero [5] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVWstoreconst [0] destptr mem))
+(Zero [6] destptr mem) =>
+	(MOVHstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVWstoreconst [0] destptr mem))
+(Zero [7] destptr mem) =>
+	(MOVWstoreconst [makeValAndOff32(0,3)] destptr
+		(MOVWstoreconst [0] destptr mem))
+
+(Zero [s] destptr mem) && s > 0 && s <= 1024 =>
+	(CLEAR [makeValAndOff32(int32(s), 0)] destptr mem)
+
+// Zero more than 1024 bytes using a loop.
+(Zero [s] destptr mem) && s > 1024 =>
+	(LoweredZero [s%256] destptr (ADDconst <destptr.Type> destptr [(int32(s)/256)*256]) mem)
+
+// Lowering constants
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32|64)F ...) => (FMOV(S|D)const ...)
+(ConstNil) => (MOVDconst [0])
+(ConstBool [b]) => (MOVDconst [b2i(b)])
+
+// Lowering calls
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)
+
+// Miscellaneous
+(IsNonNil p) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPconst p [0]))
+(IsInBounds idx len) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
+(IsSliceInBounds idx len) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetG ...) => (LoweredGetG ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(Addr {sym} base) => (MOVDaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
+(ITab (Load ptr mem)) => (MOVDload ptr mem)
+
+// block rewrites
+(If cond yes no) => (CLIJ {s390x.LessOrGreater} (MOVBZreg <typ.Bool> cond) [0] yes no)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+
+// ***************************
+// Above: lowering rules
+// Below: optimizations
+// ***************************
+// TODO: Should the optimizations be a separate pass?
+
+// Note: when removing unnecessary sign/zero extensions.
+//
+// After a value is spilled it is restored using a sign- or zero-extension
+// to register-width as appropriate for its type. For example, a uint8 will
+// be restored using a MOVBZ (llgc) instruction which will zero extend the
+// 8-bit value to 64-bits.
+//
+// This is a hazard when folding sign- and zero-extensions since we need to
+// ensure not only that the value in the argument register is correctly
+// extended but also that it will still be correctly extended if it is
+// spilled and restored.
+//
+// In general this means we need type checks when the RHS of a rule is an
+// OpCopy (i.e. "(... x:(...) ...) -> x").
+
+// Merge double extensions.
+(MOV(H|HZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(W|WZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(W|WZ)reg e:(MOV(H|HZ)reg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+
+// Bypass redundant sign extensions.
+(MOV(B|BZ)reg e:(MOVBreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVHreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(H|HZ)reg e:(MOVHreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(H|HZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(W|WZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(W|WZ)reg x)
+
+// Bypass redundant zero extensions.
+(MOV(B|BZ)reg e:(MOVBZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVHZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(B|BZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x)
+(MOV(H|HZ)reg e:(MOVHZreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(H|HZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x)
+(MOV(W|WZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(W|WZ)reg x)
+
+// Remove zero extensions after zero extending load.
+// Note: take care that if x is spilled it is restored correctly.
+(MOV(B|H|W)Zreg x:(MOVBZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) => x
+(MOV(H|W)Zreg   x:(MOVHZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) => x
+(MOVWZreg       x:(MOVWZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 4) => x
+
+// Remove sign extensions after sign extending load.
+// Note: take care that if x is spilled it is restored correctly.
+(MOV(B|H|W)reg x:(MOVBload    _   _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x
+(MOV(H|W)reg   x:(MOVHload    _   _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x
+(MOVWreg       x:(MOVWload    _   _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x
+
+// Remove sign extensions after zero extending load.
+// These type checks are probably unnecessary but do them anyway just in case.
+(MOV(H|W)reg x:(MOVBZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) => x
+(MOVWreg     x:(MOVHZload    _   _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) => x
+
+// Fold sign and zero extensions into loads.
+//
+// Note: The combined instruction must end up in the same block
+// as the original load. If not, we end up making a value with
+// memory type live in two different blocks, which can lead to
+// multiple memory values alive simultaneously.
+//
+// Make sure we don't combine these ops if the load has another use.
+// This prevents a single load from being split into multiple loads
+// which then might return different values.  See test/atomicload.go.
+(MOV(B|H|W)Zreg <t> x:(MOV(B|H|W)load [o] {s} p mem))
+  && x.Uses == 1
+  && clobber(x)
+  => @x.Block (MOV(B|H|W)Zload <t> [o] {s} p mem)
+(MOV(B|H|W)reg <t> x:(MOV(B|H|W)Zload [o] {s} p mem))
+  && x.Uses == 1
+  && clobber(x)
+  => @x.Block (MOV(B|H|W)load <t> [o] {s} p mem)
+
+// Remove zero extensions after argument load.
+(MOVBZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() == 1 => x
+(MOVHZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 2 => x
+(MOVWZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 4 => x
+
+// Remove sign extensions after argument load.
+(MOVBreg x:(Arg <t>)) && t.IsSigned() && t.Size() == 1 => x
+(MOVHreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 2 => x
+(MOVWreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 4 => x
+
+// Fold zero extensions into constants.
+(MOVBZreg (MOVDconst [c])) => (MOVDconst [int64( uint8(c))])
+(MOVHZreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))])
+(MOVWZreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))])
+
+// Fold sign extensions into constants.
+(MOVBreg (MOVDconst [c])) => (MOVDconst [int64( int8(c))])
+(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))])
+
+// Remove zero extension of conditional move.
+// Note: only for MOVBZreg for now since it is added as part of 'if' statement lowering.
+(MOVBZreg x:(LOCGR (MOVDconst [c]) (MOVDconst [d]) _))
+  && int64(uint8(c)) == c
+  && int64(uint8(d)) == d
+  && (!x.Type.IsSigned() || x.Type.Size() > 1)
+  => x
+
+// Fold boolean tests into blocks.
+// Note: this must match If statement lowering.
+(CLIJ {s390x.LessOrGreater} (LOCGR {d} (MOVDconst [0]) (MOVDconst [x]) cmp) [0] yes no)
+  && int32(x) != 0
+  => (BRC {d} cmp yes no)
+
+// Canonicalize BRC condition code mask by removing impossible conditions.
+// Integer comparisons cannot generate the unordered condition.
+(BRC {c} x:((CMP|CMPW|CMPU|CMPWU)    _ _) yes no) && c&s390x.Unordered != 0 => (BRC {c&^s390x.Unordered} x yes no)
+(BRC {c} x:((CMP|CMPW|CMPU|CMPWU)const _) yes no) && c&s390x.Unordered != 0 => (BRC {c&^s390x.Unordered} x yes no)
+
+// Compare-and-branch.
+// Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered.
+(BRC {c} (CMP   x y) yes no) => (CGRJ  {c&^s390x.Unordered} x y yes no)
+(BRC {c} (CMPW  x y) yes no) => (CRJ   {c&^s390x.Unordered} x y yes no)
+(BRC {c} (CMPU  x y) yes no) => (CLGRJ {c&^s390x.Unordered} x y yes no)
+(BRC {c} (CMPWU x y) yes no) => (CLRJ  {c&^s390x.Unordered} x y yes no)
+
+// Compare-and-branch (immediate).
+// Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered.
+(BRC {c} (CMPconst   x [y]) yes no) && y == int32( int8(y)) => (CGIJ  {c&^s390x.Unordered} x [ int8(y)] yes no)
+(BRC {c} (CMPWconst  x [y]) yes no) && y == int32( int8(y)) => (CIJ   {c&^s390x.Unordered} x [ int8(y)] yes no)
+(BRC {c} (CMPUconst  x [y]) yes no) && y == int32(uint8(y)) => (CLGIJ {c&^s390x.Unordered} x [uint8(y)] yes no)
+(BRC {c} (CMPWUconst x [y]) yes no) && y == int32(uint8(y)) => (CLIJ  {c&^s390x.Unordered} x [uint8(y)] yes no)
+
+// Absorb immediate into compare-and-branch.
+(C(R|GR)J  {c} x (MOVDconst [y]) yes no) && is8Bit(y)  => (C(I|GI)J  {c} x [ int8(y)] yes no)
+(CL(R|GR)J {c} x (MOVDconst [y]) yes no) && isU8Bit(y) => (CL(I|GI)J {c} x [uint8(y)] yes no)
+(C(R|GR)J  {c} (MOVDconst [x]) y yes no) && is8Bit(x)  => (C(I|GI)J  {c.ReverseComparison()} y [ int8(x)] yes no)
+(CL(R|GR)J {c} (MOVDconst [x]) y yes no) && isU8Bit(x) => (CL(I|GI)J {c.ReverseComparison()} y [uint8(x)] yes no)
+
+// Prefer comparison with immediate to compare-and-branch.
+(CGRJ  {c} x (MOVDconst [y]) yes no) && !is8Bit(y)  && is32Bit(y)  => (BRC {c} (CMPconst   x [int32(y)]) yes no)
+(CRJ   {c} x (MOVDconst [y]) yes no) && !is8Bit(y)  && is32Bit(y)  => (BRC {c} (CMPWconst  x [int32(y)]) yes no)
+(CLGRJ {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) => (BRC {c} (CMPUconst  x [int32(y)]) yes no)
+(CLRJ  {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) => (BRC {c} (CMPWUconst x [int32(y)]) yes no)
+(CGRJ  {c} (MOVDconst [x]) y yes no) && !is8Bit(x)  && is32Bit(x)  => (BRC {c.ReverseComparison()} (CMPconst   y [int32(x)]) yes no)
+(CRJ   {c} (MOVDconst [x]) y yes no) && !is8Bit(x)  && is32Bit(x)  => (BRC {c.ReverseComparison()} (CMPWconst  y [int32(x)]) yes no)
+(CLGRJ {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) => (BRC {c.ReverseComparison()} (CMPUconst  y [int32(x)]) yes no)
+(CLRJ  {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) => (BRC {c.ReverseComparison()} (CMPWUconst y [int32(x)]) yes no)
+
+// Absorb sign/zero extensions into 32-bit compare-and-branch.
+(CIJ  {c} (MOV(W|WZ)reg x) [y] yes no) => (CIJ  {c} x [y] yes no)
+(CLIJ {c} (MOV(W|WZ)reg x) [y] yes no) => (CLIJ {c} x [y] yes no)
+
+// Bring out-of-range signed immediates into range by varying branch condition.
+(BRC {s390x.Less}           (CMPconst  x [ 128]) yes no) => (CGIJ {s390x.LessOrEqual}    x [ 127] yes no)
+(BRC {s390x.Less}           (CMPWconst x [ 128]) yes no) => (CIJ  {s390x.LessOrEqual}    x [ 127] yes no)
+(BRC {s390x.LessOrEqual}    (CMPconst  x [-129]) yes no) => (CGIJ {s390x.Less}           x [-128] yes no)
+(BRC {s390x.LessOrEqual}    (CMPWconst x [-129]) yes no) => (CIJ  {s390x.Less}           x [-128] yes no)
+(BRC {s390x.Greater}        (CMPconst  x [-129]) yes no) => (CGIJ {s390x.GreaterOrEqual} x [-128] yes no)
+(BRC {s390x.Greater}        (CMPWconst x [-129]) yes no) => (CIJ  {s390x.GreaterOrEqual} x [-128] yes no)
+(BRC {s390x.GreaterOrEqual} (CMPconst  x [ 128]) yes no) => (CGIJ {s390x.Greater}        x [ 127] yes no)
+(BRC {s390x.GreaterOrEqual} (CMPWconst x [ 128]) yes no) => (CIJ  {s390x.Greater}        x [ 127] yes no)
+
+// Bring out-of-range unsigned immediates into range by varying branch condition.
+(BRC {s390x.Less}           (CMP(WU|U)const  x [256]) yes no) => (C(L|LG)IJ {s390x.LessOrEqual} x [255] yes no)
+(BRC {s390x.GreaterOrEqual} (CMP(WU|U)const  x [256]) yes no) => (C(L|LG)IJ {s390x.Greater}     x [255] yes no)
+
+// Bring out-of-range immediates into range by switching signedness (only == and !=).
+(BRC {c} (CMPconst   x [y]) yes no) && y == int32(uint8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CLGIJ {c} x [uint8(y)] yes no)
+(BRC {c} (CMPWconst  x [y]) yes no) && y == int32(uint8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CLIJ  {c} x [uint8(y)] yes no)
+(BRC {c} (CMPUconst  x [y]) yes no) && y == int32( int8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CGIJ  {c} x [ int8(y)] yes no)
+(BRC {c} (CMPWUconst x [y]) yes no) && y == int32( int8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CIJ   {c} x [ int8(y)] yes no)
+
+// Fold constants into instructions.
+(ADD x (MOVDconst [c])) && is32Bit(c) => (ADDconst [int32(c)] x)
+(ADDW x (MOVDconst [c])) => (ADDWconst [int32(c)] x)
+
+(SUB x (MOVDconst [c])) && is32Bit(c) => (SUBconst x [int32(c)])
+(SUB (MOVDconst [c]) x) && is32Bit(c) => (NEG (SUBconst <v.Type> x [int32(c)]))
+(SUBW x (MOVDconst [c])) => (SUBWconst x [int32(c)])
+(SUBW (MOVDconst [c]) x) => (NEGW (SUBWconst <v.Type> x [int32(c)]))
+
+(MULLD x (MOVDconst [c])) && is32Bit(c) => (MULLDconst [int32(c)] x)
+(MULLW x (MOVDconst [c])) => (MULLWconst [int32(c)] x)
+
+// NILF instructions leave the high 32 bits unchanged which is
+// equivalent to the leftmost 32 bits being set.
+// TODO(mundaym): modify the assembler to accept 64-bit values
+// and use isU32Bit(^c).
+(AND x (MOVDconst [c]))
+  && s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)) != nil
+  => (RISBGZ x {*s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c))})
+(AND x (MOVDconst [c]))
+  && is32Bit(c)
+  && c < 0
+  => (ANDconst [c] x)
+(AND x (MOVDconst [c]))
+  && is32Bit(c)
+  && c >= 0
+  => (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x))
+
+(ANDW x (MOVDconst [c])) => (ANDWconst [int32(c)] x)
+
+((AND|ANDW)const [c] ((AND|ANDW)const [d] x)) => ((AND|ANDW)const [c&d] x)
+
+((OR|XOR) x (MOVDconst [c])) && isU32Bit(c) => ((OR|XOR)const [c] x)
+((OR|XOR)W x (MOVDconst [c])) => ((OR|XOR)Wconst [int32(c)] x)
+
+// Constant shifts.
+(S(LD|RD|RAD) x (MOVDconst [c])) => (S(LD|RD|RAD)const x [uint8(c&63)])
+(S(LW|RW|RAW) x (MOVDconst [c])) && c&32 == 0 => (S(LW|RW|RAW)const x [uint8(c&31)])
+(S(LW|RW)     _ (MOVDconst [c])) && c&32 != 0 => (MOVDconst [0])
+(SRAW         x (MOVDconst [c])) && c&32 != 0 => (SRAWconst x [31])
+
+// Shifts only use the rightmost 6 bits of the shift value.
+(S(LD|RD|RAD|LW|RW|RAW) x (RISBGZ y {r}))
+  && r.Amount == 0
+  && r.OutMask()&63 == 63
+  => (S(LD|RD|RAD|LW|RW|RAW) x y)
+(S(LD|RD|RAD|LW|RW|RAW) x (AND (MOVDconst [c]) y))
+  => (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [int32(c&63)] y))
+(S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst [c] y)) && c&63 == 63
+  => (S(LD|RD|RAD|LW|RW|RAW) x y)
+(SLD  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLD  x y)
+(SRD  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRD  x y)
+(SRAD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAD x y)
+(SLW  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLW  x y)
+(SRW  x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRW  x y)
+(SRAW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAW x y)
+
+// Match rotate by constant.
+(RLLG x (MOVDconst [c])) => (RISBGZ x {s390x.NewRotateParams(0, 63, uint8(c&63))})
+(RLL  x (MOVDconst [c])) => (RLLconst x [uint8(c&31)])
+
+// Match rotate by constant pattern.
+((ADD|OR|XOR)  (SLDconst x [c]) (SRDconst x [64-c])) => (RISBGZ x {s390x.NewRotateParams(0, 63, c)})
+((ADD|OR|XOR)W (SLWconst x [c]) (SRWconst x [32-c])) => (RLLconst x [c])
+
+// Signed 64-bit comparison with immediate.
+(CMP x (MOVDconst [c])) && is32Bit(c) => (CMPconst x [int32(c)])
+(CMP (MOVDconst [c]) x) && is32Bit(c) => (InvertFlags (CMPconst x [int32(c)]))
+
+// Unsigned 64-bit comparison with immediate.
+(CMPU x (MOVDconst [c])) && isU32Bit(c) => (CMPUconst x [int32(c)])
+(CMPU (MOVDconst [c]) x) && isU32Bit(c) => (InvertFlags (CMPUconst x [int32(c)]))
+
+// Signed and unsigned 32-bit comparison with immediate.
+(CMP(W|WU) x (MOVDconst [c])) => (CMP(W|WU)const x [int32(c)])
+(CMP(W|WU) (MOVDconst [c]) x) => (InvertFlags (CMP(W|WU)const x [int32(c)]))
+
+// Match (x >> c) << d to 'rotate then insert selected bits [into zero]'.
+(SLDconst (SRDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(uint8(max8(0, int8(c-d))), 63-d, uint8(int8(d-c)&63))})
+
+// Match (x << c) >> d to 'rotate then insert selected bits [into zero]'.
+(SRDconst (SLDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(d, uint8(min8(63, int8(63-c+d))), uint8(int8(c-d)&63))})
+
+// Absorb input zero extension into 'rotate then insert selected bits [into zero]'.
+(RISBGZ (MOVWZreg x) {r}) && r.InMerge(0xffffffff) != nil => (RISBGZ x {*r.InMerge(0xffffffff)})
+(RISBGZ (MOVHZreg x) {r}) && r.InMerge(0x0000ffff) != nil => (RISBGZ x {*r.InMerge(0x0000ffff)})
+(RISBGZ (MOVBZreg x) {r}) && r.InMerge(0x000000ff) != nil => (RISBGZ x {*r.InMerge(0x000000ff)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into zero extension.
+(MOVWZreg (RISBGZ x {r})) && r.OutMerge(0xffffffff) != nil => (RISBGZ x {*r.OutMerge(0xffffffff)})
+(MOVHZreg (RISBGZ x {r})) && r.OutMerge(0x0000ffff) != nil => (RISBGZ x {*r.OutMerge(0x0000ffff)})
+(MOVBZreg (RISBGZ x {r})) && r.OutMerge(0x000000ff) != nil => (RISBGZ x {*r.OutMerge(0x000000ff)})
+
+// Absorb shift into 'rotate then insert selected bits [into zero]'.
+//
+// Any unsigned shift can be represented as a rotate and mask operation:
+//
+//   x << c => RotateLeft64(x, c) & (^uint64(0) << c)
+//   x >> c => RotateLeft64(x, -c) & (^uint64(0) >> c)
+//
+// Therefore when a shift is used as the input to a rotate then insert
+// selected bits instruction we can merge the two together. We just have
+// to be careful that the resultant mask is representable (non-zero and
+// contiguous). For example, assuming that x is variable and c, y and m
+// are constants, a shift followed by a rotate then insert selected bits
+// could be represented as:
+//
+//   RotateLeft64(RotateLeft64(x, c) & (^uint64(0) << c), y) & m
+//
+// We can split the rotation by y into two, one rotate for x and one for
+// the mask:
+//
+//   RotateLeft64(RotateLeft64(x, c), y) & (RotateLeft64(^uint64(0) << c, y)) & m
+//
+// The rotations of x by c followed by y can then be combined:
+//
+//   RotateLeft64(x, c+y) & (RotateLeft64(^uint64(0) << c, y)) & m
+//   ^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+//          rotate                          mask
+//
+// To perform this optimization we therefore just need to check that it
+// is valid to merge the shift mask (^(uint64(0)<<c)) into the selected
+// bits mask (i.e. that the resultant mask is non-zero and contiguous).
+//
+(RISBGZ (SLDconst x [c]) {r}) && r.InMerge(^uint64(0)<<c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)<<c)).RotateLeft(c)})
+(RISBGZ (SRDconst x [c]) {r}) && r.InMerge(^uint64(0)>>c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)>>c)).RotateLeft(-c)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into left shift.
+(SLDconst (RISBGZ x {r}) [c])
+  && s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask()) != nil
+  => (RISBGZ x {(*s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+
+// Absorb 'rotate then insert selected bits [into zero]' into right shift.
+(SRDconst (RISBGZ x {r}) [c])
+  && s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask()) != nil
+  => (RISBGZ x {(*s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask())).RotateLeft(r.Amount)})
+
+// Merge 'rotate then insert selected bits [into zero]' instructions together.
+(RISBGZ (RISBGZ x {y}) {z})
+  && z.InMerge(y.OutMask()) != nil
+  => (RISBGZ x {(*z.InMerge(y.OutMask())).RotateLeft(y.Amount)})
+
+// Convert RISBGZ into 64-bit shift (helps CSE).
+(RISBGZ x {r}) && r.End == 63 && r.Start == -r.Amount&63 => (SRDconst x [-r.Amount&63])
+(RISBGZ x {r}) && r.Start == 0 && r.End == 63-r.Amount => (SLDconst x [r.Amount])
+
+// Optimize single bit isolation when it is known to be equivalent to
+// the most significant bit due to mask produced by arithmetic shift.
+// Simply isolate the most significant bit itself and place it in the
+// correct position.
+//
+// Example: (int64(x) >> 63) & 0x8 -> RISBGZ $60, $60, $4, Rsrc, Rdst
+(RISBGZ (SRADconst x [c]) {r})
+  && r.Start == r.End           // single bit selected
+  && (r.Start+r.Amount)&63 <= c // equivalent to most significant bit of x
+  => (RISBGZ x {s390x.NewRotateParams(r.Start, r.Start, -r.Start&63)})
+
+// Canonicalize the order of arguments to comparisons - helps with CSE.
+((CMP|CMPW|CMPU|CMPWU) x y) && x.ID > y.ID => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x))
+
+// Use sign/zero extend instead of RISBGZ.
+(RISBGZ x {r}) && r == s390x.NewRotateParams(56, 63, 0) => (MOVBZreg x)
+(RISBGZ x {r}) && r == s390x.NewRotateParams(48, 63, 0) => (MOVHZreg x)
+(RISBGZ x {r}) && r == s390x.NewRotateParams(32, 63, 0) => (MOVWZreg x)
+
+// Use sign/zero extend instead of ANDW.
+(ANDWconst [0x00ff] x) => (MOVBZreg x)
+(ANDWconst [0xffff] x) => (MOVHZreg x)
+
+// Strength reduce multiplication to the sum (or difference) of two powers of two.
+//
+// Examples:
+//     5x -> 4x + 1x
+//    10x -> 8x + 2x
+//   120x -> 128x - 8x
+//  -120x -> 8x - 128x
+//
+// We know that the rightmost bit of any positive value, once isolated, must either
+// be a power of 2 (because it is a single bit) or 0 (if the original value is 0).
+// In all of these rules we use a rightmost bit calculation to determine one operand
+// for the addition or subtraction. We then just need to calculate if the other
+// operand is a valid power of 2 before we can match the rule.
+//
+// Notes:
+//   - the generic rules have already matched single powers of two so we ignore them here
+//   - isPowerOfTwo32 asserts that its argument is greater than 0
+//   - c&(c-1) = clear rightmost bit
+//   - c&^(c-1) = isolate rightmost bit
+
+// c = 2ˣ + 2ʸ => c - 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c&(c-1))
+  => ((ADD|ADDW) (SL(D|W)const <t> x [uint8(log32(c&(c-1)))])
+                 (SL(D|W)const <t> x [uint8(log32(c&^(c-1)))]))
+
+// c = 2ʸ - 2ˣ => c + 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c+(c&^(c-1)))
+  => ((SUB|SUBW) (SL(D|W)const <t> x [uint8(log32(c+(c&^(c-1))))])
+                 (SL(D|W)const <t> x [uint8(log32(c&^(c-1)))]))
+
+// c = 2ˣ - 2ʸ => -c + 2ˣ = 2ʸ
+(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(-c+(-c&^(-c-1)))
+  => ((SUB|SUBW) (SL(D|W)const <t> x [uint8(log32(-c&^(-c-1)))])
+                 (SL(D|W)const <t> x [uint8(log32(-c+(-c&^(-c-1))))]))
+
+// Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them).
+(ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(int64(c)+int64(d)) => (MOVDaddr [c+d] {s} x)
+(ADDconst [c] (MOVDaddr [d] {s} x)) && x.Op != OpSB && is20Bit(int64(c)+int64(d)) => (MOVDaddr [c+d] {s} x)
+(ADD idx (MOVDaddr [c] {s} ptr)) && ptr.Op != OpSB => (MOVDaddridx [c] {s} ptr idx)
+
+// fold ADDconst into MOVDaddrx
+(ADDconst [c] (MOVDaddridx [d] {s} x y)) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y)
+(MOVDaddridx [c] {s} (ADDconst [d] x) y) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y)
+(MOVDaddridx [c] {s} x (ADDconst [d] y)) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y)
+
+// reverse ordering of compare instruction
+(LOCGR {c} x y (InvertFlags cmp)) => (LOCGR {c.ReverseComparison()} x y cmp)
+
+// replace load from same location as preceding store with copy
+(MOVDload  [off] {sym} ptr1 (MOVDstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x
+(MOVWload  [off] {sym} ptr1 (MOVWstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWreg x)
+(MOVHload  [off] {sym} ptr1 (MOVHstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVHreg x)
+(MOVBload  [off] {sym} ptr1 (MOVBstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVBreg x)
+(MOVWZload [off] {sym} ptr1 (MOVWstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWZreg x)
+(MOVHZload [off] {sym} ptr1 (MOVHstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVHZreg x)
+(MOVBZload [off] {sym} ptr1 (MOVBstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVBZreg x)
+(MOVDload  [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (LGDR x)
+(FMOVDload [off] {sym} ptr1 (MOVDstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (LDGR x)
+(FMOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x
+(FMOVSload [off] {sym} ptr1 (FMOVSstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x
+
+// prefer FPR <-> GPR moves over combined load ops
+(MULLDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (MULLD x (LGDR <t> y))
+(ADDload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (ADD   x (LGDR <t> y))
+(SUBload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (SUB   x (LGDR <t> y))
+(ORload    <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (OR    x (LGDR <t> y))
+(ANDload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (AND   x (LGDR <t> y))
+(XORload   <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (XOR   x (LGDR <t> y))
+
+// detect attempts to set/clear the sign bit
+// may need to be reworked when NIHH/OIHH are added
+(RISBGZ (LGDR <t> x) {r}) && r == s390x.NewRotateParams(1, 63, 0) => (LGDR <t> (LPDFR <x.Type> x))
+(LDGR <t> (RISBGZ x {r})) && r == s390x.NewRotateParams(1, 63, 0) => (LPDFR (LDGR <t> x))
+(OR (MOVDconst [-1<<63]) (LGDR <t> x)) => (LGDR <t> (LNDFR <x.Type> x))
+(LDGR <t> (OR (MOVDconst [-1<<63]) x)) => (LNDFR (LDGR <t> x))
+
+// detect attempts to set the sign bit with load
+(LDGR <t> x:(ORload <t1> [off] {sym} (MOVDconst [-1<<63]) ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (LNDFR <t> (LDGR <t> (MOVDload <t1> [off] {sym} ptr mem)))
+
+// detect copysign
+(OR (RISBGZ (LGDR x) {r}) (LGDR (LPDFR <t> y)))
+  && r == s390x.NewRotateParams(0, 0, 0)
+  => (LGDR (CPSDR <t> y x))
+(OR (RISBGZ (LGDR x) {r}) (MOVDconst [c]))
+  && c >= 0
+  && r == s390x.NewRotateParams(0, 0, 0)
+  => (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x))
+(CPSDR y (FMOVDconst [c])) && !math.Signbit(c) => (LPDFR y)
+(CPSDR y (FMOVDconst [c])) && math.Signbit(c)  => (LNDFR y)
+
+// absorb negations into set/clear sign bit
+(FNEG  (LPDFR x)) => (LNDFR x)
+(FNEG  (LNDFR x)) => (LPDFR x)
+(FNEGS (LPDFR x)) => (LNDFR x)
+(FNEGS (LNDFR x)) => (LPDFR x)
+
+// no need to convert float32 to float64 to set/clear sign bit
+(LEDBR (LPDFR (LDEBR x))) => (LPDFR x)
+(LEDBR (LNDFR (LDEBR x))) => (LNDFR x)
+
+// remove unnecessary FPR <-> GPR moves
+(LDGR (LGDR x)) => x
+(LGDR (LDGR x)) => x
+
+// Don't extend before storing
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWZreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHZreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBZreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+
+// Fold constants into memory operations.
+// Note that this is not always a good idea because if not all the uses of
+// the ADDconst get eliminated, we still have to compute the ADDconst and we now
+// have potentially two live values (ptr and (ADDconst [off] ptr)) instead of one.
+// Nevertheless, let's do it!
+(MOVDload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVDload  [off1+off2] {sym} ptr mem)
+(MOVWload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWload  [off1+off2] {sym} ptr mem)
+(MOVHload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHload  [off1+off2] {sym} ptr mem)
+(MOVBload   [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBload  [off1+off2] {sym} ptr mem)
+(MOVWZload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWZload [off1+off2] {sym} ptr mem)
+(MOVHZload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHZload [off1+off2] {sym} ptr mem)
+(MOVBZload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBZload [off1+off2] {sym} ptr mem)
+(FMOVSload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVSload [off1+off2] {sym} ptr mem)
+(FMOVDload  [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVDload [off1+off2] {sym} ptr mem)
+
+(MOVDstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVDstore  [off1+off2] {sym} ptr val mem)
+(MOVWstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWstore  [off1+off2] {sym} ptr val mem)
+(MOVHstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHstore  [off1+off2] {sym} ptr val mem)
+(MOVBstore  [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBstore  [off1+off2] {sym} ptr val mem)
+(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVSstore [off1+off2] {sym} ptr val mem)
+(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVDstore [off1+off2] {sym} ptr val mem)
+
+(ADDload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ADDload   [off1+off2] {sym} x ptr mem)
+(ADDWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ADDWload  [off1+off2] {sym} x ptr mem)
+(MULLDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (MULLDload [off1+off2] {sym} x ptr mem)
+(MULLWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (MULLWload [off1+off2] {sym} x ptr mem)
+(SUBload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (SUBload   [off1+off2] {sym} x ptr mem)
+(SUBWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (SUBWload  [off1+off2] {sym} x ptr mem)
+
+(ANDload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ANDload   [off1+off2] {sym} x ptr mem)
+(ANDWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ANDWload  [off1+off2] {sym} x ptr mem)
+(ORload    [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ORload    [off1+off2] {sym} x ptr mem)
+(ORWload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ORWload   [off1+off2] {sym} x ptr mem)
+(XORload   [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (XORload   [off1+off2] {sym} x ptr mem)
+(XORWload  [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (XORWload  [off1+off2] {sym} x ptr mem)
+
+// Fold constants into stores.
+(MOVDstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(int64(off)) && ptr.Op != OpSB =>
+	(MOVDstoreconst [makeValAndOff32(int32(c),off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(int64(off)) && ptr.Op != OpSB =>
+	(MOVWstoreconst [makeValAndOff32(int32(c),off)] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVDconst [c]) mem) && isU12Bit(int64(off)) && ptr.Op != OpSB =>
+	(MOVHstoreconst [makeValAndOff32(int32(int16(c)),off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVDconst [c]) mem) && is20Bit(int64(off)) && ptr.Op != OpSB =>
+	(MOVBstoreconst [makeValAndOff32(int32(int8(c)),off)] {sym} ptr mem)
+
+// Fold address offsets into constant stores.
+(MOVDstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off()+int64(off)) =>
+	(MOVDstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVWstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off()+int64(off)) =>
+	(MOVWstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVHstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off()+int64(off)) =>
+	(MOVHstoreconst [sc.addOffset32(off)] {s} ptr mem)
+(MOVBstoreconst [sc] {s} (ADDconst [off] ptr) mem) && is20Bit(sc.Off()+int64(off)) =>
+	(MOVBstoreconst [sc.addOffset32(off)] {s} ptr mem)
+
+// Merge address calculations into loads and stores.
+// Offsets from SB must not be merged into unaligned memory accesses because
+// loads/stores using PC-relative addressing directly must be aligned to the
+// size of the target.
+(MOVDload   [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) =>
+	(MOVDload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVWZload  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) =>
+	(MOVWZload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHZload  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) =>
+	(MOVHZload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBZload  [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVBZload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(FMOVSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVWload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) =>
+	(MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVHload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) =>
+	(MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+
+(MOVDstore  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) =>
+	(MOVDstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVWstore  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) =>
+	(MOVWstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVHstore  [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) =>
+	(MOVHstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOVBstore  [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(MOVBstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+	(FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+
+(ADDload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ADDload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ADDWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ADDWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(MULLDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (MULLDload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(MULLWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (MULLWload [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(SUBload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (SUBload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(SUBWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (SUBWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+
+(ANDload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ANDload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ANDWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ANDWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ORload    [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ORload    [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(ORWload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ORWload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(XORload   [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (XORload   [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+(XORWload  [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (XORWload  [o1+o2] {mergeSym(s1, s2)} x ptr mem)
+
+// Cannot store constant to SB directly (no 'move relative long immediate' instructions).
+(MOVDstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVDstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVWstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVWstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVHstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVHstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOVBstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) =>
+	(MOVBstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem)
+
+// MOVDaddr into MOVDaddridx
+(MOVDaddridx [off1] {sym1} (MOVDaddr [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
+(MOVDaddridx [off1] {sym1} x (MOVDaddr [off2] {sym2} y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && y.Op != OpSB =>
+       (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
+
+// Absorb InvertFlags into branches.
+(BRC {c} (InvertFlags cmp) yes no) => (BRC {c.ReverseComparison()} cmp yes no)
+
+// Constant comparisons.
+(CMPconst (MOVDconst [x]) [y]) && x==int64(y) => (FlagEQ)
+(CMPconst (MOVDconst [x]) [y]) && x<int64(y) => (FlagLT)
+(CMPconst (MOVDconst [x]) [y]) && x>int64(y) => (FlagGT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)==uint64(y) => (FlagEQ)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) => (FlagLT)
+(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) => (FlagGT)
+
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) => (FlagLT)
+(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) => (FlagGT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)==uint32(y) => (FlagEQ)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) => (FlagLT)
+(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) => (FlagGT)
+
+(CMP(W|WU)const (MOVBZreg _) [c]) &&   0xff < c => (FlagLT)
+(CMP(W|WU)const (MOVHZreg _) [c]) && 0xffff < c => (FlagLT)
+
+(CMPconst  (SRDconst _ [c]) [n]) && c > 0 && n < 0 => (FlagGT)
+(CMPWconst (SRWconst _ [c]) [n]) && c > 0 && n < 0 => (FlagGT)
+
+(CMPUconst  (SRDconst _ [c]) [n]) && c > 0 && c < 64 && (1<<uint(64-c)) <= uint64(n) => (FlagLT)
+(CMPWUconst (SRWconst _ [c]) [n]) && c > 0 && c < 32 && (1<<uint(32-c)) <= uint32(n) => (FlagLT)
+
+(CMPWconst  (ANDWconst _ [m]) [n]) && int32(m) >= 0 &&  int32(m) <  int32(n) => (FlagLT)
+(CMPWUconst (ANDWconst _ [m]) [n]) && uint32(m) < uint32(n) => (FlagLT)
+
+(CMPconst  (RISBGZ x {r}) [c]) && c > 0 && r.OutMask() < uint64(c) => (FlagLT)
+(CMPUconst (RISBGZ x {r}) [c]) && r.OutMask() < uint64(uint32(c)) => (FlagLT)
+
+// Constant compare-and-branch with immediate.
+(CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   != 0 &&  int64(x) ==  int64(y) => (First yes no)
+(CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    != 0 &&  int64(x) <   int64(y) => (First yes no)
+(CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 &&  int64(x) >   int64(y) => (First yes no)
+(CIJ   {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   != 0 &&  int32(x) ==  int32(y) => (First yes no)
+(CIJ   {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    != 0 &&  int32(x) <   int32(y) => (First yes no)
+(CIJ   {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 &&  int32(x) >   int32(y) => (First yes no)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   != 0 && uint64(x) == uint64(y) => (First yes no)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    != 0 && uint64(x) <  uint64(y) => (First yes no)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && uint64(x) >  uint64(y) => (First yes no)
+(CLIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   != 0 && uint32(x) == uint32(y) => (First yes no)
+(CLIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    != 0 && uint32(x) <  uint32(y) => (First yes no)
+(CLIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && uint32(x) >  uint32(y) => (First yes no)
+(CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   == 0 &&  int64(x) ==  int64(y) => (First no yes)
+(CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    == 0 &&  int64(x) <   int64(y) => (First no yes)
+(CGIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 &&  int64(x) >   int64(y) => (First no yes)
+(CIJ   {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   == 0 &&  int32(x) ==  int32(y) => (First no yes)
+(CIJ   {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    == 0 &&  int32(x) <   int32(y) => (First no yes)
+(CIJ   {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 &&  int32(x) >   int32(y) => (First no yes)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   == 0 && uint64(x) == uint64(y) => (First no yes)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    == 0 && uint64(x) <  uint64(y) => (First no yes)
+(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && uint64(x) >  uint64(y) => (First no yes)
+(CLIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal   == 0 && uint32(x) == uint32(y) => (First no yes)
+(CLIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less    == 0 && uint32(x) <  uint32(y) => (First no yes)
+(CLIJ  {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && uint32(x) >  uint32(y) => (First no yes)
+
+// Constant compare-and-branch with immediate when unsigned comparison with zero.
+(C(L|LG)IJ {s390x.GreaterOrEqual} _ [0] yes no) => (First yes no)
+(C(L|LG)IJ {s390x.Less}           _ [0] yes no) => (First no yes)
+
+// Constant compare-and-branch when operands match.
+(C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c&s390x.Equal != 0 => (First yes no)
+(C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c&s390x.Equal == 0 => (First no yes)
+
+// Convert 64-bit comparisons to 32-bit comparisons and signed comparisons
+// to unsigned comparisons.
+// Helps simplify constant comparison detection.
+(CM(P|PU)const (MOV(W|WZ)reg x) [c]) => (CMP(W|WU)const x [c])
+(CM(P|P|PU|PU)const x:(MOV(H|HZ|H|HZ)reg _) [c]) => (CMP(W|W|WU|WU)const x [c])
+(CM(P|P|PU|PU)const x:(MOV(B|BZ|B|BZ)reg _) [c]) => (CMP(W|W|WU|WU)const x [c])
+(CMPconst  (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 && c >= 0 => (CMPWUconst x [c])
+(CMPUconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0           => (CMPWUconst x [c])
+(CMPconst  x:(SRDconst _ [c]) [n]) && c > 0 && n >= 0 => (CMPUconst  x [n])
+(CMPWconst x:(SRWconst _ [c]) [n]) && c > 0 && n >= 0 => (CMPWUconst x [n])
+
+// Absorb sign and zero extensions into 32-bit comparisons.
+(CMP(W|W|WU|WU)      x (MOV(W|WZ|W|WZ)reg y))   => (CMP(W|W|WU|WU) x y)
+(CMP(W|W|WU|WU)      (MOV(W|WZ|W|WZ)reg x) y)   => (CMP(W|W|WU|WU) x y)
+(CMP(W|W|WU|WU)const (MOV(W|WZ|W|WZ)reg x) [c]) => (CMP(W|W|WU|WU)const x [c])
+
+// Absorb flag constants into branches.
+(BRC {c} (FlagEQ) yes no) && c&s390x.Equal     != 0 => (First yes no)
+(BRC {c} (FlagLT) yes no) && c&s390x.Less      != 0 => (First yes no)
+(BRC {c} (FlagGT) yes no) && c&s390x.Greater   != 0 => (First yes no)
+(BRC {c} (FlagOV) yes no) && c&s390x.Unordered != 0 => (First yes no)
+
+(BRC {c} (FlagEQ) yes no) && c&s390x.Equal     == 0 => (First no yes)
+(BRC {c} (FlagLT) yes no) && c&s390x.Less      == 0 => (First no yes)
+(BRC {c} (FlagGT) yes no) && c&s390x.Greater   == 0 => (First no yes)
+(BRC {c} (FlagOV) yes no) && c&s390x.Unordered == 0 => (First no yes)
+
+// Absorb flag constants into SETxx ops.
+(LOCGR {c} _ x (FlagEQ)) && c&s390x.Equal     != 0 => x
+(LOCGR {c} _ x (FlagLT)) && c&s390x.Less      != 0 => x
+(LOCGR {c} _ x (FlagGT)) && c&s390x.Greater   != 0 => x
+(LOCGR {c} _ x (FlagOV)) && c&s390x.Unordered != 0 => x
+
+(LOCGR {c} x _ (FlagEQ)) && c&s390x.Equal     == 0 => x
+(LOCGR {c} x _ (FlagLT)) && c&s390x.Less      == 0 => x
+(LOCGR {c} x _ (FlagGT)) && c&s390x.Greater   == 0 => x
+(LOCGR {c} x _ (FlagOV)) && c&s390x.Unordered == 0 => x
+
+// Remove redundant *const ops
+(ADDconst [0] x) => x
+(ADDWconst [c] x) && int32(c)==0 => x
+(SUBconst [0] x) => x
+(SUBWconst [c] x) && int32(c) == 0 => x
+(ANDconst [0] _)                 => (MOVDconst [0])
+(ANDWconst [c] _) && int32(c)==0  => (MOVDconst [0])
+(ANDconst [-1] x)                => x
+(ANDWconst [c] x) && int32(c)==-1 => x
+(ORconst [0] x)                  => x
+(ORWconst [c] x) && int32(c)==0   => x
+(ORconst [-1] _)                 => (MOVDconst [-1])
+(ORWconst [c] _) && int32(c)==-1  => (MOVDconst [-1])
+(XORconst [0] x)                  => x
+(XORWconst [c] x) && int32(c)==0   => x
+
+// Shifts by zero (may be inserted during multiplication strength reduction).
+((SLD|SLW|SRD|SRW|SRAD|SRAW)const x [0]) => x
+
+// Convert constant subtracts to constant adds.
+(SUBconst [c] x) && c != -(1<<31) => (ADDconst [-c] x)
+(SUBWconst [c] x) => (ADDWconst [-int32(c)] x)
+
+// generic constant folding
+// TODO: more of this
+(ADDconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)+d])
+(ADDWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)+d])
+(ADDconst [c] (ADDconst [d] x)) && is32Bit(int64(c)+int64(d)) => (ADDconst [c+d] x)
+(ADDWconst [c] (ADDWconst [d] x)) => (ADDWconst [int32(c+d)] x)
+(SUBconst (MOVDconst [d]) [c]) => (MOVDconst [d-int64(c)])
+(SUBconst (SUBconst x [d]) [c]) && is32Bit(-int64(c)-int64(d)) => (ADDconst [-c-d] x)
+(SRADconst [c] (MOVDconst [d])) => (MOVDconst [d>>uint64(c)])
+(SRAWconst [c] (MOVDconst [d])) => (MOVDconst [int64(int32(d))>>uint64(c)])
+(NEG (MOVDconst [c])) => (MOVDconst [-c])
+(NEGW (MOVDconst [c])) => (MOVDconst [int64(int32(-c))])
+(MULLDconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)*d])
+(MULLWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c*int32(d))])
+(AND (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&d])
+(ANDconst [c] (MOVDconst [d])) => (MOVDconst [c&d])
+(ANDWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)&d])
+(OR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|d])
+(ORconst [c] (MOVDconst [d])) => (MOVDconst [c|d])
+(ORWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)|d])
+(XOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c^d])
+(XORconst [c] (MOVDconst [d])) => (MOVDconst [c^d])
+(XORWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)^d])
+(LoweredRound32F x:(FMOVSconst)) => x
+(LoweredRound64F x:(FMOVDconst)) => x
+
+// generic simplifications
+// TODO: more of this
+(ADD x (NEG y)) => (SUB x y)
+(ADDW x (NEGW y)) => (SUBW x y)
+(SUB x x) => (MOVDconst [0])
+(SUBW x x) => (MOVDconst [0])
+(AND x x) => x
+(ANDW x x) => x
+(OR x x) => x
+(ORW x x) => x
+(XOR x x) => (MOVDconst [0])
+(XORW x x) => (MOVDconst [0])
+(NEG (ADDconst [c] (NEG x))) && c != -(1<<31) => (ADDconst [-c] x)
+(MOVBZreg (ANDWconst [m] x)) => (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x))
+(MOVHZreg (ANDWconst [m] x)) => (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x))
+(MOVBreg  (ANDWconst [m] x)) &&  int8(m) >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x))
+(MOVHreg  (ANDWconst [m] x)) && int16(m) >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x))
+
+// carry flag generation
+// (only constant fold carry of zero)
+(Select1 (ADDCconst (MOVDconst [c]) [d]))
+  && uint64(c+int64(d)) >= uint64(c) && c+int64(d) == 0
+  => (FlagEQ)
+(Select1 (ADDCconst (MOVDconst [c]) [d]))
+  && uint64(c+int64(d)) >= uint64(c) && c+int64(d) != 0
+  => (FlagLT)
+
+// borrow flag generation
+// (only constant fold borrow of zero)
+(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
+  && uint64(d) <= uint64(c) && c-d == 0
+  => (FlagGT)
+(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
+  && uint64(d) <= uint64(c) && c-d != 0
+  => (FlagOV)
+
+// add with carry
+(ADDE x y (FlagEQ)) => (ADDC x y)
+(ADDE x y (FlagLT)) => (ADDC x y)
+(ADDC x (MOVDconst [c])) && is16Bit(c) => (ADDCconst x [int16(c)])
+(Select0 (ADDCconst (MOVDconst [c]) [d])) => (MOVDconst [c+int64(d)])
+
+// subtract with borrow
+(SUBE x y (FlagGT)) => (SUBC x y)
+(SUBE x y (FlagOV)) => (SUBC x y)
+(Select0 (SUBC (MOVDconst [c]) (MOVDconst [d]))) => (MOVDconst [c-d])
+
+// collapse carry chain
+(ADDE x y (Select1 (ADDCconst [-1] (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) c)))))
+  => (ADDE x y c)
+
+// collapse borrow chain
+(SUBE x y (Select1 (SUBC (MOVDconst [0]) (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) c))))))
+  => (SUBE x y c)
+
+// branch on carry
+(C(G|LG)IJ {s390x.Equal}         (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.NoCarry} carry)
+(C(G|LG)IJ {s390x.Equal}         (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [1]) => (BRC {s390x.Carry}   carry)
+(C(G|LG)IJ {s390x.LessOrGreater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.Carry}   carry)
+(C(G|LG)IJ {s390x.LessOrGreater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [1]) => (BRC {s390x.NoCarry} carry)
+(C(G|LG)IJ {s390x.Greater}       (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.Carry}   carry)
+
+// branch on borrow
+(C(G|LG)IJ {s390x.Equal}         (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.NoBorrow} borrow)
+(C(G|LG)IJ {s390x.Equal}         (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [1]) => (BRC {s390x.Borrow}   borrow)
+(C(G|LG)IJ {s390x.LessOrGreater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.Borrow}   borrow)
+(C(G|LG)IJ {s390x.LessOrGreater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [1]) => (BRC {s390x.NoBorrow} borrow)
+(C(G|LG)IJ {s390x.Greater}       (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.Borrow}   borrow)
+
+// fused multiply-add
+(Select0 (F(ADD|SUB) (FMUL y z) x)) => (FM(ADD|SUB) x y z)
+(Select0 (F(ADDS|SUBS) (FMULS y z) x)) => (FM(ADDS|SUBS) x y z)
+
+// Convert floating point comparisons against zero into 'load and test' instructions.
+(F(CMP|CMPS) x (FMOV(D|S)const [0.0])) => (LT(D|E)BR x)
+(F(CMP|CMPS) (FMOV(D|S)const [0.0]) x) => (InvertFlags (LT(D|E)BR <v.Type> x))
+
+// FSUB, FSUBS, FADD, FADDS now produce a condition code representing the
+// comparison of the result with 0.0. If a compare with zero instruction
+// (e.g. LTDBR) is following one of those instructions, we can use the
+// generated flag and remove the comparison instruction.
+// Note: when inserting Select1 ops we need to ensure they are in the
+// same block as their argument. We could also use @x.Block for this
+// but moving the flag generating value to a different block seems to
+// increase the likelihood that the flags value will have to be regenerated
+// by flagalloc which is not what we want.
+(LTDBR (Select0 x:(F(ADD|SUB) _ _)))   && b == x.Block => (Select1 x)
+(LTEBR (Select0 x:(F(ADDS|SUBS) _ _))) && b == x.Block => (Select1 x)
+
+// Fold memory operations into operations.
+// Exclude global data (SB) because these instructions cannot handle relative addresses.
+// TODO(mundaym): indexed versions of these?
+((ADD|SUB|MULLD|AND|OR|XOR) <t> x g:(MOVDload [off] {sym} ptr mem))
+  && ptr.Op != OpSB
+  && is20Bit(int64(off))
+  && canMergeLoadClobber(v, g, x)
+  && clobber(g)
+  => ((ADD|SUB|MULLD|AND|OR|XOR)load <t> [off] {sym} x ptr mem)
+((ADD|SUB|MULL|AND|OR|XOR)W <t> x g:(MOVWload [off] {sym} ptr mem))
+  && ptr.Op != OpSB
+  && is20Bit(int64(off))
+  && canMergeLoadClobber(v, g, x)
+  && clobber(g)
+  => ((ADD|SUB|MULL|AND|OR|XOR)Wload <t> [off] {sym} x ptr mem)
+((ADD|SUB|MULL|AND|OR|XOR)W <t> x g:(MOVWZload [off] {sym} ptr mem))
+  && ptr.Op != OpSB
+  && is20Bit(int64(off))
+  && canMergeLoadClobber(v, g, x)
+  && clobber(g)
+  => ((ADD|SUB|MULL|AND|OR|XOR)Wload <t> [off] {sym} x ptr mem)
+
+// Combine constant stores into larger (unaligned) stores.
+// Avoid SB because constant stores to relative offsets are
+// emulated by the assembler and also can't handle unaligned offsets.
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && a.Off() + 1 == c.Off()
+  && clobber(x)
+  => (MOVHstoreconst [makeValAndOff32(c.Val32()&0xff | a.Val32()<<8, a.Off32())] {s} p mem)
+(MOVHstoreconst [c] {s} p x:(MOVHstoreconst [a] {s} p mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && a.Off() + 2 == c.Off()
+  && clobber(x)
+  => (MOVWstore [a.Off32()] {s} p (MOVDconst [int64(c.Val32()&0xffff | a.Val32()<<16)]) mem)
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && a.Off() + 4 == c.Off()
+  && clobber(x)
+  => (MOVDstore [a.Off32()] {s} p (MOVDconst [c.Val()&0xffffffff | a.Val()<<32]) mem)
+
+// Combine stores into larger (unaligned) stores.
+// It doesn't work on global data (based on SB) because stores with relative addressing
+// require that the memory operand be aligned.
+(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRDconst [8] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w0:(SRDconst [j] w) x:(MOVBstore [i-1] {s} p (SRDconst [j+8] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRWconst [8] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w0:(SRWconst [j] w) x:(MOVBstore [i-1] {s} p (SRWconst [j+8] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHstore [i-1] {s} p w0 mem)
+(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRDconst [16] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-2] {s} p w mem)
+(MOVHstore [i] {s} p w0:(SRDconst [j] w) x:(MOVHstore [i-2] {s} p (SRDconst [j+16] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-2] {s} p w0 mem)
+(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRWconst [16] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-2] {s} p w mem)
+(MOVHstore [i] {s} p w0:(SRWconst [j] w) x:(MOVHstore [i-2] {s} p (SRWconst [j+16] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWstore [i-2] {s} p w0 mem)
+(MOVWstore [i] {s} p (SRDconst [32] w) x:(MOVWstore [i-4] {s} p w mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVDstore [i-4] {s} p w mem)
+(MOVWstore [i] {s} p w0:(SRDconst [j] w) x:(MOVWstore [i-4] {s} p (SRDconst [j+32] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVDstore [i-4] {s} p w0 mem)
+
+// Combine stores into larger (unaligned) stores with the bytes reversed (little endian).
+// Store-with-bytes-reversed instructions do not support relative memory addresses,
+// so these stores can't operate on global data (SB).
+(MOVBstore [i] {s} p (SRDconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHBRstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p (SRDconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRDconst [j-8] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHBRstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p (SRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHBRstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p (SRWconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRWconst [j-8] w) mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVHBRstore [i-1] {s} p w0 mem)
+(MOVHBRstore [i] {s} p (SRDconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWBRstore [i-2] {s} p w mem)
+(MOVHBRstore [i] {s} p (SRDconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRDconst [j-16] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWBRstore [i-2] {s} p w0 mem)
+(MOVHBRstore [i] {s} p (SRWconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWBRstore [i-2] {s} p w mem)
+(MOVHBRstore [i] {s} p (SRWconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRWconst [j-16] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVWBRstore [i-2] {s} p w0 mem)
+(MOVWBRstore [i] {s} p (SRDconst [32] w) x:(MOVWBRstore [i-4] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVDBRstore [i-4] {s} p w mem)
+(MOVWBRstore [i] {s} p (SRDconst [j] w) x:(MOVWBRstore [i-4] {s} p w0:(SRDconst [j-32] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  => (MOVDBRstore [i-4] {s} p w0 mem)
+
+// Combining byte loads into larger (unaligned) loads.
+
+// Big-endian loads
+
+(ORW                 x1:(MOVBZload [i1] {s} p mem)
+    sh:(SLWconst [8] x0:(MOVBZload [i0] {s} p mem)))
+  && i1 == i0+1
+  && p.Op != OpSB
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
+
+(OR                  x1:(MOVBZload [i1] {s} p mem)
+    sh:(SLDconst [8] x0:(MOVBZload [i0] {s} p mem)))
+  && i1 == i0+1
+  && p.Op != OpSB
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
+
+(ORW                  x1:(MOVHZload [i1] {s} p mem)
+    sh:(SLWconst [16] x0:(MOVHZload [i0] {s} p mem)))
+  && i1 == i0+2
+  && p.Op != OpSB
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
+
+(OR                   x1:(MOVHZload [i1] {s} p mem)
+    sh:(SLDconst [16] x0:(MOVHZload [i0] {s} p mem)))
+  && i1 == i0+2
+  && p.Op != OpSB
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
+
+(OR                   x1:(MOVWZload [i1] {s} p mem)
+    sh:(SLDconst [32] x0:(MOVWZload [i0] {s} p mem)))
+  && i1 == i0+4
+  && p.Op != OpSB
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVDload [i0] {s} p mem)
+
+(ORW
+    s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
+    or:(ORW
+        s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
+	y))
+  && i1 == i0+1
+  && j1 == j0-8
+  && j1 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
+
+(OR
+    s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
+    or:(OR
+        s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
+	y))
+  && i1 == i0+1
+  && j1 == j0-8
+  && j1 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
+
+(OR
+    s0:(SLDconst [j0] x0:(MOVHZload [i0] {s} p mem))
+    or:(OR
+        s1:(SLDconst [j1] x1:(MOVHZload [i1] {s} p mem))
+	y))
+  && i1 == i0+2
+  && j1 == j0-16
+  && j1 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZload [i0] {s} p mem)) y)
+
+// Little-endian loads
+
+(ORW                 x0:(MOVBZload [i0] {s} p mem)
+    sh:(SLWconst [8] x1:(MOVBZload [i1] {s} p mem)))
+  && p.Op != OpSB
+  && i1 == i0+1
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
+
+(OR                  x0:(MOVBZload [i0] {s} p mem)
+    sh:(SLDconst [8] x1:(MOVBZload [i1] {s} p mem)))
+  && p.Op != OpSB
+  && i1 == i0+1
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  => @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
+
+(ORW                  r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
+    sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
+  && i1 == i0+2
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (MOVWBRload [i0] {s} p mem)
+
+(OR                   r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
+    sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
+  && i1 == i0+2
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRload [i0] {s} p mem))
+
+(OR                   r0:(MOVWZreg x0:(MOVWBRload [i0] {s} p mem))
+    sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRload [i1] {s} p mem))))
+  && i1 == i0+4
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  => @mergePoint(b,x0,x1) (MOVDBRload [i0] {s} p mem)
+
+(ORW
+    s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
+    or:(ORW
+        s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
+	y))
+  && p.Op != OpSB
+  && i1 == i0+1
+  && j1 == j0+8
+  && j0 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
+
+(OR
+    s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
+    or:(OR
+        s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
+	y))
+  && p.Op != OpSB
+  && i1 == i0+1
+  && j1 == j0+8
+  && j0 % 16 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
+
+(OR
+    s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem)))
+    or:(OR
+        s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem)))
+	y))
+  && i1 == i0+2
+  && j1 == j0+16
+  && j0 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, r0, r1, s0, s1, or)
+  => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRload [i0] {s} p mem))) y)
+
+// Combine stores into store multiples.
+// 32-bit
+(MOVWstore [i] {s} p w1 x:(MOVWstore [i-4] {s} p w0 mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && is20Bit(int64(i)-4)
+  && clobber(x)
+  => (STM2 [i-4] {s} p w0 w1 mem)
+(MOVWstore [i] {s} p w2 x:(STM2 [i-8] {s} p w0 w1 mem))
+  && x.Uses == 1
+  && is20Bit(int64(i)-8)
+  && clobber(x)
+  => (STM3 [i-8] {s} p w0 w1 w2 mem)
+(MOVWstore [i] {s} p w3 x:(STM3 [i-12] {s} p w0 w1 w2 mem))
+  && x.Uses == 1
+  && is20Bit(int64(i)-12)
+  && clobber(x)
+  => (STM4 [i-12] {s} p w0 w1 w2 w3 mem)
+(STM2 [i] {s} p w2 w3 x:(STM2 [i-8] {s} p w0 w1 mem))
+  && x.Uses == 1
+  && is20Bit(int64(i)-8)
+  && clobber(x)
+  => (STM4 [i-8] {s} p w0 w1 w2 w3 mem)
+// 64-bit
+(MOVDstore [i] {s} p w1 x:(MOVDstore [i-8] {s} p w0 mem))
+  && p.Op != OpSB
+  && x.Uses == 1
+  && is20Bit(int64(i)-8)
+  && clobber(x)
+  => (STMG2 [i-8] {s} p w0 w1 mem)
+(MOVDstore [i] {s} p w2 x:(STMG2 [i-16] {s} p w0 w1 mem))
+  && x.Uses == 1
+  && is20Bit(int64(i)-16)
+  && clobber(x)
+  => (STMG3 [i-16] {s} p w0 w1 w2 mem)
+(MOVDstore [i] {s} p w3 x:(STMG3 [i-24] {s} p w0 w1 w2 mem))
+  && x.Uses == 1
+  && is20Bit(int64(i)-24)
+  && clobber(x)
+  => (STMG4 [i-24] {s} p w0 w1 w2 w3 mem)
+(STMG2 [i] {s} p w2 w3 x:(STMG2 [i-16] {s} p w0 w1 mem))
+  && x.Uses == 1
+  && is20Bit(int64(i)-16)
+  && clobber(x)
+  => (STMG4 [i-16] {s} p w0 w1 w2 w3 mem)
+
+// Convert 32-bit store multiples into 64-bit stores.
+(STM2 [i] {s} p (SRDconst [32] x) x mem) => (MOVDstore [i] {s} p x mem)
diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go
new file mode 100644
index 0000000..b24fd61
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -0,0 +1,816 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - When doing sub-register operations, we try to write the whole
+//    destination register to avoid a partial-register write.
+//  - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+//    filled by sign-extending the used portion. Users of AuxInt which interpret
+//    AuxInt as unsigned (e.g. shifts) must be careful.
+//  - The SB 'register' is implemented using instruction-relative addressing. This
+//    places some limitations on when and how memory operands that are addressed
+//    relative to SB can be used:
+//
+//     1. Pseudo-instructions do not always map to a single machine instruction when
+//        using the SB 'register' to address data. This is because many machine
+//        instructions do not have relative long (RL suffix) equivalents. For example,
+//        ADDload, which is assembled as AG.
+//
+//     2. Loads and stores using relative addressing require the data be aligned
+//        according to its size (8-bytes for double words, 4-bytes for words
+//        and so on).
+//
+//    We can always work around these by inserting LARL instructions (load address
+//    relative long) in the assembler, but typically this results in worse code
+//    generation because the address can't be re-used. Inserting instructions in the
+//    assembler also means clobbering the temp register and it is a long-term goal
+//    to prevent the compiler doing this so that it can be allocated as a normal
+//    register.
+//
+// For more information about the z/Architecture, the instruction set and the
+// addressing modes it supports take a look at the z/Architecture Principles of
+// Operation: http://publibfp.boulder.ibm.com/epubs/pdf/dz9zr010.pdf
+//
+// Suffixes encode the bit width of pseudo-instructions.
+// D (double word)  = 64 bit (frequently omitted)
+// W (word)         = 32 bit
+// H (half word)    = 16 bit
+// B (byte)         = 8 bit
+// S (single prec.) = 32 bit (double precision is omitted)
+
+// copied from ../../s390x/reg.go
+var regNamesS390X = []string{
+	"R0",
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"g", // R13
+	"R14",
+	"SP", // R15
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+
+	// If you add registers, update asyncPreempt in runtime.
+
+	//pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesS390X) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesS390X {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		sp  = buildReg("SP")
+		sb  = buildReg("SB")
+		r0  = buildReg("R0")
+		tmp = buildReg("R11") // R11 is used as a temporary in a small number of instructions.
+
+		// R10 is reserved by the assembler.
+		gp   = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14")
+		gpg  = gp | buildReg("g")
+		gpsp = gp | sp
+
+		// R0 is considered to contain the value 0 in address calculations.
+		ptr     = gp &^ r0
+		ptrsp   = ptr | sp
+		ptrspsb = ptrsp | sb
+
+		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+		r1         = buildReg("R1")
+		r2         = buildReg("R2")
+		r3         = buildReg("R3")
+	)
+	// Common slices of register masks
+	var (
+		gponly = []regMask{gp}
+		fponly = []regMask{fp}
+	)
+
+	// Common regInfo
+	var (
+		gp01    = regInfo{inputs: []regMask{}, outputs: gponly}
+		gp11    = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp11sp  = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
+		gp21    = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp21sp  = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
+		gp21tmp = regInfo{inputs: []regMask{gp &^ tmp, gp &^ tmp}, outputs: []regMask{gp &^ tmp}, clobbers: tmp}
+
+		// R0 evaluates to 0 when used as the number of bits to shift
+		// so we need to exclude it from that operand.
+		sh21 = regInfo{inputs: []regMask{gp, ptr}, outputs: gponly}
+
+		addr    = regInfo{inputs: []regMask{sp | sb}, outputs: gponly}
+		addridx = regInfo{inputs: []regMask{sp | sb, ptrsp}, outputs: gponly}
+
+		gp2flags       = regInfo{inputs: []regMask{gpsp, gpsp}}
+		gp1flags       = regInfo{inputs: []regMask{gpsp}}
+		gp2flags1      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp11flags      = regInfo{inputs: []regMask{gp}, outputs: gponly}
+		gp21flags      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+		gp2flags1flags = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
+
+		gpload       = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: gponly}
+		gploadidx    = regInfo{inputs: []regMask{ptrspsb, ptrsp, 0}, outputs: gponly}
+		gpopload     = regInfo{inputs: []regMask{gp, ptrsp, 0}, outputs: gponly}
+		gpstore      = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}}
+		gpstoreconst = regInfo{inputs: []regMask{ptrspsb, 0}}
+		gpstoreidx   = regInfo{inputs: []regMask{ptrsp, ptrsp, gpsp, 0}}
+		gpstorebr    = regInfo{inputs: []regMask{ptrsp, gpsp, 0}}
+		gpstorelaa   = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}, outputs: gponly}
+		gpstorelab   = regInfo{inputs: []regMask{r1, gpsp, 0}, clobbers: r1}
+
+		gpmvc = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}}
+
+		fp01        = regInfo{inputs: []regMask{}, outputs: fponly}
+		fp21        = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp31        = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
+		fp21clobber = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fpgp        = regInfo{inputs: fponly, outputs: gponly}
+		gpfp        = regInfo{inputs: gponly, outputs: fponly}
+		fp11        = regInfo{inputs: fponly, outputs: fponly}
+		fp1flags    = regInfo{inputs: []regMask{fp}}
+		fp11clobber = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags    = regInfo{inputs: []regMask{fp, fp}}
+
+		fpload    = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: fponly}
+		fploadidx = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}, outputs: fponly}
+
+		fpstore    = regInfo{inputs: []regMask{ptrspsb, fp, 0}}
+		fpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, fp, 0}}
+
+		sync = regInfo{inputs: []regMask{0}}
+
+		// LoweredAtomicCas may overwrite arg1, so force it to R0 for now.
+		cas = regInfo{inputs: []regMask{ptrsp, r0, gpsp, 0}, outputs: []regMask{gp, 0}, clobbers: r0}
+
+		// LoweredAtomicExchange overwrites the output before executing
+		// CS{,G}, so the output register must not be the same as the
+		// input register. For now we just force the output register to
+		// R0.
+		exchange = regInfo{inputs: []regMask{ptrsp, gpsp &^ r0, 0}, outputs: []regMask{r0, 0}}
+	)
+
+	var S390Xops = []opData{
+		// fp ops
+		{name: "FADDS", argLength: 2, reg: fp21clobber, typ: "(Float32,Flags)", asm: "FADDS", commutative: true, resultInArg0: true}, // fp32 arg0 + arg1
+		{name: "FADD", argLength: 2, reg: fp21clobber, typ: "(Float64,Flags)", asm: "FADD", commutative: true, resultInArg0: true},   // fp64 arg0 + arg1
+		{name: "FSUBS", argLength: 2, reg: fp21clobber, typ: "(Float32,Flags)", asm: "FSUBS", resultInArg0: true},                    // fp32 arg0 - arg1
+		{name: "FSUB", argLength: 2, reg: fp21clobber, typ: "(Float64,Flags)", asm: "FSUB", resultInArg0: true},                      // fp64 arg0 - arg1
+		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, resultInArg0: true},                                // fp32 arg0 * arg1
+		{name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true, resultInArg0: true},                                  // fp64 arg0 * arg1
+		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", resultInArg0: true},                                                   // fp32 arg0 / arg1
+		{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV", resultInArg0: true},                                                     // fp64 arg0 / arg1
+		{name: "FNEGS", argLength: 1, reg: fp11clobber, asm: "FNEGS", clobberFlags: true},                                            // fp32 -arg0
+		{name: "FNEG", argLength: 1, reg: fp11clobber, asm: "FNEG", clobberFlags: true},                                              // fp64 -arg0
+		{name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS", resultInArg0: true},                                                 // fp32 arg1 * arg2 + arg0
+		{name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD", resultInArg0: true},                                                   // fp64 arg1 * arg2 + arg0
+		{name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", resultInArg0: true},                                                 // fp32 arg1 * arg2 - arg0
+		{name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB", resultInArg0: true},                                                   // fp64 arg1 * arg2 - arg0
+		{name: "LPDFR", argLength: 1, reg: fp11, asm: "LPDFR"},                                                                       // fp64/fp32 set sign bit
+		{name: "LNDFR", argLength: 1, reg: fp11, asm: "LNDFR"},                                                                       // fp64/fp32 clear sign bit
+		{name: "CPSDR", argLength: 2, reg: fp21, asm: "CPSDR"},                                                                       // fp64/fp32 copy arg1 sign bit to arg0
+
+		// Round to integer, float64 only.
+		//
+		// aux | rounding mode
+		// ----+-----------------------------------
+		//   1 | round to nearest, ties away from 0
+		//   4 | round to nearest, ties to even
+		//   5 | round toward 0
+		//   6 | round toward +∞
+		//   7 | round toward -∞
+		{name: "FIDBR", argLength: 1, reg: fp11, asm: "FIDBR", aux: "Int8"},
+
+		{name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
+		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
+		{name: "FMOVSconst", reg: fp01, asm: "FMOVS", aux: "Float32", rematerializeable: true},                               // fp32 constant
+		{name: "FMOVDconst", reg: fp01, asm: "FMOVD", aux: "Float64", rematerializeable: true},                               // fp64 constant
+		{name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", aux: "SymOff", symEffect: "Read"},                 // fp32 load indexed by i
+		{name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", aux: "SymOff", symEffect: "Read"},                 // fp64 load indexed by i
+
+		{name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store
+		{name: "FMOVSstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVS", aux: "SymOff", symEffect: "Write"},                 // fp32 indexed by i store
+		{name: "FMOVDstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVD", aux: "SymOff", symEffect: "Write"},                 // fp64 indexed by i store
+
+		// binary ops
+		{name: "ADD", argLength: 2, reg: gp21sp, asm: "ADD", commutative: true, clobberFlags: true},                                                                  // arg0 + arg1
+		{name: "ADDW", argLength: 2, reg: gp21sp, asm: "ADDW", commutative: true, clobberFlags: true},                                                                // arg0 + arg1
+		{name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32", typ: "UInt64", clobberFlags: true},                                                   // arg0 + auxint
+		{name: "ADDWconst", argLength: 1, reg: gp11sp, asm: "ADDW", aux: "Int32", clobberFlags: true},                                                                // arg0 + auxint
+		{name: "ADDload", argLength: 3, reg: gpopload, asm: "ADD", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 + *arg1. arg2=mem
+		{name: "ADDWload", argLength: 3, reg: gpopload, asm: "ADDW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + *arg1. arg2=mem
+
+		{name: "SUB", argLength: 2, reg: gp21, asm: "SUB", clobberFlags: true},                                                                                       // arg0 - arg1
+		{name: "SUBW", argLength: 2, reg: gp21, asm: "SUBW", clobberFlags: true},                                                                                     // arg0 - arg1
+		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32", resultInArg0: true, clobberFlags: true},                                                // arg0 - auxint
+		{name: "SUBWconst", argLength: 1, reg: gp11, asm: "SUBW", aux: "Int32", resultInArg0: true, clobberFlags: true},                                              // arg0 - auxint
+		{name: "SUBload", argLength: 3, reg: gpopload, asm: "SUB", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 - *arg1. arg2=mem
+		{name: "SUBWload", argLength: 3, reg: gpopload, asm: "SUBW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - *arg1. arg2=mem
+
+		{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true},                                // arg0 * arg1
+		{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true, resultInArg0: true, clobberFlags: true},                                // arg0 * arg1
+		{name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64", resultInArg0: true, clobberFlags: true},                                // arg0 * auxint
+		{name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int32", resultInArg0: true, clobberFlags: true},                                // arg0 * auxint
+		{name: "MULLDload", argLength: 3, reg: gpopload, asm: "MULLD", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * *arg1. arg2=mem
+		{name: "MULLWload", argLength: 3, reg: gpopload, asm: "MULLW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * *arg1. arg2=mem
+
+		{name: "MULHD", argLength: 2, reg: gp21tmp, asm: "MULHD", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true},   // (arg0 * arg1) >> width
+		{name: "MULHDU", argLength: 2, reg: gp21tmp, asm: "MULHDU", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 * arg1) >> width
+
+		{name: "DIVD", argLength: 2, reg: gp21tmp, asm: "DIVD", resultInArg0: true, clobberFlags: true},   // arg0 / arg1
+		{name: "DIVW", argLength: 2, reg: gp21tmp, asm: "DIVW", resultInArg0: true, clobberFlags: true},   // arg0 / arg1
+		{name: "DIVDU", argLength: 2, reg: gp21tmp, asm: "DIVDU", resultInArg0: true, clobberFlags: true}, // arg0 / arg1
+		{name: "DIVWU", argLength: 2, reg: gp21tmp, asm: "DIVWU", resultInArg0: true, clobberFlags: true}, // arg0 / arg1
+
+		{name: "MODD", argLength: 2, reg: gp21tmp, asm: "MODD", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+		{name: "MODW", argLength: 2, reg: gp21tmp, asm: "MODW", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+
+		{name: "MODDU", argLength: 2, reg: gp21tmp, asm: "MODDU", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+		{name: "MODWU", argLength: 2, reg: gp21tmp, asm: "MODWU", resultInArg0: true, clobberFlags: true}, // arg0 % arg1
+
+		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true, clobberFlags: true},                                                                    // arg0 & arg1
+		{name: "ANDW", argLength: 2, reg: gp21, asm: "ANDW", commutative: true, clobberFlags: true},                                                                  // arg0 & arg1
+		{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64", resultInArg0: true, clobberFlags: true},                                                // arg0 & auxint
+		{name: "ANDWconst", argLength: 1, reg: gp11, asm: "ANDW", aux: "Int32", resultInArg0: true, clobberFlags: true},                                              // arg0 & auxint
+		{name: "ANDload", argLength: 3, reg: gpopload, asm: "AND", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 & *arg1. arg2=mem
+		{name: "ANDWload", argLength: 3, reg: gpopload, asm: "ANDW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & *arg1. arg2=mem
+
+		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true, clobberFlags: true},                                                                    // arg0 | arg1
+		{name: "ORW", argLength: 2, reg: gp21, asm: "ORW", commutative: true, clobberFlags: true},                                                                  // arg0 | arg1
+		{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64", resultInArg0: true, clobberFlags: true},                                                // arg0 | auxint
+		{name: "ORWconst", argLength: 1, reg: gp11, asm: "ORW", aux: "Int32", resultInArg0: true, clobberFlags: true},                                              // arg0 | auxint
+		{name: "ORload", argLength: 3, reg: gpopload, asm: "OR", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 | *arg1. arg2=mem
+		{name: "ORWload", argLength: 3, reg: gpopload, asm: "ORW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | *arg1. arg2=mem
+
+		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, clobberFlags: true},                                                                    // arg0 ^ arg1
+		{name: "XORW", argLength: 2, reg: gp21, asm: "XORW", commutative: true, clobberFlags: true},                                                                  // arg0 ^ arg1
+		{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", resultInArg0: true, clobberFlags: true},                                                // arg0 ^ auxint
+		{name: "XORWconst", argLength: 1, reg: gp11, asm: "XORW", aux: "Int32", resultInArg0: true, clobberFlags: true},                                              // arg0 ^ auxint
+		{name: "XORload", argLength: 3, reg: gpopload, asm: "XOR", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 ^ *arg1. arg2=mem
+		{name: "XORWload", argLength: 3, reg: gpopload, asm: "XORW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ *arg1. arg2=mem
+
+		// Arithmetic ops with carry/borrow chain.
+		//
+		// A carry is represented by a condition code of 2 or 3 (GT or OV).
+		// A borrow is represented by a condition code of 0 or 1 (EQ or LT).
+		{name: "ADDC", argLength: 2, reg: gp21flags, asm: "ADDC", typ: "(UInt64,Flags)", commutative: true},                          // (arg0 + arg1, carry out)
+		{name: "ADDCconst", argLength: 1, reg: gp11flags, asm: "ADDC", typ: "(UInt64,Flags)", aux: "Int16"},                          // (arg0 + auxint, carry out)
+		{name: "ADDE", argLength: 3, reg: gp2flags1flags, asm: "ADDE", typ: "(UInt64,Flags)", commutative: true, resultInArg0: true}, // (arg0 + arg1 + arg2 (carry in), carry out)
+		{name: "SUBC", argLength: 2, reg: gp21flags, asm: "SUBC", typ: "(UInt64,Flags)"},                                             // (arg0 - arg1, borrow out)
+		{name: "SUBE", argLength: 3, reg: gp2flags1flags, asm: "SUBE", typ: "(UInt64,Flags)", resultInArg0: true},                    // (arg0 - arg1 - arg2 (borrow in), borrow out)
+
+		// Comparisons.
+		{name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"},   // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1
+
+		{name: "CMPU", argLength: 2, reg: gp2flags, asm: "CMPU", typ: "Flags"},   // arg0 compare to arg1
+		{name: "CMPWU", argLength: 2, reg: gp2flags, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1
+
+		{name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", typ: "Flags", aux: "Int32"},     // arg0 compare to auxint
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int32"},   // arg0 compare to auxint
+		{name: "CMPUconst", argLength: 1, reg: gp1flags, asm: "CMPU", typ: "Flags", aux: "Int32"},   // arg0 compare to auxint
+		{name: "CMPWUconst", argLength: 1, reg: gp1flags, asm: "CMPWU", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+
+		{name: "FCMPS", argLength: 2, reg: fp2flags, asm: "CEBR", typ: "Flags"},  // arg0 compare to arg1, f32
+		{name: "FCMP", argLength: 2, reg: fp2flags, asm: "FCMPU", typ: "Flags"},  // arg0 compare to arg1, f64
+		{name: "LTDBR", argLength: 1, reg: fp1flags, asm: "LTDBR", typ: "Flags"}, // arg0 compare to 0, f64
+		{name: "LTEBR", argLength: 1, reg: fp1flags, asm: "LTEBR", typ: "Flags"}, // arg0 compare to 0, f32
+
+		{name: "SLD", argLength: 2, reg: sh21, asm: "SLD"},                    // arg0 << arg1, shift amount is mod 64
+		{name: "SLW", argLength: 2, reg: sh21, asm: "SLW"},                    // arg0 << arg1, shift amount is mod 64
+		{name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "UInt8"}, // arg0 << auxint, shift amount 0-63
+		{name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "UInt8"}, // arg0 << auxint, shift amount 0-31
+
+		{name: "SRD", argLength: 2, reg: sh21, asm: "SRD"},                    // unsigned arg0 >> arg1, shift amount is mod 64
+		{name: "SRW", argLength: 2, reg: sh21, asm: "SRW"},                    // unsigned uint32(arg0) >> arg1, shift amount is mod 64
+		{name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "UInt8"}, // unsigned arg0 >> auxint, shift amount 0-63
+		{name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "UInt8"}, // unsigned uint32(arg0) >> auxint, shift amount 0-31
+
+		// Arithmetic shifts clobber flags.
+		{name: "SRAD", argLength: 2, reg: sh21, asm: "SRAD", clobberFlags: true},                    // signed arg0 >> arg1, shift amount is mod 64
+		{name: "SRAW", argLength: 2, reg: sh21, asm: "SRAW", clobberFlags: true},                    // signed int32(arg0) >> arg1, shift amount is mod 64
+		{name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "UInt8", clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63
+		{name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "UInt8", clobberFlags: true}, // signed int32(arg0) >> auxint, shift amount 0-31
+
+		// Rotate instructions.
+		// Note: no RLLGconst - use RISBGZ instead.
+		{name: "RLLG", argLength: 2, reg: sh21, asm: "RLLG"},                  // arg0 rotate left arg1, rotate amount 0-63
+		{name: "RLL", argLength: 2, reg: sh21, asm: "RLL"},                    // arg0 rotate left arg1, rotate amount 0-31
+		{name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "UInt8"}, // arg0 rotate left auxint, rotate amount 0-31
+
+		// Rotate then (and|or|xor|insert) selected bits instructions.
+		//
+		// Aux is an s390x.RotateParams struct containing Start, End and rotation
+		// Amount fields.
+		//
+		// arg1 is rotated left by the rotation amount then the bits from the start
+		// bit to the end bit (inclusive) are combined with arg0 using the logical
+		// operation specified. Bit indices are specified from left to right - the
+		// MSB is 0 and the LSB is 63.
+		//
+		// Examples:
+		//               |          aux         |
+		// | instruction | start | end | amount |          arg0         |          arg1         |         result        |
+		// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
+		// | RXSBG (XOR) |     0 |   1 |      0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0x3fff_ffff_ffff_ffff |
+		// | RXSBG (XOR) |    62 |  63 |      0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_fffc |
+		// | RXSBG (XOR) |     0 |  47 |     16 | 0xffff_ffff_ffff_ffff | 0x0000_0000_0000_ffff | 0xffff_ffff_0000_ffff |
+		// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
+		//
+		{name: "RXSBG", argLength: 2, reg: gp21, asm: "RXSBG", resultInArg0: true, aux: "S390XRotateParams", clobberFlags: true}, // rotate then xor selected bits
+		{name: "RISBGZ", argLength: 1, reg: gp11, asm: "RISBGZ", aux: "S390XRotateParams", clobberFlags: true},                   // rotate then insert selected bits [into zero]
+
+		// unary ops
+		{name: "NEG", argLength: 1, reg: gp11, asm: "NEG", clobberFlags: true},   // -arg0
+		{name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW", clobberFlags: true}, // -arg0
+
+		{name: "NOT", argLength: 1, reg: gp11, resultInArg0: true, clobberFlags: true},  // ^arg0
+		{name: "NOTW", argLength: 1, reg: gp11, resultInArg0: true, clobberFlags: true}, // ^arg0
+
+		{name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0)
+
+		// Conditional register-register moves.
+		// The aux for these values is an s390x.CCMask value representing the condition code mask.
+		{name: "LOCGR", argLength: 3, reg: gp2flags1, resultInArg0: true, asm: "LOCGR", aux: "S390XCCMask"}, // load arg1 into arg0 if the condition code in arg2 matches a masked bit in aux.
+
+		{name: "MOVBreg", argLength: 1, reg: gp11sp, asm: "MOVB", typ: "Int64"},    // sign extend arg0 from int8 to int64
+		{name: "MOVBZreg", argLength: 1, reg: gp11sp, asm: "MOVBZ", typ: "UInt64"}, // zero extend arg0 from int8 to int64
+		{name: "MOVHreg", argLength: 1, reg: gp11sp, asm: "MOVH", typ: "Int64"},    // sign extend arg0 from int16 to int64
+		{name: "MOVHZreg", argLength: 1, reg: gp11sp, asm: "MOVHZ", typ: "UInt64"}, // zero extend arg0 from int16 to int64
+		{name: "MOVWreg", argLength: 1, reg: gp11sp, asm: "MOVW", typ: "Int64"},    // sign extend arg0 from int32 to int64
+		{name: "MOVWZreg", argLength: 1, reg: gp11sp, asm: "MOVWZ", typ: "UInt64"}, // zero extend arg0 from int32 to int64
+
+		{name: "MOVDconst", reg: gp01, asm: "MOVD", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
+
+		{name: "LDGR", argLength: 1, reg: gpfp, asm: "LDGR"}, // move int64 to float64 (no conversion)
+		{name: "LGDR", argLength: 1, reg: fpgp, asm: "LGDR"}, // move float64 to int64 (no conversion)
+
+		{name: "CFDBRA", argLength: 1, reg: fpgp, asm: "CFDBRA", clobberFlags: true}, // convert float64 to int32
+		{name: "CGDBRA", argLength: 1, reg: fpgp, asm: "CGDBRA", clobberFlags: true}, // convert float64 to int64
+		{name: "CFEBRA", argLength: 1, reg: fpgp, asm: "CFEBRA", clobberFlags: true}, // convert float32 to int32
+		{name: "CGEBRA", argLength: 1, reg: fpgp, asm: "CGEBRA", clobberFlags: true}, // convert float32 to int64
+		{name: "CEFBRA", argLength: 1, reg: gpfp, asm: "CEFBRA", clobberFlags: true}, // convert int32 to float32
+		{name: "CDFBRA", argLength: 1, reg: gpfp, asm: "CDFBRA", clobberFlags: true}, // convert int32 to float64
+		{name: "CEGBRA", argLength: 1, reg: gpfp, asm: "CEGBRA", clobberFlags: true}, // convert int64 to float32
+		{name: "CDGBRA", argLength: 1, reg: gpfp, asm: "CDGBRA", clobberFlags: true}, // convert int64 to float64
+		{name: "CLFEBR", argLength: 1, reg: fpgp, asm: "CLFEBR", clobberFlags: true}, // convert float32 to uint32
+		{name: "CLFDBR", argLength: 1, reg: fpgp, asm: "CLFDBR", clobberFlags: true}, // convert float64 to uint32
+		{name: "CLGEBR", argLength: 1, reg: fpgp, asm: "CLGEBR", clobberFlags: true}, // convert float32 to uint64
+		{name: "CLGDBR", argLength: 1, reg: fpgp, asm: "CLGDBR", clobberFlags: true}, // convert float64 to uint64
+		{name: "CELFBR", argLength: 1, reg: gpfp, asm: "CELFBR", clobberFlags: true}, // convert uint32 to float32
+		{name: "CDLFBR", argLength: 1, reg: gpfp, asm: "CDLFBR", clobberFlags: true}, // convert uint32 to float64
+		{name: "CELGBR", argLength: 1, reg: gpfp, asm: "CELGBR", clobberFlags: true}, // convert uint64 to float32
+		{name: "CDLGBR", argLength: 1, reg: gpfp, asm: "CDLGBR", clobberFlags: true}, // convert uint64 to float64
+
+		{name: "LEDBR", argLength: 1, reg: fp11, asm: "LEDBR"}, // convert float64 to float32
+		{name: "LDEBR", argLength: 1, reg: fp11, asm: "LDEBR"}, // convert float32 to float64
+
+		{name: "MOVDaddr", argLength: 1, reg: addr, aux: "SymOff", rematerializeable: true, symEffect: "Read"}, // arg0 + auxint + offset encoded in aux
+		{name: "MOVDaddridx", argLength: 2, reg: addridx, aux: "SymOff", symEffect: "Read"},                    // arg0 + arg1 + auxint + aux
+
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},                  // ditto, sign extend to int64
+		{name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},                  // ditto, sign extend to int64
+		{name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},                  // ditto, sign extend to int64
+		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"},   // load 8 bytes from arg0+auxint+aux. arg1=mem
+
+		{name: "MOVWBR", argLength: 1, reg: gp11, asm: "MOVWBR"}, // arg0 swap bytes
+		{name: "MOVDBR", argLength: 1, reg: gp11, asm: "MOVDBR"}, // arg0 swap bytes
+
+		{name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes.
+		{name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes.
+		{name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes.
+
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},       // store byte in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},       // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},       // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},       // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVHBRstore", argLength: 3, reg: gpstorebr, asm: "MOVHBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes.
+		{name: "MOVWBRstore", argLength: 3, reg: gpstorebr, asm: "MOVWBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes.
+		{name: "MOVDBRstore", argLength: 3, reg: gpstorebr, asm: "MOVDBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes.
+
+		{name: "MVC", argLength: 3, reg: gpmvc, asm: "MVC", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, faultOnNilArg1: true, symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size,off
+
+		// indexed loads/stores
+		{name: "MOVBZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", symEffect: "Read"},   // load a byte from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+		{name: "MOVBloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVB", aux: "SymOff", typ: "Int8", symEffect: "Read"},      // load a byte from arg0+arg1+auxint+aux. arg2=mem. Sign extend.
+		{name: "MOVHZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", symEffect: "Read"},  // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+		{name: "MOVHloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVH", aux: "SymOff", typ: "Int16", symEffect: "Read"},     // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Sign extend.
+		{name: "MOVWZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", symEffect: "Read"},  // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend.
+		{name: "MOVWloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVW", aux: "SymOff", typ: "Int32", symEffect: "Read"},     // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Sign extend.
+		{name: "MOVDloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVD", aux: "SymOff", typ: "UInt64", symEffect: "Read"},    // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVHBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVHBR", aux: "SymOff", typ: "Int16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes.
+		{name: "MOVWBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWBR", aux: "SymOff", typ: "Int32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes.
+		{name: "MOVDBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVDBR", aux: "SymOff", typ: "Int64", symEffect: "Read"}, // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes.
+		{name: "MOVBstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"},                // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVHstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVH", aux: "SymOff", symEffect: "Write"},                // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"},                // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVDstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVD", aux: "SymOff", symEffect: "Write"},                // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVHBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVHBR", aux: "SymOff", symEffect: "Write"},            // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes.
+		{name: "MOVWBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVWBR", aux: "SymOff", symEffect: "Write"},            // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes.
+		{name: "MOVDBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVDBR", aux: "SymOff", symEffect: "Write"},            // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes.
+
+		// For storeconst ops, the AuxInt field encodes both
+		// the value to store and an address offset of the store.
+		// Cast AuxInt to a ValAndOff to extract Val and Off fields.
+		{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux.  arg1=mem
+		{name: "MOVHstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVH", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ...
+		{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ...
+		{name: "MOVDstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVD", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of ...
+
+		{name: "CLEAR", argLength: 2, reg: regInfo{inputs: []regMask{ptr, 0}}, asm: "CLEAR", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Write"},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                                // call static function aux.(*obj.LSym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{ptrsp, buildReg("R12"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{ptr}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                         // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// (InvertFlags (CMP a b)) == (CMP b a)
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Pseudo-ops
+		{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of R12 (the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R12")}}, zeroWidth: true},
+		// arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		// LoweredGetCallerSP returns the SP of the caller of the current function.
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
+		// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
+		// I.e., if f calls g "calls" getcallerpc,
+		// the result should be the PC within f that g will return to.
+		// See runtime/stubs.go for a more detailed discussion.
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{ptrsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+		// Round ops to block fused-multiply-add extraction.
+		{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+		{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
+
+		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+		// It saves all GP registers if necessary,
+		// but clobbers R14 (LR) because it's a call.
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R14")}, clobberFlags: true, aux: "Sym", symEffect: "None"},
+
+		// There are three of these functions so that they can have three different register inputs.
+		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+		// default registers to match so we don't need to copy registers around unnecessarily.
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+
+		// Constant condition code values. The condition code can be 0, 1, 2 or 3.
+		{name: "FlagEQ"}, // CC=0 (equal)
+		{name: "FlagLT"}, // CC=1 (less than)
+		{name: "FlagGT"}, // CC=2 (greater than)
+		{name: "FlagOV"}, // CC=3 (overflow)
+
+		// Fast-BCR-serialization to ensure store-load ordering.
+		{name: "SYNC", argLength: 1, reg: sync, asm: "SYNC", typ: "Mem"},
+
+		// Atomic loads. These are just normal loads but return <value,memory> tuples
+		// so they can be properly ordered with other loads.
+		// load from arg0+auxint+aux.  arg1=mem.
+		{name: "MOVBZatomicload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+		{name: "MOVWZatomicload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+		{name: "MOVDatomicload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+		// Atomic stores. These are just normal stores.
+		// store arg1 to arg0+auxint+aux. arg2=mem.
+		{name: "MOVBatomicstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"},
+		{name: "MOVWatomicstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"},
+		{name: "MOVDatomicstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"},
+
+		// Atomic adds.
+		// *(arg0+auxint+aux) += arg1.  arg2=mem.
+		// Returns a tuple of <old contents of *(arg0+auxint+aux), memory>.
+		{name: "LAA", argLength: 3, reg: gpstorelaa, asm: "LAA", typ: "(UInt32,Mem)", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "LAAG", argLength: 3, reg: gpstorelaa, asm: "LAAG", typ: "(UInt64,Mem)", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
+		{name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>.  Returns <x+arg0,y>.
+
+		// Atomic bitwise operations.
+		// Note: 'floor' operations round the pointer down to the nearest word boundary
+		// which reflects how they are used in the runtime.
+		{name: "LAN", argLength: 3, reg: gpstore, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true},         // *arg0 &= arg1. arg2 = mem.
+		{name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem.
+		{name: "LAO", argLength: 3, reg: gpstore, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true},         // *arg0 |= arg1. arg2 = mem.
+		{name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
+
+		// Compare and swap.
+		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
+		// if *(arg0+auxint+aux) == arg1 {
+		//   *(arg0+auxint+aux) = arg2
+		//   return (true, memory)
+		// } else {
+		//   return (false, memory)
+		// }
+		// Note that these instructions also return the old value in arg1, but we ignore it.
+		// TODO: have these return flags instead of bool.  The current system generates:
+		//    CS ...
+		//    MOVD  $0, ret
+		//    BNE   2(PC)
+		//    MOVD  $1, ret
+		//    CMPW  ret, $0
+		//    BNE ...
+		// instead of just
+		//    CS ...
+		//    BEQ ...
+		// but we can't do that because memory-using ops can't generate flags yet
+		// (flagalloc wants to move flag-generating instructions around).
+		{name: "LoweredAtomicCas32", argLength: 4, reg: cas, asm: "CS", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "LoweredAtomicCas64", argLength: 4, reg: cas, asm: "CSG", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+
+		// Lowered atomic swaps, emulated using compare-and-swap.
+		// store arg1 to arg0+auxint+aux, arg2=mem.
+		{name: "LoweredAtomicExchange32", argLength: 3, reg: exchange, asm: "CS", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+		{name: "LoweredAtomicExchange64", argLength: 3, reg: exchange, asm: "CSG", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
+
+		// find leftmost one
+		{
+			name:         "FLOGR",
+			argLength:    1,
+			reg:          regInfo{inputs: gponly, outputs: []regMask{buildReg("R0")}, clobbers: buildReg("R1")},
+			asm:          "FLOGR",
+			typ:          "UInt64",
+			clobberFlags: true,
+		},
+
+		// population count
+		//
+		// Counts the number of ones in each byte of arg0
+		// and places the result into the corresponding byte
+		// of the result.
+		{
+			name:         "POPCNT",
+			argLength:    1,
+			reg:          gp11,
+			asm:          "POPCNT",
+			typ:          "UInt64",
+			clobberFlags: true,
+		},
+
+		// unsigned multiplication (64x64 → 128)
+		//
+		// Multiply the two 64-bit input operands together and place the 128-bit result into
+		// an even-odd register pair. The second register in the target pair also contains
+		// one of the input operands. Since we don't currently have a way to specify an
+		// even-odd register pair we hardcode this register pair as R2:R3.
+		{
+			name:      "MLGR",
+			argLength: 2,
+			reg:       regInfo{inputs: []regMask{gp, r3}, outputs: []regMask{r2, r3}},
+			asm:       "MLGR",
+		},
+
+		// pseudo operations to sum the output of the POPCNT instruction
+		{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
+		{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
+		{name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
+
+		// store multiple
+		{
+			name:           "STMG2",
+			argLength:      4,
+			reg:            regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), 0}},
+			aux:            "SymOff",
+			typ:            "Mem",
+			asm:            "STMG",
+			faultOnNilArg0: true,
+			symEffect:      "Write",
+			clobberFlags:   true, // TODO(mundaym): currently uses AGFI to handle large offsets
+		},
+		{
+			name:           "STMG3",
+			argLength:      5,
+			reg:            regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), buildReg("R3"), 0}},
+			aux:            "SymOff",
+			typ:            "Mem",
+			asm:            "STMG",
+			faultOnNilArg0: true,
+			symEffect:      "Write",
+			clobberFlags:   true, // TODO(mundaym): currently uses AGFI to handle large offsets
+		},
+		{
+			name:      "STMG4",
+			argLength: 6,
+			reg: regInfo{inputs: []regMask{
+				ptrsp,
+				buildReg("R1"),
+				buildReg("R2"),
+				buildReg("R3"),
+				buildReg("R4"),
+				0,
+			}},
+			aux:            "SymOff",
+			typ:            "Mem",
+			asm:            "STMG",
+			faultOnNilArg0: true,
+			symEffect:      "Write",
+			clobberFlags:   true, // TODO(mundaym): currently uses AGFI to handle large offsets
+		},
+		{
+			name:           "STM2",
+			argLength:      4,
+			reg:            regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), 0}},
+			aux:            "SymOff",
+			typ:            "Mem",
+			asm:            "STMY",
+			faultOnNilArg0: true,
+			symEffect:      "Write",
+			clobberFlags:   true, // TODO(mundaym): currently uses AGFI to handle large offsets
+		},
+		{
+			name:           "STM3",
+			argLength:      5,
+			reg:            regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), buildReg("R3"), 0}},
+			aux:            "SymOff",
+			typ:            "Mem",
+			asm:            "STMY",
+			faultOnNilArg0: true,
+			symEffect:      "Write",
+			clobberFlags:   true, // TODO(mundaym): currently uses AGFI to handle large offsets
+		},
+		{
+			name:      "STM4",
+			argLength: 6,
+			reg: regInfo{inputs: []regMask{
+				ptrsp,
+				buildReg("R1"),
+				buildReg("R2"),
+				buildReg("R3"),
+				buildReg("R4"),
+				0,
+			}},
+			aux:            "SymOff",
+			typ:            "Mem",
+			asm:            "STMY",
+			faultOnNilArg0: true,
+			symEffect:      "Write",
+			clobberFlags:   true, // TODO(mundaym): currently uses AGFI to handle large offsets
+		},
+
+		// large move
+		// auxint = remaining bytes after loop (rem)
+		// arg0 = address of dst memory (in R1, changed as a side effect)
+		// arg1 = address of src memory (in R2, changed as a side effect)
+		// arg2 = pointer to last address to move in loop + 256
+		// arg3 = mem
+		// returns mem
+		//
+		// mvc: MVC  $256, 0(R2), 0(R1)
+		//      MOVD $256(R1), R1
+		//      MOVD $256(R2), R2
+		//      CMP  R2, Rarg2
+		//      BNE  mvc
+		//	MVC  $rem, 0(R2), 0(R1) // if rem > 0
+		{
+			name:      "LoweredMove",
+			aux:       "Int64",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), buildReg("R2"), gpsp},
+				clobbers: buildReg("R1 R2"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
+		// large clear
+		// auxint = remaining bytes after loop (rem)
+		// arg0 = address of dst memory (in R1, changed as a side effect)
+		// arg1 = pointer to last address to zero in loop + 256
+		// arg2 = mem
+		// returns mem
+		//
+		// clear: CLEAR $256, 0(R1)
+		//        MOVD  $256(R1), R1
+		//        CMP   R1, Rarg2
+		//        BNE   clear
+		//	  CLEAR $rem, 0(R1) // if rem > 0
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("R1"), gpsp},
+				clobbers: buildReg("R1"),
+			},
+			clobberFlags:   true,
+			typ:            "Mem",
+			faultOnNilArg0: true,
+		},
+	}
+
+	// All blocks on s390x have their condition code mask (s390x.CCMask) as the Aux value.
+	// The condition code mask is a 4-bit mask where each bit corresponds to a condition
+	// code value. If the value of the condition code matches a bit set in the condition
+	// code mask then the first successor is executed. Otherwise the second successor is
+	// executed.
+	//
+	// | condition code value |  mask bit  |
+	// +----------------------+------------+
+	// | 0 (equal)            | 0b1000 (8) |
+	// | 1 (less than)        | 0b0100 (4) |
+	// | 2 (greater than)     | 0b0010 (2) |
+	// | 3 (unordered)        | 0b0001 (1) |
+	//
+	// Note: that compare-and-branch instructions must not have bit 3 (0b0001) set.
+	var S390Xblocks = []blockData{
+		// branch on condition
+		{name: "BRC", controls: 1, aux: "S390XCCMask"}, // condition code value (flags) is Controls[0]
+
+		// compare-and-branch (register-register)
+		//  - integrates comparison of Controls[0] with Controls[1]
+		//  - both control values must be in general purpose registers
+		{name: "CRJ", controls: 2, aux: "S390XCCMask"},   // signed 32-bit integer comparison
+		{name: "CGRJ", controls: 2, aux: "S390XCCMask"},  // signed 64-bit integer comparison
+		{name: "CLRJ", controls: 2, aux: "S390XCCMask"},  // unsigned 32-bit integer comparison
+		{name: "CLGRJ", controls: 2, aux: "S390XCCMask"}, // unsigned 64-bit integer comparison
+
+		// compare-and-branch (register-immediate)
+		//  - integrates comparison of Controls[0] with AuxInt
+		//  - control value must be in a general purpose register
+		//  - the AuxInt value is sign-extended for signed comparisons
+		//    and zero-extended for unsigned comparisons
+		{name: "CIJ", controls: 1, aux: "S390XCCMaskInt8"},    // signed 32-bit integer comparison
+		{name: "CGIJ", controls: 1, aux: "S390XCCMaskInt8"},   // signed 64-bit integer comparison
+		{name: "CLIJ", controls: 1, aux: "S390XCCMaskUint8"},  // unsigned 32-bit integer comparison
+		{name: "CLGIJ", controls: 1, aux: "S390XCCMaskUint8"}, // unsigned 64-bit integer comparison
+	}
+
+	archs = append(archs, arch{
+		name:            "S390X",
+		pkg:             "cmd/internal/obj/s390x",
+		genfile:         "../../s390x/ssa.go",
+		ops:             S390Xops,
+		blocks:          S390Xblocks,
+		regnames:        regNamesS390X,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		framepointerreg: -1, // not used
+		linkreg:         int8(num["R14"]),
+		imports: []string{
+			"cmd/internal/obj/s390x",
+		},
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/Wasm.rules b/src/cmd/compile/internal/ssa/gen/Wasm.rules
new file mode 100644
index 0000000..fc45cd3
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/Wasm.rules
@@ -0,0 +1,408 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lowering arithmetic
+(Add(64|32|16|8|Ptr) ...) => (I64Add ...)
+(Add(64|32)F ...) => (F(64|32)Add ...)
+
+(Sub(64|32|16|8|Ptr) ...) => (I64Sub ...)
+(Sub(64|32)F ...) => (F(64|32)Sub ...)
+
+(Mul(64|32|16|8) ...) => (I64Mul ...)
+(Mul(64|32)F ...) => (F(64|32)Mul ...)
+
+(Div64 [false] x y) => (I64DivS x y)
+(Div32 [false] x y) => (I64DivS (SignExt32to64 x) (SignExt32to64 y))
+(Div16 [false] x y) => (I64DivS (SignExt16to64 x) (SignExt16to64 y))
+(Div8          x y) => (I64DivS (SignExt8to64 x) (SignExt8to64 y))
+(Div64u ...) => (I64DivU ...)
+(Div32u x y) => (I64DivU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Div16u x y) => (I64DivU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Div8u  x y) => (I64DivU (ZeroExt8to64 x) (ZeroExt8to64 y))
+(Div(64|32)F ...) => (F(64|32)Div ...)
+
+(Mod64 [false] x y) => (I64RemS x y)
+(Mod32 [false] x y) => (I64RemS (SignExt32to64 x) (SignExt32to64 y))
+(Mod16 [false] x y) => (I64RemS (SignExt16to64 x) (SignExt16to64 y))
+(Mod8          x y) => (I64RemS (SignExt8to64  x) (SignExt8to64  y))
+(Mod64u ...) => (I64RemU ...)
+(Mod32u x y) => (I64RemU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Mod16u x y) => (I64RemU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Mod8u  x y) => (I64RemU (ZeroExt8to64  x) (ZeroExt8to64  y))
+
+(And(64|32|16|8|B) ...) => (I64And ...)
+
+(Or(64|32|16|8|B) ...) => (I64Or ...)
+
+(Xor(64|32|16|8) ...) => (I64Xor ...)
+
+(Neg(64|32|16|8) x) => (I64Sub (I64Const [0]) x)
+(Neg(64|32)F ...) => (F(64|32)Neg ...)
+
+(Com(64|32|16|8) x) => (I64Xor x (I64Const [-1]))
+
+(Not ...) => (I64Eqz ...)
+
+// Lowering pointer arithmetic
+(OffPtr ...) => (I64AddConst ...)
+
+// Lowering extension
+// It is unnecessary to extend loads
+(SignExt32to64        x:(I64Load32S _ _)) => x
+(SignExt16to(64|32)   x:(I64Load16S _ _)) => x
+(SignExt8to(64|32|16) x:(I64Load8S  _ _)) => x
+(ZeroExt32to64        x:(I64Load32U _ _)) => x
+(ZeroExt16to(64|32)   x:(I64Load16U _ _)) => x
+(ZeroExt8to(64|32|16) x:(I64Load8U  _ _)) => x
+(SignExt32to64        x) && objabi.GOWASM.SignExt => (I64Extend32S x)
+(SignExt8to(64|32|16) x) && objabi.GOWASM.SignExt => (I64Extend8S x)
+(SignExt16to(64|32)   x) && objabi.GOWASM.SignExt => (I64Extend16S x)
+(SignExt32to64        x) => (I64ShrS (I64Shl x (I64Const [32])) (I64Const [32]))
+(SignExt16to(64|32)   x) => (I64ShrS (I64Shl x (I64Const [48])) (I64Const [48]))
+(SignExt8to(64|32|16) x) => (I64ShrS (I64Shl x (I64Const [56])) (I64Const [56]))
+(ZeroExt32to64        x) => (I64And x (I64Const [0xffffffff]))
+(ZeroExt16to(64|32)   x) => (I64And x (I64Const [0xffff]))
+(ZeroExt8to(64|32|16) x) => (I64And x (I64Const [0xff]))
+
+(Slicemask x) => (I64ShrS (I64Sub (I64Const [0]) x) (I64Const [63]))
+
+// Lowering truncation
+// Because we ignore the high parts, truncates are just copies.
+(Trunc64to(32|16|8) ...) => (Copy ...)
+(Trunc32to(16|8)    ...) => (Copy ...)
+(Trunc16to8         ...) => (Copy ...)
+
+// Lowering float <=> int
+(Cvt32to(64|32)F x) => (F(64|32)ConvertI64S (SignExt32to64 x))
+(Cvt64to(64|32)F ...) => (F(64|32)ConvertI64S ...)
+(Cvt32Uto(64|32)F x) => (F(64|32)ConvertI64U (ZeroExt32to64 x))
+(Cvt64Uto(64|32)F ...) => (F(64|32)ConvertI64U ...)
+
+(Cvt32Fto32 ...) => (I64TruncSatF32S ...)
+(Cvt32Fto64 ...) => (I64TruncSatF32S ...)
+(Cvt64Fto32 ...) => (I64TruncSatF64S ...)
+(Cvt64Fto64 ...) => (I64TruncSatF64S ...)
+(Cvt32Fto32U ...) => (I64TruncSatF32U ...)
+(Cvt32Fto64U ...) => (I64TruncSatF32U ...)
+(Cvt64Fto32U ...) => (I64TruncSatF64U ...)
+(Cvt64Fto64U ...) => (I64TruncSatF64U ...)
+
+(Cvt32Fto64F ...) => (F64PromoteF32 ...)
+(Cvt64Fto32F ...) => (F32DemoteF64 ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (Copy ...)
+(Round64F ...) => (Copy ...)
+
+// Lowering shifts
+// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
+
+(Lsh64x64 x y) && shiftIsBounded(v) => (I64Shl x y)
+(Lsh64x64 x (I64Const [c])) && uint64(c) < 64 => (I64Shl x (I64Const [c]))
+(Lsh64x64 x (I64Const [c])) && uint64(c) >= 64 => (I64Const [0])
+(Lsh64x64 x y) => (Select (I64Shl x y) (I64Const [0]) (I64LtU y (I64Const [64])))
+(Lsh64x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Lsh32x64 ...) => (Lsh64x64 ...)
+(Lsh32x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Lsh16x64 ...) => (Lsh64x64 ...)
+(Lsh16x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Lsh8x64 ...) => (Lsh64x64 ...)
+(Lsh8x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Rsh64Ux64 x y) && shiftIsBounded(v) => (I64ShrU x y)
+(Rsh64Ux64 x (I64Const [c])) && uint64(c) < 64 => (I64ShrU x (I64Const [c]))
+(Rsh64Ux64 x (I64Const [c])) && uint64(c) >= 64 => (I64Const [0])
+(Rsh64Ux64 x y) => (Select (I64ShrU x y) (I64Const [0]) (I64LtU y (I64Const [64])))
+(Rsh64Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Rsh32Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt32to64 x) y)
+(Rsh32Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt32to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh16Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt16to64 x) y)
+(Rsh16Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt16to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh8Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt8to64 x) y)
+(Rsh8Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt8to64 x) (ZeroExt(32|16|8)to64 y))
+
+// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
+// We implement this by setting the shift value to (width - 1) if the shift value is >= width.
+
+(Rsh64x64 x y) && shiftIsBounded(v) => (I64ShrS x y)
+(Rsh64x64 x (I64Const [c])) && uint64(c) < 64 => (I64ShrS x (I64Const [c]))
+(Rsh64x64 x (I64Const [c])) && uint64(c) >= 64 => (I64ShrS x (I64Const [63]))
+(Rsh64x64 x y) => (I64ShrS x (Select <typ.Int64> y (I64Const [63]) (I64LtU y (I64Const [64]))))
+(Rsh64x(32|16|8) [c] x y) => (Rsh64x64 [c] x (ZeroExt(32|16|8)to64 y))
+
+(Rsh32x64 [c] x y) => (Rsh64x64 [c] (SignExt32to64 x) y)
+(Rsh32x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt32to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh16x64 [c] x y) => (Rsh64x64 [c] (SignExt16to64 x) y)
+(Rsh16x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt16to64 x) (ZeroExt(32|16|8)to64 y))
+
+(Rsh8x64 [c] x y)  => (Rsh64x64 [c] (SignExt8to64 x) y)
+(Rsh8x(32|16|8) [c] x y)  => (Rsh64x64 [c] (SignExt8to64 x) (ZeroExt(32|16|8)to64 y))
+
+// Lowering rotates
+(RotateLeft8 <t> x (I64Const [c])) => (Or8 (Lsh8x64 <t> x (I64Const [c&7])) (Rsh8Ux64 <t> x (I64Const [-c&7])))
+(RotateLeft16 <t> x (I64Const [c])) => (Or16 (Lsh16x64 <t> x (I64Const [c&15])) (Rsh16Ux64 <t> x (I64Const [-c&15])))
+(RotateLeft32 ...) => (I32Rotl ...)
+(RotateLeft64 ...) => (I64Rotl ...)
+
+// Lowering comparisons
+(Less64  ...) => (I64LtS ...)
+(Less32  x y) => (I64LtS (SignExt32to64 x) (SignExt32to64 y))
+(Less16  x y) => (I64LtS (SignExt16to64 x) (SignExt16to64 y))
+(Less8   x y) => (I64LtS (SignExt8to64  x) (SignExt8to64  y))
+(Less64U ...) => (I64LtU ...)
+(Less32U x y) => (I64LtU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Less16U x y) => (I64LtU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Less8U  x y) => (I64LtU (ZeroExt8to64  x) (ZeroExt8to64  y))
+(Less(64|32)F ...) => (F(64|32)Lt ...)
+
+(Leq64  ...) => (I64LeS ...)
+(Leq32  x y) => (I64LeS (SignExt32to64 x) (SignExt32to64 y))
+(Leq16  x y) => (I64LeS (SignExt16to64 x) (SignExt16to64 y))
+(Leq8   x y) => (I64LeS (SignExt8to64  x) (SignExt8to64  y))
+(Leq64U ...) => (I64LeU ...)
+(Leq32U x y) => (I64LeU (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Leq16U x y) => (I64LeU (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Leq8U  x y) => (I64LeU (ZeroExt8to64  x) (ZeroExt8to64  y))
+(Leq(64|32)F ...) => (F(64|32)Le ...)
+
+(Eq64  ...) => (I64Eq ...)
+(Eq32  x y) => (I64Eq (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Eq16  x y) => (I64Eq (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Eq8   x y) => (I64Eq (ZeroExt8to64  x) (ZeroExt8to64  y))
+(EqB   ...) => (I64Eq ...)
+(EqPtr ...) => (I64Eq ...)
+(Eq(64|32)F ...) => (F(64|32)Eq ...)
+
+(Neq64  ...) => (I64Ne ...)
+(Neq32  x y) => (I64Ne (ZeroExt32to64 x) (ZeroExt32to64 y))
+(Neq16  x y) => (I64Ne (ZeroExt16to64 x) (ZeroExt16to64 y))
+(Neq8   x y) => (I64Ne (ZeroExt8to64  x) (ZeroExt8to64  y))
+(NeqB   ...) => (I64Ne ...)
+(NeqPtr ...) => (I64Ne ...)
+(Neq(64|32)F ...) => (F(64|32)Ne ...)
+
+// Lowering loads
+(Load <t> ptr mem) && is32BitFloat(t) => (F32Load ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (F64Load ptr mem)
+(Load <t> ptr mem) && t.Size() == 8 => (I64Load ptr mem)
+(Load <t> ptr mem) && t.Size() == 4 && !t.IsSigned() => (I64Load32U ptr mem)
+(Load <t> ptr mem) && t.Size() == 4 &&  t.IsSigned() => (I64Load32S ptr mem)
+(Load <t> ptr mem) && t.Size() == 2 && !t.IsSigned() => (I64Load16U ptr mem)
+(Load <t> ptr mem) && t.Size() == 2 &&  t.IsSigned() => (I64Load16S ptr mem)
+(Load <t> ptr mem) && t.Size() == 1 && !t.IsSigned() => (I64Load8U ptr mem)
+(Load <t> ptr mem) && t.Size() == 1 &&  t.IsSigned() => (I64Load8S ptr mem)
+
+// Lowering stores
+(Store {t} ptr val mem) && is64BitFloat(t) => (F64Store ptr val mem)
+(Store {t} ptr val mem) && is32BitFloat(t) => (F32Store ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 => (I64Store ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (I64Store32 ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (I64Store16 ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (I64Store8 ptr val mem)
+
+// Lowering moves
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (I64Store8 dst (I64Load8U src mem) mem)
+(Move [2] dst src mem) => (I64Store16 dst (I64Load16U src mem) mem)
+(Move [4] dst src mem) => (I64Store32 dst (I64Load32U src mem) mem)
+(Move [8] dst src mem) => (I64Store dst (I64Load src mem) mem)
+(Move [16] dst src mem) =>
+	(I64Store [8] dst (I64Load [8] src mem)
+		(I64Store dst (I64Load src mem) mem))
+(Move [3] dst src mem) =>
+	(I64Store8 [2] dst (I64Load8U [2] src mem)
+		(I64Store16 dst (I64Load16U src mem) mem))
+(Move [5] dst src mem) =>
+	(I64Store8 [4] dst (I64Load8U [4] src mem)
+		(I64Store32 dst (I64Load32U src mem) mem))
+(Move [6] dst src mem) =>
+	(I64Store16 [4] dst (I64Load16U [4] src mem)
+		(I64Store32 dst (I64Load32U src mem) mem))
+(Move [7] dst src mem) =>
+	(I64Store32 [3] dst (I64Load32U [3] src mem)
+		(I64Store32 dst (I64Load32U src mem) mem))
+(Move [s] dst src mem) && s > 8 && s < 16 =>
+	(I64Store [s-8] dst (I64Load [s-8] src mem)
+		(I64Store dst (I64Load src mem) mem))
+
+// Adjust moves to be a multiple of 16 bytes.
+(Move [s] dst src mem)
+	&& s > 16 && s%16 != 0 && s%16 <= 8 =>
+	(Move [s-s%16]
+		(OffPtr <dst.Type> dst [s%16])
+		(OffPtr <src.Type> src [s%16])
+		(I64Store dst (I64Load src mem) mem))
+(Move [s] dst src mem)
+	&& s > 16 && s%16 != 0 && s%16 > 8 =>
+	(Move [s-s%16]
+		(OffPtr <dst.Type> dst [s%16])
+		(OffPtr <src.Type> src [s%16])
+		(I64Store [8] dst (I64Load [8] src mem)
+			(I64Store dst (I64Load src mem) mem)))
+
+// Large copying uses helper.
+(Move [s] dst src mem) && s%8 == 0 && logLargeCopy(v, s) =>
+	(LoweredMove [s/8] dst src mem)
+
+// Lowering Zero instructions
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (I64Store8 destptr (I64Const [0]) mem)
+(Zero [2] destptr mem) => (I64Store16 destptr (I64Const [0]) mem)
+(Zero [4] destptr mem) => (I64Store32 destptr (I64Const [0]) mem)
+(Zero [8] destptr mem) => (I64Store destptr (I64Const [0]) mem)
+
+(Zero [3] destptr mem) =>
+	(I64Store8 [2] destptr (I64Const [0])
+		(I64Store16 destptr (I64Const [0]) mem))
+(Zero [5] destptr mem) =>
+	(I64Store8 [4] destptr (I64Const [0])
+		(I64Store32 destptr (I64Const [0]) mem))
+(Zero [6] destptr mem) =>
+	(I64Store16 [4] destptr (I64Const [0])
+		(I64Store32 destptr (I64Const [0]) mem))
+(Zero [7] destptr mem) =>
+	(I64Store32 [3] destptr (I64Const [0])
+		(I64Store32 destptr (I64Const [0]) mem))
+
+// Strip off any fractional word zeroing.
+(Zero [s] destptr mem) && s%8 != 0 && s > 8 =>
+	(Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
+		(I64Store destptr (I64Const [0]) mem))
+
+// Zero small numbers of words directly.
+(Zero [16] destptr mem) =>
+	(I64Store [8] destptr (I64Const [0])
+		(I64Store destptr (I64Const [0]) mem))
+(Zero [24] destptr mem) =>
+	(I64Store [16] destptr (I64Const [0])
+		(I64Store [8] destptr (I64Const [0])
+			(I64Store destptr (I64Const [0]) mem)))
+(Zero [32] destptr mem) =>
+	(I64Store [24] destptr (I64Const [0])
+		(I64Store [16] destptr (I64Const [0])
+			(I64Store [8] destptr (I64Const [0])
+				(I64Store destptr (I64Const [0]) mem))))
+
+// Large zeroing uses helper.
+(Zero [s] destptr mem) && s%8 == 0 && s > 32 =>
+	(LoweredZero [s/8] destptr mem)
+
+// Lowering constants
+(Const64 ...) => (I64Const ...)
+(Const(32|16|8) [c]) => (I64Const [int64(c)])
+(Const(64|32)F ...) => (F(64|32)Const ...)
+(ConstNil) => (I64Const [0])
+(ConstBool [c]) => (I64Const [b2i(c)])
+
+// Lowering calls
+(StaticCall ...) => (LoweredStaticCall ...)
+(ClosureCall ...) => (LoweredClosureCall ...)
+(InterCall ...) => (LoweredInterCall ...)
+
+// Miscellaneous
+(Convert ...) => (LoweredConvert ...)
+(IsNonNil p) => (I64Eqz (I64Eqz p))
+(IsInBounds ...) => (I64LtU ...)
+(IsSliceInBounds ...) => (I64LeU ...)
+(NilCheck ...) => (LoweredNilCheck ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(Addr {sym} base) => (LoweredAddr {sym} [0] base)
+(LocalAddr {sym} base _) => (LoweredAddr {sym} base)
+
+// Write barrier.
+(WB ...) => (LoweredWB ...)
+
+// --- Intrinsics ---
+(Sqrt ...) => (F64Sqrt ...)
+(Trunc ...) => (F64Trunc ...)
+(Ceil ...) => (F64Ceil ...)
+(Floor ...) => (F64Floor ...)
+(RoundToEven ...) => (F64Nearest ...)
+(Abs ...) => (F64Abs ...)
+(Copysign ...) => (F64Copysign ...)
+
+(Ctz64 ...) => (I64Ctz ...)
+(Ctz32 x) => (I64Ctz (I64Or x (I64Const [0x100000000])))
+(Ctz16 x) => (I64Ctz (I64Or x (I64Const [0x10000])))
+(Ctz8  x) => (I64Ctz (I64Or x (I64Const [0x100])))
+
+(Ctz(64|32|16|8)NonZero ...) => (I64Ctz ...)
+
+(BitLen64 x) => (I64Sub (I64Const [64]) (I64Clz x))
+
+(PopCount64 ...) => (I64Popcnt ...)
+(PopCount32 x) => (I64Popcnt (ZeroExt32to64 x))
+(PopCount16 x) => (I64Popcnt (ZeroExt16to64 x))
+(PopCount8  x) => (I64Popcnt (ZeroExt8to64  x))
+
+(CondSelect ...) => (Select ...)
+
+// --- Optimizations ---
+(I64Add (I64Const [x]) (I64Const [y])) => (I64Const [x + y])
+(I64Mul (I64Const [x]) (I64Const [y])) => (I64Const [x * y])
+(I64And (I64Const [x]) (I64Const [y])) => (I64Const [x & y])
+(I64Or  (I64Const [x]) (I64Const [y])) => (I64Const [x | y])
+(I64Xor (I64Const [x]) (I64Const [y])) => (I64Const [x ^ y])
+(F64Add (F64Const [x]) (F64Const [y])) => (F64Const [x + y])
+(F64Mul (F64Const [x]) (F64Const [y])) && !math.IsNaN(x * y) => (F64Const [x * y])
+(I64Eq  (I64Const [x]) (I64Const [y])) && x == y => (I64Const [1])
+(I64Eq  (I64Const [x]) (I64Const [y])) && x != y => (I64Const [0])
+(I64Ne  (I64Const [x]) (I64Const [y])) && x == y => (I64Const [0])
+(I64Ne  (I64Const [x]) (I64Const [y])) && x != y => (I64Const [1])
+
+(I64Shl (I64Const [x]) (I64Const [y])) => (I64Const [x << uint64(y)])
+(I64ShrU (I64Const [x]) (I64Const [y])) => (I64Const [int64(uint64(x) >> uint64(y))])
+(I64ShrS (I64Const [x]) (I64Const [y])) => (I64Const [x >> uint64(y)])
+
+// TODO: declare these operations as commutative and get rid of these rules?
+(I64Add (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Add y (I64Const [x]))
+(I64Mul (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Mul y (I64Const [x]))
+(I64And (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64And y (I64Const [x]))
+(I64Or  (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Or  y (I64Const [x]))
+(I64Xor (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Xor y (I64Const [x]))
+(F64Add (F64Const [x]) y) && y.Op != OpWasmF64Const => (F64Add y (F64Const [x]))
+(F64Mul (F64Const [x]) y) && y.Op != OpWasmF64Const => (F64Mul y (F64Const [x]))
+(I64Eq  (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Eq y  (I64Const [x]))
+(I64Ne  (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Ne y  (I64Const [x]))
+
+(I64Eq x (I64Const [0])) => (I64Eqz x)
+(I64LtU (I64Const [0]) x) => (I64Eqz (I64Eqz x))
+(I64LeU x (I64Const [0])) => (I64Eqz x)
+(I64LtU x (I64Const [1])) => (I64Eqz x)
+(I64LeU (I64Const [1]) x) => (I64Eqz (I64Eqz x))
+(I64Ne x (I64Const [0])) => (I64Eqz (I64Eqz x))
+
+(I64Add x (I64Const [y])) => (I64AddConst [y] x)
+(I64AddConst [0] x) => x
+(I64Eqz (I64Eqz (I64Eqz x))) => (I64Eqz x)
+
+// folding offset into load/store
+((I64Load|I64Load32U|I64Load32S|I64Load16U|I64Load16S|I64Load8U|I64Load8S) [off] (I64AddConst [off2] ptr) mem)
+	&& isU32Bit(off+off2) =>
+	((I64Load|I64Load32U|I64Load32S|I64Load16U|I64Load16S|I64Load8U|I64Load8S) [off+off2] ptr mem)
+
+((I64Store|I64Store32|I64Store16|I64Store8) [off] (I64AddConst [off2] ptr) val mem)
+	&& isU32Bit(off+off2) =>
+	((I64Store|I64Store32|I64Store16|I64Store8) [off+off2] ptr val mem)
+
+// folding offset into address
+(I64AddConst [off] (LoweredAddr {sym} [off2] base)) && isU32Bit(off+int64(off2)) =>
+	(LoweredAddr {sym} [int32(off)+off2] base)
+(I64AddConst [off] x:(SP)) && isU32Bit(off) => (LoweredAddr [int32(off)] x) // so it is rematerializeable
+
+// transforming readonly globals into constants
+(I64Load [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read64(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))])
+(I64Load32U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read32(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))])
+(I64Load16U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read16(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))])
+(I64Load8U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read8(sym, off+int64(off2)))])
diff --git a/src/cmd/compile/internal/ssa/gen/WasmOps.go b/src/cmd/compile/internal/ssa/gen/WasmOps.go
new file mode 100644
index 0000000..36c53bc
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/WasmOps.go
@@ -0,0 +1,278 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+var regNamesWasm = []string{
+	"R0",
+	"R1",
+	"R2",
+	"R3",
+	"R4",
+	"R5",
+	"R6",
+	"R7",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+
+	"F0",
+	"F1",
+	"F2",
+	"F3",
+	"F4",
+	"F5",
+	"F6",
+	"F7",
+	"F8",
+	"F9",
+	"F10",
+	"F11",
+	"F12",
+	"F13",
+	"F14",
+	"F15",
+
+	"F16",
+	"F17",
+	"F18",
+	"F19",
+	"F20",
+	"F21",
+	"F22",
+	"F23",
+	"F24",
+	"F25",
+	"F26",
+	"F27",
+	"F28",
+	"F29",
+	"F30",
+	"F31",
+
+	"SP",
+	"g",
+
+	// pseudo-registers
+	"SB",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNamesWasm) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNamesWasm {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	var (
+		gp     = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
+		fp32   = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15")
+		fp64   = buildReg("F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
+		gpsp   = gp | buildReg("SP")
+		gpspsb = gpsp | buildReg("SB")
+		// The "registers", which are actually local variables, can get clobbered
+		// if we're switching goroutines, because it unwinds the WebAssembly stack.
+		callerSave = gp | fp32 | fp64 | buildReg("g")
+	)
+
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: nil, outputs: []regMask{gp}}
+		gp11      = regInfo{inputs: []regMask{gpsp}, outputs: []regMask{gp}}
+		gp21      = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: []regMask{gp}}
+		gp31      = regInfo{inputs: []regMask{gpsp, gpsp, gpsp}, outputs: []regMask{gp}}
+		fp32_01   = regInfo{inputs: nil, outputs: []regMask{fp32}}
+		fp32_11   = regInfo{inputs: []regMask{fp32}, outputs: []regMask{fp32}}
+		fp32_21   = regInfo{inputs: []regMask{fp32, fp32}, outputs: []regMask{fp32}}
+		fp32_21gp = regInfo{inputs: []regMask{fp32, fp32}, outputs: []regMask{gp}}
+		fp64_01   = regInfo{inputs: nil, outputs: []regMask{fp64}}
+		fp64_11   = regInfo{inputs: []regMask{fp64}, outputs: []regMask{fp64}}
+		fp64_21   = regInfo{inputs: []regMask{fp64, fp64}, outputs: []regMask{fp64}}
+		fp64_21gp = regInfo{inputs: []regMask{fp64, fp64}, outputs: []regMask{gp}}
+		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{gp}}
+		gpstore   = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		fp32load  = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{fp32}}
+		fp32store = regInfo{inputs: []regMask{gpspsb, fp32, 0}}
+		fp64load  = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{fp64}}
+		fp64store = regInfo{inputs: []regMask{gpspsb, fp64, 0}}
+	)
+
+	var WasmOps = []opData{
+		{name: "LoweredStaticCall", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", call: true},                                // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
+		{name: "LoweredClosureCall", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp, 0}, clobbers: callerSave}, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "LoweredInterCall", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", call: true},          // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		{name: "LoweredAddr", argLength: 1, reg: gp11, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // returns base+aux+auxint, arg0=base
+		{name: "LoweredMove", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp}}, aux: "Int64"},                // large move. arg0=dst, arg1=src, arg2=mem, auxint=len/8, returns mem
+		{name: "LoweredZero", argLength: 2, reg: regInfo{inputs: []regMask{gp}}, aux: "Int64"},                    // large zeroing. arg0=start, arg1=mem, auxint=len/8, returns mem
+
+		{name: "LoweredGetClosurePtr", reg: gp01},                                                                          // returns wasm.REG_CTXT, the closure pointer
+		{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},                                                   // returns the PC of the caller of the current function
+		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},                                                   // returns the SP of the caller of the current function
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem
+		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp}}, aux: "Sym", symEffect: "None"},          // invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+
+		// LoweredConvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GCCallOff
+		// (particularly stack maps). It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		// arg0=ptr/int arg1=mem, output=int/ptr
+		//
+		// TODO(neelance): LoweredConvert should not be necessary any more, since OpConvert does not need to be lowered any more (CL 108496).
+		{name: "LoweredConvert", argLength: 2, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}},
+
+		// The following are native WebAssembly instructions, see https://webassembly.github.io/spec/core/syntax/instructions.html
+
+		{name: "Select", asm: "Select", argLength: 3, reg: gp31}, // returns arg0 if arg2 != 0, otherwise returns arg1
+
+		{name: "I64Load8U", asm: "I64Load8U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt8"},    // read unsigned 8-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Load8S", asm: "I64Load8S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int8"},     // read signed 8-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Load16U", asm: "I64Load16U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt16"}, // read unsigned 16-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Load16S", asm: "I64Load16S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int16"},  // read signed 16-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Load32U", asm: "I64Load32U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt32"}, // read unsigned 32-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Load32S", asm: "I64Load32S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int32"},  // read signed 32-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Load", asm: "I64Load", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt64"},       // read 64-bit integer from address arg0+aux, arg1=mem
+		{name: "I64Store8", asm: "I64Store8", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"},     // store 8-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+		{name: "I64Store16", asm: "I64Store16", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"},   // store 16-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+		{name: "I64Store32", asm: "I64Store32", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"},   // store 32-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+		{name: "I64Store", asm: "I64Store", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"},       // store 64-bit integer arg1 at address arg0+aux, arg2=mem, returns mem
+
+		{name: "F32Load", asm: "F32Load", argLength: 2, reg: fp32load, aux: "Int64", typ: "Float32"}, // read 32-bit float from address arg0+aux, arg1=mem
+		{name: "F64Load", asm: "F64Load", argLength: 2, reg: fp64load, aux: "Int64", typ: "Float64"}, // read 64-bit float from address arg0+aux, arg1=mem
+		{name: "F32Store", asm: "F32Store", argLength: 3, reg: fp32store, aux: "Int64", typ: "Mem"},  // store 32-bit float arg1 at address arg0+aux, arg2=mem, returns mem
+		{name: "F64Store", asm: "F64Store", argLength: 3, reg: fp64store, aux: "Int64", typ: "Mem"},  // store 64-bit float arg1 at address arg0+aux, arg2=mem, returns mem
+
+		{name: "I64Const", reg: gp01, aux: "Int64", rematerializeable: true, typ: "Int64"},        // returns the constant integer aux
+		{name: "F32Const", reg: fp32_01, aux: "Float32", rematerializeable: true, typ: "Float32"}, // returns the constant float aux
+		{name: "F64Const", reg: fp64_01, aux: "Float64", rematerializeable: true, typ: "Float64"}, // returns the constant float aux
+
+		{name: "I64Eqz", asm: "I64Eqz", argLength: 1, reg: gp11, typ: "Bool"}, // arg0 == 0
+		{name: "I64Eq", asm: "I64Eq", argLength: 2, reg: gp21, typ: "Bool"},   // arg0 == arg1
+		{name: "I64Ne", asm: "I64Ne", argLength: 2, reg: gp21, typ: "Bool"},   // arg0 != arg1
+		{name: "I64LtS", asm: "I64LtS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 < arg1 (signed)
+		{name: "I64LtU", asm: "I64LtU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 < arg1 (unsigned)
+		{name: "I64GtS", asm: "I64GtS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 > arg1 (signed)
+		{name: "I64GtU", asm: "I64GtU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 > arg1 (unsigned)
+		{name: "I64LeS", asm: "I64LeS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 <= arg1 (signed)
+		{name: "I64LeU", asm: "I64LeU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 <= arg1 (unsigned)
+		{name: "I64GeS", asm: "I64GeS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 >= arg1 (signed)
+		{name: "I64GeU", asm: "I64GeU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 >= arg1 (unsigned)
+
+		{name: "F32Eq", asm: "F32Eq", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 == arg1
+		{name: "F32Ne", asm: "F32Ne", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 != arg1
+		{name: "F32Lt", asm: "F32Lt", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 < arg1
+		{name: "F32Gt", asm: "F32Gt", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 > arg1
+		{name: "F32Le", asm: "F32Le", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 <= arg1
+		{name: "F32Ge", asm: "F32Ge", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 >= arg1
+
+		{name: "F64Eq", asm: "F64Eq", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 == arg1
+		{name: "F64Ne", asm: "F64Ne", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 != arg1
+		{name: "F64Lt", asm: "F64Lt", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 < arg1
+		{name: "F64Gt", asm: "F64Gt", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 > arg1
+		{name: "F64Le", asm: "F64Le", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 <= arg1
+		{name: "F64Ge", asm: "F64Ge", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 >= arg1
+
+		{name: "I64Add", asm: "I64Add", argLength: 2, reg: gp21, typ: "Int64"},                    // arg0 + arg1
+		{name: "I64AddConst", asm: "I64Add", argLength: 1, reg: gp11, aux: "Int64", typ: "Int64"}, // arg0 + aux
+		{name: "I64Sub", asm: "I64Sub", argLength: 2, reg: gp21, typ: "Int64"},                    // arg0 - arg1
+		{name: "I64Mul", asm: "I64Mul", argLength: 2, reg: gp21, typ: "Int64"},                    // arg0 * arg1
+		{name: "I64DivS", asm: "I64DivS", argLength: 2, reg: gp21, typ: "Int64"},                  // arg0 / arg1 (signed)
+		{name: "I64DivU", asm: "I64DivU", argLength: 2, reg: gp21, typ: "Int64"},                  // arg0 / arg1 (unsigned)
+		{name: "I64RemS", asm: "I64RemS", argLength: 2, reg: gp21, typ: "Int64"},                  // arg0 % arg1 (signed)
+		{name: "I64RemU", asm: "I64RemU", argLength: 2, reg: gp21, typ: "Int64"},                  // arg0 % arg1 (unsigned)
+		{name: "I64And", asm: "I64And", argLength: 2, reg: gp21, typ: "Int64"},                    // arg0 & arg1
+		{name: "I64Or", asm: "I64Or", argLength: 2, reg: gp21, typ: "Int64"},                      // arg0 | arg1
+		{name: "I64Xor", asm: "I64Xor", argLength: 2, reg: gp21, typ: "Int64"},                    // arg0 ^ arg1
+		{name: "I64Shl", asm: "I64Shl", argLength: 2, reg: gp21, typ: "Int64"},                    // arg0 << (arg1 % 64)
+		{name: "I64ShrS", asm: "I64ShrS", argLength: 2, reg: gp21, typ: "Int64"},                  // arg0 >> (arg1 % 64) (signed)
+		{name: "I64ShrU", asm: "I64ShrU", argLength: 2, reg: gp21, typ: "Int64"},                  // arg0 >> (arg1 % 64) (unsigned)
+
+		{name: "F32Neg", asm: "F32Neg", argLength: 1, reg: fp32_11, typ: "Float32"}, // -arg0
+		{name: "F32Add", asm: "F32Add", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 + arg1
+		{name: "F32Sub", asm: "F32Sub", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 - arg1
+		{name: "F32Mul", asm: "F32Mul", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 * arg1
+		{name: "F32Div", asm: "F32Div", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 / arg1
+
+		{name: "F64Neg", asm: "F64Neg", argLength: 1, reg: fp64_11, typ: "Float64"}, // -arg0
+		{name: "F64Add", asm: "F64Add", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 + arg1
+		{name: "F64Sub", asm: "F64Sub", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 - arg1
+		{name: "F64Mul", asm: "F64Mul", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 * arg1
+		{name: "F64Div", asm: "F64Div", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 / arg1
+
+		{name: "I64TruncSatF64S", asm: "I64TruncSatF64S", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to a signed integer (saturating)
+		{name: "I64TruncSatF64U", asm: "I64TruncSatF64U", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to an unsigned integer (saturating)
+		{name: "I64TruncSatF32S", asm: "I64TruncSatF32S", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to a signed integer (saturating)
+		{name: "I64TruncSatF32U", asm: "I64TruncSatF32U", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to an unsigned integer (saturating)
+		{name: "F32ConvertI64S", asm: "F32ConvertI64S", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp32}}, typ: "Float32"}, // converts the signed integer arg0 to a float
+		{name: "F32ConvertI64U", asm: "F32ConvertI64U", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp32}}, typ: "Float32"}, // converts the unsigned integer arg0 to a float
+		{name: "F64ConvertI64S", asm: "F64ConvertI64S", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp64}}, typ: "Float64"}, // converts the signed integer arg0 to a float
+		{name: "F64ConvertI64U", asm: "F64ConvertI64U", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp64}}, typ: "Float64"}, // converts the unsigned integer arg0 to a float
+		{name: "F32DemoteF64", asm: "F32DemoteF64", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{fp32}}, typ: "Float32"},
+		{name: "F64PromoteF32", asm: "F64PromoteF32", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{fp64}}, typ: "Float64"},
+
+		{name: "I64Extend8S", asm: "I64Extend8S", argLength: 1, reg: gp11, typ: "Int64"},   // sign-extend arg0 from 8 to 64 bit
+		{name: "I64Extend16S", asm: "I64Extend16S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 16 to 64 bit
+		{name: "I64Extend32S", asm: "I64Extend32S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 32 to 64 bit
+
+		{name: "F32Sqrt", asm: "F32Sqrt", argLength: 1, reg: fp64_11, typ: "Float32"},         // sqrt(arg0)
+		{name: "F32Trunc", asm: "F32Trunc", argLength: 1, reg: fp64_11, typ: "Float32"},       // trunc(arg0)
+		{name: "F32Ceil", asm: "F32Ceil", argLength: 1, reg: fp64_11, typ: "Float32"},         // ceil(arg0)
+		{name: "F32Floor", asm: "F32Floor", argLength: 1, reg: fp64_11, typ: "Float32"},       // floor(arg0)
+		{name: "F32Nearest", asm: "F32Nearest", argLength: 1, reg: fp64_11, typ: "Float32"},   // round(arg0)
+		{name: "F32Abs", asm: "F32Abs", argLength: 1, reg: fp64_11, typ: "Float32"},           // abs(arg0)
+		{name: "F32Copysign", asm: "F32Copysign", argLength: 2, reg: fp64_21, typ: "Float32"}, // copysign(arg0, arg1)
+
+		{name: "F64Sqrt", asm: "F64Sqrt", argLength: 1, reg: fp64_11, typ: "Float64"},         // sqrt(arg0)
+		{name: "F64Trunc", asm: "F64Trunc", argLength: 1, reg: fp64_11, typ: "Float64"},       // trunc(arg0)
+		{name: "F64Ceil", asm: "F64Ceil", argLength: 1, reg: fp64_11, typ: "Float64"},         // ceil(arg0)
+		{name: "F64Floor", asm: "F64Floor", argLength: 1, reg: fp64_11, typ: "Float64"},       // floor(arg0)
+		{name: "F64Nearest", asm: "F64Nearest", argLength: 1, reg: fp64_11, typ: "Float64"},   // round(arg0)
+		{name: "F64Abs", asm: "F64Abs", argLength: 1, reg: fp64_11, typ: "Float64"},           // abs(arg0)
+		{name: "F64Copysign", asm: "F64Copysign", argLength: 2, reg: fp64_21, typ: "Float64"}, // copysign(arg0, arg1)
+
+		{name: "I64Ctz", asm: "I64Ctz", argLength: 1, reg: gp11, typ: "Int64"},       // ctz(arg0)
+		{name: "I64Clz", asm: "I64Clz", argLength: 1, reg: gp11, typ: "Int64"},       // clz(arg0)
+		{name: "I32Rotl", asm: "I32Rotl", argLength: 2, reg: gp21, typ: "Int32"},     // rotl(arg0, arg1)
+		{name: "I64Rotl", asm: "I64Rotl", argLength: 2, reg: gp21, typ: "Int64"},     // rotl(arg0, arg1)
+		{name: "I64Popcnt", asm: "I64Popcnt", argLength: 1, reg: gp11, typ: "Int64"}, // popcnt(arg0)
+	}
+
+	archs = append(archs, arch{
+		name:            "Wasm",
+		pkg:             "cmd/internal/obj/wasm",
+		genfile:         "../../wasm/ssa.go",
+		ops:             WasmOps,
+		blocks:          nil,
+		regnames:        regNamesWasm,
+		gpregmask:       gp,
+		fpregmask:       fp32 | fp64,
+		fp32regmask:     fp32,
+		fp64regmask:     fp64,
+		framepointerreg: -1, // not used
+		linkreg:         -1, // not used
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/cover.bash b/src/cmd/compile/internal/ssa/gen/cover.bash
new file mode 100755
index 0000000..6c860fc
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/cover.bash
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash 
+# Copyright 2020 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# A quick and dirty way to obtain code coverage from rulegen's main func. For
+# example:
+#
+#     ./cover.bash && go tool cover -html=cover.out
+#
+# This script is needed to set up a temporary test file, so that we don't break
+# regular 'go run *.go' usage to run the generator.
+
+cat >main_test.go <<-EOF
+	// +build ignore
+
+	package main
+
+	import "testing"
+
+	func TestCoverage(t *testing.T) { main() }
+EOF
+
+go test -run='^TestCoverage$' -coverprofile=cover.out "$@" *.go
+
+rm -f main_test.go
diff --git a/src/cmd/compile/internal/ssa/gen/dec.rules b/src/cmd/compile/internal/ssa/gen/dec.rules
new file mode 100644
index 0000000..4c677f8
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/dec.rules
@@ -0,0 +1,92 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules to decompose builtin compound types
+// (complex,string,slice,interface) into their constituent
+// types.  These rules work together with the decomposeBuiltIn
+// pass which handles phis of these types.
+
+// complex ops
+(ComplexReal (ComplexMake real _  )) => real
+(ComplexImag (ComplexMake _ imag )) => imag
+
+(Load <t> ptr mem) && t.IsComplex() && t.Size() == 8 =>
+  (ComplexMake
+    (Load <typ.Float32> ptr mem)
+    (Load <typ.Float32>
+      (OffPtr <typ.Float32Ptr> [4] ptr)
+      mem)
+    )
+(Store {t} dst (ComplexMake real imag) mem) && t.Size() == 8 =>
+  (Store {typ.Float32}
+    (OffPtr <typ.Float32Ptr> [4] dst)
+    imag
+    (Store {typ.Float32} dst real mem))
+(Load <t> ptr mem) && t.IsComplex() && t.Size() == 16 =>
+  (ComplexMake
+    (Load <typ.Float64> ptr mem)
+    (Load <typ.Float64>
+      (OffPtr <typ.Float64Ptr> [8] ptr)
+      mem)
+    )
+(Store {t} dst (ComplexMake real imag) mem) && t.Size() == 16 =>
+  (Store {typ.Float64}
+    (OffPtr <typ.Float64Ptr> [8] dst)
+    imag
+    (Store {typ.Float64} dst real mem))
+
+// string ops
+(StringPtr (StringMake ptr _)) => ptr
+(StringLen (StringMake _ len)) => len
+
+(Load <t> ptr mem) && t.IsString() =>
+  (StringMake
+    (Load <typ.BytePtr> ptr mem)
+    (Load <typ.Int>
+      (OffPtr <typ.IntPtr> [config.PtrSize] ptr)
+      mem))
+(Store dst (StringMake ptr len) mem) =>
+  (Store {typ.Int}
+    (OffPtr <typ.IntPtr> [config.PtrSize] dst)
+    len
+    (Store {typ.BytePtr} dst ptr mem))
+
+// slice ops
+(SlicePtr (SliceMake ptr _ _ )) => ptr
+(SliceLen (SliceMake _ len _)) => len
+(SliceCap (SliceMake _ _ cap)) => cap
+
+(Load <t> ptr mem) && t.IsSlice() =>
+  (SliceMake
+    (Load <t.Elem().PtrTo()> ptr mem)
+    (Load <typ.Int>
+      (OffPtr <typ.IntPtr> [config.PtrSize] ptr)
+      mem)
+    (Load <typ.Int>
+      (OffPtr <typ.IntPtr> [2*config.PtrSize] ptr)
+      mem))
+(Store {t} dst (SliceMake ptr len cap) mem) =>
+  (Store {typ.Int}
+    (OffPtr <typ.IntPtr> [2*config.PtrSize] dst)
+    cap
+    (Store {typ.Int}
+      (OffPtr <typ.IntPtr> [config.PtrSize] dst)
+      len
+      (Store {t.Elem().PtrTo()} dst ptr mem)))
+
+// interface ops
+(ITab (IMake itab _)) => itab
+(IData (IMake _ data)) => data
+
+(Load <t> ptr mem) && t.IsInterface() =>
+  (IMake
+    (Load <typ.Uintptr> ptr mem)
+    (Load <typ.BytePtr>
+      (OffPtr <typ.BytePtrPtr> [config.PtrSize] ptr)
+      mem))
+(Store dst (IMake itab data) mem) =>
+  (Store {typ.BytePtr}
+    (OffPtr <typ.BytePtrPtr> [config.PtrSize] dst)
+    data
+    (Store {typ.Uintptr} dst itab mem))
diff --git a/src/cmd/compile/internal/ssa/gen/dec64.rules b/src/cmd/compile/internal/ssa/gen/dec64.rules
new file mode 100644
index 0000000..9297ed8
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
@@ -0,0 +1,396 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains rules to decompose [u]int64 types on 32-bit
+// architectures. These rules work together with the decomposeBuiltIn
+// pass which handles phis of these typ.
+
+(Int64Hi (Int64Make hi _)) => hi
+(Int64Lo (Int64Make _ lo)) => lo
+
+(Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && t.IsSigned() =>
+	(Int64Make
+		(Load <typ.Int32> (OffPtr <typ.Int32Ptr> [4] ptr) mem)
+		(Load <typ.UInt32> ptr mem))
+
+(Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && !t.IsSigned() =>
+	(Int64Make
+		(Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem)
+		(Load <typ.UInt32> ptr mem))
+
+(Load <t> ptr mem) && is64BitInt(t) && config.BigEndian && t.IsSigned() =>
+	(Int64Make
+		(Load <typ.Int32> ptr mem)
+		(Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem))
+
+(Load <t> ptr mem) && is64BitInt(t) && config.BigEndian && !t.IsSigned() =>
+	(Int64Make
+		(Load <typ.UInt32> ptr mem)
+		(Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem))
+
+(Store {t} dst (Int64Make hi lo) mem) && t.Size() == 8 && !config.BigEndian =>
+	(Store {hi.Type}
+		(OffPtr <hi.Type.PtrTo()> [4] dst)
+		hi
+		(Store {lo.Type} dst lo mem))
+
+(Store {t} dst (Int64Make hi lo) mem) && t.Size() == 8 && config.BigEndian =>
+	(Store {lo.Type}
+		(OffPtr <lo.Type.PtrTo()> [4] dst)
+		lo
+		(Store {hi.Type} dst hi mem))
+
+// These are not enabled during decomposeBuiltin if late call expansion, but they are always enabled for softFloat
+(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin") =>
+  (Int64Make
+    (Arg <typ.Int32> {n} [off+4])
+    (Arg <typ.UInt32> {n} [off]))
+(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin")  =>
+  (Int64Make
+    (Arg <typ.UInt32> {n} [off+4])
+    (Arg <typ.UInt32> {n} [off]))
+
+(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin") =>
+  (Int64Make
+    (Arg <typ.Int32> {n} [off])
+    (Arg <typ.UInt32> {n} [off+4]))
+(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() && !(go116lateCallExpansion && b.Func.pass.name == "decompose builtin") =>
+  (Int64Make
+    (Arg <typ.UInt32> {n} [off])
+    (Arg <typ.UInt32> {n} [off+4]))
+
+(Add64 x y) =>
+	(Int64Make
+		(Add32withcarry <typ.Int32>
+			(Int64Hi x)
+			(Int64Hi y)
+			(Select1 <types.TypeFlags> (Add32carry (Int64Lo x) (Int64Lo y))))
+		(Select0 <typ.UInt32> (Add32carry (Int64Lo x) (Int64Lo y))))
+
+(Sub64 x y) =>
+	(Int64Make
+		(Sub32withcarry <typ.Int32>
+			(Int64Hi x)
+			(Int64Hi y)
+			(Select1 <types.TypeFlags> (Sub32carry (Int64Lo x) (Int64Lo y))))
+		(Select0 <typ.UInt32> (Sub32carry (Int64Lo x) (Int64Lo y))))
+
+(Mul64 x y) =>
+	(Int64Make
+		(Add32 <typ.UInt32>
+			(Mul32 <typ.UInt32> (Int64Lo x) (Int64Hi y))
+			(Add32 <typ.UInt32>
+				(Mul32 <typ.UInt32> (Int64Hi x) (Int64Lo y))
+				(Select0 <typ.UInt32> (Mul32uhilo (Int64Lo x) (Int64Lo y)))))
+		(Select1 <typ.UInt32> (Mul32uhilo (Int64Lo x) (Int64Lo y))))
+
+(And64 x y) =>
+	(Int64Make
+		(And32 <typ.UInt32> (Int64Hi x) (Int64Hi y))
+		(And32 <typ.UInt32> (Int64Lo x) (Int64Lo y)))
+
+(Or64 x y) =>
+	(Int64Make
+		(Or32 <typ.UInt32> (Int64Hi x) (Int64Hi y))
+		(Or32 <typ.UInt32> (Int64Lo x) (Int64Lo y)))
+
+(Xor64 x y) =>
+	(Int64Make
+		(Xor32 <typ.UInt32> (Int64Hi x) (Int64Hi y))
+		(Xor32 <typ.UInt32> (Int64Lo x) (Int64Lo y)))
+
+(Neg64 <t> x) => (Sub64 (Const64 <t> [0]) x)
+
+(Com64 x) =>
+	(Int64Make
+		(Com32 <typ.UInt32> (Int64Hi x))
+		(Com32 <typ.UInt32> (Int64Lo x)))
+
+// Sadly, just because we know that x is non-zero,
+// we don't know whether either component is,
+// so just treat Ctz64NonZero the same as Ctz64.
+(Ctz64NonZero ...) => (Ctz64 ...)
+
+(Ctz64 x) =>
+	(Add32 <typ.UInt32>
+		(Ctz32 <typ.UInt32> (Int64Lo x))
+		(And32 <typ.UInt32>
+			(Com32 <typ.UInt32> (Zeromask (Int64Lo x)))
+			(Ctz32 <typ.UInt32> (Int64Hi x))))
+
+(BitLen64 x) =>
+	(Add32 <typ.Int>
+		(BitLen32 <typ.Int> (Int64Hi x))
+		(BitLen32 <typ.Int>
+			(Or32 <typ.UInt32>
+				(Int64Lo x)
+				(Zeromask (Int64Hi x)))))
+
+(Bswap64 x) =>
+	(Int64Make
+		(Bswap32 <typ.UInt32> (Int64Lo x))
+		(Bswap32 <typ.UInt32> (Int64Hi x)))
+
+(SignExt32to64 x) => (Int64Make (Signmask x) x)
+(SignExt16to64 x) => (SignExt32to64 (SignExt16to32 x))
+(SignExt8to64 x) => (SignExt32to64 (SignExt8to32 x))
+
+(ZeroExt32to64 x) => (Int64Make (Const32 <typ.UInt32> [0]) x)
+(ZeroExt16to64 x) => (ZeroExt32to64 (ZeroExt16to32 x))
+(ZeroExt8to64 x) => (ZeroExt32to64 (ZeroExt8to32 x))
+
+(Trunc64to32 (Int64Make _ lo)) => lo
+(Trunc64to16 (Int64Make _ lo)) => (Trunc32to16 lo)
+(Trunc64to8 (Int64Make _ lo)) => (Trunc32to8 lo)
+// Most general
+(Trunc64to32 x) => (Int64Lo x)
+(Trunc64to16 x) => (Trunc32to16 (Int64Lo x))
+(Trunc64to8 x) => (Trunc32to8 (Int64Lo x))
+
+(Lsh32x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Rsh32x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask x)
+(Rsh32Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Lsh16x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Rsh16x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask (SignExt16to32 x))
+(Rsh16Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Lsh8x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+(Rsh8x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask (SignExt8to32 x))
+(Rsh8Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
+
+(Lsh32x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh32x32 [c] x lo)
+(Rsh32x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh32x32 [c] x lo)
+(Rsh32Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh32Ux32 [c] x lo)
+(Lsh16x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh16x32 [c] x lo)
+(Rsh16x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh16x32 [c] x lo)
+(Rsh16Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh16Ux32 [c] x lo)
+(Lsh8x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh8x32 [c] x lo)
+(Rsh8x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh8x32 [c] x lo)
+(Rsh8Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh8Ux32 [c] x lo)
+
+(Lsh64x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const64 [0])
+(Rsh64x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Int64Make (Signmask (Int64Hi x)) (Signmask (Int64Hi x)))
+(Rsh64Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const64 [0])
+
+(Lsh64x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh64x32 [c] x lo)
+(Rsh64x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh64x32 [c] x lo)
+(Rsh64Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh64Ux32 [c] x lo)
+
+// turn x64 non-constant shifts to x32 shifts
+// if high 32-bit of the shift is nonzero, make a huge shift
+(Lsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Lsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh64Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Lsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Lsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh32Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Lsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Lsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh16Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Lsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Lsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+(Rsh8Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
+       (Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+
+// Most general
+(Lsh64x64 x y)  => (Lsh64x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh64x64 x y)  => (Rsh64x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh64Ux64 x y) => (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh32x64 x y)  => (Lsh32x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh32x64 x y)  => (Rsh32x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh32Ux64 x y) => (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh16x64 x y)  => (Lsh16x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh16x64 x y)  => (Rsh16x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh16Ux64 x y) => (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh8x64 x y)   => (Lsh8x32   x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh8x64 x y)   => (Rsh8x32   x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh8Ux64 x y)  => (Rsh8Ux32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+
+// Clean up constants a little
+(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c == 0 => y
+(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c != 0 => (Const32 <typ.UInt32> [-1])
+
+// 64x left shift
+// result.hi = hi<<s | lo>>(32-s) | lo<<(s-32) // >> is unsigned, large shifts result 0
+// result.lo = lo<<s
+(Lsh64x32 x s) =>
+	(Int64Make
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Lsh32x32 <typ.UInt32> (Int64Hi x) s)
+				(Rsh32Ux32 <typ.UInt32>
+					(Int64Lo x)
+					(Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
+			(Lsh32x32 <typ.UInt32>
+				(Int64Lo x)
+				(Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32]))))
+		(Lsh32x32 <typ.UInt32> (Int64Lo x) s))
+(Lsh64x16 x s) =>
+	(Int64Make
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Lsh32x16 <typ.UInt32> (Int64Hi x) s)
+				(Rsh32Ux16 <typ.UInt32>
+					(Int64Lo x)
+					(Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
+			(Lsh32x16 <typ.UInt32>
+				(Int64Lo x)
+				(Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32]))))
+		(Lsh32x16 <typ.UInt32> (Int64Lo x) s))
+(Lsh64x8 x s) =>
+	(Int64Make
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Lsh32x8 <typ.UInt32> (Int64Hi x) s)
+				(Rsh32Ux8 <typ.UInt32>
+					(Int64Lo x)
+					(Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
+			(Lsh32x8 <typ.UInt32>
+				(Int64Lo x)
+				(Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32]))))
+		(Lsh32x8 <typ.UInt32> (Int64Lo x) s))
+
+// 64x unsigned right shift
+// result.hi = hi>>s
+// result.lo = lo>>s | hi<<(32-s) | hi>>(s-32) // >> is unsigned, large shifts result 0
+(Rsh64Ux32 x s) =>
+	(Int64Make
+		(Rsh32Ux32 <typ.UInt32> (Int64Hi x) s)
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Rsh32Ux32 <typ.UInt32> (Int64Lo x) s)
+				(Lsh32x32 <typ.UInt32>
+					(Int64Hi x)
+					(Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
+			(Rsh32Ux32 <typ.UInt32>
+				(Int64Hi x)
+				(Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))))
+(Rsh64Ux16 x s) =>
+	(Int64Make
+		(Rsh32Ux16 <typ.UInt32> (Int64Hi x) s)
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Rsh32Ux16 <typ.UInt32> (Int64Lo x) s)
+				(Lsh32x16 <typ.UInt32>
+					(Int64Hi x)
+					(Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
+			(Rsh32Ux16 <typ.UInt32>
+				(Int64Hi x)
+				(Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))))
+(Rsh64Ux8 x s) =>
+	(Int64Make
+		(Rsh32Ux8 <typ.UInt32> (Int64Hi x) s)
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Rsh32Ux8 <typ.UInt32> (Int64Lo x) s)
+				(Lsh32x8 <typ.UInt32>
+					(Int64Hi x)
+					(Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
+			(Rsh32Ux8 <typ.UInt32>
+				(Int64Hi x)
+				(Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))))
+
+// 64x signed right shift
+// result.hi = hi>>s
+// result.lo = lo>>s | hi<<(32-s) | (hi>>(s-32))&zeromask(s>>5) // hi>>(s-32) is signed, large shifts result 0/-1
+(Rsh64x32 x s) =>
+	(Int64Make
+		(Rsh32x32 <typ.UInt32> (Int64Hi x) s)
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Rsh32Ux32 <typ.UInt32> (Int64Lo x) s)
+				(Lsh32x32 <typ.UInt32>
+					(Int64Hi x)
+					(Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
+			(And32 <typ.UInt32>
+				(Rsh32x32 <typ.UInt32>
+					(Int64Hi x)
+					(Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))
+				(Zeromask
+					(Rsh32Ux32 <typ.UInt32> s (Const32 <typ.UInt32> [5]))))))
+(Rsh64x16 x s) =>
+	(Int64Make
+		(Rsh32x16 <typ.UInt32> (Int64Hi x) s)
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Rsh32Ux16 <typ.UInt32> (Int64Lo x) s)
+				(Lsh32x16 <typ.UInt32>
+					(Int64Hi x)
+					(Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
+			(And32 <typ.UInt32>
+				(Rsh32x16 <typ.UInt32>
+					(Int64Hi x)
+					(Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))
+				(Zeromask
+					(ZeroExt16to32
+						(Rsh16Ux32 <typ.UInt16> s (Const32 <typ.UInt32> [5])))))))
+(Rsh64x8 x s) =>
+	(Int64Make
+		(Rsh32x8 <typ.UInt32> (Int64Hi x) s)
+		(Or32 <typ.UInt32>
+			(Or32 <typ.UInt32>
+				(Rsh32Ux8 <typ.UInt32> (Int64Lo x) s)
+				(Lsh32x8 <typ.UInt32>
+					(Int64Hi x)
+					(Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
+			(And32 <typ.UInt32>
+				(Rsh32x8 <typ.UInt32>
+					(Int64Hi x)
+					(Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))
+				(Zeromask
+					(ZeroExt8to32
+						(Rsh8Ux32 <typ.UInt8> s (Const32 <typ.UInt32> [5])))))))
+
+(Const64 <t> [c]) && t.IsSigned() =>
+	(Int64Make (Const32 <typ.Int32> [int32(c>>32)]) (Const32 <typ.UInt32> [int32(c)]))
+(Const64 <t> [c]) && !t.IsSigned() =>
+	(Int64Make (Const32 <typ.UInt32> [int32(c>>32)]) (Const32 <typ.UInt32> [int32(c)]))
+
+(Eq64 x y) =>
+	(AndB
+		(Eq32 (Int64Hi x) (Int64Hi y))
+		(Eq32 (Int64Lo x) (Int64Lo y)))
+
+(Neq64 x y) =>
+	(OrB
+		(Neq32 (Int64Hi x) (Int64Hi y))
+		(Neq32 (Int64Lo x) (Int64Lo y)))
+
+(Less64U x y) =>
+	(OrB
+		(Less32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64U x y) =>
+	(OrB
+		(Less32U (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Leq32U (Int64Lo x) (Int64Lo y))))
+
+(Less64 x y) =>
+	(OrB
+		(Less32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Less32U (Int64Lo x) (Int64Lo y))))
+
+(Leq64 x y) =>
+	(OrB
+		(Less32 (Int64Hi x) (Int64Hi y))
+		(AndB
+			(Eq32 (Int64Hi x) (Int64Hi y))
+			(Leq32U (Int64Lo x) (Int64Lo y))))
diff --git a/src/cmd/compile/internal/ssa/gen/dec64Ops.go b/src/cmd/compile/internal/ssa/gen/dec64Ops.go
new file mode 100644
index 0000000..8c5883b
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/dec64Ops.go
@@ -0,0 +1,20 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+var dec64Ops = []opData{}
+
+var dec64Blocks = []blockData{}
+
+func init() {
+	archs = append(archs, arch{
+		name:    "dec64",
+		ops:     dec64Ops,
+		blocks:  dec64Blocks,
+		generic: true,
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/decArgs.rules b/src/cmd/compile/internal/ssa/gen/decArgs.rules
new file mode 100644
index 0000000..1c9a0bb
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/decArgs.rules
@@ -0,0 +1,58 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Decompose compound argument values
+// Do this early to simplify tracking names for debugging.
+
+(Arg {n} [off]) && v.Type.IsString() =>
+  (StringMake
+    (Arg <typ.BytePtr> {n} [off])
+    (Arg <typ.Int> {n} [off+int32(config.PtrSize)]))
+
+(Arg {n} [off]) && v.Type.IsSlice() =>
+  (SliceMake
+    (Arg <v.Type.Elem().PtrTo()> {n} [off])
+    (Arg <typ.Int> {n} [off+int32(config.PtrSize)])
+    (Arg <typ.Int> {n} [off+2*int32(config.PtrSize)]))
+
+(Arg {n} [off]) && v.Type.IsInterface() =>
+  (IMake
+    (Arg <typ.Uintptr> {n} [off])
+    (Arg <typ.BytePtr> {n} [off+int32(config.PtrSize)]))
+
+(Arg {n} [off]) && v.Type.IsComplex() && v.Type.Size() == 16 =>
+  (ComplexMake
+    (Arg <typ.Float64> {n} [off])
+    (Arg <typ.Float64> {n} [off+8]))
+
+(Arg {n} [off]) && v.Type.IsComplex() && v.Type.Size() == 8 =>
+  (ComplexMake
+    (Arg <typ.Float32> {n} [off])
+    (Arg <typ.Float32> {n} [off+4]))
+
+(Arg <t>) && t.IsStruct() && t.NumFields() == 0 && fe.CanSSA(t) =>
+  (StructMake0)
+(Arg <t> {n} [off]) && t.IsStruct() && t.NumFields() == 1 && fe.CanSSA(t) =>
+  (StructMake1
+    (Arg <t.FieldType(0)> {n} [off+int32(t.FieldOff(0))]))
+(Arg <t> {n} [off]) && t.IsStruct() && t.NumFields() == 2 && fe.CanSSA(t) =>
+  (StructMake2
+    (Arg <t.FieldType(0)> {n} [off+int32(t.FieldOff(0))])
+    (Arg <t.FieldType(1)> {n} [off+int32(t.FieldOff(1))]))
+(Arg <t> {n} [off]) && t.IsStruct() && t.NumFields() == 3 && fe.CanSSA(t) =>
+  (StructMake3
+    (Arg <t.FieldType(0)> {n} [off+int32(t.FieldOff(0))])
+    (Arg <t.FieldType(1)> {n} [off+int32(t.FieldOff(1))])
+    (Arg <t.FieldType(2)> {n} [off+int32(t.FieldOff(2))]))
+(Arg <t> {n} [off]) && t.IsStruct() && t.NumFields() == 4 && fe.CanSSA(t) =>
+  (StructMake4
+    (Arg <t.FieldType(0)> {n} [off+int32(t.FieldOff(0))])
+    (Arg <t.FieldType(1)> {n} [off+int32(t.FieldOff(1))])
+    (Arg <t.FieldType(2)> {n} [off+int32(t.FieldOff(2))])
+    (Arg <t.FieldType(3)> {n} [off+int32(t.FieldOff(3))]))
+
+(Arg <t>) && t.IsArray() && t.NumElem() == 0 =>
+  (ArrayMake0)
+(Arg <t> {n} [off]) && t.IsArray() && t.NumElem() == 1 && fe.CanSSA(t) =>
+  (ArrayMake1 (Arg <t.Elem()> {n} [off]))
diff --git a/src/cmd/compile/internal/ssa/gen/decArgsOps.go b/src/cmd/compile/internal/ssa/gen/decArgsOps.go
new file mode 100644
index 0000000..b73d9d3
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/decArgsOps.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+var decArgsOps = []opData{}
+
+var decArgsBlocks = []blockData{}
+
+func init() {
+	archs = append(archs, arch{
+		name:    "decArgs",
+		ops:     decArgsOps,
+		blocks:  decArgsBlocks,
+		generic: true,
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/decOps.go b/src/cmd/compile/internal/ssa/gen/decOps.go
new file mode 100644
index 0000000..b826481
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/decOps.go
@@ -0,0 +1,20 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+var decOps = []opData{}
+
+var decBlocks = []blockData{}
+
+func init() {
+	archs = append(archs, arch{
+		name:    "dec",
+		ops:     decOps,
+		blocks:  decBlocks,
+		generic: true,
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/generic.rules b/src/cmd/compile/internal/ssa/gen/generic.rules
new file mode 100644
index 0000000..1784923
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@@ -0,0 +1,2535 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Simplifications that apply to all backend architectures. As an example, this
+// Go source code
+//
+// y := 0 * x
+//
+// can be translated into y := 0 without losing any information, which saves a
+// pointless multiplication instruction. Other .rules files in this directory
+// (for example AMD64.rules) contain rules specific to the architecture in the
+// filename. The rules here apply to every architecture.
+//
+// The code for parsing this file lives in rulegen.go; this file generates
+// ssa/rewritegeneric.go.
+
+// values are specified using the following format:
+// (op <type> [auxint] {aux} arg0 arg1 ...)
+// the type, aux, and auxint fields are optional
+// on the matching side
+//  - the type, aux, and auxint fields must match if they are specified.
+//  - the first occurrence of a variable defines that variable.  Subsequent
+//    uses must match (be == to) the first use.
+//  - v is defined to be the value matched.
+//  - an additional conditional can be provided after the match pattern with "&&".
+// on the generated side
+//  - the type of the top-level expression is the same as the one on the left-hand side.
+//  - the type of any subexpressions must be specified explicitly (or
+//    be specified in the op's type field).
+//  - auxint will be 0 if not specified.
+//  - aux will be nil if not specified.
+
+// blocks are specified using the following format:
+// (kind controlvalue succ0 succ1 ...)
+// controlvalue must be "nil" or a value expression
+// succ* fields must be variables
+// For now, the generated successors must be a permutation of the matched successors.
+
+// constant folding
+(Trunc16to8  (Const16  [c])) => (Const8   [int8(c)])
+(Trunc32to8  (Const32  [c])) => (Const8   [int8(c)])
+(Trunc32to16 (Const32  [c])) => (Const16  [int16(c)])
+(Trunc64to8  (Const64  [c])) => (Const8   [int8(c)])
+(Trunc64to16 (Const64  [c])) => (Const16  [int16(c)])
+(Trunc64to32 (Const64  [c])) => (Const32  [int32(c)])
+(Cvt64Fto32F (Const64F [c])) => (Const32F [float32(c)])
+(Cvt32Fto64F (Const32F [c])) => (Const64F [float64(c)])
+(Cvt32to32F  (Const32  [c])) => (Const32F [float32(c)])
+(Cvt32to64F  (Const32  [c])) => (Const64F [float64(c)])
+(Cvt64to32F  (Const64  [c])) => (Const32F [float32(c)])
+(Cvt64to64F  (Const64  [c])) => (Const64F [float64(c)])
+(Cvt32Fto32  (Const32F [c])) => (Const32  [int32(c)])
+(Cvt32Fto64  (Const32F [c])) => (Const64  [int64(c)])
+(Cvt64Fto32  (Const64F [c])) => (Const32  [int32(c)])
+(Cvt64Fto64  (Const64F [c])) => (Const64  [int64(c)])
+(Round32F x:(Const32F)) => x
+(Round64F x:(Const64F)) => x
+(CvtBoolToUint8 (ConstBool [false])) => (Const8 [0])
+(CvtBoolToUint8 (ConstBool [true])) => (Const8 [1])
+
+(Trunc16to8  (ZeroExt8to16  x)) => x
+(Trunc32to8  (ZeroExt8to32  x)) => x
+(Trunc32to16 (ZeroExt8to32  x)) => (ZeroExt8to16  x)
+(Trunc32to16 (ZeroExt16to32 x)) => x
+(Trunc64to8  (ZeroExt8to64  x)) => x
+(Trunc64to16 (ZeroExt8to64  x)) => (ZeroExt8to16  x)
+(Trunc64to16 (ZeroExt16to64 x)) => x
+(Trunc64to32 (ZeroExt8to64  x)) => (ZeroExt8to32  x)
+(Trunc64to32 (ZeroExt16to64 x)) => (ZeroExt16to32 x)
+(Trunc64to32 (ZeroExt32to64 x)) => x
+(Trunc16to8  (SignExt8to16  x)) => x
+(Trunc32to8  (SignExt8to32  x)) => x
+(Trunc32to16 (SignExt8to32  x)) => (SignExt8to16  x)
+(Trunc32to16 (SignExt16to32 x)) => x
+(Trunc64to8  (SignExt8to64  x)) => x
+(Trunc64to16 (SignExt8to64  x)) => (SignExt8to16  x)
+(Trunc64to16 (SignExt16to64 x)) => x
+(Trunc64to32 (SignExt8to64  x)) => (SignExt8to32  x)
+(Trunc64to32 (SignExt16to64 x)) => (SignExt16to32 x)
+(Trunc64to32 (SignExt32to64 x)) => x
+
+(ZeroExt8to16  (Const8  [c])) => (Const16 [int16( uint8(c))])
+(ZeroExt8to32  (Const8  [c])) => (Const32 [int32( uint8(c))])
+(ZeroExt8to64  (Const8  [c])) => (Const64 [int64( uint8(c))])
+(ZeroExt16to32 (Const16 [c])) => (Const32 [int32(uint16(c))])
+(ZeroExt16to64 (Const16 [c])) => (Const64 [int64(uint16(c))])
+(ZeroExt32to64 (Const32 [c])) => (Const64 [int64(uint32(c))])
+(SignExt8to16  (Const8  [c])) => (Const16 [int16(c)])
+(SignExt8to32  (Const8  [c])) => (Const32 [int32(c)])
+(SignExt8to64  (Const8  [c])) => (Const64 [int64(c)])
+(SignExt16to32 (Const16 [c])) => (Const32 [int32(c)])
+(SignExt16to64 (Const16 [c])) => (Const64 [int64(c)])
+(SignExt32to64 (Const32 [c])) => (Const64 [int64(c)])
+
+(Neg8   (Const8   [c])) => (Const8   [-c])
+(Neg16  (Const16  [c])) => (Const16  [-c])
+(Neg32  (Const32  [c])) => (Const32  [-c])
+(Neg64  (Const64  [c])) => (Const64  [-c])
+(Neg32F (Const32F [c])) && c != 0 => (Const32F [-c])
+(Neg64F (Const64F [c])) && c != 0 => (Const64F [-c])
+
+(Add8   (Const8 [c])   (Const8 [d]))   => (Const8  [c+d])
+(Add16  (Const16 [c])  (Const16 [d]))  => (Const16 [c+d])
+(Add32  (Const32 [c])  (Const32 [d]))  => (Const32 [c+d])
+(Add64  (Const64 [c])  (Const64 [d]))  => (Const64 [c+d])
+(Add32F (Const32F [c]) (Const32F [d])) && c+d == c+d => (Const32F [c+d])
+(Add64F (Const64F [c]) (Const64F [d])) && c+d == c+d => (Const64F [c+d])
+(AddPtr <t> x (Const64 [c])) => (OffPtr <t> x [c])
+(AddPtr <t> x (Const32 [c])) => (OffPtr <t> x [int64(c)])
+
+(Sub8   (Const8 [c]) (Const8 [d]))     => (Const8 [c-d])
+(Sub16  (Const16 [c]) (Const16 [d]))   => (Const16 [c-d])
+(Sub32  (Const32 [c]) (Const32 [d]))   => (Const32 [c-d])
+(Sub64  (Const64 [c]) (Const64 [d]))   => (Const64 [c-d])
+(Sub32F (Const32F [c]) (Const32F [d])) && c-d == c-d => (Const32F [c-d])
+(Sub64F (Const64F [c]) (Const64F [d])) && c-d == c-d => (Const64F [c-d])
+
+(Mul8   (Const8 [c])   (Const8 [d]))   => (Const8  [c*d])
+(Mul16  (Const16 [c])  (Const16 [d]))  => (Const16 [c*d])
+(Mul32  (Const32 [c])  (Const32 [d]))  => (Const32 [c*d])
+(Mul64  (Const64 [c])  (Const64 [d]))  => (Const64 [c*d])
+(Mul32F (Const32F [c]) (Const32F [d])) && c*d == c*d => (Const32F [c*d])
+(Mul64F (Const64F [c]) (Const64F [d])) && c*d == c*d => (Const64F [c*d])
+
+(And8   (Const8 [c])   (Const8 [d]))   => (Const8  [c&d])
+(And16  (Const16 [c])  (Const16 [d]))  => (Const16 [c&d])
+(And32  (Const32 [c])  (Const32 [d]))  => (Const32 [c&d])
+(And64  (Const64 [c])  (Const64 [d]))  => (Const64 [c&d])
+
+(Or8   (Const8 [c])   (Const8 [d]))   => (Const8  [c|d])
+(Or16  (Const16 [c])  (Const16 [d]))  => (Const16 [c|d])
+(Or32  (Const32 [c])  (Const32 [d]))  => (Const32 [c|d])
+(Or64  (Const64 [c])  (Const64 [d]))  => (Const64 [c|d])
+
+(Xor8   (Const8 [c])   (Const8 [d]))   => (Const8  [c^d])
+(Xor16  (Const16 [c])  (Const16 [d]))  => (Const16 [c^d])
+(Xor32  (Const32 [c])  (Const32 [d]))  => (Const32 [c^d])
+(Xor64  (Const64 [c])  (Const64 [d]))  => (Const64 [c^d])
+
+(Ctz64 (Const64 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz64(c))])
+(Ctz32 (Const32 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz32(c))])
+(Ctz16 (Const16 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz16(c))])
+(Ctz8  (Const8  [c])) && config.PtrSize == 4 => (Const32 [int32(ntz8(c))])
+
+(Ctz64 (Const64 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz64(c))])
+(Ctz32 (Const32 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz32(c))])
+(Ctz16 (Const16 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz16(c))])
+(Ctz8  (Const8  [c])) && config.PtrSize == 8 => (Const64 [int64(ntz8(c))])
+
+(Div8   (Const8  [c])  (Const8  [d])) && d != 0 => (Const8  [c/d])
+(Div16  (Const16 [c])  (Const16 [d])) && d != 0 => (Const16 [c/d])
+(Div32  (Const32 [c])  (Const32 [d])) && d != 0 => (Const32 [c/d])
+(Div64  (Const64 [c])  (Const64 [d])) && d != 0 => (Const64 [c/d])
+(Div8u  (Const8  [c])  (Const8  [d])) && d != 0 => (Const8  [int8(uint8(c)/uint8(d))])
+(Div16u (Const16 [c])  (Const16 [d])) && d != 0 => (Const16 [int16(uint16(c)/uint16(d))])
+(Div32u (Const32 [c])  (Const32 [d])) && d != 0 => (Const32 [int32(uint32(c)/uint32(d))])
+(Div64u (Const64 [c])  (Const64 [d])) && d != 0 => (Const64 [int64(uint64(c)/uint64(d))])
+(Div32F (Const32F [c]) (Const32F [d])) && c/d == c/d => (Const32F [c/d])
+(Div64F (Const64F [c]) (Const64F [d])) && c/d == c/d => (Const64F [c/d])
+(Select0 (Div128u (Const64 [0]) lo y)) => (Div64u lo y)
+(Select1 (Div128u (Const64 [0]) lo y)) => (Mod64u lo y)
+
+(Not (ConstBool [c])) => (ConstBool [!c])
+
+// Convert x * 1 to x.
+(Mul(8|16|32|64)  (Const(8|16|32|64)  [1]) x) => x
+
+// Convert x * -1 to -x.
+(Mul(8|16|32|64)  (Const(8|16|32|64)  [-1]) x) => (Neg(8|16|32|64)  x)
+
+// Convert multiplication by a power of two to a shift.
+(Mul8  <t> n (Const8  [c])) && isPowerOfTwo8(c) => (Lsh8x64  <t> n (Const64 <typ.UInt64> [log8(c)]))
+(Mul16 <t> n (Const16 [c])) && isPowerOfTwo16(c) => (Lsh16x64 <t> n (Const64 <typ.UInt64> [log16(c)]))
+(Mul32 <t> n (Const32 [c])) && isPowerOfTwo32(c) => (Lsh32x64 <t> n (Const64 <typ.UInt64> [log32(c)]))
+(Mul64 <t> n (Const64 [c])) && isPowerOfTwo64(c) => (Lsh64x64 <t> n (Const64 <typ.UInt64> [log64(c)]))
+(Mul8  <t> n (Const8  [c])) && t.IsSigned() && isPowerOfTwo8(-c)  => (Neg8  (Lsh8x64  <t> n (Const64 <typ.UInt64> [log8(-c)])))
+(Mul16 <t> n (Const16 [c])) && t.IsSigned() && isPowerOfTwo16(-c) => (Neg16 (Lsh16x64 <t> n (Const64 <typ.UInt64> [log16(-c)])))
+(Mul32 <t> n (Const32 [c])) && t.IsSigned() && isPowerOfTwo32(-c) => (Neg32 (Lsh32x64 <t> n (Const64 <typ.UInt64> [log32(-c)])))
+(Mul64 <t> n (Const64 [c])) && t.IsSigned() && isPowerOfTwo64(-c) => (Neg64 (Lsh64x64 <t> n (Const64 <typ.UInt64> [log64(-c)])))
+
+(Mod8  (Const8  [c]) (Const8  [d])) && d != 0 => (Const8  [c % d])
+(Mod16 (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [c % d])
+(Mod32 (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [c % d])
+(Mod64 (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [c % d])
+
+(Mod8u  (Const8 [c])  (Const8  [d])) && d != 0 => (Const8  [int8(uint8(c) % uint8(d))])
+(Mod16u (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [int16(uint16(c) % uint16(d))])
+(Mod32u (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [int32(uint32(c) % uint32(d))])
+(Mod64u (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [int64(uint64(c) % uint64(d))])
+
+(Lsh64x64  (Const64 [c]) (Const64 [d])) => (Const64 [c << uint64(d)])
+(Rsh64x64  (Const64 [c]) (Const64 [d])) => (Const64 [c >> uint64(d)])
+(Rsh64Ux64 (Const64 [c]) (Const64 [d])) => (Const64 [int64(uint64(c) >> uint64(d))])
+(Lsh32x64  (Const32 [c]) (Const64 [d])) => (Const32 [c << uint64(d)])
+(Rsh32x64  (Const32 [c]) (Const64 [d])) => (Const32 [c >> uint64(d)])
+(Rsh32Ux64 (Const32 [c]) (Const64 [d])) => (Const32 [int32(uint32(c) >> uint64(d))])
+(Lsh16x64  (Const16 [c]) (Const64 [d])) => (Const16 [c << uint64(d)])
+(Rsh16x64  (Const16 [c]) (Const64 [d])) => (Const16 [c >> uint64(d)])
+(Rsh16Ux64 (Const16 [c]) (Const64 [d])) => (Const16 [int16(uint16(c) >> uint64(d))])
+(Lsh8x64   (Const8  [c]) (Const64 [d])) => (Const8  [c << uint64(d)])
+(Rsh8x64   (Const8  [c]) (Const64 [d])) => (Const8  [c >> uint64(d)])
+(Rsh8Ux64  (Const8  [c]) (Const64 [d])) => (Const8  [int8(uint8(c) >> uint64(d))])
+
+// Fold IsInBounds when the range of the index cannot exceed the limit.
+(IsInBounds (ZeroExt8to32  _) (Const32 [c])) && (1 << 8)  <= c => (ConstBool [true])
+(IsInBounds (ZeroExt8to64  _) (Const64 [c])) && (1 << 8)  <= c => (ConstBool [true])
+(IsInBounds (ZeroExt16to32 _) (Const32 [c])) && (1 << 16) <= c => (ConstBool [true])
+(IsInBounds (ZeroExt16to64 _) (Const64 [c])) && (1 << 16) <= c => (ConstBool [true])
+(IsInBounds x x) => (ConstBool [false])
+(IsInBounds                (And8  (Const8  [c]) _)  (Const8  [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to16  (And8  (Const8  [c]) _)) (Const16 [d])) && 0 <= c && int16(c) < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to32  (And8  (Const8  [c]) _)) (Const32 [d])) && 0 <= c && int32(c) < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to64  (And8  (Const8  [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true])
+(IsInBounds                (And16 (Const16 [c]) _)  (Const16 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to32 (And16 (Const16 [c]) _)) (Const32 [d])) && 0 <= c && int32(c) < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to64 (And16 (Const16 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true])
+(IsInBounds                (And32 (Const32 [c]) _)  (Const32 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (ZeroExt32to64 (And32 (Const32 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true])
+(IsInBounds                (And64 (Const64 [c]) _)  (Const64 [d])) && 0 <= c && c < d => (ConstBool [true])
+(IsInBounds (Const32 [c]) (Const32 [d])) => (ConstBool [0 <= c && c < d])
+(IsInBounds (Const64 [c]) (Const64 [d])) => (ConstBool [0 <= c && c < d])
+// (Mod64u x y) is always between 0 (inclusive) and y (exclusive).
+(IsInBounds (Mod32u _ y) y) => (ConstBool [true])
+(IsInBounds (Mod64u _ y) y) => (ConstBool [true])
+// Right shifting an unsigned number limits its value.
+(IsInBounds (ZeroExt8to64  (Rsh8Ux64  _ (Const64 [c]))) (Const64 [d])) && 0 < c && c <  8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to32  (Rsh8Ux64  _ (Const64 [c]))) (Const32 [d])) && 0 < c && c <  8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt8to16  (Rsh8Ux64  _ (Const64 [c]))) (Const16 [d])) && 0 < c && c <  8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds                (Rsh8Ux64  _ (Const64 [c]))  (Const64 [d])) && 0 < c && c <  8 && 1<<uint( 8-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to64 (Rsh16Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt16to32 (Rsh16Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true])
+(IsInBounds                (Rsh16Ux64 _ (Const64 [c]))  (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true])
+(IsInBounds (ZeroExt32to64 (Rsh32Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 32 && 1<<uint(32-c)-1 < d => (ConstBool [true])
+(IsInBounds                (Rsh32Ux64 _ (Const64 [c]))  (Const64 [d])) && 0 < c && c < 32 && 1<<uint(32-c)-1 < d => (ConstBool [true])
+(IsInBounds                (Rsh64Ux64 _ (Const64 [c]))  (Const64 [d])) && 0 < c && c < 64 && 1<<uint(64-c)-1 < d => (ConstBool [true])
+
+(IsSliceInBounds x x) => (ConstBool [true])
+(IsSliceInBounds (And32 (Const32 [c]) _) (Const32 [d])) && 0 <= c && c <= d => (ConstBool [true])
+(IsSliceInBounds (And64 (Const64 [c]) _) (Const64 [d])) && 0 <= c && c <= d => (ConstBool [true])
+(IsSliceInBounds (Const32 [0]) _) => (ConstBool [true])
+(IsSliceInBounds (Const64 [0]) _) => (ConstBool [true])
+(IsSliceInBounds (Const32 [c]) (Const32 [d])) => (ConstBool [0 <= c && c <= d])
+(IsSliceInBounds (Const64 [c]) (Const64 [d])) => (ConstBool [0 <= c && c <= d])
+(IsSliceInBounds (SliceLen x) (SliceCap x)) => (ConstBool [true])
+
+(Eq(64|32|16|8) x x) => (ConstBool [true])
+(EqB (ConstBool [c]) (ConstBool [d])) => (ConstBool [c == d])
+(EqB (ConstBool [false]) x) => (Not x)
+(EqB (ConstBool [true]) x) => x
+
+(Neq(64|32|16|8) x x) => (ConstBool [false])
+(NeqB (ConstBool [c]) (ConstBool [d])) => (ConstBool [c != d])
+(NeqB (ConstBool [false]) x) => x
+(NeqB (ConstBool [true]) x) => (Not x)
+(NeqB (Not x) (Not y)) => (NeqB x y)
+
+(Eq64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Eq64 (Const64 <t> [c-d]) x)
+(Eq32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Eq32 (Const32 <t> [c-d]) x)
+(Eq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Eq16 (Const16 <t> [c-d]) x)
+(Eq8  (Const8  <t> [c]) (Add8  (Const8  <t> [d]) x)) => (Eq8  (Const8  <t> [c-d]) x)
+
+(Neq64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Neq64 (Const64 <t> [c-d]) x)
+(Neq32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Neq32 (Const32 <t> [c-d]) x)
+(Neq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Neq16 (Const16 <t> [c-d]) x)
+(Neq8  (Const8  <t> [c]) (Add8  (Const8  <t> [d]) x)) => (Neq8  (Const8  <t> [c-d]) x)
+
+// signed integer range: ( c <= x && x (<|<=) d ) -> ( unsigned(x-c) (<|<=) unsigned(d-c) )
+(AndB (Leq64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c]))
+(AndB (Leq32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c]))
+(AndB (Leq16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c]))
+(AndB (Leq8  (Const8  [c]) x) ((Less|Leq)8  x (Const8  [d]))) && d >= c => ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c])) (Const8  <x.Type> [d-c]))
+
+// signed integer range: ( c < x && x (<|<=) d ) -> ( unsigned(x-(c+1)) (<|<=) unsigned(d-(c+1)) )
+(AndB (Less64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1]))
+(AndB (Less32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1]))
+(AndB (Less16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1]))
+(AndB (Less8  (Const8  [c]) x) ((Less|Leq)8  x (Const8  [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c+1])) (Const8  <x.Type> [d-c-1]))
+
+// unsigned integer range: ( c <= x && x (<|<=) d ) -> ( x-c (<|<=) d-c )
+(AndB (Leq64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c) => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c]))
+(AndB (Leq32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c) => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c]))
+(AndB (Leq16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c) => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c]))
+(AndB (Leq8U  (Const8  [c]) x) ((Less|Leq)8U  x (Const8  [d]))) && uint8(d)  >= uint8(c)  => ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c])) (Const8  <x.Type> [d-c]))
+
+// unsigned integer range: ( c < x && x (<|<=) d ) -> ( x-(c+1) (<|<=) d-(c+1) )
+(AndB (Less64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c+1) && uint64(c+1) > uint64(c) => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1]))
+(AndB (Less32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c+1) && uint32(c+1) > uint32(c) => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1]))
+(AndB (Less16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c+1) && uint16(c+1) > uint16(c) => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1]))
+(AndB (Less8U  (Const8  [c]) x) ((Less|Leq)8U  x (Const8  [d]))) && uint8(d)  >= uint8(c+1)  && uint8(c+1)  > uint8(c)  => ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c+1]))  (Const8  <x.Type> [d-c-1]))
+
+// signed integer range: ( c (<|<=) x || x < d ) -> ( unsigned(c-d) (<|<=) unsigned(x-d) )
+(OrB ((Less|Leq)64 (Const64 [c]) x) (Less64 x (Const64 [d]))) && c >= d => ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d])))
+(OrB ((Less|Leq)32 (Const32 [c]) x) (Less32 x (Const32 [d]))) && c >= d => ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d])))
+(OrB ((Less|Leq)16 (Const16 [c]) x) (Less16 x (Const16 [d]))) && c >= d => ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d])))
+(OrB ((Less|Leq)8  (Const8  [c]) x) (Less8  x (Const8  [d]))) && c >= d => ((Less|Leq)8U  (Const8  <x.Type> [c-d]) (Sub8  <x.Type> x (Const8  <x.Type> [d])))
+
+// signed integer range: ( c (<|<=) x || x <= d ) -> ( unsigned(c-(d+1)) (<|<=) unsigned(x-(d+1)) )
+(OrB ((Less|Leq)64 (Const64 [c]) x) (Leq64 x (Const64 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1])))
+(OrB ((Less|Leq)32 (Const32 [c]) x) (Leq32 x (Const32 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1])))
+(OrB ((Less|Leq)16 (Const16 [c]) x) (Leq16 x (Const16 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1])))
+(OrB ((Less|Leq)8  (Const8  [c]) x) (Leq8  x (Const8  [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)8U  (Const8  <x.Type> [c-d-1]) (Sub8  <x.Type> x (Const8  <x.Type> [d+1])))
+
+// unsigned integer range: ( c (<|<=) x || x < d ) -> ( c-d (<|<=) x-d )
+(OrB ((Less|Leq)64U (Const64 [c]) x) (Less64U x (Const64 [d]))) && uint64(c) >= uint64(d) => ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d])))
+(OrB ((Less|Leq)32U (Const32 [c]) x) (Less32U x (Const32 [d]))) && uint32(c) >= uint32(d) => ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d])))
+(OrB ((Less|Leq)16U (Const16 [c]) x) (Less16U x (Const16 [d]))) && uint16(c) >= uint16(d) => ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d])))
+(OrB ((Less|Leq)8U  (Const8  [c]) x) (Less8U  x (Const8  [d]))) && uint8(c)  >= uint8(d)  => ((Less|Leq)8U  (Const8  <x.Type> [c-d]) (Sub8  <x.Type> x (Const8  <x.Type> [d])))
+
+// unsigned integer range: ( c (<|<=) x || x <= d ) -> ( c-(d+1) (<|<=) x-(d+1) )
+(OrB ((Less|Leq)64U (Const64 [c]) x) (Leq64U x (Const64 [d]))) && uint64(c) >= uint64(d+1) && uint64(d+1) > uint64(d) => ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1])))
+(OrB ((Less|Leq)32U (Const32 [c]) x) (Leq32U x (Const32 [d]))) && uint32(c) >= uint32(d+1) && uint32(d+1) > uint32(d) => ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1])))
+(OrB ((Less|Leq)16U (Const16 [c]) x) (Leq16U x (Const16 [d]))) && uint16(c) >= uint16(d+1) && uint16(d+1) > uint16(d) => ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1])))
+(OrB ((Less|Leq)8U  (Const8  [c]) x) (Leq8U  x (Const8  [d]))) && uint8(c)  >= uint8(d+1)  && uint8(d+1)  > uint8(d)  => ((Less|Leq)8U  (Const8  <x.Type> [c-d-1]) (Sub8  <x.Type> x (Const8  <x.Type> [d+1])))
+
+// Canonicalize x-const to x+(-const)
+(Sub64 x (Const64 <t> [c])) && x.Op != OpConst64 => (Add64 (Const64 <t> [-c]) x)
+(Sub32 x (Const32 <t> [c])) && x.Op != OpConst32 => (Add32 (Const32 <t> [-c]) x)
+(Sub16 x (Const16 <t> [c])) && x.Op != OpConst16 => (Add16 (Const16 <t> [-c]) x)
+(Sub8  x (Const8  <t> [c])) && x.Op != OpConst8  => (Add8  (Const8  <t> [-c]) x)
+
+// fold negation into comparison operators
+(Not (Eq(64|32|16|8|B|Ptr|64F|32F) x y)) => (Neq(64|32|16|8|B|Ptr|64F|32F) x y)
+(Not (Neq(64|32|16|8|B|Ptr|64F|32F) x y)) => (Eq(64|32|16|8|B|Ptr|64F|32F) x y)
+
+(Not (Less(64|32|16|8) x y)) => (Leq(64|32|16|8) y x)
+(Not (Less(64|32|16|8)U x y)) => (Leq(64|32|16|8)U y x)
+(Not (Leq(64|32|16|8) x y)) => (Less(64|32|16|8) y x)
+(Not (Leq(64|32|16|8)U x y)) => (Less(64|32|16|8)U y x)
+
+// Distribute multiplication c * (d+x) -> c*d + c*x. Useful for:
+// a[i].b = ...; a[i+1].b = ...
+(Mul64 (Const64 <t> [c]) (Add64 <t> (Const64 <t> [d]) x)) =>
+  (Add64 (Const64 <t> [c*d]) (Mul64 <t> (Const64 <t> [c]) x))
+(Mul32 (Const32 <t> [c]) (Add32 <t> (Const32 <t> [d]) x)) =>
+  (Add32 (Const32 <t> [c*d]) (Mul32 <t> (Const32 <t> [c]) x))
+
+// Rewrite x*y ± x*z  to  x*(y±z)
+(Add(64|32|16|8) <t> (Mul(64|32|16|8) x y) (Mul(64|32|16|8) x z))
+	=> (Mul(64|32|16|8) x (Add(64|32|16|8) <t> y z))
+(Sub(64|32|16|8) <t> (Mul(64|32|16|8) x y) (Mul(64|32|16|8) x z))
+	=> (Mul(64|32|16|8) x (Sub(64|32|16|8) <t> y z))
+
+// rewrite shifts of 8/16/32 bit consts into 64 bit consts to reduce
+// the number of the other rewrite rules for const shifts
+(Lsh64x32  <t> x (Const32 [c])) => (Lsh64x64  x (Const64 <t> [int64(uint32(c))]))
+(Lsh64x16  <t> x (Const16 [c])) => (Lsh64x64  x (Const64 <t> [int64(uint16(c))]))
+(Lsh64x8   <t> x (Const8  [c])) => (Lsh64x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh64x32  <t> x (Const32 [c])) => (Rsh64x64  x (Const64 <t> [int64(uint32(c))]))
+(Rsh64x16  <t> x (Const16 [c])) => (Rsh64x64  x (Const64 <t> [int64(uint16(c))]))
+(Rsh64x8   <t> x (Const8  [c])) => (Rsh64x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh64Ux32 <t> x (Const32 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh64Ux16 <t> x (Const16 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh64Ux8  <t> x (Const8  [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+(Lsh32x32  <t> x (Const32 [c])) => (Lsh32x64  x (Const64 <t> [int64(uint32(c))]))
+(Lsh32x16  <t> x (Const16 [c])) => (Lsh32x64  x (Const64 <t> [int64(uint16(c))]))
+(Lsh32x8   <t> x (Const8  [c])) => (Lsh32x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh32x32  <t> x (Const32 [c])) => (Rsh32x64  x (Const64 <t> [int64(uint32(c))]))
+(Rsh32x16  <t> x (Const16 [c])) => (Rsh32x64  x (Const64 <t> [int64(uint16(c))]))
+(Rsh32x8   <t> x (Const8  [c])) => (Rsh32x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh32Ux32 <t> x (Const32 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh32Ux16 <t> x (Const16 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh32Ux8  <t> x (Const8  [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+(Lsh16x32  <t> x (Const32 [c])) => (Lsh16x64  x (Const64 <t> [int64(uint32(c))]))
+(Lsh16x16  <t> x (Const16 [c])) => (Lsh16x64  x (Const64 <t> [int64(uint16(c))]))
+(Lsh16x8   <t> x (Const8  [c])) => (Lsh16x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh16x32  <t> x (Const32 [c])) => (Rsh16x64  x (Const64 <t> [int64(uint32(c))]))
+(Rsh16x16  <t> x (Const16 [c])) => (Rsh16x64  x (Const64 <t> [int64(uint16(c))]))
+(Rsh16x8   <t> x (Const8  [c])) => (Rsh16x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh16Ux32 <t> x (Const32 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh16Ux16 <t> x (Const16 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh16Ux8  <t> x (Const8  [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+(Lsh8x32  <t> x (Const32 [c])) => (Lsh8x64  x (Const64 <t> [int64(uint32(c))]))
+(Lsh8x16  <t> x (Const16 [c])) => (Lsh8x64  x (Const64 <t> [int64(uint16(c))]))
+(Lsh8x8   <t> x (Const8  [c])) => (Lsh8x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh8x32  <t> x (Const32 [c])) => (Rsh8x64  x (Const64 <t> [int64(uint32(c))]))
+(Rsh8x16  <t> x (Const16 [c])) => (Rsh8x64  x (Const64 <t> [int64(uint16(c))]))
+(Rsh8x8   <t> x (Const8  [c])) => (Rsh8x64  x (Const64 <t> [int64(uint8(c))]))
+(Rsh8Ux32 <t> x (Const32 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint32(c))]))
+(Rsh8Ux16 <t> x (Const16 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint16(c))]))
+(Rsh8Ux8  <t> x (Const8  [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint8(c))]))
+
+// shifts by zero
+(Lsh(64|32|16|8)x64  x (Const64 [0])) => x
+(Rsh(64|32|16|8)x64  x (Const64 [0])) => x
+(Rsh(64|32|16|8)Ux64 x (Const64 [0])) => x
+
+// rotates by multiples of register width
+(RotateLeft64 x (Const64 [c])) && c%64 == 0 => x
+(RotateLeft32 x (Const32 [c])) && c%32 == 0 => x
+(RotateLeft16 x (Const16 [c])) && c%16 == 0 => x
+(RotateLeft8  x (Const8 [c]))  && c%8  == 0 => x
+
+// zero shifted
+(Lsh64x(64|32|16|8)  (Const64 [0]) _) => (Const64 [0])
+(Rsh64x(64|32|16|8)  (Const64 [0]) _) => (Const64 [0])
+(Rsh64Ux(64|32|16|8) (Const64 [0]) _) => (Const64 [0])
+(Lsh32x(64|32|16|8)  (Const32 [0]) _) => (Const32 [0])
+(Rsh32x(64|32|16|8)  (Const32 [0]) _) => (Const32 [0])
+(Rsh32Ux(64|32|16|8) (Const32 [0]) _) => (Const32 [0])
+(Lsh16x(64|32|16|8)  (Const16 [0]) _) => (Const16 [0])
+(Rsh16x(64|32|16|8)  (Const16 [0]) _) => (Const16 [0])
+(Rsh16Ux(64|32|16|8) (Const16 [0]) _) => (Const16 [0])
+(Lsh8x(64|32|16|8)   (Const8  [0]) _) => (Const8  [0])
+(Rsh8x(64|32|16|8)   (Const8  [0]) _) => (Const8  [0])
+(Rsh8Ux(64|32|16|8)  (Const8  [0]) _) => (Const8  [0])
+
+// large left shifts of all values, and right shifts of unsigned values
+((Lsh64|Rsh64U)x64  _ (Const64 [c])) && uint64(c) >= 64 => (Const64 [0])
+((Lsh32|Rsh32U)x64  _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0])
+((Lsh16|Rsh16U)x64  _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0])
+((Lsh8|Rsh8U)x64    _ (Const64 [c])) && uint64(c) >= 8  => (Const8  [0])
+
+// combine const shifts
+(Lsh64x64 <t> (Lsh64x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh64x64 x (Const64 <t> [c+d]))
+(Lsh32x64 <t> (Lsh32x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh32x64 x (Const64 <t> [c+d]))
+(Lsh16x64 <t> (Lsh16x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh16x64 x (Const64 <t> [c+d]))
+(Lsh8x64  <t> (Lsh8x64  x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh8x64  x (Const64 <t> [c+d]))
+
+(Rsh64x64 <t> (Rsh64x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh64x64 x (Const64 <t> [c+d]))
+(Rsh32x64 <t> (Rsh32x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh32x64 x (Const64 <t> [c+d]))
+(Rsh16x64 <t> (Rsh16x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh16x64 x (Const64 <t> [c+d]))
+(Rsh8x64  <t> (Rsh8x64  x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh8x64  x (Const64 <t> [c+d]))
+
+(Rsh64Ux64 <t> (Rsh64Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh64Ux64 x (Const64 <t> [c+d]))
+(Rsh32Ux64 <t> (Rsh32Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh32Ux64 x (Const64 <t> [c+d]))
+(Rsh16Ux64 <t> (Rsh16Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh16Ux64 x (Const64 <t> [c+d]))
+(Rsh8Ux64  <t> (Rsh8Ux64  x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh8Ux64  x (Const64 <t> [c+d]))
+
+// Remove signed right shift before an unsigned right shift that extracts the sign bit.
+(Rsh8Ux64  (Rsh8x64  x _) (Const64 <t> [7] )) => (Rsh8Ux64  x (Const64 <t> [7] ))
+(Rsh16Ux64 (Rsh16x64 x _) (Const64 <t> [15])) => (Rsh16Ux64 x (Const64 <t> [15]))
+(Rsh32Ux64 (Rsh32x64 x _) (Const64 <t> [31])) => (Rsh32Ux64 x (Const64 <t> [31]))
+(Rsh64Ux64 (Rsh64x64 x _) (Const64 <t> [63])) => (Rsh64Ux64 x (Const64 <t> [63]))
+
+// ((x >> c1) << c2) >> c3
+(Rsh(64|32|16|8)Ux64 (Lsh(64|32|16|8)x64 (Rsh(64|32|16|8)Ux64 x (Const64 [c1])) (Const64 [c2])) (Const64 [c3]))
+  && uint64(c1) >= uint64(c2) && uint64(c3) >= uint64(c2) && !uaddOvf(c1-c2, c3)
+  => (Rsh(64|32|16|8)Ux64 x (Const64 <typ.UInt64> [c1-c2+c3]))
+
+// ((x << c1) >> c2) << c3
+(Lsh(64|32|16|8)x64 (Rsh(64|32|16|8)Ux64 (Lsh(64|32|16|8)x64 x (Const64 [c1])) (Const64 [c2])) (Const64 [c3]))
+  && uint64(c1) >= uint64(c2) && uint64(c3) >= uint64(c2) && !uaddOvf(c1-c2, c3)
+  => (Lsh(64|32|16|8)x64 x (Const64 <typ.UInt64> [c1-c2+c3]))
+
+// (x >> c) & uppermask = 0
+(And64 (Const64 [m]) (Rsh64Ux64 _ (Const64 [c]))) && c >= int64(64-ntz64(m)) => (Const64 [0])
+(And32 (Const32 [m]) (Rsh32Ux64 _ (Const64 [c]))) && c >= int64(32-ntz32(m)) => (Const32 [0])
+(And16 (Const16 [m]) (Rsh16Ux64 _ (Const64 [c]))) && c >= int64(16-ntz16(m)) => (Const16 [0])
+(And8  (Const8  [m]) (Rsh8Ux64  _ (Const64 [c]))) && c >= int64(8-ntz8(m))  => (Const8  [0])
+
+// (x << c) & lowermask = 0
+(And64 (Const64 [m]) (Lsh64x64  _ (Const64 [c]))) && c >= int64(64-nlz64(m)) => (Const64 [0])
+(And32 (Const32 [m]) (Lsh32x64  _ (Const64 [c]))) && c >= int64(32-nlz32(m)) => (Const32 [0])
+(And16 (Const16 [m]) (Lsh16x64  _ (Const64 [c]))) && c >= int64(16-nlz16(m)) => (Const16 [0])
+(And8  (Const8  [m]) (Lsh8x64   _ (Const64 [c]))) && c >= int64(8-nlz8(m))  => (Const8  [0])
+
+// replace shifts with zero extensions
+(Rsh16Ux64 (Lsh16x64 x (Const64  [8])) (Const64  [8])) => (ZeroExt8to16  (Trunc16to8  <typ.UInt8>  x))
+(Rsh32Ux64 (Lsh32x64 x (Const64 [24])) (Const64 [24])) => (ZeroExt8to32  (Trunc32to8  <typ.UInt8>  x))
+(Rsh64Ux64 (Lsh64x64 x (Const64 [56])) (Const64 [56])) => (ZeroExt8to64  (Trunc64to8  <typ.UInt8>  x))
+(Rsh32Ux64 (Lsh32x64 x (Const64 [16])) (Const64 [16])) => (ZeroExt16to32 (Trunc32to16 <typ.UInt16> x))
+(Rsh64Ux64 (Lsh64x64 x (Const64 [48])) (Const64 [48])) => (ZeroExt16to64 (Trunc64to16 <typ.UInt16> x))
+(Rsh64Ux64 (Lsh64x64 x (Const64 [32])) (Const64 [32])) => (ZeroExt32to64 (Trunc64to32 <typ.UInt32> x))
+
+// replace shifts with sign extensions
+(Rsh16x64 (Lsh16x64 x (Const64  [8])) (Const64  [8])) => (SignExt8to16  (Trunc16to8  <typ.Int8>  x))
+(Rsh32x64 (Lsh32x64 x (Const64 [24])) (Const64 [24])) => (SignExt8to32  (Trunc32to8  <typ.Int8>  x))
+(Rsh64x64 (Lsh64x64 x (Const64 [56])) (Const64 [56])) => (SignExt8to64  (Trunc64to8  <typ.Int8>  x))
+(Rsh32x64 (Lsh32x64 x (Const64 [16])) (Const64 [16])) => (SignExt16to32 (Trunc32to16 <typ.Int16> x))
+(Rsh64x64 (Lsh64x64 x (Const64 [48])) (Const64 [48])) => (SignExt16to64 (Trunc64to16 <typ.Int16> x))
+(Rsh64x64 (Lsh64x64 x (Const64 [32])) (Const64 [32])) => (SignExt32to64 (Trunc64to32 <typ.Int32> x))
+
+// constant comparisons
+(Eq(64|32|16|8)   (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c == d])
+(Neq(64|32|16|8)  (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c != d])
+(Less(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c < d])
+(Leq(64|32|16|8)  (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c <= d])
+
+(Less64U (Const64 [c]) (Const64 [d])) => (ConstBool [uint64(c) < uint64(d)])
+(Less32U (Const32 [c]) (Const32 [d])) => (ConstBool [uint32(c) < uint32(d)])
+(Less16U (Const16 [c]) (Const16 [d])) => (ConstBool [uint16(c) < uint16(d)])
+(Less8U  (Const8  [c]) (Const8  [d])) => (ConstBool [ uint8(c) <  uint8(d)])
+
+(Leq64U (Const64 [c]) (Const64 [d])) => (ConstBool [uint64(c) <= uint64(d)])
+(Leq32U (Const32 [c]) (Const32 [d])) => (ConstBool [uint32(c) <= uint32(d)])
+(Leq16U (Const16 [c]) (Const16 [d])) => (ConstBool [uint16(c) <= uint16(d)])
+(Leq8U  (Const8  [c]) (Const8  [d])) => (ConstBool [ uint8(c) <=  uint8(d)])
+
+(Leq8  (Const8  [0]) (And8  _ (Const8  [c]))) && c >= 0 => (ConstBool [true])
+(Leq16 (Const16 [0]) (And16 _ (Const16 [c]))) && c >= 0 => (ConstBool [true])
+(Leq32 (Const32 [0]) (And32 _ (Const32 [c]))) && c >= 0 => (ConstBool [true])
+(Leq64 (Const64 [0]) (And64 _ (Const64 [c]))) && c >= 0 => (ConstBool [true])
+
+(Leq8  (Const8  [0]) (Rsh8Ux64  _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+(Leq16 (Const16 [0]) (Rsh16Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+(Leq32 (Const32 [0]) (Rsh32Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+(Leq64 (Const64 [0]) (Rsh64Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true])
+
+// constant floating point comparisons
+(Eq32F   (Const32F [c]) (Const32F [d])) => (ConstBool [c == d])
+(Eq64F   (Const64F [c]) (Const64F [d])) => (ConstBool [c == d])
+(Neq32F  (Const32F [c]) (Const32F [d])) => (ConstBool [c != d])
+(Neq64F  (Const64F [c]) (Const64F [d])) => (ConstBool [c != d])
+(Less32F (Const32F [c]) (Const32F [d])) => (ConstBool [c < d])
+(Less64F (Const64F [c]) (Const64F [d])) => (ConstBool [c < d])
+(Leq32F  (Const32F [c]) (Const32F [d])) => (ConstBool [c <= d])
+(Leq64F  (Const64F [c]) (Const64F [d])) => (ConstBool [c <= d])
+
+// simplifications
+(Or(64|32|16|8) x x) => x
+(Or(64|32|16|8) (Const(64|32|16|8) [0]) x) => x
+(Or(64|32|16|8) (Const(64|32|16|8) [-1]) _) => (Const(64|32|16|8) [-1])
+
+(And(64|32|16|8) x x) => x
+(And(64|32|16|8) (Const(64|32|16|8) [-1]) x) => x
+(And(64|32|16|8) (Const(64|32|16|8) [0]) _) => (Const(64|32|16|8) [0])
+
+(Xor(64|32|16|8) x x) => (Const(64|32|16|8) [0])
+(Xor(64|32|16|8) (Const(64|32|16|8) [0]) x) => x
+
+(Add(64|32|16|8) (Const(64|32|16|8) [0]) x) => x
+(Sub(64|32|16|8) x x) => (Const(64|32|16|8) [0])
+(Mul(64|32|16|8) (Const(64|32|16|8) [0]) _) => (Const(64|32|16|8) [0])
+
+(Com(64|32|16|8) (Com(64|32|16|8)  x)) => x
+(Com(64|32|16|8) (Const(64|32|16|8) [c])) => (Const(64|32|16|8) [^c])
+
+(Neg(64|32|16|8) (Sub(64|32|16|8) x y)) => (Sub(64|32|16|8) y x)
+
+// ^(x-1) == ^x+1 == -x
+(Add(64|32|16|8) (Const(64|32|16|8) [1]) (Com(64|32|16|8) x)) => (Neg(64|32|16|8) x)
+(Com(64|32|16|8) (Add(64|32|16|8) (Const(64|32|16|8) [-1]) x)) => (Neg(64|32|16|8) x)
+
+// -(-x) == x
+(Neg(64|32|16|8) (Neg(64|32|16|8) x)) => x
+
+// -^x == x+1
+(Neg(64|32|16|8) <t> (Com(64|32|16|8) x)) => (Add(64|32|16|8) (Const(64|32|16|8) <t> [1]) x)
+
+(And(64|32|16|8) x (And(64|32|16|8) x y)) => (And(64|32|16|8) x y)
+(Or(64|32|16|8) x (Or(64|32|16|8) x y)) => (Or(64|32|16|8) x y)
+(Xor(64|32|16|8) x (Xor(64|32|16|8) x y)) => y
+
+// Unsigned comparisons to zero.
+(Less(64U|32U|16U|8U) _ (Const(64|32|16|8) [0])) => (ConstBool [false])
+(Leq(64U|32U|16U|8U) (Const(64|32|16|8) [0]) _)  => (ConstBool [true])
+
+// Ands clear bits. Ors set bits.
+// If a subsequent Or will set all the bits
+// that an And cleared, we can skip the And.
+// This happens in bitmasking code like:
+//   x &^= 3 << shift // clear two old bits
+//   x  |= v << shift // set two new bits
+// when shift is a small constant and v ends up a constant 3.
+(Or8  (And8  x (Const8  [c2])) (Const8  <t> [c1])) && ^(c1 | c2) == 0 => (Or8  (Const8  <t> [c1]) x)
+(Or16 (And16 x (Const16 [c2])) (Const16 <t> [c1])) && ^(c1 | c2) == 0 => (Or16 (Const16 <t> [c1]) x)
+(Or32 (And32 x (Const32 [c2])) (Const32 <t> [c1])) && ^(c1 | c2) == 0 => (Or32 (Const32 <t> [c1]) x)
+(Or64 (And64 x (Const64 [c2])) (Const64 <t> [c1])) && ^(c1 | c2) == 0 => (Or64 (Const64 <t> [c1]) x)
+
+(Trunc64to8  (And64 (Const64 [y]) x)) && y&0xFF == 0xFF => (Trunc64to8 x)
+(Trunc64to16 (And64 (Const64 [y]) x)) && y&0xFFFF == 0xFFFF => (Trunc64to16 x)
+(Trunc64to32 (And64 (Const64 [y]) x)) && y&0xFFFFFFFF == 0xFFFFFFFF => (Trunc64to32 x)
+(Trunc32to8  (And32 (Const32 [y]) x)) && y&0xFF == 0xFF => (Trunc32to8 x)
+(Trunc32to16 (And32 (Const32 [y]) x)) && y&0xFFFF == 0xFFFF => (Trunc32to16 x)
+(Trunc16to8  (And16 (Const16 [y]) x)) && y&0xFF == 0xFF => (Trunc16to8 x)
+
+(ZeroExt8to64  (Trunc64to8  x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 56 => x
+(ZeroExt16to64 (Trunc64to16 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 48 => x
+(ZeroExt32to64 (Trunc64to32 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 32 => x
+(ZeroExt8to32  (Trunc32to8  x:(Rsh32Ux64 _ (Const64 [s])))) && s >= 24 => x
+(ZeroExt16to32 (Trunc32to16 x:(Rsh32Ux64 _ (Const64 [s])))) && s >= 16 => x
+(ZeroExt8to16  (Trunc16to8  x:(Rsh16Ux64 _ (Const64 [s])))) && s >= 8 => x
+
+(SignExt8to64  (Trunc64to8  x:(Rsh64x64 _ (Const64 [s])))) && s >= 56 => x
+(SignExt16to64 (Trunc64to16 x:(Rsh64x64 _ (Const64 [s])))) && s >= 48 => x
+(SignExt32to64 (Trunc64to32 x:(Rsh64x64 _ (Const64 [s])))) && s >= 32 => x
+(SignExt8to32  (Trunc32to8  x:(Rsh32x64 _ (Const64 [s])))) && s >= 24 => x
+(SignExt16to32 (Trunc32to16 x:(Rsh32x64 _ (Const64 [s])))) && s >= 16 => x
+(SignExt8to16  (Trunc16to8  x:(Rsh16x64 _ (Const64 [s])))) && s >= 8 => x
+
+(Slicemask (Const32 [x])) && x > 0 => (Const32 [-1])
+(Slicemask (Const32 [0]))          => (Const32 [0])
+(Slicemask (Const64 [x])) && x > 0 => (Const64 [-1])
+(Slicemask (Const64 [0]))          => (Const64 [0])
+
+// simplifications often used for lengths.  e.g. len(s[i:i+5])==5
+(Sub(64|32|16|8) (Add(64|32|16|8) x y) x) => y
+(Sub(64|32|16|8) (Add(64|32|16|8) x y) y) => x
+
+// basic phi simplifications
+(Phi (Const8  [c]) (Const8  [c])) => (Const8  [c])
+(Phi (Const16 [c]) (Const16 [c])) => (Const16 [c])
+(Phi (Const32 [c]) (Const32 [c])) => (Const32 [c])
+(Phi (Const64 [c]) (Const64 [c])) => (Const64 [c])
+
+// slice and interface comparisons
+// The frontend ensures that we can only compare against nil,
+// so we need only compare the first word (interface type or slice ptr).
+(EqInter x y)  => (EqPtr  (ITab x) (ITab y))
+(NeqInter x y) => (NeqPtr (ITab x) (ITab y))
+(EqSlice x y)  => (EqPtr  (SlicePtr x) (SlicePtr y))
+(NeqSlice x y) => (NeqPtr (SlicePtr x) (SlicePtr y))
+
+// Load of store of same address, with compatibly typed value and same size
+(Load <t1> p1 (Store {t2} p2 x _))
+	&& isSamePtr(p1, p2)
+	&& t1.Compare(x.Type) == types.CMPeq
+	&& t1.Size() == t2.Size()
+	=> x
+(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 x _)))
+	&& isSamePtr(p1, p3)
+	&& t1.Compare(x.Type) == types.CMPeq
+	&& t1.Size() == t2.Size()
+	&& disjoint(p3, t3.Size(), p2, t2.Size())
+	=> x
+(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 x _))))
+	&& isSamePtr(p1, p4)
+	&& t1.Compare(x.Type) == types.CMPeq
+	&& t1.Size() == t2.Size()
+	&& disjoint(p4, t4.Size(), p2, t2.Size())
+	&& disjoint(p4, t4.Size(), p3, t3.Size())
+	=> x
+(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 x _)))))
+	&& isSamePtr(p1, p5)
+	&& t1.Compare(x.Type) == types.CMPeq
+	&& t1.Size() == t2.Size()
+	&& disjoint(p5, t5.Size(), p2, t2.Size())
+	&& disjoint(p5, t5.Size(), p3, t3.Size())
+	&& disjoint(p5, t5.Size(), p4, t4.Size())
+	=> x
+
+// Pass constants through math.Float{32,64}bits and math.Float{32,64}frombits
+        (Load <t1> p1 (Store {t2} p2 (Const64  [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 8 && is64BitFloat(t1) && !math.IsNaN(math.Float64frombits(uint64(x))) => (Const64F [math.Float64frombits(uint64(x))])
+        (Load <t1> p1 (Store {t2} p2 (Const32  [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 4 && is32BitFloat(t1) && !math.IsNaN(float64(math.Float32frombits(uint32(x)))) => (Const32F [math.Float32frombits(uint32(x))])
+(Load <t1> p1 (Store {t2} p2 (Const64F [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 8 && is64BitInt(t1)   => (Const64  [int64(math.Float64bits(x))])
+(Load <t1> p1 (Store {t2} p2 (Const32F [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 4 && is32BitInt(t1)   => (Const32  [int32(math.Float32bits(x))])
+
+// Float Loads up to Zeros so they can be constant folded.
+(Load <t1> op:(OffPtr [o1] p1)
+	(Store {t2} p2 _
+		mem:(Zero [n] p3 _)))
+	&& o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p3)
+	&& fe.CanSSA(t1)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	=> @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p3) mem)
+(Load <t1> op:(OffPtr [o1] p1)
+	(Store {t2} p2 _
+		(Store {t3} p3 _
+			mem:(Zero [n] p4 _))))
+	&& o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p4)
+	&& fe.CanSSA(t1)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	&& disjoint(op, t1.Size(), p3, t3.Size())
+	=> @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p4) mem)
+(Load <t1> op:(OffPtr [o1] p1)
+	(Store {t2} p2 _
+		(Store {t3} p3 _
+			(Store {t4} p4 _
+				mem:(Zero [n] p5 _)))))
+	&& o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p5)
+	&& fe.CanSSA(t1)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	&& disjoint(op, t1.Size(), p3, t3.Size())
+	&& disjoint(op, t1.Size(), p4, t4.Size())
+	=> @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p5) mem)
+(Load <t1> op:(OffPtr [o1] p1)
+	(Store {t2} p2 _
+		(Store {t3} p3 _
+			(Store {t4} p4 _
+				(Store {t5} p5 _
+					mem:(Zero [n] p6 _))))))
+	&& o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p6)
+	&& fe.CanSSA(t1)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	&& disjoint(op, t1.Size(), p3, t3.Size())
+	&& disjoint(op, t1.Size(), p4, t4.Size())
+	&& disjoint(op, t1.Size(), p5, t5.Size())
+	=> @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p6) mem)
+
+// Zero to Load forwarding.
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& t1.IsBoolean()
+	&& isSamePtr(p1, p2)
+	&& n >= o + 1
+	=> (ConstBool [false])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& is8BitInt(t1)
+	&& isSamePtr(p1, p2)
+	&& n >= o + 1
+	=> (Const8 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& is16BitInt(t1)
+	&& isSamePtr(p1, p2)
+	&& n >= o + 2
+	=> (Const16 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& is32BitInt(t1)
+	&& isSamePtr(p1, p2)
+	&& n >= o + 4
+	=> (Const32 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& is64BitInt(t1)
+	&& isSamePtr(p1, p2)
+	&& n >= o + 8
+	=> (Const64 [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& is32BitFloat(t1)
+	&& isSamePtr(p1, p2)
+	&& n >= o + 4
+	=> (Const32F [0])
+(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _))
+	&& is64BitFloat(t1)
+	&& isSamePtr(p1, p2)
+	&& n >= o + 8
+	=> (Const64F [0])
+
+// Eliminate stores of values that have just been loaded from the same location.
+// We also handle the common case where there are some intermediate stores.
+(Store {t1} p1 (Load <t2> p2 mem) mem)
+	&& isSamePtr(p1, p2)
+	&& t2.Size() == t1.Size()
+	=> mem
+(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ oldmem))
+	&& isSamePtr(p1, p2)
+	&& t2.Size() == t1.Size()
+	&& disjoint(p1, t1.Size(), p3, t3.Size())
+	=> mem
+(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ (Store {t4} p4 _ oldmem)))
+	&& isSamePtr(p1, p2)
+	&& t2.Size() == t1.Size()
+	&& disjoint(p1, t1.Size(), p3, t3.Size())
+	&& disjoint(p1, t1.Size(), p4, t4.Size())
+	=> mem
+(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 _ oldmem))))
+	&& isSamePtr(p1, p2)
+	&& t2.Size() == t1.Size()
+	&& disjoint(p1, t1.Size(), p3, t3.Size())
+	&& disjoint(p1, t1.Size(), p4, t4.Size())
+	&& disjoint(p1, t1.Size(), p5, t5.Size())
+	=> mem
+
+// Don't Store zeros to cleared variables.
+(Store {t} (OffPtr [o] p1) x mem:(Zero [n] p2 _))
+	&& isConstZero(x)
+	&& o >= 0 && t.Size() + o <= n && isSamePtr(p1, p2)
+	=> mem
+(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Zero [n] p3 _)))
+	&& isConstZero(x)
+	&& o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p3)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	=> mem
+(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Store {t3} p3 _ (Zero [n] p4 _))))
+	&& isConstZero(x)
+	&& o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p4)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	&& disjoint(op, t1.Size(), p3, t3.Size())
+	=> mem
+(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Zero [n] p5 _)))))
+	&& isConstZero(x)
+	&& o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p5)
+	&& disjoint(op, t1.Size(), p2, t2.Size())
+	&& disjoint(op, t1.Size(), p3, t3.Size())
+	&& disjoint(op, t1.Size(), p4, t4.Size())
+	=> mem
+
+// Collapse OffPtr
+(OffPtr (OffPtr p [b]) [a]) => (OffPtr p [a+b])
+(OffPtr p [0]) && v.Type.Compare(p.Type) == types.CMPeq => p
+
+// indexing operations
+// Note: bounds check has already been done
+(PtrIndex <t> ptr idx) && config.PtrSize == 4 && is32Bit(t.Elem().Size()) => (AddPtr ptr (Mul32 <typ.Int> idx (Const32 <typ.Int> [int32(t.Elem().Size())])))
+(PtrIndex <t> ptr idx) && config.PtrSize == 8 => (AddPtr ptr (Mul64 <typ.Int> idx (Const64 <typ.Int> [t.Elem().Size()])))
+
+// struct operations
+(StructSelect (StructMake1 x)) => x
+(StructSelect [0] (StructMake2 x _)) => x
+(StructSelect [1] (StructMake2 _ x)) => x
+(StructSelect [0] (StructMake3 x _ _)) => x
+(StructSelect [1] (StructMake3 _ x _)) => x
+(StructSelect [2] (StructMake3 _ _ x)) => x
+(StructSelect [0] (StructMake4 x _ _ _)) => x
+(StructSelect [1] (StructMake4 _ x _ _)) => x
+(StructSelect [2] (StructMake4 _ _ x _)) => x
+(StructSelect [3] (StructMake4 _ _ _ x)) => x
+
+(Load <t> _ _) && t.IsStruct() && t.NumFields() == 0 && fe.CanSSA(t) =>
+  (StructMake0)
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 1 && fe.CanSSA(t) =>
+  (StructMake1
+    (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem))
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 2 && fe.CanSSA(t) =>
+  (StructMake2
+    (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0]             ptr) mem)
+    (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem))
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 3 && fe.CanSSA(t) =>
+  (StructMake3
+    (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0]             ptr) mem)
+    (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem)
+    (Load <t.FieldType(2)> (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] ptr) mem))
+(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 4 && fe.CanSSA(t) =>
+  (StructMake4
+    (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0]             ptr) mem)
+    (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem)
+    (Load <t.FieldType(2)> (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] ptr) mem)
+    (Load <t.FieldType(3)> (OffPtr <t.FieldType(3).PtrTo()> [t.FieldOff(3)] ptr) mem))
+
+(StructSelect [i] x:(Load <t> ptr mem)) && !fe.CanSSA(t) =>
+  @x.Block (Load <v.Type> (OffPtr <v.Type.PtrTo()> [t.FieldOff(int(i))] ptr) mem)
+
+(Store _ (StructMake0) mem) => mem
+(Store dst (StructMake1 <t> f0) mem) =>
+  (Store {t.FieldType(0)} (OffPtr <t.FieldType(0).PtrTo()> [0] dst) f0 mem)
+(Store dst (StructMake2 <t> f0 f1) mem) =>
+  (Store {t.FieldType(1)}
+    (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst)
+    f1
+    (Store {t.FieldType(0)}
+      (OffPtr <t.FieldType(0).PtrTo()> [0] dst)
+        f0 mem))
+(Store dst (StructMake3 <t> f0 f1 f2) mem) =>
+  (Store {t.FieldType(2)}
+    (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] dst)
+    f2
+    (Store {t.FieldType(1)}
+      (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst)
+      f1
+      (Store {t.FieldType(0)}
+        (OffPtr <t.FieldType(0).PtrTo()> [0] dst)
+          f0 mem)))
+(Store dst (StructMake4 <t> f0 f1 f2 f3) mem) =>
+  (Store {t.FieldType(3)}
+    (OffPtr <t.FieldType(3).PtrTo()> [t.FieldOff(3)] dst)
+    f3
+    (Store {t.FieldType(2)}
+      (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] dst)
+      f2
+      (Store {t.FieldType(1)}
+        (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst)
+        f1
+        (Store {t.FieldType(0)}
+          (OffPtr <t.FieldType(0).PtrTo()> [0] dst)
+            f0 mem))))
+
+// Putting struct{*byte} and similar into direct interfaces.
+(IMake typ (StructMake1 val)) => (IMake typ val)
+(StructSelect [0] (IData x)) => (IData x)
+
+// un-SSAable values use mem->mem copies
+(Store {t} dst (Load src mem) mem) && !fe.CanSSA(t) =>
+	(Move {t} [t.Size()] dst src mem)
+(Store {t} dst (Load src mem) (VarDef {x} mem)) && !fe.CanSSA(t) =>
+	(Move {t} [t.Size()] dst src (VarDef {x} mem))
+
+// array ops
+(ArraySelect (ArrayMake1 x)) => x
+
+(Load <t> _ _) && t.IsArray() && t.NumElem() == 0 =>
+  (ArrayMake0)
+
+(Load <t> ptr mem) && t.IsArray() && t.NumElem() == 1 && fe.CanSSA(t) =>
+  (ArrayMake1 (Load <t.Elem()> ptr mem))
+
+(Store _ (ArrayMake0) mem) => mem
+(Store dst (ArrayMake1 e) mem) => (Store {e.Type} dst e mem)
+
+// Putting [1]*byte and similar into direct interfaces.
+(IMake typ (ArrayMake1 val)) => (IMake typ val)
+(ArraySelect [0] (IData x)) => (IData x)
+
+// string ops
+// Decomposing StringMake and lowering of StringPtr and StringLen
+// happens in a later pass, dec, so that these operations are available
+// to other passes for optimizations.
+(StringPtr (StringMake (Addr <t> {s} base) _)) => (Addr <t> {s} base)
+(StringLen (StringMake _ (Const64 <t> [c]))) => (Const64 <t> [c])
+(ConstString {str}) && config.PtrSize == 4 && str == "" =>
+  (StringMake (ConstNil) (Const32 <typ.Int> [0]))
+(ConstString {str}) && config.PtrSize == 8 && str == "" =>
+  (StringMake (ConstNil) (Const64 <typ.Int> [0]))
+(ConstString {str}) && config.PtrSize == 4 && str != "" =>
+  (StringMake
+    (Addr <typ.BytePtr> {fe.StringData(str)}
+      (SB))
+    (Const32 <typ.Int> [int32(len(str))]))
+(ConstString {str}) && config.PtrSize == 8 && str != "" =>
+  (StringMake
+    (Addr <typ.BytePtr> {fe.StringData(str)}
+      (SB))
+    (Const64 <typ.Int> [int64(len(str))]))
+
+// slice ops
+// Only a few slice rules are provided here.  See dec.rules for
+// a more comprehensive set.
+(SliceLen (SliceMake _ (Const64 <t> [c]) _)) => (Const64 <t> [c])
+(SliceCap (SliceMake _ _ (Const64 <t> [c]))) => (Const64 <t> [c])
+(SliceLen (SliceMake _ (Const32 <t> [c]) _)) => (Const32 <t> [c])
+(SliceCap (SliceMake _ _ (Const32 <t> [c]))) => (Const32 <t> [c])
+(SlicePtr (SliceMake (SlicePtr x) _ _)) => (SlicePtr x)
+(SliceLen (SliceMake _ (SliceLen x) _)) => (SliceLen x)
+(SliceCap (SliceMake _ _ (SliceCap x))) => (SliceCap x)
+(SliceCap (SliceMake _ _ (SliceLen x))) => (SliceLen x)
+(ConstSlice) && config.PtrSize == 4 =>
+  (SliceMake
+    (ConstNil <v.Type.Elem().PtrTo()>)
+    (Const32 <typ.Int> [0])
+    (Const32 <typ.Int> [0]))
+(ConstSlice) && config.PtrSize == 8 =>
+  (SliceMake
+    (ConstNil <v.Type.Elem().PtrTo()>)
+    (Const64 <typ.Int> [0])
+    (Const64 <typ.Int> [0]))
+
+// interface ops
+(ConstInterface) =>
+  (IMake
+    (ConstNil <typ.Uintptr>)
+    (ConstNil <typ.BytePtr>))
+
+(NilCheck (GetG mem) mem) => mem
+
+(If (Not cond) yes no) => (If cond no yes)
+(If (ConstBool [c]) yes no) && c => (First yes no)
+(If (ConstBool [c]) yes no) && !c => (First no yes)
+
+// Get rid of Convert ops for pointer arithmetic on unsafe.Pointer.
+(Convert (Add(64|32) (Convert ptr mem) off) mem) => (AddPtr ptr off)
+(Convert (Convert ptr mem) mem) => ptr
+
+// strength reduction of divide by a constant.
+// See ../magic.go for a detailed description of these algorithms.
+
+// Unsigned divide by power of 2.  Strength reduce to a shift.
+(Div8u  n (Const8  [c])) && isPowerOfTwo8(c)  => (Rsh8Ux64  n (Const64 <typ.UInt64> [log8(c)]))
+(Div16u n (Const16 [c])) && isPowerOfTwo16(c) => (Rsh16Ux64 n (Const64 <typ.UInt64> [log16(c)]))
+(Div32u n (Const32 [c])) && isPowerOfTwo32(c) => (Rsh32Ux64 n (Const64 <typ.UInt64> [log32(c)]))
+(Div64u n (Const64 [c])) && isPowerOfTwo64(c) => (Rsh64Ux64 n (Const64 <typ.UInt64> [log64(c)]))
+(Div64u n (Const64 [-1<<63]))                 => (Rsh64Ux64 n (Const64 <typ.UInt64> [63]))
+
+// Signed non-negative divide by power of 2.
+(Div8  n (Const8  [c])) && isNonNegative(n) && isPowerOfTwo8(c)  => (Rsh8Ux64  n (Const64 <typ.UInt64> [log8(c)]))
+(Div16 n (Const16 [c])) && isNonNegative(n) && isPowerOfTwo16(c) => (Rsh16Ux64 n (Const64 <typ.UInt64> [log16(c)]))
+(Div32 n (Const32 [c])) && isNonNegative(n) && isPowerOfTwo32(c) => (Rsh32Ux64 n (Const64 <typ.UInt64> [log32(c)]))
+(Div64 n (Const64 [c])) && isNonNegative(n) && isPowerOfTwo64(c) => (Rsh64Ux64 n (Const64 <typ.UInt64> [log64(c)]))
+(Div64 n (Const64 [-1<<63])) && isNonNegative(n)                 => (Const64 [0])
+
+// Unsigned divide, not a power of 2.  Strength reduce to a multiply.
+// For 8-bit divides, we just do a direct 9-bit by 8-bit multiply.
+(Div8u x (Const8 [c])) && umagicOK8(c) =>
+  (Trunc32to8
+    (Rsh32Ux64 <typ.UInt32>
+      (Mul32 <typ.UInt32>
+        (Const32 <typ.UInt32> [int32(1<<8+umagic8(c).m)])
+        (ZeroExt8to32 x))
+      (Const64 <typ.UInt64> [8+umagic8(c).s])))
+
+// For 16-bit divides on 64-bit machines, we do a direct 17-bit by 16-bit multiply.
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 8 =>
+  (Trunc64to16
+    (Rsh64Ux64 <typ.UInt64>
+      (Mul64 <typ.UInt64>
+        (Const64 <typ.UInt64> [int64(1<<16+umagic16(c).m)])
+        (ZeroExt16to64 x))
+      (Const64 <typ.UInt64> [16+umagic16(c).s])))
+
+// For 16-bit divides on 32-bit machines
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && umagic16(c).m&1 == 0 =>
+  (Trunc32to16
+    (Rsh32Ux64 <typ.UInt32>
+      (Mul32 <typ.UInt32>
+        (Const32 <typ.UInt32> [int32(1<<15+umagic16(c).m/2)])
+        (ZeroExt16to32 x))
+      (Const64 <typ.UInt64> [16+umagic16(c).s-1])))
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && c&1 == 0 =>
+  (Trunc32to16
+    (Rsh32Ux64 <typ.UInt32>
+      (Mul32 <typ.UInt32>
+        (Const32 <typ.UInt32> [int32(1<<15+(umagic16(c).m+1)/2)])
+        (Rsh32Ux64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [1])))
+      (Const64 <typ.UInt64> [16+umagic16(c).s-2])))
+(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && config.useAvg =>
+  (Trunc32to16
+    (Rsh32Ux64 <typ.UInt32>
+      (Avg32u
+        (Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16]))
+        (Mul32 <typ.UInt32>
+          (Const32 <typ.UInt32> [int32(umagic16(c).m)])
+          (ZeroExt16to32 x)))
+      (Const64 <typ.UInt64> [16+umagic16(c).s-1])))
+
+// For 32-bit divides on 32-bit machines
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && umagic32(c).m&1 == 0 && config.useHmul =>
+  (Rsh32Ux64 <typ.UInt32>
+    (Hmul32u <typ.UInt32>
+      (Const32 <typ.UInt32> [int32(1<<31+umagic32(c).m/2)])
+      x)
+    (Const64 <typ.UInt64> [umagic32(c).s-1]))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul =>
+  (Rsh32Ux64 <typ.UInt32>
+    (Hmul32u <typ.UInt32>
+      (Const32 <typ.UInt32> [int32(1<<31+(umagic32(c).m+1)/2)])
+      (Rsh32Ux64 <typ.UInt32> x (Const64 <typ.UInt64> [1])))
+    (Const64 <typ.UInt64> [umagic32(c).s-2]))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul =>
+  (Rsh32Ux64 <typ.UInt32>
+    (Avg32u
+      x
+      (Hmul32u <typ.UInt32>
+        (Const32 <typ.UInt32> [int32(umagic32(c).m)])
+        x))
+    (Const64 <typ.UInt64> [umagic32(c).s-1]))
+
+// For 32-bit divides on 64-bit machines
+// We'll use a regular (non-hi) multiply for this case.
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && umagic32(c).m&1 == 0 =>
+  (Trunc64to32
+    (Rsh64Ux64 <typ.UInt64>
+      (Mul64 <typ.UInt64>
+        (Const64 <typ.UInt64> [int64(1<<31+umagic32(c).m/2)])
+        (ZeroExt32to64 x))
+      (Const64 <typ.UInt64> [32+umagic32(c).s-1])))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && c&1 == 0 =>
+  (Trunc64to32
+    (Rsh64Ux64 <typ.UInt64>
+      (Mul64 <typ.UInt64>
+        (Const64 <typ.UInt64> [int64(1<<31+(umagic32(c).m+1)/2)])
+        (Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1])))
+      (Const64 <typ.UInt64> [32+umagic32(c).s-2])))
+(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.useAvg =>
+  (Trunc64to32
+    (Rsh64Ux64 <typ.UInt64>
+      (Avg64u
+        (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32]))
+        (Mul64 <typ.UInt64>
+          (Const64 <typ.UInt32> [int64(umagic32(c).m)])
+          (ZeroExt32to64 x)))
+      (Const64 <typ.UInt64> [32+umagic32(c).s-1])))
+
+// For unsigned 64-bit divides on 32-bit machines,
+// if the constant fits in 16 bits (so that the last term
+// fits in 32 bits), convert to three 32-bit divides by a constant.
+//
+// If 1<<32 = Q * c + R
+// and    x = hi << 32 + lo
+//
+// Then x = (hi/c*c + hi%c) << 32 + lo
+//        = hi/c*c<<32 + hi%c<<32 + lo
+//        = hi/c*c<<32 + (hi%c)*(Q*c+R) + lo/c*c + lo%c
+//        = hi/c*c<<32 + (hi%c)*Q*c + lo/c*c + (hi%c*R+lo%c)
+// and x / c = (hi/c)<<32 + (hi%c)*Q + lo/c + (hi%c*R+lo%c)/c
+(Div64u x (Const64 [c])) && c > 0 && c <= 0xFFFF && umagicOK32(int32(c)) && config.RegSize == 4 && config.useHmul =>
+  (Add64
+    (Add64 <typ.UInt64>
+      (Add64 <typ.UInt64>
+        (Lsh64x64 <typ.UInt64>
+          (ZeroExt32to64
+            (Div32u <typ.UInt32>
+              (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32])))
+              (Const32 <typ.UInt32> [int32(c)])))
+          (Const64 <typ.UInt64> [32]))
+        (ZeroExt32to64 (Div32u <typ.UInt32> (Trunc64to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(c)]))))
+      (Mul64 <typ.UInt64>
+        (ZeroExt32to64 <typ.UInt64>
+          (Mod32u <typ.UInt32>
+            (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32])))
+            (Const32 <typ.UInt32> [int32(c)])))
+        (Const64 <typ.UInt64> [int64((1<<32)/c)])))
+      (ZeroExt32to64
+        (Div32u <typ.UInt32>
+          (Add32 <typ.UInt32>
+            (Mod32u <typ.UInt32> (Trunc64to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(c)]))
+            (Mul32 <typ.UInt32>
+              (Mod32u <typ.UInt32>
+                (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32])))
+                (Const32 <typ.UInt32> [int32(c)]))
+              (Const32 <typ.UInt32> [int32((1<<32)%c)])))
+          (Const32 <typ.UInt32> [int32(c)]))))
+
+// For 64-bit divides on 64-bit machines
+// (64-bit divides on 32-bit machines are lowered to a runtime call by the walk pass.)
+(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && umagic64(c).m&1 == 0 && config.useHmul =>
+  (Rsh64Ux64 <typ.UInt64>
+    (Hmul64u <typ.UInt64>
+      (Const64 <typ.UInt64> [int64(1<<63+umagic64(c).m/2)])
+      x)
+    (Const64 <typ.UInt64> [umagic64(c).s-1]))
+(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && c&1 == 0 && config.useHmul =>
+  (Rsh64Ux64 <typ.UInt64>
+    (Hmul64u <typ.UInt64>
+      (Const64 <typ.UInt64> [int64(1<<63+(umagic64(c).m+1)/2)])
+      (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [1])))
+    (Const64 <typ.UInt64> [umagic64(c).s-2]))
+(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && config.useAvg && config.useHmul =>
+  (Rsh64Ux64 <typ.UInt64>
+    (Avg64u
+      x
+      (Hmul64u <typ.UInt64>
+        (Const64 <typ.UInt64> [int64(umagic64(c).m)])
+        x))
+    (Const64 <typ.UInt64> [umagic64(c).s-1]))
+
+// Signed divide by a negative constant.  Rewrite to divide by a positive constant.
+(Div8  <t> n (Const8  [c])) && c < 0 && c != -1<<7  => (Neg8  (Div8  <t> n (Const8  <t> [-c])))
+(Div16 <t> n (Const16 [c])) && c < 0 && c != -1<<15 => (Neg16 (Div16 <t> n (Const16 <t> [-c])))
+(Div32 <t> n (Const32 [c])) && c < 0 && c != -1<<31 => (Neg32 (Div32 <t> n (Const32 <t> [-c])))
+(Div64 <t> n (Const64 [c])) && c < 0 && c != -1<<63 => (Neg64 (Div64 <t> n (Const64 <t> [-c])))
+
+// Dividing by the most-negative number.  Result is always 0 except
+// if the input is also the most-negative number.
+// We can detect that using the sign bit of x & -x.
+(Div8  <t> x (Const8  [-1<<7 ])) => (Rsh8Ux64  (And8  <t> x (Neg8  <t> x)) (Const64 <typ.UInt64> [7 ]))
+(Div16 <t> x (Const16 [-1<<15])) => (Rsh16Ux64 (And16 <t> x (Neg16 <t> x)) (Const64 <typ.UInt64> [15]))
+(Div32 <t> x (Const32 [-1<<31])) => (Rsh32Ux64 (And32 <t> x (Neg32 <t> x)) (Const64 <typ.UInt64> [31]))
+(Div64 <t> x (Const64 [-1<<63])) => (Rsh64Ux64 (And64 <t> x (Neg64 <t> x)) (Const64 <typ.UInt64> [63]))
+
+// Signed divide by power of 2.
+// n / c =       n >> log(c) if n >= 0
+//       = (n+c-1) >> log(c) if n < 0
+// We conditionally add c-1 by adding n>>63>>(64-log(c)) (first shift signed, second shift unsigned).
+(Div8  <t> n (Const8  [c])) && isPowerOfTwo8(c) =>
+  (Rsh8x64
+    (Add8  <t> n (Rsh8Ux64  <t> (Rsh8x64  <t> n (Const64 <typ.UInt64> [ 7])) (Const64 <typ.UInt64> [int64( 8-log8(c))])))
+    (Const64 <typ.UInt64> [int64(log8(c))]))
+(Div16 <t> n (Const16 [c])) && isPowerOfTwo16(c) =>
+  (Rsh16x64
+    (Add16 <t> n (Rsh16Ux64 <t> (Rsh16x64 <t> n (Const64 <typ.UInt64> [15])) (Const64 <typ.UInt64> [int64(16-log16(c))])))
+    (Const64 <typ.UInt64> [int64(log16(c))]))
+(Div32 <t> n (Const32 [c])) && isPowerOfTwo32(c) =>
+  (Rsh32x64
+    (Add32 <t> n (Rsh32Ux64 <t> (Rsh32x64 <t> n (Const64 <typ.UInt64> [31])) (Const64 <typ.UInt64> [int64(32-log32(c))])))
+    (Const64 <typ.UInt64> [int64(log32(c))]))
+(Div64 <t> n (Const64 [c])) && isPowerOfTwo64(c) =>
+  (Rsh64x64
+    (Add64 <t> n (Rsh64Ux64 <t> (Rsh64x64 <t> n (Const64 <typ.UInt64> [63])) (Const64 <typ.UInt64> [int64(64-log64(c))])))
+    (Const64 <typ.UInt64> [int64(log64(c))]))
+
+// Signed divide, not a power of 2.  Strength reduce to a multiply.
+(Div8 <t> x (Const8 [c])) && smagicOK8(c) =>
+  (Sub8 <t>
+    (Rsh32x64 <t>
+      (Mul32 <typ.UInt32>
+        (Const32 <typ.UInt32> [int32(smagic8(c).m)])
+        (SignExt8to32 x))
+      (Const64 <typ.UInt64> [8+smagic8(c).s]))
+    (Rsh32x64 <t>
+      (SignExt8to32 x)
+      (Const64 <typ.UInt64> [31])))
+(Div16 <t> x (Const16 [c])) && smagicOK16(c) =>
+  (Sub16 <t>
+    (Rsh32x64 <t>
+      (Mul32 <typ.UInt32>
+        (Const32 <typ.UInt32> [int32(smagic16(c).m)])
+        (SignExt16to32 x))
+      (Const64 <typ.UInt64> [16+smagic16(c).s]))
+    (Rsh32x64 <t>
+      (SignExt16to32 x)
+      (Const64 <typ.UInt64> [31])))
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 8 =>
+  (Sub32 <t>
+    (Rsh64x64 <t>
+      (Mul64 <typ.UInt64>
+        (Const64 <typ.UInt64> [int64(smagic32(c).m)])
+        (SignExt32to64 x))
+      (Const64 <typ.UInt64> [32+smagic32(c).s]))
+    (Rsh64x64 <t>
+      (SignExt32to64 x)
+      (Const64 <typ.UInt64> [63])))
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul =>
+  (Sub32 <t>
+    (Rsh32x64 <t>
+      (Hmul32 <t>
+        (Const32 <typ.UInt32> [int32(smagic32(c).m/2)])
+        x)
+      (Const64 <typ.UInt64> [smagic32(c).s-1]))
+    (Rsh32x64 <t>
+      x
+      (Const64 <typ.UInt64> [31])))
+(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul =>
+  (Sub32 <t>
+    (Rsh32x64 <t>
+      (Add32 <t>
+        (Hmul32 <t>
+          (Const32 <typ.UInt32> [int32(smagic32(c).m)])
+          x)
+        x)
+      (Const64 <typ.UInt64> [smagic32(c).s]))
+    (Rsh32x64 <t>
+      x
+      (Const64 <typ.UInt64> [31])))
+(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul =>
+  (Sub64 <t>
+    (Rsh64x64 <t>
+      (Hmul64 <t>
+        (Const64 <typ.UInt64> [int64(smagic64(c).m/2)])
+        x)
+      (Const64 <typ.UInt64> [smagic64(c).s-1]))
+    (Rsh64x64 <t>
+      x
+      (Const64 <typ.UInt64> [63])))
+(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul =>
+  (Sub64 <t>
+    (Rsh64x64 <t>
+      (Add64 <t>
+        (Hmul64 <t>
+          (Const64 <typ.UInt64> [int64(smagic64(c).m)])
+          x)
+        x)
+      (Const64 <typ.UInt64> [smagic64(c).s]))
+    (Rsh64x64 <t>
+      x
+      (Const64 <typ.UInt64> [63])))
+
+// Unsigned mod by power of 2 constant.
+(Mod8u  <t> n (Const8  [c])) && isPowerOfTwo8(c)  => (And8  n (Const8  <t> [c-1]))
+(Mod16u <t> n (Const16 [c])) && isPowerOfTwo16(c) => (And16 n (Const16 <t> [c-1]))
+(Mod32u <t> n (Const32 [c])) && isPowerOfTwo32(c) => (And32 n (Const32 <t> [c-1]))
+(Mod64u <t> n (Const64 [c])) && isPowerOfTwo64(c) => (And64 n (Const64 <t> [c-1]))
+(Mod64u <t> n (Const64 [-1<<63]))                 => (And64 n (Const64 <t> [1<<63-1]))
+
+// Signed non-negative mod by power of 2 constant.
+(Mod8  <t> n (Const8  [c])) && isNonNegative(n) && isPowerOfTwo8(c)  => (And8  n (Const8  <t> [c-1]))
+(Mod16 <t> n (Const16 [c])) && isNonNegative(n) && isPowerOfTwo16(c) => (And16 n (Const16 <t> [c-1]))
+(Mod32 <t> n (Const32 [c])) && isNonNegative(n) && isPowerOfTwo32(c) => (And32 n (Const32 <t> [c-1]))
+(Mod64 <t> n (Const64 [c])) && isNonNegative(n) && isPowerOfTwo64(c) => (And64 n (Const64 <t> [c-1]))
+(Mod64 n (Const64 [-1<<63])) && isNonNegative(n)                   => n
+
+// Signed mod by negative constant.
+(Mod8  <t> n (Const8  [c])) && c < 0 && c != -1<<7  => (Mod8  <t> n (Const8  <t> [-c]))
+(Mod16 <t> n (Const16 [c])) && c < 0 && c != -1<<15 => (Mod16 <t> n (Const16 <t> [-c]))
+(Mod32 <t> n (Const32 [c])) && c < 0 && c != -1<<31 => (Mod32 <t> n (Const32 <t> [-c]))
+(Mod64 <t> n (Const64 [c])) && c < 0 && c != -1<<63 => (Mod64 <t> n (Const64 <t> [-c]))
+
+// All other mods by constants, do A%B = A-(A/B*B).
+// This implements % with two * and a bunch of ancillary ops.
+// One of the * is free if the user's code also computes A/B.
+(Mod8   <t> x (Const8  [c])) && x.Op != OpConst8  && (c > 0 || c == -1<<7)
+  => (Sub8  x (Mul8  <t> (Div8   <t> x (Const8  <t> [c])) (Const8  <t> [c])))
+(Mod16  <t> x (Const16 [c])) && x.Op != OpConst16 && (c > 0 || c == -1<<15)
+  => (Sub16 x (Mul16 <t> (Div16  <t> x (Const16 <t> [c])) (Const16 <t> [c])))
+(Mod32  <t> x (Const32 [c])) && x.Op != OpConst32 && (c > 0 || c == -1<<31)
+  => (Sub32 x (Mul32 <t> (Div32  <t> x (Const32 <t> [c])) (Const32 <t> [c])))
+(Mod64  <t> x (Const64 [c])) && x.Op != OpConst64 && (c > 0 || c == -1<<63)
+  => (Sub64 x (Mul64 <t> (Div64  <t> x (Const64 <t> [c])) (Const64 <t> [c])))
+(Mod8u  <t> x (Const8  [c])) && x.Op != OpConst8  && c > 0 && umagicOK8( c)
+  => (Sub8  x (Mul8  <t> (Div8u  <t> x (Const8  <t> [c])) (Const8  <t> [c])))
+(Mod16u <t> x (Const16 [c])) && x.Op != OpConst16 && c > 0 && umagicOK16(c)
+  => (Sub16 x (Mul16 <t> (Div16u <t> x (Const16 <t> [c])) (Const16 <t> [c])))
+(Mod32u <t> x (Const32 [c])) && x.Op != OpConst32 && c > 0 && umagicOK32(c)
+  => (Sub32 x (Mul32 <t> (Div32u <t> x (Const32 <t> [c])) (Const32 <t> [c])))
+(Mod64u <t> x (Const64 [c])) && x.Op != OpConst64 && c > 0 && umagicOK64(c)
+  => (Sub64 x (Mul64 <t> (Div64u <t> x (Const64 <t> [c])) (Const64 <t> [c])))
+
+// For architectures without rotates on less than 32-bits, promote these checks to 32-bit.
+(Eq8 (Mod8u x (Const8  [c])) (Const8 [0])) && x.Op != OpConst8 && udivisibleOK8(c) && !hasSmallRotate(config) =>
+	(Eq32 (Mod32u <typ.UInt32> (ZeroExt8to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(uint8(c))])) (Const32 <typ.UInt32> [0]))
+(Eq16 (Mod16u x (Const16  [c])) (Const16 [0])) && x.Op != OpConst16 && udivisibleOK16(c) && !hasSmallRotate(config) =>
+	(Eq32 (Mod32u <typ.UInt32> (ZeroExt16to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(uint16(c))])) (Const32 <typ.UInt32> [0]))
+(Eq8 (Mod8 x (Const8  [c])) (Const8 [0])) && x.Op != OpConst8 && sdivisibleOK8(c) && !hasSmallRotate(config) =>
+	(Eq32 (Mod32 <typ.Int32> (SignExt8to32 <typ.Int32> x) (Const32 <typ.Int32> [int32(c)])) (Const32 <typ.Int32> [0]))
+(Eq16 (Mod16 x (Const16  [c])) (Const16 [0])) && x.Op != OpConst16 && sdivisibleOK16(c) && !hasSmallRotate(config) =>
+	(Eq32 (Mod32 <typ.Int32> (SignExt16to32 <typ.Int32> x) (Const32 <typ.Int32> [int32(c)])) (Const32 <typ.Int32> [0]))
+
+// Divisibility checks x%c == 0 convert to multiply and rotate.
+// Note, x%c == 0 is rewritten as x == c*(x/c) during the opt pass
+// where (x/c) is performed using multiplication with magic constants.
+// To rewrite x%c == 0 requires pattern matching the rewritten expression
+// and checking that the division by the same constant wasn't already calculated.
+// This check is made by counting uses of the magic constant multiplication.
+// Note that if there were an intermediate opt pass, this rule could be applied
+// directly on the Div op and magic division rewrites could be delayed to late opt.
+
+// Unsigned divisibility checks convert to multiply and rotate.
+(Eq8 x (Mul8 (Const8 [c])
+  (Trunc32to8
+    (Rsh32Ux64
+      mul:(Mul32
+        (Const32 [m])
+        (ZeroExt8to32 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(1<<8+umagic8(c).m) && s == 8+umagic8(c).s
+  && x.Op != OpConst8 && udivisibleOK8(c)
+ => (Leq8U
+			(RotateLeft8 <typ.UInt8>
+				(Mul8 <typ.UInt8>
+					(Const8 <typ.UInt8> [int8(udivisible8(c).m)])
+					x)
+				(Const8 <typ.UInt8> [int8(8-udivisible8(c).k)])
+				)
+			(Const8 <typ.UInt8> [int8(udivisible8(c).max)])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc64to16
+    (Rsh64Ux64
+      mul:(Mul64
+        (Const64 [m])
+        (ZeroExt16to64 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<16+umagic16(c).m) && s == 16+umagic16(c).s
+  && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+					x)
+				(Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+				)
+			(Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc32to16
+    (Rsh32Ux64
+      mul:(Mul32
+        (Const32 [m])
+        (ZeroExt16to32 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(1<<15+umagic16(c).m/2) && s == 16+umagic16(c).s-1
+  && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+					x)
+				(Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+				)
+			(Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc32to16
+    (Rsh32Ux64
+      mul:(Mul32
+        (Const32 [m])
+        (Rsh32Ux64 (ZeroExt16to32 x) (Const64 [1])))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(1<<15+(umagic16(c).m+1)/2) && s == 16+umagic16(c).s-2
+  && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+					x)
+				(Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+				)
+			(Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc32to16
+    (Rsh32Ux64
+      (Avg32u
+        (Lsh32x64 (ZeroExt16to32 x) (Const64 [16]))
+        mul:(Mul32
+          (Const32 [m])
+          (ZeroExt16to32 x)))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(umagic16(c).m) && s == 16+umagic16(c).s-1
+  && x.Op != OpConst16 && udivisibleOK16(c)
+ => (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int16(udivisible16(c).m)])
+					x)
+				(Const16 <typ.UInt16> [int16(16-udivisible16(c).k)])
+				)
+			(Const16 <typ.UInt16> [int16(udivisible16(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+	(Rsh32Ux64
+		mul:(Hmul32u
+			(Const32 [m])
+			x)
+		(Const64 [s]))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(1<<31+umagic32(c).m/2) && s == umagic32(c).s-1
+	&& x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+					x)
+				(Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+				)
+			(Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Rsh32Ux64
+    mul:(Hmul32u
+      (Const32 <typ.UInt32> [m])
+      (Rsh32Ux64 x (Const64 [1])))
+    (Const64 [s]))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(1<<31+(umagic32(c).m+1)/2) && s == umagic32(c).s-2
+	&& x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+					x)
+				(Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+				)
+			(Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Rsh32Ux64
+    (Avg32u
+      x
+      mul:(Hmul32u
+        (Const32 [m])
+        x))
+    (Const64 [s]))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(umagic32(c).m) && s == umagic32(c).s-1
+	&& x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+					x)
+				(Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+				)
+			(Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Trunc64to32
+    (Rsh64Ux64
+      mul:(Mul64
+        (Const64 [m])
+        (ZeroExt32to64 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<31+umagic32(c).m/2) && s == 32+umagic32(c).s-1
+	&& x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+					x)
+				(Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+				)
+			(Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Trunc64to32
+    (Rsh64Ux64
+      mul:(Mul64
+        (Const64 [m])
+        (Rsh64Ux64 (ZeroExt32to64 x) (Const64 [1])))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<31+(umagic32(c).m+1)/2) && s == 32+umagic32(c).s-2
+	&& x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+					x)
+				(Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+				)
+			(Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Trunc64to32
+    (Rsh64Ux64
+      (Avg64u
+        (Lsh64x64 (ZeroExt32to64 x) (Const64 [32]))
+        mul:(Mul64
+          (Const64 [m])
+          (ZeroExt32to64 x)))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(umagic32(c).m) && s == 32+umagic32(c).s-1
+	&& x.Op != OpConst32 && udivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int32(udivisible32(c).m)])
+					x)
+				(Const32 <typ.UInt32> [int32(32-udivisible32(c).k)])
+				)
+			(Const32 <typ.UInt32> [int32(udivisible32(c).max)])
+		)
+
+(Eq64 x (Mul64 (Const64 [c])
+	(Rsh64Ux64
+		mul:(Hmul64u
+			(Const64 [m])
+			x)
+		(Const64 [s]))
+	)
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<63+umagic64(c).m/2) && s == umagic64(c).s-1
+  && x.Op != OpConst64 && udivisibleOK64(c)
+ => (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Mul64 <typ.UInt64>
+					(Const64 <typ.UInt64> [int64(udivisible64(c).m)])
+					x)
+				(Const64 <typ.UInt64> [64-udivisible64(c).k])
+				)
+			(Const64 <typ.UInt64> [int64(udivisible64(c).max)])
+		)
+(Eq64 x (Mul64 (Const64 [c])
+	(Rsh64Ux64
+		mul:(Hmul64u
+			(Const64 [m])
+			(Rsh64Ux64 x (Const64 [1])))
+		(Const64 [s]))
+	)
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<63+(umagic64(c).m+1)/2) && s == umagic64(c).s-2
+  && x.Op != OpConst64 && udivisibleOK64(c)
+ => (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Mul64 <typ.UInt64>
+					(Const64 <typ.UInt64> [int64(udivisible64(c).m)])
+					x)
+				(Const64 <typ.UInt64> [64-udivisible64(c).k])
+				)
+			(Const64 <typ.UInt64> [int64(udivisible64(c).max)])
+		)
+(Eq64 x (Mul64 (Const64 [c])
+	(Rsh64Ux64
+		(Avg64u
+			x
+			mul:(Hmul64u
+				(Const64 [m])
+				x))
+		(Const64 [s]))
+	)
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(umagic64(c).m) && s == umagic64(c).s-1
+  && x.Op != OpConst64 && udivisibleOK64(c)
+ => (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Mul64 <typ.UInt64>
+					(Const64 <typ.UInt64> [int64(udivisible64(c).m)])
+					x)
+				(Const64 <typ.UInt64> [64-udivisible64(c).k])
+				)
+			(Const64 <typ.UInt64> [int64(udivisible64(c).max)])
+		)
+
+// Signed divisibility checks convert to multiply, add and rotate.
+(Eq8 x (Mul8 (Const8 [c])
+  (Sub8
+    (Rsh32x64
+      mul:(Mul32
+        (Const32 [m])
+        (SignExt8to32 x))
+      (Const64 [s]))
+    (Rsh32x64
+      (SignExt8to32 x)
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(smagic8(c).m) && s == 8+smagic8(c).s
+	&& x.Op != OpConst8 && sdivisibleOK8(c)
+ => (Leq8U
+			(RotateLeft8 <typ.UInt8>
+				(Add8 <typ.UInt8>
+					(Mul8 <typ.UInt8>
+						(Const8 <typ.UInt8> [int8(sdivisible8(c).m)])
+						x)
+					(Const8 <typ.UInt8> [int8(sdivisible8(c).a)])
+				)
+				(Const8 <typ.UInt8> [int8(8-sdivisible8(c).k)])
+			)
+			(Const8 <typ.UInt8> [int8(sdivisible8(c).max)])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Sub16
+    (Rsh32x64
+      mul:(Mul32
+        (Const32 [m])
+        (SignExt16to32 x))
+      (Const64 [s]))
+    (Rsh32x64
+      (SignExt16to32 x)
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(smagic16(c).m) && s == 16+smagic16(c).s
+	&& x.Op != OpConst16 && sdivisibleOK16(c)
+ => (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Add16 <typ.UInt16>
+					(Mul16 <typ.UInt16>
+						(Const16 <typ.UInt16> [int16(sdivisible16(c).m)])
+						x)
+					(Const16 <typ.UInt16> [int16(sdivisible16(c).a)])
+				)
+				(Const16 <typ.UInt16> [int16(16-sdivisible16(c).k)])
+			)
+			(Const16 <typ.UInt16> [int16(sdivisible16(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Sub32
+    (Rsh64x64
+      mul:(Mul64
+        (Const64 [m])
+        (SignExt32to64 x))
+      (Const64 [s]))
+    (Rsh64x64
+      (SignExt32to64 x)
+      (Const64 [63])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic32(c).m) && s == 32+smagic32(c).s
+	&& x.Op != OpConst32 && sdivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Add32 <typ.UInt32>
+					(Mul32 <typ.UInt32>
+						(Const32 <typ.UInt32> [int32(sdivisible32(c).m)])
+						x)
+					(Const32 <typ.UInt32> [int32(sdivisible32(c).a)])
+				)
+				(Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)])
+			)
+			(Const32 <typ.UInt32> [int32(sdivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Sub32
+    (Rsh32x64
+      mul:(Hmul32
+        (Const32 [m])
+        x)
+      (Const64 [s]))
+    (Rsh32x64
+      x
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(smagic32(c).m/2) && s == smagic32(c).s-1
+	&& x.Op != OpConst32 && sdivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Add32 <typ.UInt32>
+					(Mul32 <typ.UInt32>
+						(Const32 <typ.UInt32> [int32(sdivisible32(c).m)])
+						x)
+					(Const32 <typ.UInt32> [int32(sdivisible32(c).a)])
+				)
+				(Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)])
+			)
+			(Const32 <typ.UInt32> [int32(sdivisible32(c).max)])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Sub32
+    (Rsh32x64
+      (Add32
+        mul:(Hmul32
+          (Const32 [m])
+          x)
+        x)
+      (Const64 [s]))
+    (Rsh32x64
+      x
+      (Const64 [31])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int32(smagic32(c).m) && s == smagic32(c).s
+	&& x.Op != OpConst32 && sdivisibleOK32(c)
+ => (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Add32 <typ.UInt32>
+					(Mul32 <typ.UInt32>
+						(Const32 <typ.UInt32> [int32(sdivisible32(c).m)])
+						x)
+					(Const32 <typ.UInt32> [int32(sdivisible32(c).a)])
+				)
+				(Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)])
+			)
+			(Const32 <typ.UInt32> [int32(sdivisible32(c).max)])
+		)
+
+(Eq64 x (Mul64 (Const64 [c])
+  (Sub64
+    (Rsh64x64
+      mul:(Hmul64
+        (Const64 [m])
+        x)
+      (Const64 [s]))
+    (Rsh64x64
+      x
+      (Const64 [63])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic64(c).m/2) && s == smagic64(c).s-1
+	&& x.Op != OpConst64 && sdivisibleOK64(c)
+ => (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Add64 <typ.UInt64>
+					(Mul64 <typ.UInt64>
+						(Const64 <typ.UInt64> [int64(sdivisible64(c).m)])
+						x)
+					(Const64 <typ.UInt64> [int64(sdivisible64(c).a)])
+				)
+				(Const64 <typ.UInt64> [64-sdivisible64(c).k])
+			)
+			(Const64 <typ.UInt64> [int64(sdivisible64(c).max)])
+		)
+
+(Eq64 x (Mul64 (Const64 [c])
+  (Sub64
+    (Rsh64x64
+      (Add64
+        mul:(Hmul64
+          (Const64 [m])
+          x)
+        x)
+      (Const64 [s]))
+    (Rsh64x64
+      x
+      (Const64 [63])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(smagic64(c).m) && s == smagic64(c).s
+	&& x.Op != OpConst64 && sdivisibleOK64(c)
+ => (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Add64 <typ.UInt64>
+					(Mul64 <typ.UInt64>
+						(Const64 <typ.UInt64> [int64(sdivisible64(c).m)])
+						x)
+					(Const64 <typ.UInt64> [int64(sdivisible64(c).a)])
+				)
+				(Const64 <typ.UInt64> [64-sdivisible64(c).k])
+			)
+			(Const64 <typ.UInt64> [int64(sdivisible64(c).max)])
+		)
+
+// Divisibility check for signed integers for power of two constant are simple mask.
+// However, we must match against the rewritten n%c == 0 -> n - c*(n/c) == 0 -> n == c*(n/c)
+// where n/c contains fixup code to handle signed n.
+((Eq8|Neq8) n (Lsh8x64
+  (Rsh8x64
+    (Add8  <t> n (Rsh8Ux64  <t> (Rsh8x64  <t> n (Const64 <typ.UInt64> [ 7])) (Const64 <typ.UInt64> [kbar])))
+    (Const64 <typ.UInt64> [k]))
+	(Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 7 && kbar == 8 - k
+  => ((Eq8|Neq8) (And8 <t> n (Const8 <t> [1<<uint(k)-1])) (Const8 <t> [0]))
+
+((Eq16|Neq16) n (Lsh16x64
+  (Rsh16x64
+    (Add16 <t> n (Rsh16Ux64 <t> (Rsh16x64 <t> n (Const64 <typ.UInt64> [15])) (Const64 <typ.UInt64> [kbar])))
+    (Const64 <typ.UInt64> [k]))
+	(Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 15 && kbar == 16 - k
+  => ((Eq16|Neq16) (And16 <t> n (Const16 <t> [1<<uint(k)-1])) (Const16 <t> [0]))
+
+((Eq32|Neq32) n (Lsh32x64
+  (Rsh32x64
+    (Add32 <t> n (Rsh32Ux64 <t> (Rsh32x64 <t> n (Const64 <typ.UInt64> [31])) (Const64 <typ.UInt64> [kbar])))
+    (Const64 <typ.UInt64> [k]))
+	(Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 31 && kbar == 32 - k
+  => ((Eq32|Neq32) (And32 <t> n (Const32 <t> [1<<uint(k)-1])) (Const32 <t> [0]))
+
+((Eq64|Neq64) n (Lsh64x64
+  (Rsh64x64
+    (Add64 <t> n (Rsh64Ux64 <t> (Rsh64x64 <t> n (Const64 <typ.UInt64> [63])) (Const64 <typ.UInt64> [kbar])))
+    (Const64 <typ.UInt64> [k]))
+	(Const64 <typ.UInt64> [k]))
+) && k > 0 && k < 63 && kbar == 64 - k
+  => ((Eq64|Neq64) (And64 <t> n (Const64 <t> [1<<uint(k)-1])) (Const64 <t> [0]))
+
+(Eq(8|16|32|64)  s:(Sub(8|16|32|64) x y) (Const(8|16|32|64) [0])) && s.Uses == 1 => (Eq(8|16|32|64)  x y)
+(Neq(8|16|32|64) s:(Sub(8|16|32|64) x y) (Const(8|16|32|64) [0])) && s.Uses == 1 => (Neq(8|16|32|64) x y)
+
+// Optimize bitsets
+(Eq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [y])) && oneBit8(y)
+  => (Neq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [0]))
+(Eq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [y])) && oneBit16(y)
+  => (Neq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [0]))
+(Eq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [y])) && oneBit32(y)
+  => (Neq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [0]))
+(Eq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [y])) && oneBit64(y)
+  => (Neq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [0]))
+(Neq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [y])) && oneBit8(y)
+  => (Eq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [0]))
+(Neq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [y])) && oneBit16(y)
+  => (Eq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [0]))
+(Neq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [y])) && oneBit32(y)
+  => (Eq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [0]))
+(Neq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [y])) && oneBit64(y)
+  => (Eq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [0]))
+
+// Reassociate expressions involving
+// constants such that constants come first,
+// exposing obvious constant-folding opportunities.
+// Reassociate (op (op y C) x) to (op C (op x y)) or similar, where C
+// is constant, which pushes constants to the outside
+// of the expression. At that point, any constant-folding
+// opportunities should be obvious.
+// Note: don't include AddPtr here! In order to maintain the
+// invariant that pointers must stay within the pointed-to object,
+// we can't pull part of a pointer computation above the AddPtr.
+// See issue 37881.
+// Note: we don't need to handle any (x-C) cases because we already rewrite
+// (x-C) to (x+(-C)).
+
+// x + (C + z) -> C + (x + z)
+(Add64 (Add64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Add64 <t> z x))
+(Add32 (Add32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Add32 <t> z x))
+(Add16 (Add16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Add16 <t> z x))
+(Add8  (Add8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Add8  i (Add8  <t> z x))
+
+// x + (C - z) -> C + (x - z)
+(Add64 (Sub64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Sub64 <t> x z))
+(Add32 (Sub32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Sub32 <t> x z))
+(Add16 (Sub16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Sub16 <t> x z))
+(Add8  (Sub8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Add8  i (Sub8  <t> x z))
+
+// x - (C - z) -> x + (z - C) -> (x + z) - C
+(Sub64 x (Sub64 i:(Const64 <t>) z)) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 (Add64 <t> x z) i)
+(Sub32 x (Sub32 i:(Const32 <t>) z)) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 (Add32 <t> x z) i)
+(Sub16 x (Sub16 i:(Const16 <t>) z)) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 (Add16 <t> x z) i)
+(Sub8  x (Sub8  i:(Const8  <t>) z)) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Sub8  (Add8  <t> x z) i)
+
+// x - (z + C) -> x + (-z - C) -> (x - z) - C
+(Sub64 x (Add64 z i:(Const64 <t>))) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 (Sub64 <t> x z) i)
+(Sub32 x (Add32 z i:(Const32 <t>))) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 (Sub32 <t> x z) i)
+(Sub16 x (Add16 z i:(Const16 <t>))) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 (Sub16 <t> x z) i)
+(Sub8  x (Add8  z i:(Const8  <t>))) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Sub8 (Sub8  <t> x z) i)
+
+// (C - z) - x -> C - (z + x)
+(Sub64 (Sub64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 i (Add64 <t> z x))
+(Sub32 (Sub32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 i (Add32 <t> z x))
+(Sub16 (Sub16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 i (Add16 <t> z x))
+(Sub8  (Sub8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Sub8  i (Add8  <t> z x))
+
+// (z + C) -x -> C + (z - x)
+(Sub64 (Add64 z i:(Const64 <t>)) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Sub64 <t> z x))
+(Sub32 (Add32 z i:(Const32 <t>)) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Sub32 <t> z x))
+(Sub16 (Add16 z i:(Const16 <t>)) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Sub16 <t> z x))
+(Sub8  (Add8  z i:(Const8  <t>)) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Add8  i (Sub8  <t> z x))
+
+// x & (C & z) -> C & (x & z)
+(And64 (And64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (And64 i (And64 <t> z x))
+(And32 (And32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (And32 i (And32 <t> z x))
+(And16 (And16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (And16 i (And16 <t> z x))
+(And8  (And8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (And8  i (And8  <t> z x))
+
+// x | (C | z) -> C | (x | z)
+(Or64 (Or64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Or64 i (Or64 <t> z x))
+(Or32 (Or32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Or32 i (Or32 <t> z x))
+(Or16 (Or16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Or16 i (Or16 <t> z x))
+(Or8  (Or8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Or8  i (Or8  <t> z x))
+
+// x ^ (C ^ z) -> C ^ (x ^ z)
+(Xor64 (Xor64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Xor64 i (Xor64 <t> z x))
+(Xor32 (Xor32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Xor32 i (Xor32 <t> z x))
+(Xor16 (Xor16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Xor16 i (Xor16 <t> z x))
+(Xor8  (Xor8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Xor8  i (Xor8  <t> z x))
+
+// x * (D * z) = D * (x * z)
+(Mul64 (Mul64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Mul64 i (Mul64 <t> x z))
+(Mul32 (Mul32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Mul32 i (Mul32 <t> x z))
+(Mul16 (Mul16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Mul16 i (Mul16 <t> x z))
+(Mul8  (Mul8  i:(Const8  <t>) z) x) && (z.Op != OpConst8  && x.Op != OpConst8)  => (Mul8  i (Mul8  <t> x z))
+
+// C + (D + x) -> (C + D) + x
+(Add64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Add64 (Const64 <t> [c+d]) x)
+(Add32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Add32 (Const32 <t> [c+d]) x)
+(Add16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Add16 (Const16 <t> [c+d]) x)
+(Add8  (Const8  <t> [c]) (Add8  (Const8  <t> [d]) x)) => (Add8  (Const8  <t> [c+d]) x)
+
+// C + (D - x) -> (C + D) - x
+(Add64 (Const64 <t> [c]) (Sub64 (Const64 <t> [d]) x)) => (Sub64 (Const64 <t> [c+d]) x)
+(Add32 (Const32 <t> [c]) (Sub32 (Const32 <t> [d]) x)) => (Sub32 (Const32 <t> [c+d]) x)
+(Add16 (Const16 <t> [c]) (Sub16 (Const16 <t> [d]) x)) => (Sub16 (Const16 <t> [c+d]) x)
+(Add8  (Const8  <t> [c]) (Sub8  (Const8  <t> [d]) x)) => (Sub8  (Const8  <t> [c+d]) x)
+
+// C - (D - x) -> (C - D) + x
+(Sub64 (Const64 <t> [c]) (Sub64 (Const64 <t> [d]) x)) => (Add64 (Const64 <t> [c-d]) x)
+(Sub32 (Const32 <t> [c]) (Sub32 (Const32 <t> [d]) x)) => (Add32 (Const32 <t> [c-d]) x)
+(Sub16 (Const16 <t> [c]) (Sub16 (Const16 <t> [d]) x)) => (Add16 (Const16 <t> [c-d]) x)
+(Sub8  (Const8  <t> [c]) (Sub8  (Const8  <t> [d]) x)) => (Add8  (Const8  <t> [c-d]) x)
+
+// C - (D + x) -> (C - D) - x
+(Sub64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Sub64 (Const64 <t> [c-d]) x)
+(Sub32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Sub32 (Const32 <t> [c-d]) x)
+(Sub16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Sub16 (Const16 <t> [c-d]) x)
+(Sub8  (Const8  <t> [c]) (Add8  (Const8  <t> [d]) x)) => (Sub8  (Const8  <t> [c-d]) x)
+
+// C & (D & x) -> (C & D) & x
+(And64 (Const64 <t> [c]) (And64 (Const64 <t> [d]) x)) => (And64 (Const64 <t> [c&d]) x)
+(And32 (Const32 <t> [c]) (And32 (Const32 <t> [d]) x)) => (And32 (Const32 <t> [c&d]) x)
+(And16 (Const16 <t> [c]) (And16 (Const16 <t> [d]) x)) => (And16 (Const16 <t> [c&d]) x)
+(And8  (Const8  <t> [c]) (And8  (Const8  <t> [d]) x)) => (And8  (Const8  <t> [c&d]) x)
+
+// C | (D | x) -> (C | D) | x
+(Or64 (Const64 <t> [c]) (Or64 (Const64 <t> [d]) x)) => (Or64 (Const64 <t> [c|d]) x)
+(Or32 (Const32 <t> [c]) (Or32 (Const32 <t> [d]) x)) => (Or32 (Const32 <t> [c|d]) x)
+(Or16 (Const16 <t> [c]) (Or16 (Const16 <t> [d]) x)) => (Or16 (Const16 <t> [c|d]) x)
+(Or8  (Const8  <t> [c]) (Or8  (Const8  <t> [d]) x)) => (Or8  (Const8  <t> [c|d]) x)
+
+// C ^ (D ^ x) -> (C ^ D) ^ x
+(Xor64 (Const64 <t> [c]) (Xor64 (Const64 <t> [d]) x)) => (Xor64 (Const64 <t> [c^d]) x)
+(Xor32 (Const32 <t> [c]) (Xor32 (Const32 <t> [d]) x)) => (Xor32 (Const32 <t> [c^d]) x)
+(Xor16 (Const16 <t> [c]) (Xor16 (Const16 <t> [d]) x)) => (Xor16 (Const16 <t> [c^d]) x)
+(Xor8  (Const8  <t> [c]) (Xor8  (Const8  <t> [d]) x)) => (Xor8  (Const8  <t> [c^d]) x)
+
+// C * (D * x) = (C * D) * x
+(Mul64 (Const64 <t> [c]) (Mul64 (Const64 <t> [d]) x)) => (Mul64 (Const64 <t> [c*d]) x)
+(Mul32 (Const32 <t> [c]) (Mul32 (Const32 <t> [d]) x)) => (Mul32 (Const32 <t> [c*d]) x)
+(Mul16 (Const16 <t> [c]) (Mul16 (Const16 <t> [d]) x)) => (Mul16 (Const16 <t> [c*d]) x)
+(Mul8  (Const8  <t> [c]) (Mul8  (Const8  <t> [d]) x)) => (Mul8  (Const8  <t> [c*d]) x)
+
+// floating point optimizations
+(Mul(32|64)F x (Const(32|64)F [1])) => x
+(Mul32F x (Const32F [-1])) => (Neg32F x)
+(Mul64F x (Const64F [-1])) => (Neg64F x)
+(Mul32F x (Const32F [2])) => (Add32F x x)
+(Mul64F x (Const64F [2])) => (Add64F x x)
+
+(Div32F x (Const32F <t> [c])) && reciprocalExact32(c) => (Mul32F x (Const32F <t> [1/c]))
+(Div64F x (Const64F <t> [c])) && reciprocalExact64(c) => (Mul64F x (Const64F <t> [1/c]))
+
+(Sqrt (Const64F [c])) && !math.IsNaN(math.Sqrt(c)) => (Const64F [math.Sqrt(c)])
+
+// recognize runtime.newobject and don't Zero/Nilcheck it
+(Zero (Load (OffPtr [c] (SP)) mem) mem)
+	&& mem.Op == OpStaticCall
+	&& isSameCall(mem.Aux, "runtime.newobject")
+	&& c == config.ctxt.FixedFrameSize() + config.RegSize // offset of return value
+	=> mem
+(Store (Load (OffPtr [c] (SP)) mem) x mem)
+	&& isConstZero(x)
+	&& mem.Op == OpStaticCall
+	&& isSameCall(mem.Aux, "runtime.newobject")
+	&& c == config.ctxt.FixedFrameSize() + config.RegSize // offset of return value
+	=> mem
+(Store (OffPtr (Load (OffPtr [c] (SP)) mem)) x mem)
+	&& isConstZero(x)
+	&& mem.Op == OpStaticCall
+	&& isSameCall(mem.Aux, "runtime.newobject")
+	&& c == config.ctxt.FixedFrameSize() + config.RegSize // offset of return value
+	=> mem
+// nil checks just need to rewrite to something useless.
+// they will be deadcode eliminated soon afterwards.
+(NilCheck (Load (OffPtr [c] (SP)) (StaticCall {sym} _)) _)
+	&& isSameCall(sym, "runtime.newobject")
+	&& c == config.ctxt.FixedFrameSize() + config.RegSize // offset of return value
+	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
+	=> (Invalid)
+(NilCheck (OffPtr (Load (OffPtr [c] (SP)) (StaticCall {sym} _))) _)
+	&& isSameCall(sym, "runtime.newobject")
+	&& c == config.ctxt.FixedFrameSize() + config.RegSize // offset of return value
+	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
+	=> (Invalid)
+
+// for rewriting results of some late-expanded rewrites (below)
+(SelectN [0] (MakeResult a ___)) => a
+(SelectN [1] (MakeResult a b ___)) => b
+(SelectN [2] (MakeResult a b c ___)) => c
+
+// for late-expanded calls, recognize newobject and remove zeroing and nilchecks
+(Zero (SelectN [0] call:(StaticLECall _ _)) mem:(SelectN [1] call))
+	&& isSameCall(call.Aux, "runtime.newobject")
+	=> mem
+
+(Store (SelectN [0] call:(StaticLECall _ _)) x mem:(SelectN [1] call))
+	&& isConstZero(x)
+	&& isSameCall(call.Aux, "runtime.newobject")
+	=> mem
+
+(Store (OffPtr (SelectN [0] call:(StaticLECall _ _))) x mem:(SelectN [1] call))
+	&& isConstZero(x)
+	&& isSameCall(call.Aux, "runtime.newobject")
+	=> mem
+
+(NilCheck (SelectN [0] call:(StaticLECall _ _)) (SelectN [1] call))
+	&& isSameCall(call.Aux, "runtime.newobject")
+	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
+	=> (Invalid)
+
+(NilCheck (OffPtr (SelectN [0] call:(StaticLECall _ _))) (SelectN [1] call))
+	&& isSameCall(call.Aux, "runtime.newobject")
+	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
+	=> (Invalid)
+
+// for late-expanded calls, recognize memequal applied to a single constant byte
+// TODO figure out breakeven number of bytes for this optimization.
+(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [1]) mem)
+  && isSameCall(callAux, "runtime.memequal")
+  && symIsRO(scon)
+  => (MakeResult (Eq8 (Load <typ.Int8> sptr mem) (Const8 <typ.Int8> [int8(read8(scon,0))])) mem)
+
+// Evaluate constant address comparisons.
+(EqPtr  x x) => (ConstBool [true])
+(NeqPtr x x) => (ConstBool [false])
+(EqPtr  (Addr {a} _) (Addr {b} _)) => (ConstBool [a == b])
+(EqPtr  (Addr {a} _) (OffPtr [o] (Addr {b} _))) => (ConstBool [a == b && o == 0])
+(EqPtr  (OffPtr [o1] (Addr {a} _)) (OffPtr [o2] (Addr {b} _))) => (ConstBool [a == b && o1 == o2])
+(NeqPtr (Addr {a} _) (Addr {b} _)) => (ConstBool [a != b])
+(NeqPtr (Addr {a} _) (OffPtr [o] (Addr {b} _))) => (ConstBool [a != b || o != 0])
+(NeqPtr (OffPtr [o1] (Addr {a} _)) (OffPtr [o2] (Addr {b} _))) => (ConstBool [a != b || o1 != o2])
+(EqPtr  (LocalAddr {a} _ _) (LocalAddr {b} _ _)) => (ConstBool [a == b])
+(EqPtr  (LocalAddr {a} _ _) (OffPtr [o] (LocalAddr {b} _ _))) => (ConstBool [a == b && o == 0])
+(EqPtr  (OffPtr [o1] (LocalAddr {a} _ _)) (OffPtr [o2] (LocalAddr {b} _ _))) => (ConstBool [a == b && o1 == o2])
+(NeqPtr (LocalAddr {a} _ _) (LocalAddr {b} _ _)) => (ConstBool [a != b])
+(NeqPtr (LocalAddr {a} _ _) (OffPtr [o] (LocalAddr {b} _ _))) => (ConstBool [a != b || o != 0])
+(NeqPtr (OffPtr [o1] (LocalAddr {a} _ _)) (OffPtr [o2] (LocalAddr {b} _ _))) => (ConstBool [a != b || o1 != o2])
+(EqPtr  (OffPtr [o1] p1) p2) && isSamePtr(p1, p2) => (ConstBool [o1 == 0])
+(NeqPtr (OffPtr [o1] p1) p2) && isSamePtr(p1, p2) => (ConstBool [o1 != 0])
+(EqPtr  (OffPtr [o1] p1) (OffPtr [o2] p2)) && isSamePtr(p1, p2) => (ConstBool [o1 == o2])
+(NeqPtr (OffPtr [o1] p1) (OffPtr [o2] p2)) && isSamePtr(p1, p2) => (ConstBool [o1 != o2])
+(EqPtr  (Const(32|64) [c]) (Const(32|64) [d])) => (ConstBool [c == d])
+(NeqPtr (Const(32|64) [c]) (Const(32|64) [d])) => (ConstBool [c != d])
+
+(EqPtr  (LocalAddr _ _) (Addr _)) => (ConstBool [false])
+(EqPtr  (OffPtr (LocalAddr _ _)) (Addr _)) => (ConstBool [false])
+(EqPtr  (LocalAddr _ _) (OffPtr (Addr _))) => (ConstBool [false])
+(EqPtr  (OffPtr (LocalAddr _ _)) (OffPtr (Addr _))) => (ConstBool [false])
+(NeqPtr (LocalAddr _ _) (Addr _)) => (ConstBool [true])
+(NeqPtr (OffPtr (LocalAddr _ _)) (Addr _)) => (ConstBool [true])
+(NeqPtr (LocalAddr _ _) (OffPtr (Addr _))) => (ConstBool [true])
+(NeqPtr (OffPtr (LocalAddr _ _)) (OffPtr (Addr _))) => (ConstBool [true])
+
+// Simplify address comparisons.
+(EqPtr  (AddPtr p1 o1) p2) && isSamePtr(p1, p2) => (Not (IsNonNil o1))
+(NeqPtr (AddPtr p1 o1) p2) && isSamePtr(p1, p2) => (IsNonNil o1)
+(EqPtr  (Const(32|64) [0]) p) => (Not (IsNonNil p))
+(NeqPtr (Const(32|64) [0]) p) => (IsNonNil p)
+(EqPtr  (ConstNil) p) => (Not (IsNonNil p))
+(NeqPtr (ConstNil) p) => (IsNonNil p)
+
+// Evaluate constant user nil checks.
+(IsNonNil (ConstNil)) => (ConstBool [false])
+(IsNonNil (Const(32|64) [c])) => (ConstBool [c != 0])
+(IsNonNil (Addr _)) => (ConstBool [true])
+(IsNonNil (LocalAddr _ _)) => (ConstBool [true])
+
+// Inline small or disjoint runtime.memmove calls with constant length.
+// See the comment in op Move in genericOps.go for discussion of the type.
+(StaticCall {sym} s1:(Store _ (Const(64|32) [sz]) s2:(Store  _ src s3:(Store {t} _ dst mem))))
+	&& sz >= 0
+	&& isSameCall(sym, "runtime.memmove")
+	&& t.IsPtr() // avoids TUINTPTR, see issue 30061
+	&& s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+	&& isInlinableMemmove(dst, src, int64(sz), config)
+	&& clobber(s1, s2, s3)
+	=> (Move {t.Elem()} [int64(sz)] dst src mem)
+
+// Inline small or disjoint runtime.memmove calls with constant length.
+// See the comment in op Move in genericOps.go for discussion of the type.
+(SelectN [0] call:(StaticLECall {sym} dst src (Const(64|32) [sz]) mem))
+	&& sz >= 0
+	&& call.Uses == 1 // this will exclude all calls with results
+	&& isSameCall(sym, "runtime.memmove")
+	&& dst.Type.IsPtr() // avoids TUINTPTR, see issue 30061
+	&& isInlinableMemmove(dst, src, int64(sz), config)
+	&& clobber(call)
+	=> (Move {dst.Type.Elem()} [int64(sz)] dst src mem)
+
+// De-virtualize interface calls into static calls.
+// Note that (ITab (IMake)) doesn't get
+// rewritten until after the first opt pass,
+// so this rule should trigger reliably.
+(InterCall [argsize] {auxCall} (Load (OffPtr [off] (ITab (IMake (Addr {itab} (SB)) _))) _) mem) && devirt(v, auxCall, itab, off) != nil =>
+	(StaticCall [int32(argsize)] {devirt(v, auxCall, itab, off)} mem)
+
+// De-virtualize late-expanded interface calls into late-expanded static calls.
+// Note that (ITab (IMake)) doesn't get rewritten until after the first opt pass,
+// so this rule should trigger reliably.
+// devirtLECall removes the first argument, adds the devirtualized symbol to the AuxCall, and changes the opcode
+(InterLECall [argsize] {auxCall} (Load (OffPtr [off] (ITab (IMake (Addr {itab} (SB)) _))) _) ___) && devirtLESym(v, auxCall, itab, off) !=
+    nil => devirtLECall(v, devirtLESym(v, auxCall, itab, off))
+
+// Move and Zero optimizations.
+// Move source and destination may overlap.
+
+// Convert Moves into Zeros when the source is known to be zeros.
+(Move {t} [n] dst1 src mem:(Zero {t} [n] dst2 _)) && isSamePtr(src, dst2)
+	=> (Zero {t} [n] dst1 mem)
+(Move {t} [n] dst1 src mem:(VarDef (Zero {t} [n] dst0 _))) && isSamePtr(src, dst0)
+	=> (Zero {t} [n] dst1 mem)
+(Move {t} [n] dst (Addr {sym} (SB)) mem) && symIsROZero(sym) => (Zero {t} [n] dst mem)
+
+// Don't Store to variables that are about to be overwritten by Move/Zero.
+(Zero {t1} [n] p1 store:(Store {t2} (OffPtr [o2] p2) _ mem))
+	&& isSamePtr(p1, p2) && store.Uses == 1
+	&& n >= o2 + t2.Size()
+	&& clobber(store)
+	=> (Zero {t1} [n] p1 mem)
+(Move {t1} [n] dst1 src1 store:(Store {t2} op:(OffPtr [o2] dst2) _ mem))
+	&& isSamePtr(dst1, dst2) && store.Uses == 1
+	&& n >= o2 + t2.Size()
+	&& disjoint(src1, n, op, t2.Size())
+	&& clobber(store)
+	=> (Move {t1} [n] dst1 src1 mem)
+
+// Don't Move to variables that are immediately completely overwritten.
+(Zero {t} [n] dst1 move:(Move {t} [n] dst2 _ mem))
+	&& move.Uses == 1
+	&& isSamePtr(dst1, dst2)
+	&& clobber(move)
+	=> (Zero {t} [n] dst1 mem)
+(Move {t} [n] dst1 src1 move:(Move {t} [n] dst2 _ mem))
+	&& move.Uses == 1
+	&& isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+	&& clobber(move)
+	=> (Move {t} [n] dst1 src1 mem)
+(Zero {t} [n] dst1 vardef:(VarDef {x} move:(Move {t} [n] dst2 _ mem)))
+	&& move.Uses == 1 && vardef.Uses == 1
+	&& isSamePtr(dst1, dst2)
+	&& clobber(move, vardef)
+	=> (Zero {t} [n] dst1 (VarDef {x} mem))
+(Move {t} [n] dst1 src1 vardef:(VarDef {x} move:(Move {t} [n] dst2 _ mem)))
+	&& move.Uses == 1 && vardef.Uses == 1
+	&& isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+	&& clobber(move, vardef)
+	=> (Move {t} [n] dst1 src1 (VarDef {x} mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+	m2:(Store {t2} op2:(OffPtr [0] p2) d2
+		m3:(Move [n] p3 _ mem)))
+	&& m2.Uses == 1 && m3.Uses == 1
+	&& o1 == t2.Size()
+	&& n == t2.Size() + t1.Size()
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3)
+	&& clobber(m2, m3)
+	=> (Store {t1} op1 d1 (Store {t2} op2 d2 mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+	m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+		m3:(Store {t3} op3:(OffPtr [0] p3) d3
+			m4:(Move [n] p4 _ mem))))
+	&& m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1
+	&& o2 == t3.Size()
+	&& o1-o2 == t2.Size()
+	&& n == t3.Size() + t2.Size() + t1.Size()
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+	&& clobber(m2, m3, m4)
+	=> (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 mem)))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+	m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+		m3:(Store {t3} op3:(OffPtr [o3] p3) d3
+			m4:(Store {t4} op4:(OffPtr [0] p4) d4
+				m5:(Move [n] p5 _ mem)))))
+	&& m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 && m5.Uses == 1
+	&& o3 == t4.Size()
+	&& o2-o3 == t3.Size()
+	&& o1-o2 == t2.Size()
+	&& n == t4.Size() + t3.Size() + t2.Size() + t1.Size()
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+	&& clobber(m2, m3, m4, m5)
+	=> (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 (Store {t4} op4 d4 mem))))
+
+// Don't Zero variables that are immediately completely overwritten
+// before being accessed.
+(Move {t} [n] dst1 src1 zero:(Zero {t} [n] dst2 mem))
+	&& zero.Uses == 1
+	&& isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+	&& clobber(zero)
+	=> (Move {t} [n] dst1 src1 mem)
+(Move {t} [n] dst1 src1 vardef:(VarDef {x} zero:(Zero {t} [n] dst2 mem)))
+	&& zero.Uses == 1 && vardef.Uses == 1
+	&& isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n)
+	&& clobber(zero, vardef)
+	=> (Move {t} [n] dst1 src1 (VarDef {x} mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+	m2:(Store {t2} op2:(OffPtr [0] p2) d2
+		m3:(Zero [n] p3 mem)))
+	&& m2.Uses == 1 && m3.Uses == 1
+	&& o1 == t2.Size()
+	&& n == t2.Size() + t1.Size()
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3)
+	&& clobber(m2, m3)
+	=> (Store {t1} op1 d1 (Store {t2} op2 d2 mem))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+	m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+		m3:(Store {t3} op3:(OffPtr [0] p3) d3
+			m4:(Zero [n] p4 mem))))
+	&& m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1
+	&& o2 == t3.Size()
+	&& o1-o2 == t2.Size()
+	&& n == t3.Size() + t2.Size() + t1.Size()
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+	&& clobber(m2, m3, m4)
+	=> (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 mem)))
+(Store {t1} op1:(OffPtr [o1] p1) d1
+	m2:(Store {t2} op2:(OffPtr [o2] p2) d2
+		m3:(Store {t3} op3:(OffPtr [o3] p3) d3
+			m4:(Store {t4} op4:(OffPtr [0] p4) d4
+				m5:(Zero [n] p5 mem)))))
+	&& m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 && m5.Uses == 1
+	&& o3 == t4.Size()
+	&& o2-o3 == t3.Size()
+	&& o1-o2 == t2.Size()
+	&& n == t4.Size() + t3.Size() + t2.Size() + t1.Size()
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+	&& clobber(m2, m3, m4, m5)
+	=> (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 (Store {t4} op4 d4 mem))))
+
+// Don't Move from memory if the values are likely to already be
+// in registers.
+(Move {t1} [n] dst p1
+	mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+		(Store {t3} op3:(OffPtr <tt3> [0] p3) d2 _)))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& o2 == t3.Size()
+	&& n == t2.Size() + t3.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [0] dst) d2 mem))
+(Move {t1} [n] dst p1
+	mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+		(Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+			(Store {t4} op4:(OffPtr <tt4> [0] p4) d3 _))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& o3 == t4.Size()
+	&& o2-o3 == t3.Size()
+	&& n == t2.Size() + t3.Size() + t4.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [0] dst) d3 mem)))
+(Move {t1} [n] dst p1
+	mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+		(Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+			(Store {t4} op4:(OffPtr <tt4> [o4] p4) d3
+				(Store {t5} op5:(OffPtr <tt5> [0] p5) d4 _)))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& t5.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& registerizable(b, t5)
+	&& o4 == t5.Size()
+	&& o3-o4 == t4.Size()
+	&& o2-o3 == t3.Size()
+	&& n == t2.Size() + t3.Size() + t4.Size() + t5.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [o4] dst) d3
+				(Store {t5} (OffPtr <tt5> [0] dst) d4 mem))))
+
+// Same thing but with VarDef in the middle.
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+			(Store {t3} op3:(OffPtr <tt3> [0] p3) d2 _))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& o2 == t3.Size()
+	&& n == t2.Size() + t3.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [0] dst) d2 mem))
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+			(Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+				(Store {t4} op4:(OffPtr <tt4> [0] p4) d3 _)))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& o3 == t4.Size()
+	&& o2-o3 == t3.Size()
+	&& n == t2.Size() + t3.Size() + t4.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [0] dst) d3 mem)))
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+			(Store {t3} op3:(OffPtr <tt3> [o3] p3) d2
+				(Store {t4} op4:(OffPtr <tt4> [o4] p4) d3
+					(Store {t5} op5:(OffPtr <tt5> [0] p5) d4 _))))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& t5.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& registerizable(b, t5)
+	&& o4 == t5.Size()
+	&& o3-o4 == t4.Size()
+	&& o2-o3 == t3.Size()
+	&& n == t2.Size() + t3.Size() + t4.Size() + t5.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [o4] dst) d3
+				(Store {t5} (OffPtr <tt5> [0] dst) d4 mem))))
+
+// Prefer to Zero and Store than to Move.
+(Move {t1} [n] dst p1
+	mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+		(Zero {t3} [n] p3 _)))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& n >= o2 + t2.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Zero {t1} [n] dst mem))
+(Move {t1} [n] dst p1
+	mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1
+		(Store {t3} (OffPtr <tt3> [o3] p3) d2
+			(Zero {t4} [n] p4 _))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& n >= o2 + t2.Size()
+	&& n >= o3 + t3.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Zero {t1} [n] dst mem)))
+(Move {t1} [n] dst p1
+	mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1
+		(Store {t3} (OffPtr <tt3> [o3] p3) d2
+			(Store {t4} (OffPtr <tt4> [o4] p4) d3
+				(Zero {t5} [n] p5 _)))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& t5.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& n >= o2 + t2.Size()
+	&& n >= o3 + t3.Size()
+	&& n >= o4 + t4.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [o4] dst) d3
+				(Zero {t1} [n] dst mem))))
+(Move {t1} [n] dst p1
+	mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1
+		(Store {t3} (OffPtr <tt3> [o3] p3) d2
+			(Store {t4} (OffPtr <tt4> [o4] p4) d3
+				(Store {t5} (OffPtr <tt5> [o5] p5) d4
+					(Zero {t6} [n] p6 _))))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) && isSamePtr(p5, p6)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& t5.Alignment() <= t1.Alignment()
+	&& t6.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& registerizable(b, t5)
+	&& n >= o2 + t2.Size()
+	&& n >= o3 + t3.Size()
+	&& n >= o4 + t4.Size()
+	&& n >= o5 + t5.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [o4] dst) d3
+				(Store {t5} (OffPtr <tt5> [o5] dst) d4
+					(Zero {t1} [n] dst mem)))))
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1
+			(Zero {t3} [n] p3 _))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& n >= o2 + t2.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Zero {t1} [n] dst mem))
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} (OffPtr <tt2> [o2] p2) d1
+			(Store {t3} (OffPtr <tt3> [o3] p3) d2
+				(Zero {t4} [n] p4 _)))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& n >= o2 + t2.Size()
+	&& n >= o3 + t3.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Zero {t1} [n] dst mem)))
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} (OffPtr <tt2> [o2] p2) d1
+			(Store {t3} (OffPtr <tt3> [o3] p3) d2
+				(Store {t4} (OffPtr <tt4> [o4] p4) d3
+					(Zero {t5} [n] p5 _))))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& t5.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& n >= o2 + t2.Size()
+	&& n >= o3 + t3.Size()
+	&& n >= o4 + t4.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [o4] dst) d3
+				(Zero {t1} [n] dst mem))))
+(Move {t1} [n] dst p1
+	mem:(VarDef
+		(Store {t2} (OffPtr <tt2> [o2] p2) d1
+			(Store {t3} (OffPtr <tt3> [o3] p3) d2
+				(Store {t4} (OffPtr <tt4> [o4] p4) d3
+					(Store {t5} (OffPtr <tt5> [o5] p5) d4
+						(Zero {t6} [n] p6 _)))))))
+	&& isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) && isSamePtr(p5, p6)
+	&& t2.Alignment() <= t1.Alignment()
+	&& t3.Alignment() <= t1.Alignment()
+	&& t4.Alignment() <= t1.Alignment()
+	&& t5.Alignment() <= t1.Alignment()
+	&& t6.Alignment() <= t1.Alignment()
+	&& registerizable(b, t2)
+	&& registerizable(b, t3)
+	&& registerizable(b, t4)
+	&& registerizable(b, t5)
+	&& n >= o2 + t2.Size()
+	&& n >= o3 + t3.Size()
+	&& n >= o4 + t4.Size()
+	&& n >= o5 + t5.Size()
+	=> (Store {t2} (OffPtr <tt2> [o2] dst) d1
+		(Store {t3} (OffPtr <tt3> [o3] dst) d2
+			(Store {t4} (OffPtr <tt4> [o4] dst) d3
+				(Store {t5} (OffPtr <tt5> [o5] dst) d4
+					(Zero {t1} [n] dst mem)))))
+
+// TODO this does not fire before call expansion; is that acceptable?
+(StaticCall {sym} x) && needRaceCleanup(sym, v) => x
+
+// Collapse moving A -> B -> C into just A -> C.
+// Later passes (deadstore, elim unread auto) will remove the A -> B move, if possible.
+// This happens most commonly when B is an autotmp inserted earlier
+// during compilation to ensure correctness.
+// Take care that overlapping moves are preserved.
+// Restrict this optimization to the stack, to avoid duplicating loads from the heap;
+// see CL 145208 for discussion.
+(Move {t1} [s] dst tmp1 midmem:(Move {t2} [s] tmp2 src _))
+	&& t1.Compare(t2) == types.CMPeq
+	&& isSamePtr(tmp1, tmp2)
+	&& isStackPtr(src) && !isVolatile(src)
+	&& disjoint(src, s, tmp2, s)
+	&& (disjoint(src, s, dst, s) || isInlinableMemmove(dst, src, s, config))
+	=> (Move {t1} [s] dst src midmem)
+
+// Same, but for large types that require VarDefs.
+(Move {t1} [s] dst tmp1 midmem:(VarDef (Move {t2} [s] tmp2 src _)))
+	&& t1.Compare(t2) == types.CMPeq
+	&& isSamePtr(tmp1, tmp2)
+	&& isStackPtr(src) && !isVolatile(src)
+	&& disjoint(src, s, tmp2, s)
+	&& (disjoint(src, s, dst, s) || isInlinableMemmove(dst, src, s, config))
+	=> (Move {t1} [s] dst src midmem)
+
+// Don't zero the same bits twice.
+(Zero {t} [s] dst1 zero:(Zero {t} [s] dst2 _)) && isSamePtr(dst1, dst2) => zero
+(Zero {t} [s] dst1 vardef:(VarDef (Zero {t} [s] dst2 _))) && isSamePtr(dst1, dst2) => vardef
+
+// Elide self-moves. This only happens rarely (e.g test/fixedbugs/bug277.go).
+// However, this rule is needed to prevent the previous rule from looping forever in such cases.
+(Move dst src mem) && isSamePtr(dst, src) => mem
diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
new file mode 100644
index 0000000..0a7d5dd
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -0,0 +1,620 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+// Generic opcodes typically specify a width. The inputs and outputs
+// of that op are the given number of bits wide. There is no notion of
+// "sign", so Add32 can be used both for signed and unsigned 32-bit
+// addition.
+
+// Signed/unsigned is explicit with the extension ops
+// (SignExt*/ZeroExt*) and implicit as the arg to some opcodes
+// (e.g. the second argument to shifts is unsigned). If not mentioned,
+// all args take signed inputs, or don't care whether their inputs
+// are signed or unsigned.
+
+var genericOps = []opData{
+	// 2-input arithmetic
+	// Types must be consistent with Go typing. Add, for example, must take two values
+	// of the same type and produces that same type.
+	{name: "Add8", argLength: 2, commutative: true}, // arg0 + arg1
+	{name: "Add16", argLength: 2, commutative: true},
+	{name: "Add32", argLength: 2, commutative: true},
+	{name: "Add64", argLength: 2, commutative: true},
+	{name: "AddPtr", argLength: 2}, // For address calculations.  arg0 is a pointer and arg1 is an int.
+	{name: "Add32F", argLength: 2, commutative: true},
+	{name: "Add64F", argLength: 2, commutative: true},
+
+	{name: "Sub8", argLength: 2}, // arg0 - arg1
+	{name: "Sub16", argLength: 2},
+	{name: "Sub32", argLength: 2},
+	{name: "Sub64", argLength: 2},
+	{name: "SubPtr", argLength: 2},
+	{name: "Sub32F", argLength: 2},
+	{name: "Sub64F", argLength: 2},
+
+	{name: "Mul8", argLength: 2, commutative: true}, // arg0 * arg1
+	{name: "Mul16", argLength: 2, commutative: true},
+	{name: "Mul32", argLength: 2, commutative: true},
+	{name: "Mul64", argLength: 2, commutative: true},
+	{name: "Mul32F", argLength: 2, commutative: true},
+	{name: "Mul64F", argLength: 2, commutative: true},
+
+	{name: "Div32F", argLength: 2}, // arg0 / arg1
+	{name: "Div64F", argLength: 2},
+
+	{name: "Hmul32", argLength: 2, commutative: true},
+	{name: "Hmul32u", argLength: 2, commutative: true},
+	{name: "Hmul64", argLength: 2, commutative: true},
+	{name: "Hmul64u", argLength: 2, commutative: true},
+
+	{name: "Mul32uhilo", argLength: 2, typ: "(UInt32,UInt32)", commutative: true}, // arg0 * arg1, returns (hi, lo)
+	{name: "Mul64uhilo", argLength: 2, typ: "(UInt64,UInt64)", commutative: true}, // arg0 * arg1, returns (hi, lo)
+
+	{name: "Mul32uover", argLength: 2, typ: "(UInt32,Bool)", commutative: true}, // Let x = arg0*arg1 (full 32x32-> 64 unsigned multiply), returns (uint32(x), (uint32(x) != x))
+	{name: "Mul64uover", argLength: 2, typ: "(UInt64,Bool)", commutative: true}, // Let x = arg0*arg1 (full 64x64->128 unsigned multiply), returns (uint64(x), (uint64(x) != x))
+
+	// Weird special instructions for use in the strength reduction of divides.
+	// These ops compute unsigned (arg0 + arg1) / 2, correct to all
+	// 32/64 bits, even when the intermediate result of the add has 33/65 bits.
+	// These ops can assume arg0 >= arg1.
+	// Note: these ops aren't commutative!
+	{name: "Avg32u", argLength: 2, typ: "UInt32"}, // 32-bit platforms only
+	{name: "Avg64u", argLength: 2, typ: "UInt64"}, // 64-bit platforms only
+
+	// For Div16, Div32 and Div64, AuxInt non-zero means that the divisor has been proved to be not -1
+	// or that the dividend is not the most negative value.
+	{name: "Div8", argLength: 2},  // arg0 / arg1, signed
+	{name: "Div8u", argLength: 2}, // arg0 / arg1, unsigned
+	{name: "Div16", argLength: 2, aux: "Bool"},
+	{name: "Div16u", argLength: 2},
+	{name: "Div32", argLength: 2, aux: "Bool"},
+	{name: "Div32u", argLength: 2},
+	{name: "Div64", argLength: 2, aux: "Bool"},
+	{name: "Div64u", argLength: 2},
+	{name: "Div128u", argLength: 3}, // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r)
+
+	// For Mod16, Mod32 and Mod64, AuxInt non-zero means that the divisor has been proved to be not -1.
+	{name: "Mod8", argLength: 2},  // arg0 % arg1, signed
+	{name: "Mod8u", argLength: 2}, // arg0 % arg1, unsigned
+	{name: "Mod16", argLength: 2, aux: "Bool"},
+	{name: "Mod16u", argLength: 2},
+	{name: "Mod32", argLength: 2, aux: "Bool"},
+	{name: "Mod32u", argLength: 2},
+	{name: "Mod64", argLength: 2, aux: "Bool"},
+	{name: "Mod64u", argLength: 2},
+
+	{name: "And8", argLength: 2, commutative: true}, // arg0 & arg1
+	{name: "And16", argLength: 2, commutative: true},
+	{name: "And32", argLength: 2, commutative: true},
+	{name: "And64", argLength: 2, commutative: true},
+
+	{name: "Or8", argLength: 2, commutative: true}, // arg0 | arg1
+	{name: "Or16", argLength: 2, commutative: true},
+	{name: "Or32", argLength: 2, commutative: true},
+	{name: "Or64", argLength: 2, commutative: true},
+
+	{name: "Xor8", argLength: 2, commutative: true}, // arg0 ^ arg1
+	{name: "Xor16", argLength: 2, commutative: true},
+	{name: "Xor32", argLength: 2, commutative: true},
+	{name: "Xor64", argLength: 2, commutative: true},
+
+	// For shifts, AxB means the shifted value has A bits and the shift amount has B bits.
+	// Shift amounts are considered unsigned.
+	// If arg1 is known to be nonnegative and less than the number of bits in arg0,
+	// then auxInt may be set to 1.
+	// This enables better code generation on some platforms.
+	{name: "Lsh8x8", argLength: 2, aux: "Bool"}, // arg0 << arg1
+	{name: "Lsh8x16", argLength: 2, aux: "Bool"},
+	{name: "Lsh8x32", argLength: 2, aux: "Bool"},
+	{name: "Lsh8x64", argLength: 2, aux: "Bool"},
+	{name: "Lsh16x8", argLength: 2, aux: "Bool"},
+	{name: "Lsh16x16", argLength: 2, aux: "Bool"},
+	{name: "Lsh16x32", argLength: 2, aux: "Bool"},
+	{name: "Lsh16x64", argLength: 2, aux: "Bool"},
+	{name: "Lsh32x8", argLength: 2, aux: "Bool"},
+	{name: "Lsh32x16", argLength: 2, aux: "Bool"},
+	{name: "Lsh32x32", argLength: 2, aux: "Bool"},
+	{name: "Lsh32x64", argLength: 2, aux: "Bool"},
+	{name: "Lsh64x8", argLength: 2, aux: "Bool"},
+	{name: "Lsh64x16", argLength: 2, aux: "Bool"},
+	{name: "Lsh64x32", argLength: 2, aux: "Bool"},
+	{name: "Lsh64x64", argLength: 2, aux: "Bool"},
+
+	{name: "Rsh8x8", argLength: 2, aux: "Bool"}, // arg0 >> arg1, signed
+	{name: "Rsh8x16", argLength: 2, aux: "Bool"},
+	{name: "Rsh8x32", argLength: 2, aux: "Bool"},
+	{name: "Rsh8x64", argLength: 2, aux: "Bool"},
+	{name: "Rsh16x8", argLength: 2, aux: "Bool"},
+	{name: "Rsh16x16", argLength: 2, aux: "Bool"},
+	{name: "Rsh16x32", argLength: 2, aux: "Bool"},
+	{name: "Rsh16x64", argLength: 2, aux: "Bool"},
+	{name: "Rsh32x8", argLength: 2, aux: "Bool"},
+	{name: "Rsh32x16", argLength: 2, aux: "Bool"},
+	{name: "Rsh32x32", argLength: 2, aux: "Bool"},
+	{name: "Rsh32x64", argLength: 2, aux: "Bool"},
+	{name: "Rsh64x8", argLength: 2, aux: "Bool"},
+	{name: "Rsh64x16", argLength: 2, aux: "Bool"},
+	{name: "Rsh64x32", argLength: 2, aux: "Bool"},
+	{name: "Rsh64x64", argLength: 2, aux: "Bool"},
+
+	{name: "Rsh8Ux8", argLength: 2, aux: "Bool"}, // arg0 >> arg1, unsigned
+	{name: "Rsh8Ux16", argLength: 2, aux: "Bool"},
+	{name: "Rsh8Ux32", argLength: 2, aux: "Bool"},
+	{name: "Rsh8Ux64", argLength: 2, aux: "Bool"},
+	{name: "Rsh16Ux8", argLength: 2, aux: "Bool"},
+	{name: "Rsh16Ux16", argLength: 2, aux: "Bool"},
+	{name: "Rsh16Ux32", argLength: 2, aux: "Bool"},
+	{name: "Rsh16Ux64", argLength: 2, aux: "Bool"},
+	{name: "Rsh32Ux8", argLength: 2, aux: "Bool"},
+	{name: "Rsh32Ux16", argLength: 2, aux: "Bool"},
+	{name: "Rsh32Ux32", argLength: 2, aux: "Bool"},
+	{name: "Rsh32Ux64", argLength: 2, aux: "Bool"},
+	{name: "Rsh64Ux8", argLength: 2, aux: "Bool"},
+	{name: "Rsh64Ux16", argLength: 2, aux: "Bool"},
+	{name: "Rsh64Ux32", argLength: 2, aux: "Bool"},
+	{name: "Rsh64Ux64", argLength: 2, aux: "Bool"},
+
+	// 2-input comparisons
+	{name: "Eq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1
+	{name: "Eq16", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq32", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq64", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "EqPtr", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "EqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "EqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "Eq32F", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Eq64F", argLength: 2, commutative: true, typ: "Bool"},
+
+	{name: "Neq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1
+	{name: "Neq16", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq32", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq64", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "NeqPtr", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "NeqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "NeqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend
+	{name: "Neq32F", argLength: 2, commutative: true, typ: "Bool"},
+	{name: "Neq64F", argLength: 2, commutative: true, typ: "Bool"},
+
+	{name: "Less8", argLength: 2, typ: "Bool"},  // arg0 < arg1, signed
+	{name: "Less8U", argLength: 2, typ: "Bool"}, // arg0 < arg1, unsigned
+	{name: "Less16", argLength: 2, typ: "Bool"},
+	{name: "Less16U", argLength: 2, typ: "Bool"},
+	{name: "Less32", argLength: 2, typ: "Bool"},
+	{name: "Less32U", argLength: 2, typ: "Bool"},
+	{name: "Less64", argLength: 2, typ: "Bool"},
+	{name: "Less64U", argLength: 2, typ: "Bool"},
+	{name: "Less32F", argLength: 2, typ: "Bool"},
+	{name: "Less64F", argLength: 2, typ: "Bool"},
+
+	{name: "Leq8", argLength: 2, typ: "Bool"},  // arg0 <= arg1, signed
+	{name: "Leq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned
+	{name: "Leq16", argLength: 2, typ: "Bool"},
+	{name: "Leq16U", argLength: 2, typ: "Bool"},
+	{name: "Leq32", argLength: 2, typ: "Bool"},
+	{name: "Leq32U", argLength: 2, typ: "Bool"},
+	{name: "Leq64", argLength: 2, typ: "Bool"},
+	{name: "Leq64U", argLength: 2, typ: "Bool"},
+	{name: "Leq32F", argLength: 2, typ: "Bool"},
+	{name: "Leq64F", argLength: 2, typ: "Bool"},
+
+	// the type of a CondSelect is the same as the type of its first
+	// two arguments, which should be register-width scalars; the third
+	// argument should be a boolean
+	{name: "CondSelect", argLength: 3}, // arg2 ? arg0 : arg1
+
+	// boolean ops
+	{name: "AndB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 && arg1 (not shortcircuited)
+	{name: "OrB", argLength: 2, commutative: true, typ: "Bool"},  // arg0 || arg1 (not shortcircuited)
+	{name: "EqB", argLength: 2, commutative: true, typ: "Bool"},  // arg0 == arg1
+	{name: "NeqB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1
+	{name: "Not", argLength: 1, typ: "Bool"},                     // !arg0, boolean
+
+	// 1-input ops
+	{name: "Neg8", argLength: 1}, // -arg0
+	{name: "Neg16", argLength: 1},
+	{name: "Neg32", argLength: 1},
+	{name: "Neg64", argLength: 1},
+	{name: "Neg32F", argLength: 1},
+	{name: "Neg64F", argLength: 1},
+
+	{name: "Com8", argLength: 1}, // ^arg0
+	{name: "Com16", argLength: 1},
+	{name: "Com32", argLength: 1},
+	{name: "Com64", argLength: 1},
+
+	{name: "Ctz8", argLength: 1},         // Count trailing (low order) zeroes (returns 0-8)
+	{name: "Ctz16", argLength: 1},        // Count trailing (low order) zeroes (returns 0-16)
+	{name: "Ctz32", argLength: 1},        // Count trailing (low order) zeroes (returns 0-32)
+	{name: "Ctz64", argLength: 1},        // Count trailing (low order) zeroes (returns 0-64)
+	{name: "Ctz8NonZero", argLength: 1},  // same as above, but arg[0] known to be non-zero, returns 0-7
+	{name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
+	{name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
+	{name: "Ctz64NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-63
+	{name: "BitLen8", argLength: 1},      // Number of bits in arg[0] (returns 0-8)
+	{name: "BitLen16", argLength: 1},     // Number of bits in arg[0] (returns 0-16)
+	{name: "BitLen32", argLength: 1},     // Number of bits in arg[0] (returns 0-32)
+	{name: "BitLen64", argLength: 1},     // Number of bits in arg[0] (returns 0-64)
+
+	{name: "Bswap32", argLength: 1}, // Swap bytes
+	{name: "Bswap64", argLength: 1}, // Swap bytes
+
+	{name: "BitRev8", argLength: 1},  // Reverse the bits in arg[0]
+	{name: "BitRev16", argLength: 1}, // Reverse the bits in arg[0]
+	{name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0]
+	{name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0]
+
+	{name: "PopCount8", argLength: 1},    // Count bits in arg[0]
+	{name: "PopCount16", argLength: 1},   // Count bits in arg[0]
+	{name: "PopCount32", argLength: 1},   // Count bits in arg[0]
+	{name: "PopCount64", argLength: 1},   // Count bits in arg[0]
+	{name: "RotateLeft8", argLength: 2},  // Rotate bits in arg[0] left by arg[1]
+	{name: "RotateLeft16", argLength: 2}, // Rotate bits in arg[0] left by arg[1]
+	{name: "RotateLeft32", argLength: 2}, // Rotate bits in arg[0] left by arg[1]
+	{name: "RotateLeft64", argLength: 2}, // Rotate bits in arg[0] left by arg[1]
+
+	// Square root, float64 only.
+	// Special cases:
+	//   +∞  → +∞
+	//   ±0  → ±0 (sign preserved)
+	//   x<0 → NaN
+	//   NaN → NaN
+	{name: "Sqrt", argLength: 1}, // √arg0
+
+	// Round to integer, float64 only.
+	// Special cases:
+	//   ±∞  → ±∞ (sign preserved)
+	//   ±0  → ±0 (sign preserved)
+	//   NaN → NaN
+	{name: "Floor", argLength: 1},       // round arg0 toward -∞
+	{name: "Ceil", argLength: 1},        // round arg0 toward +∞
+	{name: "Trunc", argLength: 1},       // round arg0 toward 0
+	{name: "Round", argLength: 1},       // round arg0 to nearest, ties away from 0
+	{name: "RoundToEven", argLength: 1}, // round arg0 to nearest, ties to even
+
+	// Modify the sign bit
+	{name: "Abs", argLength: 1},      // absolute value arg0
+	{name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1
+
+	// 3-input opcode.
+	// Fused-multiply-add, float64 only.
+	// When a*b+c is exactly zero (before rounding), then the result is +0 or -0.
+	// The 0's sign is determined according to the standard rules for the
+	// addition (-0 if both a*b and c are -0, +0 otherwise).
+	//
+	// Otherwise, when a*b+c rounds to zero, then the resulting 0's sign is
+	// determined by the sign of the exact result a*b+c.
+	// See section 6.3 in ieee754.
+	//
+	// When the multiply is an infinity times a zero, the result is NaN.
+	// See section 7.2 in ieee754.
+	{name: "FMA", argLength: 3}, // compute (a*b)+c without intermediate rounding
+
+	// Data movement. Max argument length for Phi is indefinite.
+	{name: "Phi", argLength: -1, zeroWidth: true}, // select an argument based on which predecessor block we came from
+	{name: "Copy", argLength: 1},                  // output = arg0
+	// Convert converts between pointers and integers.
+	// We have a special op for this so as to not confuse GC
+	// (particularly stack maps).  It takes a memory arg so it
+	// gets correctly ordered with respect to GC safepoints.
+	// It gets compiled to nothing, so its result must in the same
+	// register as its argument. regalloc knows it can use any
+	// allocatable integer register for OpConvert.
+	// arg0=ptr/int arg1=mem, output=int/ptr
+	{name: "Convert", argLength: 2, zeroWidth: true, resultInArg0: true},
+
+	// constants. Constant values are stored in the aux or
+	// auxint fields.
+	{name: "ConstBool", aux: "Bool"},     // auxint is 0 for false and 1 for true
+	{name: "ConstString", aux: "String"}, // value is aux.(string)
+	{name: "ConstNil", typ: "BytePtr"},   // nil pointer
+	{name: "Const8", aux: "Int8"},        // auxint is sign-extended 8 bits
+	{name: "Const16", aux: "Int16"},      // auxint is sign-extended 16 bits
+	{name: "Const32", aux: "Int32"},      // auxint is sign-extended 32 bits
+	// Note: ConstX are sign-extended even when the type of the value is unsigned.
+	// For instance, uint8(0xaa) is stored as auxint=0xffffffffffffffaa.
+	{name: "Const64", aux: "Int64"}, // value is auxint
+	// Note: for both Const32F and Const64F, we disallow encoding NaNs.
+	// Signaling NaNs are tricky because if you do anything with them, they become quiet.
+	// Particularly, converting a 32 bit sNaN to 64 bit and back converts it to a qNaN.
+	// See issue 36399 and 36400.
+	// Encodings of +inf, -inf, and -0 are fine.
+	{name: "Const32F", aux: "Float32"}, // value is math.Float64frombits(uint64(auxint)) and is exactly representable as float 32
+	{name: "Const64F", aux: "Float64"}, // value is math.Float64frombits(uint64(auxint))
+	{name: "ConstInterface"},           // nil interface
+	{name: "ConstSlice"},               // nil slice
+
+	// Constant-like things
+	{name: "InitMem", zeroWidth: true},                               // memory input to the function.
+	{name: "Arg", aux: "SymOff", symEffect: "Read", zeroWidth: true}, // argument to the function.  aux=GCNode of arg, off = offset in that arg.
+
+	// The address of a variable.  arg0 is the base pointer.
+	// If the variable is a global, the base pointer will be SB and
+	// the Aux field will be a *obj.LSym.
+	// If the variable is a local, the base pointer will be SP and
+	// the Aux field will be a *gc.Node.
+	{name: "Addr", argLength: 1, aux: "Sym", symEffect: "Addr"},      // Address of a variable.  Arg0=SB.  Aux identifies the variable.
+	{name: "LocalAddr", argLength: 2, aux: "Sym", symEffect: "Addr"}, // Address of a variable.  Arg0=SP. Arg1=mem. Aux identifies the variable.
+
+	{name: "SP", zeroWidth: true},                 // stack pointer
+	{name: "SB", typ: "Uintptr", zeroWidth: true}, // static base pointer (a.k.a. globals pointer)
+	{name: "Invalid"},                             // unused value
+
+	// Memory operations
+	{name: "Load", argLength: 2},                          // Load from arg0.  arg1=memory
+	{name: "Dereference", argLength: 2},                   // Load from arg0.  arg1=memory.  Helper op for arg/result passing, result is an otherwise not-SSA-able "value".
+	{name: "Store", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
+	// The source and destination of Move may overlap in some cases. See e.g.
+	// memmove inlining in generic.rules. When inlineablememmovesize (in ../rewrite.go)
+	// returns true, we must do all loads before all stores, when lowering Move.
+	// The type of Move is used for the write barrier pass to insert write barriers
+	// and for alignment on some architectures.
+	// For pointerless types, it is possible for the type to be inaccurate.
+	// For type alignment and pointer information, use the type in Aux;
+	// for type size, use the size in AuxInt.
+	// The "inline runtime.memmove" rewrite rule generates Moves with inaccurate types,
+	// such as type byte instead of the more accurate type [8]byte.
+	{name: "Move", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type.  Returns memory.
+	{name: "Zero", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory.
+
+	// Memory operations with write barriers.
+	// Expand to runtime calls. Write barrier will be removed if write on stack.
+	{name: "StoreWB", argLength: 3, typ: "Mem", aux: "Typ"},    // Store arg1 to arg0. arg2=memory, aux=type.  Returns memory.
+	{name: "MoveWB", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type.  Returns memory.
+	{name: "ZeroWB", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory.
+
+	// WB invokes runtime.gcWriteBarrier. This is not a normal
+	// call: it takes arguments in registers, doesn't clobber
+	// general-purpose registers (the exact clobber set is
+	// arch-dependent), and is not a safe-point.
+	{name: "WB", argLength: 3, typ: "Mem", aux: "Sym", symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
+
+	{name: "HasCPUFeature", argLength: 0, typ: "bool", aux: "Sym", symEffect: "None"}, // aux=place that this feature flag can be loaded from
+
+	// PanicBounds and PanicExtend generate a runtime panic.
+	// Their arguments provide index values to use in panic messages.
+	// Both PanicBounds and PanicExtend have an AuxInt value from the BoundsKind type (in ../op.go).
+	// PanicBounds' index is int sized.
+	// PanicExtend's index is int64 sized. (PanicExtend is only used on 32-bit archs.)
+	{name: "PanicBounds", argLength: 3, aux: "Int64", typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory.
+	{name: "PanicExtend", argLength: 4, aux: "Int64", typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory.
+
+	// Function calls. Arguments to the call have already been written to the stack.
+	// Return values appear on the stack. The method receiver, if any, is treated
+	// as a phantom first argument.
+	// TODO(josharian): ClosureCall and InterCall should have Int32 aux
+	// to match StaticCall's 32 bit arg size limit.
+	// TODO(drchase,josharian): could the arg size limit be bundled into the rules for CallOff?
+	{name: "ClosureCall", argLength: 3, aux: "CallOff", call: true},    // arg0=code pointer, arg1=context ptr, arg2=memory.  auxint=arg size.  Returns memory.
+	{name: "StaticCall", argLength: 1, aux: "CallOff", call: true},     // call function aux.(*obj.LSym), arg0=memory.  auxint=arg size.  Returns memory.
+	{name: "InterCall", argLength: 2, aux: "CallOff", call: true},      // interface call.  arg0=code pointer, arg1=memory, auxint=arg size.  Returns memory.
+	{name: "ClosureLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded closure call. arg0=code pointer, arg1=context ptr,  arg2..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+	{name: "StaticLECall", argLength: -1, aux: "CallOff", call: true},  // late-expanded static call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+	{name: "InterLECall", argLength: -1, aux: "CallOff", call: true},   // late-expanded interface call. arg0=code pointer, arg1..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+
+	// Conversions: signed extensions, zero (unsigned) extensions, truncations
+	{name: "SignExt8to16", argLength: 1, typ: "Int16"},
+	{name: "SignExt8to32", argLength: 1, typ: "Int32"},
+	{name: "SignExt8to64", argLength: 1, typ: "Int64"},
+	{name: "SignExt16to32", argLength: 1, typ: "Int32"},
+	{name: "SignExt16to64", argLength: 1, typ: "Int64"},
+	{name: "SignExt32to64", argLength: 1, typ: "Int64"},
+	{name: "ZeroExt8to16", argLength: 1, typ: "UInt16"},
+	{name: "ZeroExt8to32", argLength: 1, typ: "UInt32"},
+	{name: "ZeroExt8to64", argLength: 1, typ: "UInt64"},
+	{name: "ZeroExt16to32", argLength: 1, typ: "UInt32"},
+	{name: "ZeroExt16to64", argLength: 1, typ: "UInt64"},
+	{name: "ZeroExt32to64", argLength: 1, typ: "UInt64"},
+	{name: "Trunc16to8", argLength: 1},
+	{name: "Trunc32to8", argLength: 1},
+	{name: "Trunc32to16", argLength: 1},
+	{name: "Trunc64to8", argLength: 1},
+	{name: "Trunc64to16", argLength: 1},
+	{name: "Trunc64to32", argLength: 1},
+
+	{name: "Cvt32to32F", argLength: 1},
+	{name: "Cvt32to64F", argLength: 1},
+	{name: "Cvt64to32F", argLength: 1},
+	{name: "Cvt64to64F", argLength: 1},
+	{name: "Cvt32Fto32", argLength: 1},
+	{name: "Cvt32Fto64", argLength: 1},
+	{name: "Cvt64Fto32", argLength: 1},
+	{name: "Cvt64Fto64", argLength: 1},
+	{name: "Cvt32Fto64F", argLength: 1},
+	{name: "Cvt64Fto32F", argLength: 1},
+	{name: "CvtBoolToUint8", argLength: 1},
+
+	// Force rounding to precision of type.
+	{name: "Round32F", argLength: 1},
+	{name: "Round64F", argLength: 1},
+
+	// Automatically inserted safety checks
+	{name: "IsNonNil", argLength: 1, typ: "Bool"},        // arg0 != nil
+	{name: "IsInBounds", argLength: 2, typ: "Bool"},      // 0 <= arg0 < arg1. arg1 is guaranteed >= 0.
+	{name: "IsSliceInBounds", argLength: 2, typ: "Bool"}, // 0 <= arg0 <= arg1. arg1 is guaranteed >= 0.
+	{name: "NilCheck", argLength: 2, typ: "Void"},        // arg0=ptr, arg1=mem. Panics if arg0 is nil. Returns void.
+
+	// Pseudo-ops
+	{name: "GetG", argLength: 1, zeroWidth: true}, // runtime.getg() (read g pointer). arg0=mem
+	{name: "GetClosurePtr"},                       // get closure pointer from dedicated register
+	{name: "GetCallerPC"},                         // for getcallerpc intrinsic
+	{name: "GetCallerSP"},                         // for getcallersp intrinsic
+
+	// Indexing operations
+	{name: "PtrIndex", argLength: 2},             // arg0=ptr, arg1=index. Computes ptr+sizeof(*v.type)*index, where index is extended to ptrwidth type
+	{name: "OffPtr", argLength: 1, aux: "Int64"}, // arg0 + auxint (arg0 and result are pointers)
+
+	// Slices
+	{name: "SliceMake", argLength: 3},                // arg0=ptr, arg1=len, arg2=cap
+	{name: "SlicePtr", argLength: 1, typ: "BytePtr"}, // ptr(arg0)
+	{name: "SliceLen", argLength: 1},                 // len(arg0)
+	{name: "SliceCap", argLength: 1},                 // cap(arg0)
+
+	// Complex (part/whole)
+	{name: "ComplexMake", argLength: 2}, // arg0=real, arg1=imag
+	{name: "ComplexReal", argLength: 1}, // real(arg0)
+	{name: "ComplexImag", argLength: 1}, // imag(arg0)
+
+	// Strings
+	{name: "StringMake", argLength: 2},                // arg0=ptr, arg1=len
+	{name: "StringPtr", argLength: 1, typ: "BytePtr"}, // ptr(arg0)
+	{name: "StringLen", argLength: 1, typ: "Int"},     // len(arg0)
+
+	// Interfaces
+	{name: "IMake", argLength: 2},                // arg0=itab, arg1=data
+	{name: "ITab", argLength: 1, typ: "Uintptr"}, // arg0=interface, returns itable field
+	{name: "IData", argLength: 1},                // arg0=interface, returns data field
+
+	// Structs
+	{name: "StructMake0"},                              // Returns struct with 0 fields.
+	{name: "StructMake1", argLength: 1},                // arg0=field0.  Returns struct.
+	{name: "StructMake2", argLength: 2},                // arg0,arg1=field0,field1.  Returns struct.
+	{name: "StructMake3", argLength: 3},                // arg0..2=field0..2.  Returns struct.
+	{name: "StructMake4", argLength: 4},                // arg0..3=field0..3.  Returns struct.
+	{name: "StructSelect", argLength: 1, aux: "Int64"}, // arg0=struct, auxint=field index.  Returns the auxint'th field.
+
+	// Arrays
+	{name: "ArrayMake0"},                              // Returns array with 0 elements
+	{name: "ArrayMake1", argLength: 1},                // Returns array with 1 element
+	{name: "ArraySelect", argLength: 1, aux: "Int64"}, // arg0=array, auxint=index. Returns a[i].
+
+	// Spill&restore ops for the register allocator. These are
+	// semantically identical to OpCopy; they do not take/return
+	// stores like regular memory ops do. We can get away without memory
+	// args because we know there is no aliasing of spill slots on the stack.
+	{name: "StoreReg", argLength: 1},
+	{name: "LoadReg", argLength: 1},
+
+	// Used during ssa construction. Like Copy, but the arg has not been specified yet.
+	{name: "FwdRef", aux: "Sym", symEffect: "None"},
+
+	// Unknown value. Used for Values whose values don't matter because they are dead code.
+	{name: "Unknown"},
+
+	{name: "VarDef", argLength: 1, aux: "Sym", typ: "Mem", symEffect: "None", zeroWidth: true}, // aux is a *gc.Node of a variable that is about to be initialized.  arg0=mem, returns mem
+	{name: "VarKill", argLength: 1, aux: "Sym", symEffect: "None"},                             // aux is a *gc.Node of a variable that is known to be dead.  arg0=mem, returns mem
+	// TODO: what's the difference between VarLive and KeepAlive?
+	{name: "VarLive", argLength: 1, aux: "Sym", symEffect: "Read", zeroWidth: true}, // aux is a *gc.Node of a variable that must be kept live.  arg0=mem, returns mem
+	{name: "KeepAlive", argLength: 2, typ: "Mem", zeroWidth: true},                  // arg[0] is a value that must be kept alive until this mark.  arg[1]=mem, returns mem
+
+	// InlMark marks the start of an inlined function body. Its AuxInt field
+	// distinguishes which entry in the local inline tree it is marking.
+	{name: "InlMark", argLength: 1, aux: "Int32", typ: "Void"}, // arg[0]=mem, returns void.
+
+	// Ops for breaking 64-bit operations on 32-bit architectures
+	{name: "Int64Make", argLength: 2, typ: "UInt64"}, // arg0=hi, arg1=lo
+	{name: "Int64Hi", argLength: 1, typ: "UInt32"},   // high 32-bit of arg0
+	{name: "Int64Lo", argLength: 1, typ: "UInt32"},   // low 32-bit of arg0
+
+	{name: "Add32carry", argLength: 2, commutative: true, typ: "(UInt32,Flags)"}, // arg0 + arg1, returns (value, carry)
+	{name: "Add32withcarry", argLength: 3, commutative: true},                    // arg0 + arg1 + arg2, arg2=carry (0 or 1)
+
+	{name: "Sub32carry", argLength: 2, typ: "(UInt32,Flags)"}, // arg0 - arg1, returns (value, carry)
+	{name: "Sub32withcarry", argLength: 3},                    // arg0 - arg1 - arg2, arg2=carry (0 or 1)
+
+	{name: "Add64carry", argLength: 3, commutative: true, typ: "(UInt64,UInt64)"}, // arg0 + arg1 + arg2, arg2 must be 0 or 1. returns (value, value>>64)
+	{name: "Sub64borrow", argLength: 3, typ: "(UInt64,UInt64)"},                   // arg0 - (arg1 + arg2), arg2 must be 0 or 1. returns (value, value>>64&1)
+
+	{name: "Signmask", argLength: 1, typ: "Int32"},  // 0 if arg0 >= 0, -1 if arg0 < 0
+	{name: "Zeromask", argLength: 1, typ: "UInt32"}, // 0 if arg0 == 0, 0xffffffff if arg0 != 0
+	{name: "Slicemask", argLength: 1},               // 0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0. Type is native int size.
+
+	{name: "SpectreIndex", argLength: 2},      // arg0 if 0 <= arg0 < arg1, 0 otherwise. Type is native int size.
+	{name: "SpectreSliceIndex", argLength: 2}, // arg0 if 0 <= arg0 <= arg1, 0 otherwise. Type is native int size.
+
+	{name: "Cvt32Uto32F", argLength: 1}, // uint32 -> float32, only used on 32-bit arch
+	{name: "Cvt32Uto64F", argLength: 1}, // uint32 -> float64, only used on 32-bit arch
+	{name: "Cvt32Fto32U", argLength: 1}, // float32 -> uint32, only used on 32-bit arch
+	{name: "Cvt64Fto32U", argLength: 1}, // float64 -> uint32, only used on 32-bit arch
+	{name: "Cvt64Uto32F", argLength: 1}, // uint64 -> float32, only used on archs that has the instruction
+	{name: "Cvt64Uto64F", argLength: 1}, // uint64 -> float64, only used on archs that has the instruction
+	{name: "Cvt32Fto64U", argLength: 1}, // float32 -> uint64, only used on archs that has the instruction
+	{name: "Cvt64Fto64U", argLength: 1}, // float64 -> uint64, only used on archs that has the instruction
+
+	// pseudo-ops for breaking Tuple
+	{name: "Select0", argLength: 1, zeroWidth: true},  // the first component of a tuple
+	{name: "Select1", argLength: 1, zeroWidth: true},  // the second component of a tuple
+	{name: "SelectN", argLength: 1, aux: "Int64"},     // arg0=result, auxint=field index.  Returns the auxint'th member.
+	{name: "SelectNAddr", argLength: 1, aux: "Int64"}, // arg0=result, auxint=field index.  Returns the address of auxint'th member. Used for un-SSA-able result types.
+	{name: "MakeResult", argLength: -1},               // arg0 .. are components of a "Result" (like the result from a Call). The last arg should be memory (like the result from a call).
+
+	// Atomic operations used for semantically inlining sync/atomic and
+	// runtime/internal/atomic. Atomic loads return a new memory so that
+	// the loads are properly ordered with respect to other loads and
+	// stores.
+	{name: "AtomicLoad8", argLength: 2, typ: "(UInt8,Mem)"},                                    // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
+	{name: "AtomicLoad32", argLength: 2, typ: "(UInt32,Mem)"},                                  // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
+	{name: "AtomicLoad64", argLength: 2, typ: "(UInt64,Mem)"},                                  // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
+	{name: "AtomicLoadPtr", argLength: 2, typ: "(BytePtr,Mem)"},                                // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
+	{name: "AtomicLoadAcq32", argLength: 2, typ: "(UInt32,Mem)"},                               // Load from arg0.  arg1=memory.  Lock acquisition, returns loaded value and new memory.
+	{name: "AtomicLoadAcq64", argLength: 2, typ: "(UInt64,Mem)"},                               // Load from arg0.  arg1=memory.  Lock acquisition, returns loaded value and new memory.
+	{name: "AtomicStore8", argLength: 3, typ: "Mem", hasSideEffects: true},                     // Store arg1 to *arg0.  arg2=memory.  Returns memory.
+	{name: "AtomicStore32", argLength: 3, typ: "Mem", hasSideEffects: true},                    // Store arg1 to *arg0.  arg2=memory.  Returns memory.
+	{name: "AtomicStore64", argLength: 3, typ: "Mem", hasSideEffects: true},                    // Store arg1 to *arg0.  arg2=memory.  Returns memory.
+	{name: "AtomicStorePtrNoWB", argLength: 3, typ: "Mem", hasSideEffects: true},               // Store arg1 to *arg0.  arg2=memory.  Returns memory.
+	{name: "AtomicStoreRel32", argLength: 3, typ: "Mem", hasSideEffects: true},                 // Store arg1 to *arg0.  arg2=memory.  Lock release, returns memory.
+	{name: "AtomicStoreRel64", argLength: 3, typ: "Mem", hasSideEffects: true},                 // Store arg1 to *arg0.  arg2=memory.  Lock release, returns memory.
+	{name: "AtomicExchange32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},        // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
+	{name: "AtomicExchange64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},        // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
+	{name: "AtomicAdd32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},             // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicAdd64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},             // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicCompareAndSwap32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true},    // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
+	{name: "AtomicCompareAndSwap64", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true},    // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
+	{name: "AtomicCompareAndSwapRel32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2.  Lock release, reports whether store happens and new memory.
+	{name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true},                       // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicAnd32", argLength: 3, typ: "Mem", hasSideEffects: true},                      // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true},                        // *arg0 |= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr32", argLength: 3, typ: "Mem", hasSideEffects: true},                       // *arg0 |= arg1.  arg2=memory.  Returns memory.
+
+	// Atomic operation variants
+	// These variants have the same semantics as above atomic operations.
+	// But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
+	// Currently, they are used on ARM64 only.
+	{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},          // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},          // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
+	{name: "AtomicExchange32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},     // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
+	{name: "AtomicExchange64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},     // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
+	{name: "AtomicCompareAndSwap32Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
+	{name: "AtomicCompareAndSwap64Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
+	{name: "AtomicAnd8Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                    // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicAnd32Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                   // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr8Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                     // *arg0 |= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr32Variant", argLength: 3, typ: "Mem", hasSideEffects: true},                    // *arg0 |= arg1.  arg2=memory.  Returns memory.
+
+	// Clobber experiment op
+	{name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable
+}
+
+//     kind          controls        successors   implicit exit
+//   ----------------------------------------------------------
+//     Exit      [return mem]                []             yes
+//      Ret      [return mem]                []             yes
+//   RetJmp      [return mem]                []             yes
+//    Plain                []            [next]
+//       If   [boolean Value]      [then, else]
+//    First                []   [always, never]
+
+var genericBlocks = []blockData{
+	{name: "Plain"},               // a single successor
+	{name: "If", controls: 1},     // if Controls[0] goto Succs[0] else goto Succs[1]
+	{name: "Defer", controls: 1},  // Succs[0]=defer queued, Succs[1]=defer recovered. Controls[0] is call op (of memory type)
+	{name: "Ret", controls: 1},    // no successors, Controls[0] value is memory result
+	{name: "RetJmp", controls: 1}, // no successors, Controls[0] value is memory result, jumps to b.Aux.(*gc.Sym)
+	{name: "Exit", controls: 1},   // no successors, Controls[0] value generates a panic
+
+	// transient block state used for dead code removal
+	{name: "First"}, // 2 successors, always takes the first one (second is dead)
+}
+
+func init() {
+	archs = append(archs, arch{
+		name:    "generic",
+		ops:     genericOps,
+		blocks:  genericBlocks,
+		generic: true,
+	})
+}
diff --git a/src/cmd/compile/internal/ssa/gen/main.go b/src/cmd/compile/internal/ssa/gen/main.go
new file mode 100644
index 0000000..dfa146a
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/main.go
@@ -0,0 +1,541 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// The gen command generates Go code (in the parent directory) for all
+// the architecture-specific opcodes, blocks, and rewrites.
+package main
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"go/format"
+	"io/ioutil"
+	"log"
+	"os"
+	"path"
+	"regexp"
+	"runtime"
+	"runtime/pprof"
+	"runtime/trace"
+	"sort"
+	"strings"
+	"sync"
+)
+
+// TODO: capitalize these types, so that we can more easily tell variable names
+// apart from type names, and avoid awkward func parameters like "arch arch".
+
+type arch struct {
+	name            string
+	pkg             string // obj package to import for this arch.
+	genfile         string // source file containing opcode code generation.
+	ops             []opData
+	blocks          []blockData
+	regnames        []string
+	gpregmask       regMask
+	fpregmask       regMask
+	fp32regmask     regMask
+	fp64regmask     regMask
+	specialregmask  regMask
+	framepointerreg int8
+	linkreg         int8
+	generic         bool
+	imports         []string
+}
+
+type opData struct {
+	name              string
+	reg               regInfo
+	asm               string
+	typ               string // default result type
+	aux               string
+	rematerializeable bool
+	argLength         int32  // number of arguments, if -1, then this operation has a variable number of arguments
+	commutative       bool   // this operation is commutative on its first 2 arguments (e.g. addition)
+	resultInArg0      bool   // (first, if a tuple) output of v and v.Args[0] must be allocated to the same register
+	resultNotInArgs   bool   // outputs must not be allocated to the same registers as inputs
+	clobberFlags      bool   // this op clobbers flags register
+	call              bool   // is a function call
+	nilCheck          bool   // this op is a nil check on arg0
+	faultOnNilArg0    bool   // this op will fault if arg0 is nil (and aux encodes a small offset)
+	faultOnNilArg1    bool   // this op will fault if arg1 is nil (and aux encodes a small offset)
+	usesScratch       bool   // this op requires scratch memory space
+	hasSideEffects    bool   // for "reasons", not to be eliminated.  E.g., atomic store, #19182.
+	zeroWidth         bool   // op never translates into any machine code. example: copy, which may sometimes translate to machine code, is not zero-width.
+	unsafePoint       bool   // this op is an unsafe point, i.e. not safe for async preemption
+	symEffect         string // effect this op has on symbol in aux
+	scale             uint8  // amd64/386 indexed load scale
+}
+
+type blockData struct {
+	name     string // the suffix for this block ("EQ", "LT", etc.)
+	controls int    // the number of control values this type of block requires
+	aux      string // the type of the Aux/AuxInt value, if any
+}
+
+type regInfo struct {
+	// inputs[i] encodes the set of registers allowed for the i'th input.
+	// Inputs that don't use registers (flags, memory, etc.) should be 0.
+	inputs []regMask
+	// clobbers encodes the set of registers that are overwritten by
+	// the instruction (other than the output registers).
+	clobbers regMask
+	// outputs[i] encodes the set of registers allowed for the i'th output.
+	outputs []regMask
+}
+
+type regMask uint64
+
+func (a arch) regMaskComment(r regMask) string {
+	var buf bytes.Buffer
+	for i := uint64(0); r != 0; i++ {
+		if r&1 != 0 {
+			if buf.Len() == 0 {
+				buf.WriteString(" //")
+			}
+			buf.WriteString(" ")
+			buf.WriteString(a.regnames[i])
+		}
+		r >>= 1
+	}
+	return buf.String()
+}
+
+var archs []arch
+
+var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`")
+var memprofile = flag.String("memprofile", "", "write memory profile to `file`")
+var tracefile = flag.String("trace", "", "write trace to `file`")
+
+func main() {
+	flag.Parse()
+	if *cpuprofile != "" {
+		f, err := os.Create(*cpuprofile)
+		if err != nil {
+			log.Fatal("could not create CPU profile: ", err)
+		}
+		defer f.Close()
+		if err := pprof.StartCPUProfile(f); err != nil {
+			log.Fatal("could not start CPU profile: ", err)
+		}
+		defer pprof.StopCPUProfile()
+	}
+	if *tracefile != "" {
+		f, err := os.Create(*tracefile)
+		if err != nil {
+			log.Fatalf("failed to create trace output file: %v", err)
+		}
+		defer func() {
+			if err := f.Close(); err != nil {
+				log.Fatalf("failed to close trace file: %v", err)
+			}
+		}()
+
+		if err := trace.Start(f); err != nil {
+			log.Fatalf("failed to start trace: %v", err)
+		}
+		defer trace.Stop()
+	}
+
+	sort.Sort(ArchsByName(archs))
+
+	// The generate tasks are run concurrently, since they are CPU-intensive
+	// that can easily make use of many cores on a machine.
+	//
+	// Note that there is no limit on the concurrency at the moment. On a
+	// four-core laptop at the time of writing, peak RSS usually reaches
+	// ~200MiB, which seems doable by practically any machine nowadays. If
+	// that stops being the case, we can cap this func to a fixed number of
+	// architectures being generated at once.
+
+	tasks := []func(){
+		genOp,
+	}
+	for _, a := range archs {
+		a := a // the funcs are ran concurrently at a later time
+		tasks = append(tasks, func() {
+			genRules(a)
+			genSplitLoadRules(a)
+		})
+	}
+	var wg sync.WaitGroup
+	for _, task := range tasks {
+		task := task
+		wg.Add(1)
+		go func() {
+			task()
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	if *memprofile != "" {
+		f, err := os.Create(*memprofile)
+		if err != nil {
+			log.Fatal("could not create memory profile: ", err)
+		}
+		defer f.Close()
+		runtime.GC() // get up-to-date statistics
+		if err := pprof.WriteHeapProfile(f); err != nil {
+			log.Fatal("could not write memory profile: ", err)
+		}
+	}
+}
+
+func genOp() {
+	w := new(bytes.Buffer)
+	fmt.Fprintf(w, "// Code generated from gen/*Ops.go; DO NOT EDIT.\n")
+	fmt.Fprintln(w)
+	fmt.Fprintln(w, "package ssa")
+
+	fmt.Fprintln(w, "import (")
+	fmt.Fprintln(w, "\"cmd/internal/obj\"")
+	for _, a := range archs {
+		if a.pkg != "" {
+			fmt.Fprintf(w, "%q\n", a.pkg)
+		}
+	}
+	fmt.Fprintln(w, ")")
+
+	// generate Block* declarations
+	fmt.Fprintln(w, "const (")
+	fmt.Fprintln(w, "BlockInvalid BlockKind = iota")
+	for _, a := range archs {
+		fmt.Fprintln(w)
+		for _, d := range a.blocks {
+			fmt.Fprintf(w, "Block%s%s\n", a.Name(), d.name)
+		}
+	}
+	fmt.Fprintln(w, ")")
+
+	// generate block kind string method
+	fmt.Fprintln(w, "var blockString = [...]string{")
+	fmt.Fprintln(w, "BlockInvalid:\"BlockInvalid\",")
+	for _, a := range archs {
+		fmt.Fprintln(w)
+		for _, b := range a.blocks {
+			fmt.Fprintf(w, "Block%s%s:\"%s\",\n", a.Name(), b.name, b.name)
+		}
+	}
+	fmt.Fprintln(w, "}")
+	fmt.Fprintln(w, "func (k BlockKind) String() string {return blockString[k]}")
+
+	// generate block kind auxint method
+	fmt.Fprintln(w, "func (k BlockKind) AuxIntType() string {")
+	fmt.Fprintln(w, "switch k {")
+	for _, a := range archs {
+		for _, b := range a.blocks {
+			if b.auxIntType() == "invalid" {
+				continue
+			}
+			fmt.Fprintf(w, "case Block%s%s: return \"%s\"\n", a.Name(), b.name, b.auxIntType())
+		}
+	}
+	fmt.Fprintln(w, "}")
+	fmt.Fprintln(w, "return \"\"")
+	fmt.Fprintln(w, "}")
+
+	// generate Op* declarations
+	fmt.Fprintln(w, "const (")
+	fmt.Fprintln(w, "OpInvalid Op = iota") // make sure OpInvalid is 0.
+	for _, a := range archs {
+		fmt.Fprintln(w)
+		for _, v := range a.ops {
+			if v.name == "Invalid" {
+				continue
+			}
+			fmt.Fprintf(w, "Op%s%s\n", a.Name(), v.name)
+		}
+	}
+	fmt.Fprintln(w, ")")
+
+	// generate OpInfo table
+	fmt.Fprintln(w, "var opcodeTable = [...]opInfo{")
+	fmt.Fprintln(w, " { name: \"OpInvalid\" },")
+	for _, a := range archs {
+		fmt.Fprintln(w)
+
+		pkg := path.Base(a.pkg)
+		for _, v := range a.ops {
+			if v.name == "Invalid" {
+				continue
+			}
+			fmt.Fprintln(w, "{")
+			fmt.Fprintf(w, "name:\"%s\",\n", v.name)
+
+			// flags
+			if v.aux != "" {
+				fmt.Fprintf(w, "auxType: aux%s,\n", v.aux)
+			}
+			fmt.Fprintf(w, "argLen: %d,\n", v.argLength)
+
+			if v.rematerializeable {
+				if v.reg.clobbers != 0 {
+					log.Fatalf("%s is rematerializeable and clobbers registers", v.name)
+				}
+				if v.clobberFlags {
+					log.Fatalf("%s is rematerializeable and clobbers flags", v.name)
+				}
+				fmt.Fprintln(w, "rematerializeable: true,")
+			}
+			if v.commutative {
+				fmt.Fprintln(w, "commutative: true,")
+			}
+			if v.resultInArg0 {
+				fmt.Fprintln(w, "resultInArg0: true,")
+				// OpConvert's register mask is selected dynamically,
+				// so don't try to check it in the static table.
+				if v.name != "Convert" && v.reg.inputs[0] != v.reg.outputs[0] {
+					log.Fatalf("%s: input[0] and output[0] must use the same registers for %s", a.name, v.name)
+				}
+				if v.name != "Convert" && v.commutative && v.reg.inputs[1] != v.reg.outputs[0] {
+					log.Fatalf("%s: input[1] and output[0] must use the same registers for %s", a.name, v.name)
+				}
+			}
+			if v.resultNotInArgs {
+				fmt.Fprintln(w, "resultNotInArgs: true,")
+			}
+			if v.clobberFlags {
+				fmt.Fprintln(w, "clobberFlags: true,")
+			}
+			if v.call {
+				fmt.Fprintln(w, "call: true,")
+			}
+			if v.nilCheck {
+				fmt.Fprintln(w, "nilCheck: true,")
+			}
+			if v.faultOnNilArg0 {
+				fmt.Fprintln(w, "faultOnNilArg0: true,")
+				if v.aux != "Sym" && v.aux != "SymOff" && v.aux != "SymValAndOff" && v.aux != "Int64" && v.aux != "Int32" && v.aux != "" {
+					log.Fatalf("faultOnNilArg0 with aux %s not allowed", v.aux)
+				}
+			}
+			if v.faultOnNilArg1 {
+				fmt.Fprintln(w, "faultOnNilArg1: true,")
+				if v.aux != "Sym" && v.aux != "SymOff" && v.aux != "SymValAndOff" && v.aux != "Int64" && v.aux != "Int32" && v.aux != "" {
+					log.Fatalf("faultOnNilArg1 with aux %s not allowed", v.aux)
+				}
+			}
+			if v.usesScratch {
+				fmt.Fprintln(w, "usesScratch: true,")
+			}
+			if v.hasSideEffects {
+				fmt.Fprintln(w, "hasSideEffects: true,")
+			}
+			if v.zeroWidth {
+				fmt.Fprintln(w, "zeroWidth: true,")
+			}
+			if v.unsafePoint {
+				fmt.Fprintln(w, "unsafePoint: true,")
+			}
+			needEffect := strings.HasPrefix(v.aux, "Sym")
+			if v.symEffect != "" {
+				if !needEffect {
+					log.Fatalf("symEffect with aux %s not allowed", v.aux)
+				}
+				fmt.Fprintf(w, "symEffect: Sym%s,\n", strings.Replace(v.symEffect, ",", "|Sym", -1))
+			} else if needEffect {
+				log.Fatalf("symEffect needed for aux %s", v.aux)
+			}
+			if a.name == "generic" {
+				fmt.Fprintln(w, "generic:true,")
+				fmt.Fprintln(w, "},") // close op
+				// generic ops have no reg info or asm
+				continue
+			}
+			if v.asm != "" {
+				fmt.Fprintf(w, "asm: %s.A%s,\n", pkg, v.asm)
+			}
+			if v.scale != 0 {
+				fmt.Fprintf(w, "scale: %d,\n", v.scale)
+			}
+			fmt.Fprintln(w, "reg:regInfo{")
+
+			// Compute input allocation order. We allocate from the
+			// most to the least constrained input. This order guarantees
+			// that we will always be able to find a register.
+			var s []intPair
+			for i, r := range v.reg.inputs {
+				if r != 0 {
+					s = append(s, intPair{countRegs(r), i})
+				}
+			}
+			if len(s) > 0 {
+				sort.Sort(byKey(s))
+				fmt.Fprintln(w, "inputs: []inputInfo{")
+				for _, p := range s {
+					r := v.reg.inputs[p.val]
+					fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r))
+				}
+				fmt.Fprintln(w, "},")
+			}
+
+			if v.reg.clobbers > 0 {
+				fmt.Fprintf(w, "clobbers: %d,%s\n", v.reg.clobbers, a.regMaskComment(v.reg.clobbers))
+			}
+
+			// reg outputs
+			s = s[:0]
+			for i, r := range v.reg.outputs {
+				s = append(s, intPair{countRegs(r), i})
+			}
+			if len(s) > 0 {
+				sort.Sort(byKey(s))
+				fmt.Fprintln(w, "outputs: []outputInfo{")
+				for _, p := range s {
+					r := v.reg.outputs[p.val]
+					fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r))
+				}
+				fmt.Fprintln(w, "},")
+			}
+			fmt.Fprintln(w, "},") // close reg info
+			fmt.Fprintln(w, "},") // close op
+		}
+	}
+	fmt.Fprintln(w, "}")
+
+	fmt.Fprintln(w, "func (o Op) Asm() obj.As {return opcodeTable[o].asm}")
+	fmt.Fprintln(w, "func (o Op) Scale() int16 {return int16(opcodeTable[o].scale)}")
+
+	// generate op string method
+	fmt.Fprintln(w, "func (o Op) String() string {return opcodeTable[o].name }")
+
+	fmt.Fprintln(w, "func (o Op) UsesScratch() bool { return opcodeTable[o].usesScratch }")
+
+	fmt.Fprintln(w, "func (o Op) SymEffect() SymEffect { return opcodeTable[o].symEffect }")
+	fmt.Fprintln(w, "func (o Op) IsCall() bool { return opcodeTable[o].call }")
+	fmt.Fprintln(w, "func (o Op) HasSideEffects() bool { return opcodeTable[o].hasSideEffects }")
+	fmt.Fprintln(w, "func (o Op) UnsafePoint() bool { return opcodeTable[o].unsafePoint }")
+
+	// generate registers
+	for _, a := range archs {
+		if a.generic {
+			continue
+		}
+		fmt.Fprintf(w, "var registers%s = [...]Register {\n", a.name)
+		var gcRegN int
+		for i, r := range a.regnames {
+			pkg := a.pkg[len("cmd/internal/obj/"):]
+			var objname string // name in cmd/internal/obj/$ARCH
+			switch r {
+			case "SB":
+				// SB isn't a real register.  cmd/internal/obj expects 0 in this case.
+				objname = "0"
+			case "SP":
+				objname = pkg + ".REGSP"
+			case "g":
+				objname = pkg + ".REGG"
+			default:
+				objname = pkg + ".REG_" + r
+			}
+			// Assign a GC register map index to registers
+			// that may contain pointers.
+			gcRegIdx := -1
+			if a.gpregmask&(1<<uint(i)) != 0 {
+				gcRegIdx = gcRegN
+				gcRegN++
+			}
+			fmt.Fprintf(w, "  {%d, %s, %d, \"%s\"},\n", i, objname, gcRegIdx, r)
+		}
+		if gcRegN > 32 {
+			// Won't fit in a uint32 mask.
+			log.Fatalf("too many GC registers (%d > 32) on %s", gcRegN, a.name)
+		}
+		fmt.Fprintln(w, "}")
+		fmt.Fprintf(w, "var gpRegMask%s = regMask(%d)\n", a.name, a.gpregmask)
+		fmt.Fprintf(w, "var fpRegMask%s = regMask(%d)\n", a.name, a.fpregmask)
+		if a.fp32regmask != 0 {
+			fmt.Fprintf(w, "var fp32RegMask%s = regMask(%d)\n", a.name, a.fp32regmask)
+		}
+		if a.fp64regmask != 0 {
+			fmt.Fprintf(w, "var fp64RegMask%s = regMask(%d)\n", a.name, a.fp64regmask)
+		}
+		fmt.Fprintf(w, "var specialRegMask%s = regMask(%d)\n", a.name, a.specialregmask)
+		fmt.Fprintf(w, "var framepointerReg%s = int8(%d)\n", a.name, a.framepointerreg)
+		fmt.Fprintf(w, "var linkReg%s = int8(%d)\n", a.name, a.linkreg)
+	}
+
+	// gofmt result
+	b := w.Bytes()
+	var err error
+	b, err = format.Source(b)
+	if err != nil {
+		fmt.Printf("%s\n", w.Bytes())
+		panic(err)
+	}
+
+	if err := ioutil.WriteFile("../opGen.go", b, 0666); err != nil {
+		log.Fatalf("can't write output: %v\n", err)
+	}
+
+	// Check that the arch genfile handles all the arch-specific opcodes.
+	// This is very much a hack, but it is better than nothing.
+	//
+	// Do a single regexp pass to record all ops being handled in a map, and
+	// then compare that with the ops list. This is much faster than one
+	// regexp pass per opcode.
+	for _, a := range archs {
+		if a.genfile == "" {
+			continue
+		}
+
+		pattern := fmt.Sprintf(`\Wssa\.Op%s([a-zA-Z0-9_]+)\W`, a.name)
+		rxOp, err := regexp.Compile(pattern)
+		if err != nil {
+			log.Fatalf("bad opcode regexp %s: %v", pattern, err)
+		}
+
+		src, err := ioutil.ReadFile(a.genfile)
+		if err != nil {
+			log.Fatalf("can't read %s: %v", a.genfile, err)
+		}
+		seen := make(map[string]bool, len(a.ops))
+		for _, m := range rxOp.FindAllSubmatch(src, -1) {
+			seen[string(m[1])] = true
+		}
+		for _, op := range a.ops {
+			if !seen[op.name] {
+				log.Fatalf("Op%s%s has no code generation in %s", a.name, op.name, a.genfile)
+			}
+		}
+	}
+}
+
+// Name returns the name of the architecture for use in Op* and Block* enumerations.
+func (a arch) Name() string {
+	s := a.name
+	if s == "generic" {
+		s = ""
+	}
+	return s
+}
+
+// countRegs returns the number of set bits in the register mask.
+func countRegs(r regMask) int {
+	n := 0
+	for r != 0 {
+		n += int(r & 1)
+		r >>= 1
+	}
+	return n
+}
+
+// for sorting a pair of integers by key
+type intPair struct {
+	key, val int
+}
+type byKey []intPair
+
+func (a byKey) Len() int           { return len(a) }
+func (a byKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a byKey) Less(i, j int) bool { return a[i].key < a[j].key }
+
+type ArchsByName []arch
+
+func (x ArchsByName) Len() int           { return len(x) }
+func (x ArchsByName) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x ArchsByName) Less(i, j int) bool { return x[i].name < x[j].name }
diff --git a/src/cmd/compile/internal/ssa/gen/rulegen.go b/src/cmd/compile/internal/ssa/gen/rulegen.go
new file mode 100644
index 0000000..aaf9101
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/gen/rulegen.go
@@ -0,0 +1,1856 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gen
+
+// This program generates Go code that applies rewrite rules to a Value.
+// The generated code implements a function of type func (v *Value) bool
+// which reports whether if did something.
+// Ideas stolen from Swift: http://www.hpl.hp.com/techreports/Compaq-DEC/WRL-2000-2.html
+
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"flag"
+	"fmt"
+	"go/ast"
+	"go/format"
+	"go/parser"
+	"go/printer"
+	"go/token"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"path"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+
+	"golang.org/x/tools/go/ast/astutil"
+)
+
+// rule syntax:
+//  sexpr [&& extra conditions] => [@block] sexpr
+//
+// sexpr are s-expressions (lisp-like parenthesized groupings)
+// sexpr ::= [variable:](opcode sexpr*)
+//         | variable
+//         | <type>
+//         | [auxint]
+//         | {aux}
+//
+// aux      ::= variable | {code}
+// type     ::= variable | {code}
+// variable ::= some token
+// opcode   ::= one of the opcodes from the *Ops.go files
+
+// special rules: trailing ellipsis "..." (in the outermost sexpr?) must match on both sides of a rule.
+//                trailing three underscore "___" in the outermost match sexpr indicate the presence of
+//                   extra ignored args that need not appear in the replacement
+
+// extra conditions is just a chunk of Go that evaluates to a boolean. It may use
+// variables declared in the matching tsexpr. The variable "v" is predefined to be
+// the value matched by the entire rule.
+
+// If multiple rules match, the first one in file order is selected.
+
+var (
+	genLog  = flag.Bool("log", false, "generate code that logs; for debugging only")
+	addLine = flag.Bool("line", false, "add line number comment to generated rules; for debugging only")
+)
+
+type Rule struct {
+	Rule string
+	Loc  string // file name & line number
+}
+
+func (r Rule) String() string {
+	return fmt.Sprintf("rule %q at %s", r.Rule, r.Loc)
+}
+
+func normalizeSpaces(s string) string {
+	return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
+}
+
+// parse returns the matching part of the rule, additional conditions, and the result.
+func (r Rule) parse() (match, cond, result string) {
+	s := strings.Split(r.Rule, "=>")
+	match = normalizeSpaces(s[0])
+	result = normalizeSpaces(s[1])
+	cond = ""
+	if i := strings.Index(match, "&&"); i >= 0 {
+		cond = normalizeSpaces(match[i+2:])
+		match = normalizeSpaces(match[:i])
+	}
+	return match, cond, result
+}
+
+func genRules(arch arch)          { genRulesSuffix(arch, "") }
+func genSplitLoadRules(arch arch) { genRulesSuffix(arch, "splitload") }
+
+func genRulesSuffix(arch arch, suff string) {
+	// Open input file.
+	text, err := os.Open(arch.name + suff + ".rules")
+	if err != nil {
+		if suff == "" {
+			// All architectures must have a plain rules file.
+			log.Fatalf("can't read rule file: %v", err)
+		}
+		// Some architectures have bonus rules files that others don't share. That's fine.
+		return
+	}
+
+	// oprules contains a list of rules for each block and opcode
+	blockrules := map[string][]Rule{}
+	oprules := map[string][]Rule{}
+
+	// read rule file
+	scanner := bufio.NewScanner(text)
+	rule := ""
+	var lineno int
+	var ruleLineno int // line number of "=>"
+	for scanner.Scan() {
+		lineno++
+		line := scanner.Text()
+		if i := strings.Index(line, "//"); i >= 0 {
+			// Remove comments. Note that this isn't string safe, so
+			// it will truncate lines with // inside strings. Oh well.
+			line = line[:i]
+		}
+		rule += " " + line
+		rule = strings.TrimSpace(rule)
+		if rule == "" {
+			continue
+		}
+		if !strings.Contains(rule, "=>") {
+			continue
+		}
+		if ruleLineno == 0 {
+			ruleLineno = lineno
+		}
+		if strings.HasSuffix(rule, "=>") {
+			continue // continue on the next line
+		}
+		if n := balance(rule); n > 0 {
+			continue // open parentheses remain, continue on the next line
+		} else if n < 0 {
+			break // continuing the line can't help, and it will only make errors worse
+		}
+
+		loc := fmt.Sprintf("%s%s.rules:%d", arch.name, suff, ruleLineno)
+		for _, rule2 := range expandOr(rule) {
+			r := Rule{Rule: rule2, Loc: loc}
+			if rawop := strings.Split(rule2, " ")[0][1:]; isBlock(rawop, arch) {
+				blockrules[rawop] = append(blockrules[rawop], r)
+				continue
+			}
+			// Do fancier value op matching.
+			match, _, _ := r.parse()
+			op, oparch, _, _, _, _ := parseValue(match, arch, loc)
+			opname := fmt.Sprintf("Op%s%s", oparch, op.name)
+			oprules[opname] = append(oprules[opname], r)
+		}
+		rule = ""
+		ruleLineno = 0
+	}
+	if err := scanner.Err(); err != nil {
+		log.Fatalf("scanner failed: %v\n", err)
+	}
+	if balance(rule) != 0 {
+		log.Fatalf("%s.rules:%d: unbalanced rule: %v\n", arch.name, lineno, rule)
+	}
+
+	// Order all the ops.
+	var ops []string
+	for op := range oprules {
+		ops = append(ops, op)
+	}
+	sort.Strings(ops)
+
+	genFile := &File{Arch: arch, Suffix: suff}
+	// Main rewrite routine is a switch on v.Op.
+	fn := &Func{Kind: "Value", ArgLen: -1}
+
+	sw := &Switch{Expr: exprf("v.Op")}
+	for _, op := range ops {
+		eop, ok := parseEllipsisRules(oprules[op], arch)
+		if ok {
+			if strings.Contains(oprules[op][0].Rule, "=>") && opByName(arch, op).aux != opByName(arch, eop).aux {
+				panic(fmt.Sprintf("can't use ... for ops that have different aux types: %s and %s", op, eop))
+			}
+			swc := &Case{Expr: exprf("%s", op)}
+			swc.add(stmtf("v.Op = %s", eop))
+			swc.add(stmtf("return true"))
+			sw.add(swc)
+			continue
+		}
+
+		swc := &Case{Expr: exprf("%s", op)}
+		swc.add(stmtf("return rewriteValue%s%s_%s(v)", arch.name, suff, op))
+		sw.add(swc)
+	}
+	fn.add(sw)
+	fn.add(stmtf("return false"))
+	genFile.add(fn)
+
+	// Generate a routine per op. Note that we don't make one giant routine
+	// because it is too big for some compilers.
+	for _, op := range ops {
+		rules := oprules[op]
+		_, ok := parseEllipsisRules(oprules[op], arch)
+		if ok {
+			continue
+		}
+
+		// rr is kept between iterations, so that each rule can check
+		// that the previous rule wasn't unconditional.
+		var rr *RuleRewrite
+		fn := &Func{
+			Kind:   "Value",
+			Suffix: fmt.Sprintf("_%s", op),
+			ArgLen: opByName(arch, op).argLength,
+		}
+		fn.add(declf("b", "v.Block"))
+		fn.add(declf("config", "b.Func.Config"))
+		fn.add(declf("fe", "b.Func.fe"))
+		fn.add(declf("typ", "&b.Func.Config.Types"))
+		for _, rule := range rules {
+			if rr != nil && !rr.CanFail {
+				log.Fatalf("unconditional rule %s is followed by other rules", rr.Match)
+			}
+			rr = &RuleRewrite{Loc: rule.Loc}
+			rr.Match, rr.Cond, rr.Result = rule.parse()
+			pos, _ := genMatch(rr, arch, rr.Match, fn.ArgLen >= 0)
+			if pos == "" {
+				pos = "v.Pos"
+			}
+			if rr.Cond != "" {
+				rr.add(breakf("!(%s)", rr.Cond))
+			}
+			genResult(rr, arch, rr.Result, pos)
+			if *genLog {
+				rr.add(stmtf("logRule(%q)", rule.Loc))
+			}
+			fn.add(rr)
+		}
+		if rr.CanFail {
+			fn.add(stmtf("return false"))
+		}
+		genFile.add(fn)
+	}
+
+	// Generate block rewrite function. There are only a few block types
+	// so we can make this one function with a switch.
+	fn = &Func{Kind: "Block"}
+	fn.add(declf("config", "b.Func.Config"))
+	fn.add(declf("typ", "&b.Func.Config.Types"))
+
+	sw = &Switch{Expr: exprf("b.Kind")}
+	ops = ops[:0]
+	for op := range blockrules {
+		ops = append(ops, op)
+	}
+	sort.Strings(ops)
+	for _, op := range ops {
+		name, data := getBlockInfo(op, arch)
+		swc := &Case{Expr: exprf("%s", name)}
+		for _, rule := range blockrules[op] {
+			swc.add(genBlockRewrite(rule, arch, data))
+		}
+		sw.add(swc)
+	}
+	fn.add(sw)
+	fn.add(stmtf("return false"))
+	genFile.add(fn)
+
+	// Remove unused imports and variables.
+	buf := new(bytes.Buffer)
+	fprint(buf, genFile)
+	fset := token.NewFileSet()
+	file, err := parser.ParseFile(fset, "", buf, parser.ParseComments)
+	if err != nil {
+		filename := fmt.Sprintf("%s_broken.go", arch.name)
+		if err := ioutil.WriteFile(filename, buf.Bytes(), 0644); err != nil {
+			log.Printf("failed to dump broken code to %s: %v", filename, err)
+		} else {
+			log.Printf("dumped broken code to %s", filename)
+		}
+		log.Fatalf("failed to parse generated code for arch %s: %v", arch.name, err)
+	}
+	tfile := fset.File(file.Pos())
+
+	// First, use unusedInspector to find the unused declarations by their
+	// start position.
+	u := unusedInspector{unused: make(map[token.Pos]bool)}
+	u.node(file)
+
+	// Then, delete said nodes via astutil.Apply.
+	pre := func(c *astutil.Cursor) bool {
+		node := c.Node()
+		if node == nil {
+			return true
+		}
+		if u.unused[node.Pos()] {
+			c.Delete()
+			// Unused imports and declarations use exactly
+			// one line. Prevent leaving an empty line.
+			tfile.MergeLine(tfile.Position(node.Pos()).Line)
+			return false
+		}
+		return true
+	}
+	post := func(c *astutil.Cursor) bool {
+		switch node := c.Node().(type) {
+		case *ast.GenDecl:
+			if len(node.Specs) == 0 {
+				// Don't leave a broken or empty GenDecl behind,
+				// such as "import ()".
+				c.Delete()
+			}
+		}
+		return true
+	}
+	file = astutil.Apply(file, pre, post).(*ast.File)
+
+	// Write the well-formatted source to file
+	f, err := os.Create("../rewrite" + arch.name + suff + ".go")
+	if err != nil {
+		log.Fatalf("can't write output: %v", err)
+	}
+	defer f.Close()
+	// gofmt result; use a buffered writer, as otherwise go/format spends
+	// far too much time in syscalls.
+	bw := bufio.NewWriter(f)
+	if err := format.Node(bw, fset, file); err != nil {
+		log.Fatalf("can't format output: %v", err)
+	}
+	if err := bw.Flush(); err != nil {
+		log.Fatalf("can't write output: %v", err)
+	}
+	if err := f.Close(); err != nil {
+		log.Fatalf("can't write output: %v", err)
+	}
+}
+
+// unusedInspector can be used to detect unused variables and imports in an
+// ast.Node via its node method. The result is available in the "unused" map.
+//
+// note that unusedInspector is lazy and best-effort; it only supports the node
+// types and patterns used by the rulegen program.
+type unusedInspector struct {
+	// scope is the current scope, which can never be nil when a declaration
+	// is encountered. That is, the unusedInspector.node entrypoint should
+	// generally be an entire file or block.
+	scope *scope
+
+	// unused is the resulting set of unused declared names, indexed by the
+	// starting position of the node that declared the name.
+	unused map[token.Pos]bool
+
+	// defining is the object currently being defined; this is useful so
+	// that if "foo := bar" is unused and removed, we can then detect if
+	// "bar" becomes unused as well.
+	defining *object
+}
+
+// scoped opens a new scope when called, and returns a function which closes
+// that same scope. When a scope is closed, unused variables are recorded.
+func (u *unusedInspector) scoped() func() {
+	outer := u.scope
+	u.scope = &scope{outer: outer, objects: map[string]*object{}}
+	return func() {
+		for anyUnused := true; anyUnused; {
+			anyUnused = false
+			for _, obj := range u.scope.objects {
+				if obj.numUses > 0 {
+					continue
+				}
+				u.unused[obj.pos] = true
+				for _, used := range obj.used {
+					if used.numUses--; used.numUses == 0 {
+						anyUnused = true
+					}
+				}
+				// We've decremented numUses for each of the
+				// objects in used. Zero this slice too, to keep
+				// everything consistent.
+				obj.used = nil
+			}
+		}
+		u.scope = outer
+	}
+}
+
+func (u *unusedInspector) exprs(list []ast.Expr) {
+	for _, x := range list {
+		u.node(x)
+	}
+}
+
+func (u *unusedInspector) node(node ast.Node) {
+	switch node := node.(type) {
+	case *ast.File:
+		defer u.scoped()()
+		for _, decl := range node.Decls {
+			u.node(decl)
+		}
+	case *ast.GenDecl:
+		for _, spec := range node.Specs {
+			u.node(spec)
+		}
+	case *ast.ImportSpec:
+		impPath, _ := strconv.Unquote(node.Path.Value)
+		name := path.Base(impPath)
+		u.scope.objects[name] = &object{
+			name: name,
+			pos:  node.Pos(),
+		}
+	case *ast.FuncDecl:
+		u.node(node.Type)
+		if node.Body != nil {
+			u.node(node.Body)
+		}
+	case *ast.FuncType:
+		if node.Params != nil {
+			u.node(node.Params)
+		}
+		if node.Results != nil {
+			u.node(node.Results)
+		}
+	case *ast.FieldList:
+		for _, field := range node.List {
+			u.node(field)
+		}
+	case *ast.Field:
+		u.node(node.Type)
+
+	// statements
+
+	case *ast.BlockStmt:
+		defer u.scoped()()
+		for _, stmt := range node.List {
+			u.node(stmt)
+		}
+	case *ast.DeclStmt:
+		u.node(node.Decl)
+	case *ast.IfStmt:
+		if node.Init != nil {
+			u.node(node.Init)
+		}
+		u.node(node.Cond)
+		u.node(node.Body)
+		if node.Else != nil {
+			u.node(node.Else)
+		}
+	case *ast.ForStmt:
+		if node.Init != nil {
+			u.node(node.Init)
+		}
+		if node.Cond != nil {
+			u.node(node.Cond)
+		}
+		if node.Post != nil {
+			u.node(node.Post)
+		}
+		u.node(node.Body)
+	case *ast.SwitchStmt:
+		if node.Init != nil {
+			u.node(node.Init)
+		}
+		if node.Tag != nil {
+			u.node(node.Tag)
+		}
+		u.node(node.Body)
+	case *ast.CaseClause:
+		u.exprs(node.List)
+		defer u.scoped()()
+		for _, stmt := range node.Body {
+			u.node(stmt)
+		}
+	case *ast.BranchStmt:
+	case *ast.ExprStmt:
+		u.node(node.X)
+	case *ast.AssignStmt:
+		if node.Tok != token.DEFINE {
+			u.exprs(node.Rhs)
+			u.exprs(node.Lhs)
+			break
+		}
+		lhs := node.Lhs
+		if len(lhs) == 2 && lhs[1].(*ast.Ident).Name == "_" {
+			lhs = lhs[:1]
+		}
+		if len(lhs) != 1 {
+			panic("no support for := with multiple names")
+		}
+
+		name := lhs[0].(*ast.Ident)
+		obj := &object{
+			name: name.Name,
+			pos:  name.NamePos,
+		}
+
+		old := u.defining
+		u.defining = obj
+		u.exprs(node.Rhs)
+		u.defining = old
+
+		u.scope.objects[name.Name] = obj
+	case *ast.ReturnStmt:
+		u.exprs(node.Results)
+	case *ast.IncDecStmt:
+		u.node(node.X)
+
+	// expressions
+
+	case *ast.CallExpr:
+		u.node(node.Fun)
+		u.exprs(node.Args)
+	case *ast.SelectorExpr:
+		u.node(node.X)
+	case *ast.UnaryExpr:
+		u.node(node.X)
+	case *ast.BinaryExpr:
+		u.node(node.X)
+		u.node(node.Y)
+	case *ast.StarExpr:
+		u.node(node.X)
+	case *ast.ParenExpr:
+		u.node(node.X)
+	case *ast.IndexExpr:
+		u.node(node.X)
+		u.node(node.Index)
+	case *ast.TypeAssertExpr:
+		u.node(node.X)
+		u.node(node.Type)
+	case *ast.Ident:
+		if obj := u.scope.Lookup(node.Name); obj != nil {
+			obj.numUses++
+			if u.defining != nil {
+				u.defining.used = append(u.defining.used, obj)
+			}
+		}
+	case *ast.BasicLit:
+	case *ast.ValueSpec:
+		u.exprs(node.Values)
+	default:
+		panic(fmt.Sprintf("unhandled node: %T", node))
+	}
+}
+
+// scope keeps track of a certain scope and its declared names, as well as the
+// outer (parent) scope.
+type scope struct {
+	outer   *scope             // can be nil, if this is the top-level scope
+	objects map[string]*object // indexed by each declared name
+}
+
+func (s *scope) Lookup(name string) *object {
+	if obj := s.objects[name]; obj != nil {
+		return obj
+	}
+	if s.outer == nil {
+		return nil
+	}
+	return s.outer.Lookup(name)
+}
+
+// object keeps track of a declared name, such as a variable or import.
+type object struct {
+	name string
+	pos  token.Pos // start position of the node declaring the object
+
+	numUses int       // number of times this object is used
+	used    []*object // objects that its declaration makes use of
+}
+
+func fprint(w io.Writer, n Node) {
+	switch n := n.(type) {
+	case *File:
+		file := n
+		seenRewrite := make(map[[3]string]string)
+		fmt.Fprintf(w, "// Code generated from gen/%s%s.rules; DO NOT EDIT.\n", n.Arch.name, n.Suffix)
+		fmt.Fprintf(w, "// generated with: cd gen; go run *.go\n")
+		fmt.Fprintf(w, "\npackage ssa\n")
+		for _, path := range append([]string{
+			"fmt",
+			"math",
+			"cmd/internal/obj",
+			"cmd/internal/objabi",
+			"cmd/compile/internal/types",
+		}, n.Arch.imports...) {
+			fmt.Fprintf(w, "import %q\n", path)
+		}
+		for _, f := range n.List {
+			f := f.(*Func)
+			fmt.Fprintf(w, "func rewrite%s%s%s%s(", f.Kind, n.Arch.name, n.Suffix, f.Suffix)
+			fmt.Fprintf(w, "%c *%s) bool {\n", strings.ToLower(f.Kind)[0], f.Kind)
+			if f.Kind == "Value" && f.ArgLen > 0 {
+				for i := f.ArgLen - 1; i >= 0; i-- {
+					fmt.Fprintf(w, "v_%d := v.Args[%d]\n", i, i)
+				}
+			}
+			for _, n := range f.List {
+				fprint(w, n)
+
+				if rr, ok := n.(*RuleRewrite); ok {
+					k := [3]string{
+						normalizeMatch(rr.Match, file.Arch),
+						normalizeWhitespace(rr.Cond),
+						normalizeWhitespace(rr.Result),
+					}
+					if prev, ok := seenRewrite[k]; ok {
+						log.Fatalf("duplicate rule %s, previously seen at %s\n", rr.Loc, prev)
+					}
+					seenRewrite[k] = rr.Loc
+				}
+			}
+			fmt.Fprintf(w, "}\n")
+		}
+	case *Switch:
+		fmt.Fprintf(w, "switch ")
+		fprint(w, n.Expr)
+		fmt.Fprintf(w, " {\n")
+		for _, n := range n.List {
+			fprint(w, n)
+		}
+		fmt.Fprintf(w, "}\n")
+	case *Case:
+		fmt.Fprintf(w, "case ")
+		fprint(w, n.Expr)
+		fmt.Fprintf(w, ":\n")
+		for _, n := range n.List {
+			fprint(w, n)
+		}
+	case *RuleRewrite:
+		if *addLine {
+			fmt.Fprintf(w, "// %s\n", n.Loc)
+		}
+		fmt.Fprintf(w, "// match: %s\n", n.Match)
+		if n.Cond != "" {
+			fmt.Fprintf(w, "// cond: %s\n", n.Cond)
+		}
+		fmt.Fprintf(w, "// result: %s\n", n.Result)
+		fmt.Fprintf(w, "for %s {\n", n.Check)
+		nCommutative := 0
+		for _, n := range n.List {
+			if b, ok := n.(*CondBreak); ok {
+				b.InsideCommuteLoop = nCommutative > 0
+			}
+			fprint(w, n)
+			if loop, ok := n.(StartCommuteLoop); ok {
+				if nCommutative != loop.Depth {
+					panic("mismatch commute loop depth")
+				}
+				nCommutative++
+			}
+		}
+		fmt.Fprintf(w, "return true\n")
+		for i := 0; i < nCommutative; i++ {
+			fmt.Fprintln(w, "}")
+		}
+		if n.CommuteDepth > 0 && n.CanFail {
+			fmt.Fprint(w, "break\n")
+		}
+		fmt.Fprintf(w, "}\n")
+	case *Declare:
+		fmt.Fprintf(w, "%s := ", n.Name)
+		fprint(w, n.Value)
+		fmt.Fprintln(w)
+	case *CondBreak:
+		fmt.Fprintf(w, "if ")
+		fprint(w, n.Cond)
+		fmt.Fprintf(w, " {\n")
+		if n.InsideCommuteLoop {
+			fmt.Fprintf(w, "continue")
+		} else {
+			fmt.Fprintf(w, "break")
+		}
+		fmt.Fprintf(w, "\n}\n")
+	case ast.Node:
+		printConfig.Fprint(w, emptyFset, n)
+		if _, ok := n.(ast.Stmt); ok {
+			fmt.Fprintln(w)
+		}
+	case StartCommuteLoop:
+		fmt.Fprintf(w, "for _i%[1]d := 0; _i%[1]d <= 1; _i%[1]d, %[2]s_0, %[2]s_1 = _i%[1]d + 1, %[2]s_1, %[2]s_0 {\n", n.Depth, n.V)
+	default:
+		log.Fatalf("cannot print %T", n)
+	}
+}
+
+var printConfig = printer.Config{
+	Mode: printer.RawFormat, // we use go/format later, so skip work here
+}
+
+var emptyFset = token.NewFileSet()
+
+// Node can be a Statement or an ast.Expr.
+type Node interface{}
+
+// Statement can be one of our high-level statement struct types, or an
+// ast.Stmt under some limited circumstances.
+type Statement interface{}
+
+// BodyBase is shared by all of our statement pseudo-node types which can
+// contain other statements.
+type BodyBase struct {
+	List    []Statement
+	CanFail bool
+}
+
+func (w *BodyBase) add(node Statement) {
+	var last Statement
+	if len(w.List) > 0 {
+		last = w.List[len(w.List)-1]
+	}
+	if node, ok := node.(*CondBreak); ok {
+		w.CanFail = true
+		if last, ok := last.(*CondBreak); ok {
+			// Add to the previous "if <cond> { break }" via a
+			// logical OR, which will save verbosity.
+			last.Cond = &ast.BinaryExpr{
+				Op: token.LOR,
+				X:  last.Cond,
+				Y:  node.Cond,
+			}
+			return
+		}
+	}
+
+	w.List = append(w.List, node)
+}
+
+// predeclared contains globally known tokens that should not be redefined.
+var predeclared = map[string]bool{
+	"nil":   true,
+	"false": true,
+	"true":  true,
+}
+
+// declared reports if the body contains a Declare with the given name.
+func (w *BodyBase) declared(name string) bool {
+	if predeclared[name] {
+		// Treat predeclared names as having already been declared.
+		// This lets us use nil to match an aux field or
+		// true and false to match an auxint field.
+		return true
+	}
+	for _, s := range w.List {
+		if decl, ok := s.(*Declare); ok && decl.Name == name {
+			return true
+		}
+	}
+	return false
+}
+
+// These types define some high-level statement struct types, which can be used
+// as a Statement. This allows us to keep some node structs simpler, and have
+// higher-level nodes such as an entire rule rewrite.
+//
+// Note that ast.Expr is always used as-is; we don't declare our own expression
+// nodes.
+type (
+	File struct {
+		BodyBase // []*Func
+		Arch     arch
+		Suffix   string
+	}
+	Func struct {
+		BodyBase
+		Kind   string // "Value" or "Block"
+		Suffix string
+		ArgLen int32 // if kind == "Value", number of args for this op
+	}
+	Switch struct {
+		BodyBase // []*Case
+		Expr     ast.Expr
+	}
+	Case struct {
+		BodyBase
+		Expr ast.Expr
+	}
+	RuleRewrite struct {
+		BodyBase
+		Match, Cond, Result string // top comments
+		Check               string // top-level boolean expression
+
+		Alloc        int    // for unique var names
+		Loc          string // file name & line number of the original rule
+		CommuteDepth int    // used to track depth of commute loops
+	}
+	Declare struct {
+		Name  string
+		Value ast.Expr
+	}
+	CondBreak struct {
+		Cond              ast.Expr
+		InsideCommuteLoop bool
+	}
+	StartCommuteLoop struct {
+		Depth int
+		V     string
+	}
+)
+
+// exprf parses a Go expression generated from fmt.Sprintf, panicking if an
+// error occurs.
+func exprf(format string, a ...interface{}) ast.Expr {
+	src := fmt.Sprintf(format, a...)
+	expr, err := parser.ParseExpr(src)
+	if err != nil {
+		log.Fatalf("expr parse error on %q: %v", src, err)
+	}
+	return expr
+}
+
+// stmtf parses a Go statement generated from fmt.Sprintf. This function is only
+// meant for simple statements that don't have a custom Statement node declared
+// in this package, such as ast.ReturnStmt or ast.ExprStmt.
+func stmtf(format string, a ...interface{}) Statement {
+	src := fmt.Sprintf(format, a...)
+	fsrc := "package p\nfunc _() {\n" + src + "\n}\n"
+	file, err := parser.ParseFile(token.NewFileSet(), "", fsrc, 0)
+	if err != nil {
+		log.Fatalf("stmt parse error on %q: %v", src, err)
+	}
+	return file.Decls[0].(*ast.FuncDecl).Body.List[0]
+}
+
+// declf constructs a simple "name := value" declaration, using exprf for its
+// value.
+func declf(name, format string, a ...interface{}) *Declare {
+	return &Declare{name, exprf(format, a...)}
+}
+
+// breakf constructs a simple "if cond { break }" statement, using exprf for its
+// condition.
+func breakf(format string, a ...interface{}) *CondBreak {
+	return &CondBreak{Cond: exprf(format, a...)}
+}
+
+func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite {
+	rr := &RuleRewrite{Loc: rule.Loc}
+	rr.Match, rr.Cond, rr.Result = rule.parse()
+	_, _, auxint, aux, s := extract(rr.Match) // remove parens, then split
+
+	// check match of control values
+	if len(s) < data.controls {
+		log.Fatalf("incorrect number of arguments in %s, got %v wanted at least %v", rule, len(s), data.controls)
+	}
+	controls := s[:data.controls]
+	pos := make([]string, data.controls)
+	for i, arg := range controls {
+		cname := fmt.Sprintf("b.Controls[%v]", i)
+		if strings.Contains(arg, "(") {
+			vname, expr := splitNameExpr(arg)
+			if vname == "" {
+				vname = fmt.Sprintf("v_%v", i)
+			}
+			rr.add(declf(vname, cname))
+			p, op := genMatch0(rr, arch, expr, vname, nil, false) // TODO: pass non-nil cnt?
+			if op != "" {
+				check := fmt.Sprintf("%s.Op == %s", cname, op)
+				if rr.Check == "" {
+					rr.Check = check
+				} else {
+					rr.Check += " && " + check
+				}
+			}
+			if p == "" {
+				p = vname + ".Pos"
+			}
+			pos[i] = p
+		} else {
+			rr.add(declf(arg, cname))
+			pos[i] = arg + ".Pos"
+		}
+	}
+	for _, e := range []struct {
+		name, field, dclType string
+	}{
+		{auxint, "AuxInt", data.auxIntType()},
+		{aux, "Aux", data.auxType()},
+	} {
+		if e.name == "" {
+			continue
+		}
+
+		if e.dclType == "" {
+			log.Fatalf("op %s has no declared type for %s", data.name, e.field)
+		}
+		if !token.IsIdentifier(e.name) || rr.declared(e.name) {
+			rr.add(breakf("%sTo%s(b.%s) != %s", unTitle(e.field), title(e.dclType), e.field, e.name))
+		} else {
+			rr.add(declf(e.name, "%sTo%s(b.%s)", unTitle(e.field), title(e.dclType), e.field))
+		}
+	}
+	if rr.Cond != "" {
+		rr.add(breakf("!(%s)", rr.Cond))
+	}
+
+	// Rule matches. Generate result.
+	outop, _, auxint, aux, t := extract(rr.Result) // remove parens, then split
+	blockName, outdata := getBlockInfo(outop, arch)
+	if len(t) < outdata.controls {
+		log.Fatalf("incorrect number of output arguments in %s, got %v wanted at least %v", rule, len(s), outdata.controls)
+	}
+
+	// Check if newsuccs is the same set as succs.
+	succs := s[data.controls:]
+	newsuccs := t[outdata.controls:]
+	m := map[string]bool{}
+	for _, succ := range succs {
+		if m[succ] {
+			log.Fatalf("can't have a repeat successor name %s in %s", succ, rule)
+		}
+		m[succ] = true
+	}
+	for _, succ := range newsuccs {
+		if !m[succ] {
+			log.Fatalf("unknown successor %s in %s", succ, rule)
+		}
+		delete(m, succ)
+	}
+	if len(m) != 0 {
+		log.Fatalf("unmatched successors %v in %s", m, rule)
+	}
+
+	var genControls [2]string
+	for i, control := range t[:outdata.controls] {
+		// Select a source position for any new control values.
+		// TODO: does it always make sense to use the source position
+		// of the original control values or should we be using the
+		// block's source position in some cases?
+		newpos := "b.Pos" // default to block's source position
+		if i < len(pos) && pos[i] != "" {
+			// Use the previous control value's source position.
+			newpos = pos[i]
+		}
+
+		// Generate a new control value (or copy an existing value).
+		genControls[i] = genResult0(rr, arch, control, false, false, newpos, nil)
+	}
+	switch outdata.controls {
+	case 0:
+		rr.add(stmtf("b.Reset(%s)", blockName))
+	case 1:
+		rr.add(stmtf("b.resetWithControl(%s, %s)", blockName, genControls[0]))
+	case 2:
+		rr.add(stmtf("b.resetWithControl2(%s, %s, %s)", blockName, genControls[0], genControls[1]))
+	default:
+		log.Fatalf("too many controls: %d", outdata.controls)
+	}
+
+	if auxint != "" {
+		// Make sure auxint value has the right type.
+		rr.add(stmtf("b.AuxInt = %sToAuxInt(%s)", unTitle(outdata.auxIntType()), auxint))
+	}
+	if aux != "" {
+		// Make sure aux value has the right type.
+		rr.add(stmtf("b.Aux = %sToAux(%s)", unTitle(outdata.auxType()), aux))
+	}
+
+	succChanged := false
+	for i := 0; i < len(succs); i++ {
+		if succs[i] != newsuccs[i] {
+			succChanged = true
+		}
+	}
+	if succChanged {
+		if len(succs) != 2 {
+			log.Fatalf("changed successors, len!=2 in %s", rule)
+		}
+		if succs[0] != newsuccs[1] || succs[1] != newsuccs[0] {
+			log.Fatalf("can only handle swapped successors in %s", rule)
+		}
+		rr.add(stmtf("b.swapSuccessors()"))
+	}
+
+	if *genLog {
+		rr.add(stmtf("logRule(%q)", rule.Loc))
+	}
+	return rr
+}
+
+// genMatch returns the variable whose source position should be used for the
+// result (or "" if no opinion), and a boolean that reports whether the match can fail.
+func genMatch(rr *RuleRewrite, arch arch, match string, pregenTop bool) (pos, checkOp string) {
+	cnt := varCount(rr)
+	return genMatch0(rr, arch, match, "v", cnt, pregenTop)
+}
+
+func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int, pregenTop bool) (pos, checkOp string) {
+	if match[0] != '(' || match[len(match)-1] != ')' {
+		log.Fatalf("%s: non-compound expr in genMatch0: %q", rr.Loc, match)
+	}
+	op, oparch, typ, auxint, aux, args := parseValue(match, arch, rr.Loc)
+
+	checkOp = fmt.Sprintf("Op%s%s", oparch, op.name)
+
+	if op.faultOnNilArg0 || op.faultOnNilArg1 {
+		// Prefer the position of an instruction which could fault.
+		pos = v + ".Pos"
+	}
+
+	// If the last argument is ___, it means "don't care about trailing arguments, really"
+	// The likely/intended use is for rewrites that are too tricky to express in the existing pattern language
+	// Do a length check early because long patterns fed short (ultimately not-matching) inputs will
+	// do an indexing error in pattern-matching.
+	if op.argLength == -1 {
+		l := len(args)
+		if l == 0 || args[l-1] != "___" {
+			rr.add(breakf("len(%s.Args) != %d", v, l))
+		} else if l > 1 && args[l-1] == "___" {
+			rr.add(breakf("len(%s.Args) < %d", v, l-1))
+		}
+	}
+
+	for _, e := range []struct {
+		name, field, dclType string
+	}{
+		{typ, "Type", "*types.Type"},
+		{auxint, "AuxInt", op.auxIntType()},
+		{aux, "Aux", op.auxType()},
+	} {
+		if e.name == "" {
+			continue
+		}
+
+		if e.dclType == "" {
+			log.Fatalf("op %s has no declared type for %s", op.name, e.field)
+		}
+		if !token.IsIdentifier(e.name) || rr.declared(e.name) {
+			switch e.field {
+			case "Aux":
+				rr.add(breakf("auxTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name))
+			case "AuxInt":
+				rr.add(breakf("auxIntTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name))
+			case "Type":
+				rr.add(breakf("%s.%s != %s", v, e.field, e.name))
+			}
+		} else {
+			switch e.field {
+			case "Aux":
+				rr.add(declf(e.name, "auxTo%s(%s.%s)", title(e.dclType), v, e.field))
+			case "AuxInt":
+				rr.add(declf(e.name, "auxIntTo%s(%s.%s)", title(e.dclType), v, e.field))
+			case "Type":
+				rr.add(declf(e.name, "%s.%s", v, e.field))
+			}
+		}
+	}
+
+	commutative := op.commutative
+	if commutative {
+		if args[0] == args[1] {
+			// When we have (Add x x), for any x,
+			// even if there are other uses of x besides these two,
+			// and even if x is not a variable,
+			// we can skip the commutative match.
+			commutative = false
+		}
+		if cnt[args[0]] == 1 && cnt[args[1]] == 1 {
+			// When we have (Add x y) with no other uses
+			// of x and y in the matching rule and condition,
+			// then we can skip the commutative match (Add y x).
+			commutative = false
+		}
+	}
+
+	if !pregenTop {
+		// Access last argument first to minimize bounds checks.
+		for n := len(args) - 1; n > 0; n-- {
+			a := args[n]
+			if a == "_" {
+				continue
+			}
+			if !rr.declared(a) && token.IsIdentifier(a) && !(commutative && len(args) == 2) {
+				rr.add(declf(a, "%s.Args[%d]", v, n))
+				// delete the last argument so it is not reprocessed
+				args = args[:n]
+			} else {
+				rr.add(stmtf("_ = %s.Args[%d]", v, n))
+			}
+			break
+		}
+	}
+	if commutative && !pregenTop {
+		for i := 0; i <= 1; i++ {
+			vname := fmt.Sprintf("%s_%d", v, i)
+			rr.add(declf(vname, "%s.Args[%d]", v, i))
+		}
+	}
+	if commutative {
+		rr.add(StartCommuteLoop{rr.CommuteDepth, v})
+		rr.CommuteDepth++
+	}
+	for i, arg := range args {
+		if arg == "_" {
+			continue
+		}
+		var rhs string
+		if (commutative && i < 2) || pregenTop {
+			rhs = fmt.Sprintf("%s_%d", v, i)
+		} else {
+			rhs = fmt.Sprintf("%s.Args[%d]", v, i)
+		}
+		if !strings.Contains(arg, "(") {
+			// leaf variable
+			if rr.declared(arg) {
+				// variable already has a definition. Check whether
+				// the old definition and the new definition match.
+				// For example, (add x x).  Equality is just pointer equality
+				// on Values (so cse is important to do before lowering).
+				rr.add(breakf("%s != %s", arg, rhs))
+			} else {
+				if arg != rhs {
+					rr.add(declf(arg, "%s", rhs))
+				}
+			}
+			continue
+		}
+		// compound sexpr
+		argname, expr := splitNameExpr(arg)
+		if argname == "" {
+			argname = fmt.Sprintf("%s_%d", v, i)
+		}
+		if argname == "b" {
+			log.Fatalf("don't name args 'b', it is ambiguous with blocks")
+		}
+
+		if argname != rhs {
+			rr.add(declf(argname, "%s", rhs))
+		}
+		bexpr := exprf("%s.Op != addLater", argname)
+		rr.add(&CondBreak{Cond: bexpr})
+		argPos, argCheckOp := genMatch0(rr, arch, expr, argname, cnt, false)
+		bexpr.(*ast.BinaryExpr).Y.(*ast.Ident).Name = argCheckOp
+
+		if argPos != "" {
+			// Keep the argument in preference to the parent, as the
+			// argument is normally earlier in program flow.
+			// Keep the argument in preference to an earlier argument,
+			// as that prefers the memory argument which is also earlier
+			// in the program flow.
+			pos = argPos
+		}
+	}
+
+	return pos, checkOp
+}
+
+func genResult(rr *RuleRewrite, arch arch, result, pos string) {
+	move := result[0] == '@'
+	if move {
+		// parse @block directive
+		s := strings.SplitN(result[1:], " ", 2)
+		rr.add(stmtf("b = %s", s[0]))
+		result = s[1]
+	}
+	cse := make(map[string]string)
+	genResult0(rr, arch, result, true, move, pos, cse)
+}
+
+func genResult0(rr *RuleRewrite, arch arch, result string, top, move bool, pos string, cse map[string]string) string {
+	resname, expr := splitNameExpr(result)
+	result = expr
+	// TODO: when generating a constant result, use f.constVal to avoid
+	// introducing copies just to clean them up again.
+	if result[0] != '(' {
+		// variable
+		if top {
+			// It in not safe in general to move a variable between blocks
+			// (and particularly not a phi node).
+			// Introduce a copy.
+			rr.add(stmtf("v.copyOf(%s)", result))
+		}
+		return result
+	}
+
+	w := normalizeWhitespace(result)
+	if prev := cse[w]; prev != "" {
+		return prev
+	}
+
+	op, oparch, typ, auxint, aux, args := parseValue(result, arch, rr.Loc)
+
+	// Find the type of the variable.
+	typeOverride := typ != ""
+	if typ == "" && op.typ != "" {
+		typ = typeName(op.typ)
+	}
+
+	v := "v"
+	if top && !move {
+		rr.add(stmtf("v.reset(Op%s%s)", oparch, op.name))
+		if typeOverride {
+			rr.add(stmtf("v.Type = %s", typ))
+		}
+	} else {
+		if typ == "" {
+			log.Fatalf("sub-expression %s (op=Op%s%s) at %s must have a type", result, oparch, op.name, rr.Loc)
+		}
+		if resname == "" {
+			v = fmt.Sprintf("v%d", rr.Alloc)
+		} else {
+			v = resname
+		}
+		rr.Alloc++
+		rr.add(declf(v, "b.NewValue0(%s, Op%s%s, %s)", pos, oparch, op.name, typ))
+		if move && top {
+			// Rewrite original into a copy
+			rr.add(stmtf("v.copyOf(%s)", v))
+		}
+	}
+
+	if auxint != "" {
+		// Make sure auxint value has the right type.
+		rr.add(stmtf("%s.AuxInt = %sToAuxInt(%s)", v, unTitle(op.auxIntType()), auxint))
+	}
+	if aux != "" {
+		// Make sure aux value has the right type.
+		rr.add(stmtf("%s.Aux = %sToAux(%s)", v, unTitle(op.auxType()), aux))
+	}
+	all := new(strings.Builder)
+	for i, arg := range args {
+		x := genResult0(rr, arch, arg, false, move, pos, cse)
+		if i > 0 {
+			all.WriteString(", ")
+		}
+		all.WriteString(x)
+	}
+	switch len(args) {
+	case 0:
+	case 1:
+		rr.add(stmtf("%s.AddArg(%s)", v, all.String()))
+	default:
+		rr.add(stmtf("%s.AddArg%d(%s)", v, len(args), all.String()))
+	}
+
+	if cse != nil {
+		cse[w] = v
+	}
+	return v
+}
+
+func split(s string) []string {
+	var r []string
+
+outer:
+	for s != "" {
+		d := 0               // depth of ({[<
+		var open, close byte // opening and closing markers ({[< or )}]>
+		nonsp := false       // found a non-space char so far
+		for i := 0; i < len(s); i++ {
+			switch {
+			case d == 0 && s[i] == '(':
+				open, close = '(', ')'
+				d++
+			case d == 0 && s[i] == '<':
+				open, close = '<', '>'
+				d++
+			case d == 0 && s[i] == '[':
+				open, close = '[', ']'
+				d++
+			case d == 0 && s[i] == '{':
+				open, close = '{', '}'
+				d++
+			case d == 0 && (s[i] == ' ' || s[i] == '\t'):
+				if nonsp {
+					r = append(r, strings.TrimSpace(s[:i]))
+					s = s[i:]
+					continue outer
+				}
+			case d > 0 && s[i] == open:
+				d++
+			case d > 0 && s[i] == close:
+				d--
+			default:
+				nonsp = true
+			}
+		}
+		if d != 0 {
+			log.Fatalf("imbalanced expression: %q", s)
+		}
+		if nonsp {
+			r = append(r, strings.TrimSpace(s))
+		}
+		break
+	}
+	return r
+}
+
+// isBlock reports whether this op is a block opcode.
+func isBlock(name string, arch arch) bool {
+	for _, b := range genericBlocks {
+		if b.name == name {
+			return true
+		}
+	}
+	for _, b := range arch.blocks {
+		if b.name == name {
+			return true
+		}
+	}
+	return false
+}
+
+func extract(val string) (op, typ, auxint, aux string, args []string) {
+	val = val[1 : len(val)-1] // remove ()
+
+	// Split val up into regions.
+	// Split by spaces/tabs, except those contained in (), {}, [], or <>.
+	s := split(val)
+
+	// Extract restrictions and args.
+	op = s[0]
+	for _, a := range s[1:] {
+		switch a[0] {
+		case '<':
+			typ = a[1 : len(a)-1] // remove <>
+		case '[':
+			auxint = a[1 : len(a)-1] // remove []
+		case '{':
+			aux = a[1 : len(a)-1] // remove {}
+		default:
+			args = append(args, a)
+		}
+	}
+	return
+}
+
+// parseValue parses a parenthesized value from a rule.
+// The value can be from the match or the result side.
+// It returns the op and unparsed strings for typ, auxint, and aux restrictions and for all args.
+// oparch is the architecture that op is located in, or "" for generic.
+func parseValue(val string, arch arch, loc string) (op opData, oparch, typ, auxint, aux string, args []string) {
+	// Resolve the op.
+	var s string
+	s, typ, auxint, aux, args = extract(val)
+
+	// match reports whether x is a good op to select.
+	// If strict is true, rule generation might succeed.
+	// If strict is false, rule generation has failed,
+	// but we're trying to generate a useful error.
+	// Doing strict=true then strict=false allows
+	// precise op matching while retaining good error messages.
+	match := func(x opData, strict bool, archname string) bool {
+		if x.name != s {
+			return false
+		}
+		if x.argLength != -1 && int(x.argLength) != len(args) && (len(args) != 1 || args[0] != "...") {
+			if strict {
+				return false
+			}
+			log.Printf("%s: op %s (%s) should have %d args, has %d", loc, s, archname, x.argLength, len(args))
+		}
+		return true
+	}
+
+	for _, x := range genericOps {
+		if match(x, true, "generic") {
+			op = x
+			break
+		}
+	}
+	for _, x := range arch.ops {
+		if arch.name != "generic" && match(x, true, arch.name) {
+			if op.name != "" {
+				log.Fatalf("%s: matches for op %s found in both generic and %s", loc, op.name, arch.name)
+			}
+			op = x
+			oparch = arch.name
+			break
+		}
+	}
+
+	if op.name == "" {
+		// Failed to find the op.
+		// Run through everything again with strict=false
+		// to generate useful diagnosic messages before failing.
+		for _, x := range genericOps {
+			match(x, false, "generic")
+		}
+		for _, x := range arch.ops {
+			match(x, false, arch.name)
+		}
+		log.Fatalf("%s: unknown op %s", loc, s)
+	}
+
+	// Sanity check aux, auxint.
+	if auxint != "" && !opHasAuxInt(op) {
+		log.Fatalf("%s: op %s %s can't have auxint", loc, op.name, op.aux)
+	}
+	if aux != "" && !opHasAux(op) {
+		log.Fatalf("%s: op %s %s can't have aux", loc, op.name, op.aux)
+	}
+	return
+}
+
+func opHasAuxInt(op opData) bool {
+	switch op.aux {
+	case "Bool", "Int8", "Int16", "Int32", "Int64", "Int128", "UInt8", "Float32", "Float64",
+		"SymOff", "CallOff", "SymValAndOff", "TypSize", "ARM64BitField", "FlagConstant", "CCop":
+		return true
+	}
+	return false
+}
+
+func opHasAux(op opData) bool {
+	switch op.aux {
+	case "String", "Sym", "SymOff", "Call", "CallOff", "SymValAndOff", "Typ", "TypSize",
+		"S390XCCMask", "S390XRotateParams":
+		return true
+	}
+	return false
+}
+
+// splitNameExpr splits s-expr arg, possibly prefixed by "name:",
+// into name and the unprefixed expression.
+// For example, "x:(Foo)" yields "x", "(Foo)",
+// and "(Foo)" yields "", "(Foo)".
+func splitNameExpr(arg string) (name, expr string) {
+	colon := strings.Index(arg, ":")
+	if colon < 0 {
+		return "", arg
+	}
+	openparen := strings.Index(arg, "(")
+	if openparen < 0 {
+		log.Fatalf("splitNameExpr(%q): colon but no open parens", arg)
+	}
+	if colon > openparen {
+		// colon is inside the parens, such as in "(Foo x:(Bar))".
+		return "", arg
+	}
+	return arg[:colon], arg[colon+1:]
+}
+
+func getBlockInfo(op string, arch arch) (name string, data blockData) {
+	for _, b := range genericBlocks {
+		if b.name == op {
+			return "Block" + op, b
+		}
+	}
+	for _, b := range arch.blocks {
+		if b.name == op {
+			return "Block" + arch.name + op, b
+		}
+	}
+	log.Fatalf("could not find block data for %s", op)
+	panic("unreachable")
+}
+
+// typeName returns the string to use to generate a type.
+func typeName(typ string) string {
+	if typ[0] == '(' {
+		ts := strings.Split(typ[1:len(typ)-1], ",")
+		if len(ts) != 2 {
+			log.Fatalf("Tuple expect 2 arguments")
+		}
+		return "types.NewTuple(" + typeName(ts[0]) + ", " + typeName(ts[1]) + ")"
+	}
+	switch typ {
+	case "Flags", "Mem", "Void", "Int128":
+		return "types.Type" + typ
+	default:
+		return "typ." + typ
+	}
+}
+
+// balance returns the number of unclosed '(' characters in s.
+// If a ')' appears without a corresponding '(', balance returns -1.
+func balance(s string) int {
+	balance := 0
+	for _, c := range s {
+		switch c {
+		case '(':
+			balance++
+		case ')':
+			balance--
+			if balance < 0 {
+				// don't allow ")(" to return 0
+				return -1
+			}
+		}
+	}
+	return balance
+}
+
+// findAllOpcode is a function to find the opcode portion of s-expressions.
+var findAllOpcode = regexp.MustCompile(`[(](\w+[|])+\w+[)]`).FindAllStringIndex
+
+// excludeFromExpansion reports whether the substring s[idx[0]:idx[1]] in a rule
+// should be disregarded as a candidate for | expansion.
+// It uses simple syntactic checks to see whether the substring
+// is inside an AuxInt expression or inside the && conditions.
+func excludeFromExpansion(s string, idx []int) bool {
+	left := s[:idx[0]]
+	if strings.LastIndexByte(left, '[') > strings.LastIndexByte(left, ']') {
+		// Inside an AuxInt expression.
+		return true
+	}
+	right := s[idx[1]:]
+	if strings.Contains(left, "&&") && strings.Contains(right, "=>") {
+		// Inside && conditions.
+		return true
+	}
+	return false
+}
+
+// expandOr converts a rule into multiple rules by expanding | ops.
+func expandOr(r string) []string {
+	// Find every occurrence of |-separated things.
+	// They look like MOV(B|W|L|Q|SS|SD)load or MOV(Q|L)loadidx(1|8).
+	// Generate rules selecting one case from each |-form.
+
+	// Count width of |-forms.  They must match.
+	n := 1
+	for _, idx := range findAllOpcode(r, -1) {
+		if excludeFromExpansion(r, idx) {
+			continue
+		}
+		s := r[idx[0]:idx[1]]
+		c := strings.Count(s, "|") + 1
+		if c == 1 {
+			continue
+		}
+		if n > 1 && n != c {
+			log.Fatalf("'|' count doesn't match in %s: both %d and %d\n", r, n, c)
+		}
+		n = c
+	}
+	if n == 1 {
+		// No |-form in this rule.
+		return []string{r}
+	}
+	// Build each new rule.
+	res := make([]string, n)
+	for i := 0; i < n; i++ {
+		buf := new(strings.Builder)
+		x := 0
+		for _, idx := range findAllOpcode(r, -1) {
+			if excludeFromExpansion(r, idx) {
+				continue
+			}
+			buf.WriteString(r[x:idx[0]])              // write bytes we've skipped over so far
+			s := r[idx[0]+1 : idx[1]-1]               // remove leading "(" and trailing ")"
+			buf.WriteString(strings.Split(s, "|")[i]) // write the op component for this rule
+			x = idx[1]                                // note that we've written more bytes
+		}
+		buf.WriteString(r[x:])
+		res[i] = buf.String()
+	}
+	return res
+}
+
+// varCount returns a map which counts the number of occurrences of
+// Value variables in the s-expression rr.Match and the Go expression rr.Cond.
+func varCount(rr *RuleRewrite) map[string]int {
+	cnt := map[string]int{}
+	varCount1(rr.Loc, rr.Match, cnt)
+	if rr.Cond != "" {
+		expr, err := parser.ParseExpr(rr.Cond)
+		if err != nil {
+			log.Fatalf("%s: failed to parse cond %q: %v", rr.Loc, rr.Cond, err)
+		}
+		ast.Inspect(expr, func(n ast.Node) bool {
+			if id, ok := n.(*ast.Ident); ok {
+				cnt[id.Name]++
+			}
+			return true
+		})
+	}
+	return cnt
+}
+
+func varCount1(loc, m string, cnt map[string]int) {
+	if m[0] == '<' || m[0] == '[' || m[0] == '{' {
+		return
+	}
+	if token.IsIdentifier(m) {
+		cnt[m]++
+		return
+	}
+	// Split up input.
+	name, expr := splitNameExpr(m)
+	if name != "" {
+		cnt[name]++
+	}
+	if expr[0] != '(' || expr[len(expr)-1] != ')' {
+		log.Fatalf("%s: non-compound expr in varCount1: %q", loc, expr)
+	}
+	s := split(expr[1 : len(expr)-1])
+	for _, arg := range s[1:] {
+		varCount1(loc, arg, cnt)
+	}
+}
+
+// normalizeWhitespace replaces 2+ whitespace sequences with a single space.
+func normalizeWhitespace(x string) string {
+	x = strings.Join(strings.Fields(x), " ")
+	x = strings.Replace(x, "( ", "(", -1)
+	x = strings.Replace(x, " )", ")", -1)
+	x = strings.Replace(x, "[ ", "[", -1)
+	x = strings.Replace(x, " ]", "]", -1)
+	x = strings.Replace(x, ")=>", ") =>", -1)
+	return x
+}
+
+// opIsCommutative reports whether op s is commutative.
+func opIsCommutative(op string, arch arch) bool {
+	for _, x := range genericOps {
+		if op == x.name {
+			if x.commutative {
+				return true
+			}
+			break
+		}
+	}
+	if arch.name != "generic" {
+		for _, x := range arch.ops {
+			if op == x.name {
+				if x.commutative {
+					return true
+				}
+				break
+			}
+		}
+	}
+	return false
+}
+
+func normalizeMatch(m string, arch arch) string {
+	if token.IsIdentifier(m) {
+		return m
+	}
+	op, typ, auxint, aux, args := extract(m)
+	if opIsCommutative(op, arch) {
+		if args[1] < args[0] {
+			args[0], args[1] = args[1], args[0]
+		}
+	}
+	s := new(strings.Builder)
+	fmt.Fprintf(s, "%s <%s> [%s] {%s}", op, typ, auxint, aux)
+	for _, arg := range args {
+		prefix, expr := splitNameExpr(arg)
+		fmt.Fprint(s, " ", prefix, normalizeMatch(expr, arch))
+	}
+	return s.String()
+}
+
+func parseEllipsisRules(rules []Rule, arch arch) (newop string, ok bool) {
+	if len(rules) != 1 {
+		for _, r := range rules {
+			if strings.Contains(r.Rule, "...") {
+				log.Fatalf("%s: found ellipsis in rule, but there are other rules with the same op", r.Loc)
+			}
+		}
+		return "", false
+	}
+	rule := rules[0]
+	match, cond, result := rule.parse()
+	if cond != "" || !isEllipsisValue(match) || !isEllipsisValue(result) {
+		if strings.Contains(rule.Rule, "...") {
+			log.Fatalf("%s: found ellipsis in non-ellipsis rule", rule.Loc)
+		}
+		checkEllipsisRuleCandidate(rule, arch)
+		return "", false
+	}
+	op, oparch, _, _, _, _ := parseValue(result, arch, rule.Loc)
+	return fmt.Sprintf("Op%s%s", oparch, op.name), true
+}
+
+// isEllipsisValue reports whether s is of the form (OpX ...).
+func isEllipsisValue(s string) bool {
+	if len(s) < 2 || s[0] != '(' || s[len(s)-1] != ')' {
+		return false
+	}
+	c := split(s[1 : len(s)-1])
+	if len(c) != 2 || c[1] != "..." {
+		return false
+	}
+	return true
+}
+
+func checkEllipsisRuleCandidate(rule Rule, arch arch) {
+	match, cond, result := rule.parse()
+	if cond != "" {
+		return
+	}
+	op, _, _, auxint, aux, args := parseValue(match, arch, rule.Loc)
+	var auxint2, aux2 string
+	var args2 []string
+	var usingCopy string
+	var eop opData
+	if result[0] != '(' {
+		// Check for (Foo x) => x, which can be converted to (Foo ...) => (Copy ...).
+		args2 = []string{result}
+		usingCopy = " using Copy"
+	} else {
+		eop, _, _, auxint2, aux2, args2 = parseValue(result, arch, rule.Loc)
+	}
+	// Check that all restrictions in match are reproduced exactly in result.
+	if aux != aux2 || auxint != auxint2 || len(args) != len(args2) {
+		return
+	}
+	if strings.Contains(rule.Rule, "=>") && op.aux != eop.aux {
+		return
+	}
+	for i := range args {
+		if args[i] != args2[i] {
+			return
+		}
+	}
+	switch {
+	case opHasAux(op) && aux == "" && aux2 == "":
+		fmt.Printf("%s: rule silently zeros aux, either copy aux or explicitly zero\n", rule.Loc)
+	case opHasAuxInt(op) && auxint == "" && auxint2 == "":
+		fmt.Printf("%s: rule silently zeros auxint, either copy auxint or explicitly zero\n", rule.Loc)
+	default:
+		fmt.Printf("%s: possible ellipsis rule candidate%s: %q\n", rule.Loc, usingCopy, rule.Rule)
+	}
+}
+
+func opByName(arch arch, name string) opData {
+	name = name[2:]
+	for _, x := range genericOps {
+		if name == x.name {
+			return x
+		}
+	}
+	if arch.name != "generic" {
+		name = name[len(arch.name):]
+		for _, x := range arch.ops {
+			if name == x.name {
+				return x
+			}
+		}
+	}
+	log.Fatalf("failed to find op named %s in arch %s", name, arch.name)
+	panic("unreachable")
+}
+
+// auxType returns the Go type that this operation should store in its aux field.
+func (op opData) auxType() string {
+	switch op.aux {
+	case "String":
+		return "string"
+	case "Sym":
+		// Note: a Sym can be an *obj.LSym, a *gc.Node, or nil.
+		return "Sym"
+	case "SymOff":
+		return "Sym"
+	case "Call":
+		return "Call"
+	case "CallOff":
+		return "Call"
+	case "SymValAndOff":
+		return "Sym"
+	case "Typ":
+		return "*types.Type"
+	case "TypSize":
+		return "*types.Type"
+	case "S390XCCMask":
+		return "s390x.CCMask"
+	case "S390XRotateParams":
+		return "s390x.RotateParams"
+	default:
+		return "invalid"
+	}
+}
+
+// auxIntType returns the Go type that this operation should store in its auxInt field.
+func (op opData) auxIntType() string {
+	switch op.aux {
+	case "Bool":
+		return "bool"
+	case "Int8":
+		return "int8"
+	case "Int16":
+		return "int16"
+	case "Int32":
+		return "int32"
+	case "Int64":
+		return "int64"
+	case "Int128":
+		return "int128"
+	case "UInt8":
+		return "uint8"
+	case "Float32":
+		return "float32"
+	case "Float64":
+		return "float64"
+	case "CallOff":
+		return "int32"
+	case "SymOff":
+		return "int32"
+	case "SymValAndOff":
+		return "ValAndOff"
+	case "TypSize":
+		return "int64"
+	case "CCop":
+		return "Op"
+	case "FlagConstant":
+		return "flagConstant"
+	case "ARM64BitField":
+		return "arm64BitField"
+	default:
+		return "invalid"
+	}
+}
+
+// auxType returns the Go type that this block should store in its aux field.
+func (b blockData) auxType() string {
+	switch b.aux {
+	case "S390XCCMask", "S390XCCMaskInt8", "S390XCCMaskUint8":
+		return "s390x.CCMask"
+	case "S390XRotateParams":
+		return "s390x.RotateParams"
+	default:
+		return "invalid"
+	}
+}
+
+// auxIntType returns the Go type that this block should store in its auxInt field.
+func (b blockData) auxIntType() string {
+	switch b.aux {
+	case "S390XCCMaskInt8":
+		return "int8"
+	case "S390XCCMaskUint8":
+		return "uint8"
+	case "Int64":
+		return "int64"
+	default:
+		return "invalid"
+	}
+}
+
+func title(s string) string {
+	if i := strings.Index(s, "."); i >= 0 {
+		switch strings.ToLower(s[:i]) {
+		case "s390x": // keep arch prefix for clarity
+			s = s[:i] + s[i+1:]
+		default:
+			s = s[i+1:]
+		}
+	}
+	return strings.Title(s)
+}
+
+func unTitle(s string) string {
+	if i := strings.Index(s, "."); i >= 0 {
+		switch strings.ToLower(s[:i]) {
+		case "s390x": // keep arch prefix for clarity
+			s = s[:i] + s[i+1:]
+		default:
+			s = s[i+1:]
+		}
+	}
+	return strings.ToLower(s[:1]) + s[1:]
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 13:14:23 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 13:14:23 +0000
commit	73df946d56c74384511a194dd01dbe099584fd1a (patch)
tree	fd0bcea490dd81327ddfbb31e215439672c9a068 /src/cmd/compile/internal/ssa/gen
parent	Initial commit. (diff)
download	golang-1.16-upstream.tar.xz golang-1.16-upstream.zip