diff options
Diffstat (limited to 'src/cmd/compile/internal/ssa/_gen')
39 files changed, 27621 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/_gen/386.rules b/src/cmd/compile/internal/ssa/_gen/386.rules new file mode 100644 index 0000000..5e30ca9 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/386.rules @@ -0,0 +1,1091 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Lowering arithmetic +(Add(Ptr|32|16|8) ...) => (ADDL ...) +(Add(32|64)F ...) => (ADDS(S|D) ...) +(Add32carry ...) => (ADDLcarry ...) +(Add32withcarry ...) => (ADCL ...) + +(Sub(Ptr|32|16|8) ...) => (SUBL ...) +(Sub(32|64)F ...) => (SUBS(S|D) ...) +(Sub32carry ...) => (SUBLcarry ...) +(Sub32withcarry ...) => (SBBL ...) + +(Mul(32|16|8) ...) => (MULL ...) +(Mul(32|64)F ...) => (MULS(S|D) ...) +(Mul32uhilo ...) => (MULLQU ...) + +(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y)) +(Select1 (Mul32uover x y)) => (SETO (Select1 <types.TypeFlags> (MULLU x y))) + +(Avg32u ...) => (AVGLU ...) + +(Div(32|64)F ...) => (DIVS(S|D) ...) +(Div(32|32u|16|16u) ...) => (DIV(L|LU|W|WU) ...) +(Div8 x y) => (DIVW (SignExt8to16 x) (SignExt8to16 y)) +(Div8u x y) => (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)) + +(Hmul(32|32u) ...) => (HMUL(L|LU) ...) + +(Mod(32|32u|16|16u) ...) => (MOD(L|LU|W|WU) ...) +(Mod8 x y) => (MODW (SignExt8to16 x) (SignExt8to16 y)) +(Mod8u x y) => (MODWU (ZeroExt8to16 x) (ZeroExt8to16 y)) + +(And(32|16|8) ...) => (ANDL ...) +(Or(32|16|8) ...) => (ORL ...) +(Xor(32|16|8) ...) => (XORL ...) + +(Neg(32|16|8) ...) => (NEGL ...) +(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))])) +(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)])) + +(Com(32|16|8) ...) => (NOTL ...) + +// Lowering boolean ops +(AndB ...) => (ANDL ...) +(OrB ...) => (ORL ...) +(Not x) => (XORLconst [1] x) + +// Lowering pointer arithmetic +(OffPtr [off] ptr) => (ADDLconst [int32(off)] ptr) + +(Bswap32 ...) => (BSWAPL ...) + +(Sqrt ...) => (SQRTSD ...) +(Sqrt32 ...) => (SQRTSS ...) + +(Ctz16 x) => (BSFL (ORLconst <typ.UInt32> [0x10000] x)) +(Ctz16NonZero ...) => (BSFL ...) + +// Lowering extension +(SignExt8to16 ...) => (MOVBLSX ...) +(SignExt8to32 ...) => (MOVBLSX ...) +(SignExt16to32 ...) => (MOVWLSX ...) + +(ZeroExt8to16 ...) => (MOVBLZX ...) +(ZeroExt8to32 ...) => (MOVBLZX ...) +(ZeroExt16to32 ...) => (MOVWLZX ...) + +(Signmask x) => (SARLconst x [31]) +(Zeromask <t> x) => (XORLconst [-1] (SBBLcarrymask <t> (CMPLconst x [1]))) +(Slicemask <t> x) => (SARLconst (NEGL <t> x) [31]) + +// Lowering truncation +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) + +// Lowering float-int conversions +(Cvt32to32F ...) => (CVTSL2SS ...) +(Cvt32to64F ...) => (CVTSL2SD ...) + +(Cvt32Fto32 ...) => (CVTTSS2SL ...) +(Cvt64Fto32 ...) => (CVTTSD2SL ...) + +(Cvt32Fto64F ...) => (CVTSS2SD ...) +(Cvt64Fto32F ...) => (CVTSD2SS ...) + +(Round32F ...) => (Copy ...) +(Round64F ...) => (Copy ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +// Lowering shifts +// Unsigned shifts need to return 0 if shift amount is >= width of shifted value. +// result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff) +(Lsh32x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32]))) +(Lsh16x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32]))) +(Lsh8x(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32]))) + +(Lsh32x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y) +(Lsh16x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y) +(Lsh8x(32|16|8) <t> x y) && shiftIsBounded(v) => (SHLL <t> x y) + +(Rsh32Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [32]))) +(Rsh16Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [16]))) +(Rsh8Ux(32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(L|W|B)const y [8]))) + +(Rsh32Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRL <t> x y) +(Rsh16Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRW <t> x y) +(Rsh8Ux(32|16|8) <t> x y) && shiftIsBounded(v) => (SHRB <t> x y) + +// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value. +// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width. + +(Rsh32x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [32]))))) +(Rsh16x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [16]))))) +(Rsh8x(32|16|8) <t> x y) && !shiftIsBounded(v) => (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMP(L|W|B)const y [8]))))) + +(Rsh32x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARL x y) +(Rsh16x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARW x y) +(Rsh8x(32|16|8) <t> x y) && shiftIsBounded(v) => (SARB x y) + +// constant shifts +// generic opt rewrites all constant shifts to shift by Const64 +(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SHLLconst x [int32(c)]) +(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SARLconst x [int32(c)]) +(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 => (SHRLconst x [int32(c)]) +(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SHLLconst x [int32(c)]) +(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SARWconst x [int16(c)]) +(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 => (SHRWconst x [int16(c)]) +(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SHLLconst x [int32(c)]) +(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SARBconst x [int8(c)]) +(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 => (SHRBconst x [int8(c)]) + +// large constant shifts +(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0]) +(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0]) +(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0]) +(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0]) +(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0]) +(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0]) + +// large constant signed right shift, we leave the sign bit +(Rsh32x64 x (Const64 [c])) && uint64(c) >= 32 => (SARLconst x [31]) +(Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 => (SARWconst x [15]) +(Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SARBconst x [7]) + +// rotates +(RotateLeft32 ...) => (ROLL ...) +(RotateLeft16 ...) => (ROLW ...) +(RotateLeft8 ...) => (ROLB ...) +// constant rotates +(ROLL x (MOVLconst [c])) => (ROLLconst [c&31] x) +(ROLW x (MOVLconst [c])) => (ROLWconst [int16(c&15)] x) +(ROLB x (MOVLconst [c])) => (ROLBconst [int8(c&7)] x) + +// Lowering comparisons +(Less32 x y) => (SETL (CMPL x y)) +(Less16 x y) => (SETL (CMPW x y)) +(Less8 x y) => (SETL (CMPB x y)) +(Less32U x y) => (SETB (CMPL x y)) +(Less16U x y) => (SETB (CMPW x y)) +(Less8U x y) => (SETB (CMPB x y)) +// Use SETGF with reversed operands to dodge NaN case +(Less64F x y) => (SETGF (UCOMISD y x)) +(Less32F x y) => (SETGF (UCOMISS y x)) + +(Leq32 x y) => (SETLE (CMPL x y)) +(Leq16 x y) => (SETLE (CMPW x y)) +(Leq8 x y) => (SETLE (CMPB x y)) +(Leq32U x y) => (SETBE (CMPL x y)) +(Leq16U x y) => (SETBE (CMPW x y)) +(Leq8U x y) => (SETBE (CMPB x y)) +// Use SETGEF with reversed operands to dodge NaN case +(Leq64F x y) => (SETGEF (UCOMISD y x)) +(Leq32F x y) => (SETGEF (UCOMISS y x)) + +(Eq32 x y) => (SETEQ (CMPL x y)) +(Eq16 x y) => (SETEQ (CMPW x y)) +(Eq8 x y) => (SETEQ (CMPB x y)) +(EqB x y) => (SETEQ (CMPB x y)) +(EqPtr x y) => (SETEQ (CMPL x y)) +(Eq64F x y) => (SETEQF (UCOMISD x y)) +(Eq32F x y) => (SETEQF (UCOMISS x y)) + +(Neq32 x y) => (SETNE (CMPL x y)) +(Neq16 x y) => (SETNE (CMPW x y)) +(Neq8 x y) => (SETNE (CMPB x y)) +(NeqB x y) => (SETNE (CMPB x y)) +(NeqPtr x y) => (SETNE (CMPL x y)) +(Neq64F x y) => (SETNEF (UCOMISD x y)) +(Neq32F x y) => (SETNEF (UCOMISS x y)) + +// Lowering loads +(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVLload ptr mem) +(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem) +(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem) + +// Lowering stores +// These more-specific FP versions of Store pattern should come first. +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem) + +(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) + +// Lowering moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem) +(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem) +(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem) +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVWstore dst (MOVWload src mem) mem)) +(Move [5] dst src mem) => + (MOVBstore [4] dst (MOVBload [4] src mem) + (MOVLstore dst (MOVLload src mem) mem)) +(Move [6] dst src mem) => + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVLstore dst (MOVLload src mem) mem)) +(Move [7] dst src mem) => + (MOVLstore [3] dst (MOVLload [3] src mem) + (MOVLstore dst (MOVLload src mem) mem)) +(Move [8] dst src mem) => + (MOVLstore [4] dst (MOVLload [4] src mem) + (MOVLstore dst (MOVLload src mem) mem)) + +// Adjust moves to be a multiple of 4 bytes. +(Move [s] dst src mem) + && s > 8 && s%4 != 0 => + (Move [s-s%4] + (ADDLconst <dst.Type> dst [int32(s%4)]) + (ADDLconst <src.Type> src [int32(s%4)]) + (MOVLstore dst (MOVLload src mem) mem)) + +// Medium copying uses a duff device. +(Move [s] dst src mem) + && s > 8 && s <= 4*128 && s%4 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [10*(128-s/4)] dst src mem) +// 10 and 128 are magic constants. 10 is the number of bytes to encode: +// MOVL (SI), CX +// ADDL $4, SI +// MOVL CX, (DI) +// ADDL $4, DI +// and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy. + +// Large copying uses REP MOVSL. +(Move [s] dst src mem) && (s > 4*128 || config.noDuffDevice) && s%4 == 0 && logLargeCopy(v, s) => + (REPMOVSL dst src (MOVLconst [int32(s/4)]) mem) + +// Lowering Zero instructions +(Zero [0] _ mem) => mem +(Zero [1] destptr mem) => (MOVBstoreconst [0] destptr mem) +(Zero [2] destptr mem) => (MOVWstoreconst [0] destptr mem) +(Zero [4] destptr mem) => (MOVLstoreconst [0] destptr mem) + +(Zero [3] destptr mem) => + (MOVBstoreconst [makeValAndOff(0,2)] destptr + (MOVWstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [5] destptr mem) => + (MOVBstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [6] destptr mem) => + (MOVWstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [7] destptr mem) => + (MOVLstoreconst [makeValAndOff(0,3)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) + +// Strip off any fractional word zeroing. +(Zero [s] destptr mem) && s%4 != 0 && s > 4 => + (Zero [s-s%4] (ADDLconst destptr [int32(s%4)]) + (MOVLstoreconst [0] destptr mem)) + +// Zero small numbers of words directly. +(Zero [8] destptr mem) => + (MOVLstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [12] destptr mem) => + (MOVLstoreconst [makeValAndOff(0,8)] destptr + (MOVLstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem))) +(Zero [16] destptr mem) => + (MOVLstoreconst [makeValAndOff(0,12)] destptr + (MOVLstoreconst [makeValAndOff(0,8)] destptr + (MOVLstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)))) + +// Medium zeroing uses a duff device. +(Zero [s] destptr mem) + && s > 16 && s <= 4*128 && s%4 == 0 + && !config.noDuffDevice => + (DUFFZERO [1*(128-s/4)] destptr (MOVLconst [0]) mem) +// 1 and 128 are magic constants. 1 is the number of bytes to encode STOSL. +// 128 is the number of STOSL instructions in duffzero. +// See src/runtime/duff_386.s:duffzero. + +// Large zeroing uses REP STOSQ. +(Zero [s] destptr mem) + && (s > 4*128 || (config.noDuffDevice && s > 16)) + && s%4 == 0 => + (REPSTOSL destptr (MOVLconst [int32(s/4)]) (MOVLconst [0]) mem) + + +// Lowering constants +(Const8 [c]) => (MOVLconst [int32(c)]) +(Const16 [c]) => (MOVLconst [int32(c)]) +(Const32 ...) => (MOVLconst ...) +(Const(32|64)F ...) => (MOVS(S|D)const ...) +(ConstNil) => (MOVLconst [0]) +(ConstBool [c]) => (MOVLconst [b2i32(c)]) + +// Lowering calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// Miscellaneous +(IsNonNil p) => (SETNE (TESTL p p)) +(IsInBounds idx len) => (SETB (CMPL idx len)) +(IsSliceInBounds idx len) => (SETBE (CMPL idx len)) +(NilCheck ...) => (LoweredNilCheck ...) +(GetG ...) => (LoweredGetG ...) +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(Addr {sym} base) => (LEAL {sym} base) +(LocalAddr {sym} base _) => (LEAL {sym} base) + +// block rewrites +(If (SETL cmp) yes no) => (LT cmp yes no) +(If (SETLE cmp) yes no) => (LE cmp yes no) +(If (SETG cmp) yes no) => (GT cmp yes no) +(If (SETGE cmp) yes no) => (GE cmp yes no) +(If (SETEQ cmp) yes no) => (EQ cmp yes no) +(If (SETNE cmp) yes no) => (NE cmp yes no) +(If (SETB cmp) yes no) => (ULT cmp yes no) +(If (SETBE cmp) yes no) => (ULE cmp yes no) +(If (SETA cmp) yes no) => (UGT cmp yes no) +(If (SETAE cmp) yes no) => (UGE cmp yes no) +(If (SETO cmp) yes no) => (OS cmp yes no) + +// Special case for floating point - LF/LEF not generated +(If (SETGF cmp) yes no) => (UGT cmp yes no) +(If (SETGEF cmp) yes no) => (UGE cmp yes no) +(If (SETEQF cmp) yes no) => (EQF cmp yes no) +(If (SETNEF cmp) yes no) => (NEF cmp yes no) + +(If cond yes no) => (NE (TESTB cond cond) yes no) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem) +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem) +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem) + +// *************************** +// Above: lowering rules +// Below: optimizations +// *************************** +// TODO: Should the optimizations be a separate pass? + +// Fold boolean tests into blocks +(NE (TESTB (SETL cmp) (SETL cmp)) yes no) => (LT cmp yes no) +(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE cmp yes no) +(NE (TESTB (SETG cmp) (SETG cmp)) yes no) => (GT cmp yes no) +(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE cmp yes no) +(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ cmp yes no) +(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE cmp yes no) +(NE (TESTB (SETB cmp) (SETB cmp)) yes no) => (ULT cmp yes no) +(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no) +(NE (TESTB (SETA cmp) (SETA cmp)) yes no) => (UGT cmp yes no) +(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no) +(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no) + +// Special case for floating point - LF/LEF not generated +(NE (TESTB (SETGF cmp) (SETGF cmp)) yes no) => (UGT cmp yes no) +(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE cmp yes no) +(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF cmp yes no) +(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF cmp yes no) + +// fold constants into instructions +(ADDL x (MOVLconst [c])) => (ADDLconst [c] x) +(ADDLcarry x (MOVLconst [c])) => (ADDLconstcarry [c] x) +(ADCL x (MOVLconst [c]) f) => (ADCLconst [c] x f) + +(SUBL x (MOVLconst [c])) => (SUBLconst x [c]) +(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c])) +(SUBLcarry x (MOVLconst [c])) => (SUBLconstcarry [c] x) +(SBBL x (MOVLconst [c]) f) => (SBBLconst [c] x f) + +(MULL x (MOVLconst [c])) => (MULLconst [c] x) +(ANDL x (MOVLconst [c])) => (ANDLconst [c] x) + +(ANDLconst [c] (ANDLconst [d] x)) => (ANDLconst [c & d] x) +(XORLconst [c] (XORLconst [d] x)) => (XORLconst [c ^ d] x) +(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x) + +(ORL x (MOVLconst [c])) => (ORLconst [c] x) +(XORL x (MOVLconst [c])) => (XORLconst [c] x) + +(SHLL x (MOVLconst [c])) => (SHLLconst [c&31] x) +(SHRL x (MOVLconst [c])) => (SHRLconst [c&31] x) +(SHRW x (MOVLconst [c])) && c&31 < 16 => (SHRWconst [int16(c&31)] x) +(SHRW _ (MOVLconst [c])) && c&31 >= 16 => (MOVLconst [0]) +(SHRB x (MOVLconst [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x) +(SHRB _ (MOVLconst [c])) && c&31 >= 8 => (MOVLconst [0]) + +(SARL x (MOVLconst [c])) => (SARLconst [c&31] x) +(SARW x (MOVLconst [c])) => (SARWconst [int16(min(int64(c&31),15))] x) +(SARB x (MOVLconst [c])) => (SARBconst [int8(min(int64(c&31),7))] x) + +(SARL x (ANDLconst [31] y)) => (SARL x y) +(SHLL x (ANDLconst [31] y)) => (SHLL x y) +(SHRL x (ANDLconst [31] y)) => (SHRL x y) + +// Constant shift simplifications + +(SHLLconst x [0]) => x +(SHRLconst x [0]) => x +(SARLconst x [0]) => x + +(SHRWconst x [0]) => x +(SARWconst x [0]) => x + +(SHRBconst x [0]) => x +(SARBconst x [0]) => x + +(ROLLconst [0] x) => x +(ROLWconst [0] x) => x +(ROLBconst [0] x) => x + +// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits) +// because the x86 instructions are defined to use all 5 bits of the shift even +// for the small shifts. I don't think we'll ever generate a weird shift (e.g. +// (SHRW x (MOVLconst [24])), but just in case. + +(CMPL x (MOVLconst [c])) => (CMPLconst x [c]) +(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c])) +(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)]) +(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)])) +(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)]) +(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)])) + +// Canonicalize the order of arguments to comparisons - helps with CSE. +(CMP(L|W|B) x y) && canonLessThan(x,y) => (InvertFlags (CMP(L|W|B) y x)) + +// strength reduction +// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf: +// 1 - addl, shll, leal, negl, subl +// 3 - imull +// This limits the rewrites to two instructions. +// Note that negl always operates in-place, +// which can require a register-register move +// to preserve the original value, +// so it must be used with care. +(MULLconst [-9] x) => (NEGL (LEAL8 <v.Type> x x)) +(MULLconst [-5] x) => (NEGL (LEAL4 <v.Type> x x)) +(MULLconst [-3] x) => (NEGL (LEAL2 <v.Type> x x)) +(MULLconst [-1] x) => (NEGL x) +(MULLconst [0] _) => (MOVLconst [0]) +(MULLconst [1] x) => x +(MULLconst [3] x) => (LEAL2 x x) +(MULLconst [5] x) => (LEAL4 x x) +(MULLconst [7] x) => (LEAL2 x (LEAL2 <v.Type> x x)) +(MULLconst [9] x) => (LEAL8 x x) +(MULLconst [11] x) => (LEAL2 x (LEAL4 <v.Type> x x)) +(MULLconst [13] x) => (LEAL4 x (LEAL2 <v.Type> x x)) +(MULLconst [19] x) => (LEAL2 x (LEAL8 <v.Type> x x)) +(MULLconst [21] x) => (LEAL4 x (LEAL4 <v.Type> x x)) +(MULLconst [25] x) => (LEAL8 x (LEAL2 <v.Type> x x)) +(MULLconst [27] x) => (LEAL8 (LEAL2 <v.Type> x x) (LEAL2 <v.Type> x x)) +(MULLconst [37] x) => (LEAL4 x (LEAL8 <v.Type> x x)) +(MULLconst [41] x) => (LEAL8 x (LEAL4 <v.Type> x x)) +(MULLconst [45] x) => (LEAL8 (LEAL4 <v.Type> x x) (LEAL4 <v.Type> x x)) +(MULLconst [73] x) => (LEAL8 x (LEAL8 <v.Type> x x)) +(MULLconst [81] x) => (LEAL8 (LEAL8 <v.Type> x x) (LEAL8 <v.Type> x x)) + +(MULLconst [c] x) && isPowerOfTwo32(c+1) && c >= 15 => (SUBL (SHLLconst <v.Type> [int32(log32(c+1))] x) x) +(MULLconst [c] x) && isPowerOfTwo32(c-1) && c >= 17 => (LEAL1 (SHLLconst <v.Type> [int32(log32(c-1))] x) x) +(MULLconst [c] x) && isPowerOfTwo32(c-2) && c >= 34 => (LEAL2 (SHLLconst <v.Type> [int32(log32(c-2))] x) x) +(MULLconst [c] x) && isPowerOfTwo32(c-4) && c >= 68 => (LEAL4 (SHLLconst <v.Type> [int32(log32(c-4))] x) x) +(MULLconst [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEAL8 (SHLLconst <v.Type> [int32(log32(c-8))] x) x) +(MULLconst [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHLLconst [int32(log32(c/3))] (LEAL2 <v.Type> x x)) +(MULLconst [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHLLconst [int32(log32(c/5))] (LEAL4 <v.Type> x x)) +(MULLconst [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHLLconst [int32(log32(c/9))] (LEAL8 <v.Type> x x)) + +// combine add/shift into LEAL +(ADDL x (SHLLconst [3] y)) => (LEAL8 x y) +(ADDL x (SHLLconst [2] y)) => (LEAL4 x y) +(ADDL x (SHLLconst [1] y)) => (LEAL2 x y) +(ADDL x (ADDL y y)) => (LEAL2 x y) +(ADDL x (ADDL x y)) => (LEAL2 y x) + +// combine ADDL/ADDLconst into LEAL1 +(ADDLconst [c] (ADDL x y)) => (LEAL1 [c] x y) +(ADDL (ADDLconst [c] x) y) => (LEAL1 [c] x y) + +// fold ADDL into LEAL +(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x) +(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x) +(ADDLconst [c] x:(SP)) => (LEAL [c] x) // so it is rematerializeable +(LEAL [c] {s} (ADDL x y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y) +(ADDL x (LEAL [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y) + +// fold ADDLconst into LEALx +(ADDLconst [c] (LEAL1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL1 [c+d] {s} x y) +(ADDLconst [c] (LEAL2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL2 [c+d] {s} x y) +(ADDLconst [c] (LEAL4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL4 [c+d] {s} x y) +(ADDLconst [c] (LEAL8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL8 [c+d] {s} x y) +(LEAL1 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL1 [c+d] {s} x y) +(LEAL2 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL2 [c+d] {s} x y) +(LEAL2 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEAL2 [c+2*d] {s} x y) +(LEAL4 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL4 [c+d] {s} x y) +(LEAL4 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEAL4 [c+4*d] {s} x y) +(LEAL8 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEAL8 [c+d] {s} x y) +(LEAL8 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEAL8 [c+8*d] {s} x y) + +// fold shifts into LEALx +(LEAL1 [c] {s} x (SHLLconst [1] y)) => (LEAL2 [c] {s} x y) +(LEAL1 [c] {s} x (SHLLconst [2] y)) => (LEAL4 [c] {s} x y) +(LEAL1 [c] {s} x (SHLLconst [3] y)) => (LEAL8 [c] {s} x y) +(LEAL2 [c] {s} x (SHLLconst [1] y)) => (LEAL4 [c] {s} x y) +(LEAL2 [c] {s} x (SHLLconst [2] y)) => (LEAL8 [c] {s} x y) +(LEAL4 [c] {s} x (SHLLconst [1] y)) => (LEAL8 [c] {s} x y) + +// reverse ordering of compare instruction +(SETL (InvertFlags x)) => (SETG x) +(SETG (InvertFlags x)) => (SETL x) +(SETB (InvertFlags x)) => (SETA x) +(SETA (InvertFlags x)) => (SETB x) +(SETLE (InvertFlags x)) => (SETGE x) +(SETGE (InvertFlags x)) => (SETLE x) +(SETBE (InvertFlags x)) => (SETAE x) +(SETAE (InvertFlags x)) => (SETBE x) +(SETEQ (InvertFlags x)) => (SETEQ x) +(SETNE (InvertFlags x)) => (SETNE x) + +// sign extended loads +// Note: The combined instruction must end up in the same block +// as the original load. If not, we end up making a value with +// memory type live in two different blocks, which can lead to +// multiple memory values alive simultaneously. +// Make sure we don't combine these ops if the load has another use. +// This prevents a single load from being split into multiple loads +// which then might return different values. See test/atomicload.go. +(MOVBLSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBLSXload <v.Type> [off] {sym} ptr mem) +(MOVBLZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem) +(MOVWLSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWLSXload <v.Type> [off] {sym} ptr mem) +(MOVWLZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem) + +// replace load from same location as preceding store with zero/sign extension (or copy in case of full width) +(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLZX x) +(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLZX x) +(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x +(MOVBLSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLSX x) +(MOVWLSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLSX x) + +// Fold extensions and ANDs together. +(MOVBLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x) +(MOVWLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x) +(MOVBLSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x) +(MOVWLSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x) + +// Don't extend before storing +(MOVWstore [off] {sym} ptr (MOVWL(S|Z)X x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBL(S|Z)X x) mem) => (MOVBstore [off] {sym} ptr x mem) + +// fold constants into memory operations +// Note that this is not always a good idea because if not all the uses of +// the ADDLconst get eliminated, we still have to compute the ADDLconst and we now +// have potentially two live values (ptr and (ADDLconst [off] ptr)) instead of one. +// Nevertheless, let's do it! +(MOV(L|W|B|SS|SD)load [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (MOV(L|W|B|SS|SD)load [off1+off2] {sym} ptr mem) +(MOV(L|W|B|SS|SD)store [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => + (MOV(L|W|B|SS|SD)store [off1+off2] {sym} ptr val mem) + +((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {sym} val base mem) +((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem) +((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem) +((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem) +((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDLconst [off2] base) mem) && valoff1.canAdd32(off2) => + ((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {sym} base mem) + +// Fold constants into stores. +(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) => + (MOVLstoreconst [makeValAndOff(c,off)] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOVLconst [c]) mem) => + (MOVWstoreconst [makeValAndOff(c,off)] {sym} ptr mem) +(MOVBstore [off] {sym} ptr (MOVLconst [c]) mem) => + (MOVBstoreconst [makeValAndOff(c,off)] {sym} ptr mem) + +// Fold address offsets into constant stores. +(MOV(L|W|B)storeconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) => + (MOV(L|W|B)storeconst [sc.addOffset32(off)] {s} ptr mem) + +// We need to fold LEAL into the MOVx ops so that the live variable analysis knows +// what variables are being read/written by the ops. +// Note: we turn off this merging for operations on globals when building +// position-independent code (when Flag_shared is set). +// PIC needs a spare register to load the PC into. Having the LEAL be +// a separate instruction gives us that register. Having the LEAL be +// a separate instruction also allows it to be CSEd (which is good because +// it compiles to a thunk call). +(MOV(L|W|B|SS|SD|BLSX|WLSX)load [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) + && (base.Op != OpSB || !config.ctxt.Flag_shared) => + (MOV(L|W|B|SS|SD|BLSX|WLSX)load [off1+off2] {mergeSym(sym1,sym2)} base mem) + +(MOV(L|W|B|SS|SD)store [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) + && (base.Op != OpSB || !config.ctxt.Flag_shared) => + (MOV(L|W|B|SS|SD)store [off1+off2] {mergeSym(sym1,sym2)} base val mem) + +(MOV(L|W|B)storeconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOV(L|W|B)storeconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem) + +((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) => + ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) => + ((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) => + ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAL [off2] {sym2} base) val mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) +((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAL [off2] {sym2} base) mem) + && valoff1.canAdd32(off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) => + ((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) + +// Merge load/store to op +((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem) +((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem) +((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem) +(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) +(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) +(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr mem)) mem) + && y.Uses==1 && l.Uses==1 && clobber(y, l) => + ((ADD|AND|OR|XOR)Lconstmodify [makeValAndOff(c,off)] {sym} ptr mem) + +// fold LEALs together +(LEAL [off1] {sym1} (LEAL [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL [off1+off2] {mergeSym(sym1,sym2)} x) + +// LEAL into LEAL1 +(LEAL1 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAL1 into LEAL +(LEAL [off1] {sym1} (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAL into LEAL[248] +(LEAL2 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAL4 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAL8 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAL[248] into LEAL +(LEAL [off1] {sym1} (LEAL2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAL [off1] {sym1} (LEAL4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAL [off1] {sym1} (LEAL8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAL[1248] into LEAL[1248]. Only some such merges are possible. +(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} x y) +(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} y x) +(LEAL2 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+2*int64(off2)) => + (LEAL4 [off1+2*off2] {sym} x y) +(LEAL4 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+4*int64(off2)) => + (LEAL8 [off1+4*off2] {sym} x y) + +// Absorb InvertFlags into branches. +(LT (InvertFlags cmp) yes no) => (GT cmp yes no) +(GT (InvertFlags cmp) yes no) => (LT cmp yes no) +(LE (InvertFlags cmp) yes no) => (GE cmp yes no) +(GE (InvertFlags cmp) yes no) => (LE cmp yes no) +(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no) +(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no) +(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no) +(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no) +(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no) +(NE (InvertFlags cmp) yes no) => (NE cmp yes no) + +// Constant comparisons. +(CMPLconst (MOVLconst [x]) [y]) && x==y => (FlagEQ) +(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT) +(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT) +(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT) +(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT) + +(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y => (FlagEQ) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT) + +(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y => (FlagEQ) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT) + +// Other known comparisons. +(CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) => (FlagLT_ULT) +(CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT) +(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT) +(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m) && int8(m) < n => (FlagLT_ULT) +// TODO: DIVxU also. + +// Absorb flag constants into SBB ops. +(SBBLcarrymask (FlagEQ)) => (MOVLconst [0]) +(SBBLcarrymask (FlagLT_ULT)) => (MOVLconst [-1]) +(SBBLcarrymask (FlagLT_UGT)) => (MOVLconst [0]) +(SBBLcarrymask (FlagGT_ULT)) => (MOVLconst [-1]) +(SBBLcarrymask (FlagGT_UGT)) => (MOVLconst [0]) + +// Absorb flag constants into branches. +(EQ (FlagEQ) yes no) => (First yes no) +(EQ (FlagLT_ULT) yes no) => (First no yes) +(EQ (FlagLT_UGT) yes no) => (First no yes) +(EQ (FlagGT_ULT) yes no) => (First no yes) +(EQ (FlagGT_UGT) yes no) => (First no yes) + +(NE (FlagEQ) yes no) => (First no yes) +(NE (FlagLT_ULT) yes no) => (First yes no) +(NE (FlagLT_UGT) yes no) => (First yes no) +(NE (FlagGT_ULT) yes no) => (First yes no) +(NE (FlagGT_UGT) yes no) => (First yes no) + +(LT (FlagEQ) yes no) => (First no yes) +(LT (FlagLT_ULT) yes no) => (First yes no) +(LT (FlagLT_UGT) yes no) => (First yes no) +(LT (FlagGT_ULT) yes no) => (First no yes) +(LT (FlagGT_UGT) yes no) => (First no yes) + +(LE (FlagEQ) yes no) => (First yes no) +(LE (FlagLT_ULT) yes no) => (First yes no) +(LE (FlagLT_UGT) yes no) => (First yes no) +(LE (FlagGT_ULT) yes no) => (First no yes) +(LE (FlagGT_UGT) yes no) => (First no yes) + +(GT (FlagEQ) yes no) => (First no yes) +(GT (FlagLT_ULT) yes no) => (First no yes) +(GT (FlagLT_UGT) yes no) => (First no yes) +(GT (FlagGT_ULT) yes no) => (First yes no) +(GT (FlagGT_UGT) yes no) => (First yes no) + +(GE (FlagEQ) yes no) => (First yes no) +(GE (FlagLT_ULT) yes no) => (First no yes) +(GE (FlagLT_UGT) yes no) => (First no yes) +(GE (FlagGT_ULT) yes no) => (First yes no) +(GE (FlagGT_UGT) yes no) => (First yes no) + +(ULT (FlagEQ) yes no) => (First no yes) +(ULT (FlagLT_ULT) yes no) => (First yes no) +(ULT (FlagLT_UGT) yes no) => (First no yes) +(ULT (FlagGT_ULT) yes no) => (First yes no) +(ULT (FlagGT_UGT) yes no) => (First no yes) + +(ULE (FlagEQ) yes no) => (First yes no) +(ULE (FlagLT_ULT) yes no) => (First yes no) +(ULE (FlagLT_UGT) yes no) => (First no yes) +(ULE (FlagGT_ULT) yes no) => (First yes no) +(ULE (FlagGT_UGT) yes no) => (First no yes) + +(UGT (FlagEQ) yes no) => (First no yes) +(UGT (FlagLT_ULT) yes no) => (First no yes) +(UGT (FlagLT_UGT) yes no) => (First yes no) +(UGT (FlagGT_ULT) yes no) => (First no yes) +(UGT (FlagGT_UGT) yes no) => (First yes no) + +(UGE (FlagEQ) yes no) => (First yes no) +(UGE (FlagLT_ULT) yes no) => (First no yes) +(UGE (FlagLT_UGT) yes no) => (First yes no) +(UGE (FlagGT_ULT) yes no) => (First no yes) +(UGE (FlagGT_UGT) yes no) => (First yes no) + +// Absorb flag constants into SETxx ops. +(SETEQ (FlagEQ)) => (MOVLconst [1]) +(SETEQ (FlagLT_ULT)) => (MOVLconst [0]) +(SETEQ (FlagLT_UGT)) => (MOVLconst [0]) +(SETEQ (FlagGT_ULT)) => (MOVLconst [0]) +(SETEQ (FlagGT_UGT)) => (MOVLconst [0]) + +(SETNE (FlagEQ)) => (MOVLconst [0]) +(SETNE (FlagLT_ULT)) => (MOVLconst [1]) +(SETNE (FlagLT_UGT)) => (MOVLconst [1]) +(SETNE (FlagGT_ULT)) => (MOVLconst [1]) +(SETNE (FlagGT_UGT)) => (MOVLconst [1]) + +(SETL (FlagEQ)) => (MOVLconst [0]) +(SETL (FlagLT_ULT)) => (MOVLconst [1]) +(SETL (FlagLT_UGT)) => (MOVLconst [1]) +(SETL (FlagGT_ULT)) => (MOVLconst [0]) +(SETL (FlagGT_UGT)) => (MOVLconst [0]) + +(SETLE (FlagEQ)) => (MOVLconst [1]) +(SETLE (FlagLT_ULT)) => (MOVLconst [1]) +(SETLE (FlagLT_UGT)) => (MOVLconst [1]) +(SETLE (FlagGT_ULT)) => (MOVLconst [0]) +(SETLE (FlagGT_UGT)) => (MOVLconst [0]) + +(SETG (FlagEQ)) => (MOVLconst [0]) +(SETG (FlagLT_ULT)) => (MOVLconst [0]) +(SETG (FlagLT_UGT)) => (MOVLconst [0]) +(SETG (FlagGT_ULT)) => (MOVLconst [1]) +(SETG (FlagGT_UGT)) => (MOVLconst [1]) + +(SETGE (FlagEQ)) => (MOVLconst [1]) +(SETGE (FlagLT_ULT)) => (MOVLconst [0]) +(SETGE (FlagLT_UGT)) => (MOVLconst [0]) +(SETGE (FlagGT_ULT)) => (MOVLconst [1]) +(SETGE (FlagGT_UGT)) => (MOVLconst [1]) + +(SETB (FlagEQ)) => (MOVLconst [0]) +(SETB (FlagLT_ULT)) => (MOVLconst [1]) +(SETB (FlagLT_UGT)) => (MOVLconst [0]) +(SETB (FlagGT_ULT)) => (MOVLconst [1]) +(SETB (FlagGT_UGT)) => (MOVLconst [0]) + +(SETBE (FlagEQ)) => (MOVLconst [1]) +(SETBE (FlagLT_ULT)) => (MOVLconst [1]) +(SETBE (FlagLT_UGT)) => (MOVLconst [0]) +(SETBE (FlagGT_ULT)) => (MOVLconst [1]) +(SETBE (FlagGT_UGT)) => (MOVLconst [0]) + +(SETA (FlagEQ)) => (MOVLconst [0]) +(SETA (FlagLT_ULT)) => (MOVLconst [0]) +(SETA (FlagLT_UGT)) => (MOVLconst [1]) +(SETA (FlagGT_ULT)) => (MOVLconst [0]) +(SETA (FlagGT_UGT)) => (MOVLconst [1]) + +(SETAE (FlagEQ)) => (MOVLconst [1]) +(SETAE (FlagLT_ULT)) => (MOVLconst [0]) +(SETAE (FlagLT_UGT)) => (MOVLconst [1]) +(SETAE (FlagGT_ULT)) => (MOVLconst [0]) +(SETAE (FlagGT_UGT)) => (MOVLconst [1]) + +// Remove redundant *const ops +(ADDLconst [c] x) && c==0 => x +(SUBLconst [c] x) && c==0 => x +(ANDLconst [c] _) && c==0 => (MOVLconst [0]) +(ANDLconst [c] x) && c==-1 => x +(ORLconst [c] x) && c==0 => x +(ORLconst [c] _) && c==-1 => (MOVLconst [-1]) +(XORLconst [c] x) && c==0 => x +// TODO: since we got rid of the W/B versions, we might miss +// things like (ANDLconst [0x100] x) which were formerly +// (ANDBconst [0] x). Probably doesn't happen very often. +// If we cared, we might do: +// (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 => (MOVLconst [0]) + +// Convert constant subtracts to constant adds +(SUBLconst [c] x) => (ADDLconst [-c] x) + +// generic constant folding +// TODO: more of this +(ADDLconst [c] (MOVLconst [d])) => (MOVLconst [c+d]) +(ADDLconst [c] (ADDLconst [d] x)) => (ADDLconst [c+d] x) +(SARLconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)]) +(SARWconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)]) +(SARBconst [c] (MOVLconst [d])) => (MOVLconst [d>>uint64(c)]) +(NEGL (MOVLconst [c])) => (MOVLconst [-c]) +(MULLconst [c] (MOVLconst [d])) => (MOVLconst [c*d]) +(ANDLconst [c] (MOVLconst [d])) => (MOVLconst [c&d]) +(ORLconst [c] (MOVLconst [d])) => (MOVLconst [c|d]) +(XORLconst [c] (MOVLconst [d])) => (MOVLconst [c^d]) +(NOTL (MOVLconst [c])) => (MOVLconst [^c]) + +// generic simplifications +// TODO: more of this +(ADDL x (NEGL y)) => (SUBL x y) +(SUBL x x) => (MOVLconst [0]) +(ANDL x x) => x +(ORL x x) => x +(XORL x x) => (MOVLconst [0]) + +// checking AND against 0. +(CMP(L|W|B)const l:(ANDL x y) [0]) && l.Uses==1 => (TEST(L|W|B) x y) +(CMPLconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTLconst [c] x) +(CMPWconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTWconst [int16(c)] x) +(CMPBconst l:(ANDLconst [c] x) [0]) && l.Uses==1 => (TESTBconst [int8(c)] x) + +// TEST %reg,%reg is shorter than CMP +(CMP(L|W|B)const x [0]) => (TEST(L|W|B) x x) + +// Convert LEAL1 back to ADDL if we can +(LEAL1 [0] {nil} x y) => (ADDL x y) + +// Combining byte loads into larger (unaligned) loads. +// There are many ways these combinations could occur. This is +// designed to match the way encoding/binary.LittleEndian does it. +(ORL x0:(MOVBload [i0] {s} p mem) + s0:(SHLLconst [8] x1:(MOVBload [i1] {s} p mem))) + && i1 == i0+1 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, s0) + => @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem) + +(ORL x0:(MOVBload [i] {s} p0 mem) + s0:(SHLLconst [8] x1:(MOVBload [i] {s} p1 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, s0) + => @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem) + +(ORL o0:(ORL + x0:(MOVWload [i0] {s} p mem) + s0:(SHLLconst [16] x1:(MOVBload [i2] {s} p mem))) + s1:(SHLLconst [24] x2:(MOVBload [i3] {s} p mem))) + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && clobber(x0, x1, x2, s0, s1, o0) + => @mergePoint(b,x0,x1,x2) (MOVLload [i0] {s} p mem) + +(ORL o0:(ORL + x0:(MOVWload [i] {s} p0 mem) + s0:(SHLLconst [16] x1:(MOVBload [i] {s} p1 mem))) + s1:(SHLLconst [24] x2:(MOVBload [i] {s} p2 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && o0.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && sequentialAddresses(p1, p2, 1) + && mergePoint(b,x0,x1,x2) != nil + && clobber(x0, x1, x2, s0, s1, o0) + => @mergePoint(b,x0,x1,x2) (MOVLload [i] {s} p0 mem) + +// Combine constant stores into larger (unaligned) stores. +(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem)) + && x.Uses == 1 + && a.Off() + 1 == c.Off() + && clobber(x) + => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p mem) +(MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem)) + && x.Uses == 1 + && a.Off() + 1 == c.Off() + && clobber(x) + => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p mem) + +(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem)) + && x.Uses == 1 + && a.Off() == c.Off() + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem) +(MOVBstoreconst [a] {s} p0 x:(MOVBstoreconst [c] {s} p1 mem)) + && x.Uses == 1 + && a.Off() == c.Off() + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem) + +(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem)) + && x.Uses == 1 + && a.Off() + 2 == c.Off() + && clobber(x) + => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p mem) +(MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem)) + && x.Uses == 1 + && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() + && clobber(x) + => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p mem) + +(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem)) + && x.Uses == 1 + && a.Off() == c.Off() + && sequentialAddresses(p0, p1, 2) + && clobber(x) + => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem) +(MOVWstoreconst [a] {s} p0 x:(MOVWstoreconst [c] {s} p1 mem)) + && x.Uses == 1 + && a.Off() == c.Off() + && sequentialAddresses(p0, p1, 2) + && clobber(x) + => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem) + +// Combine stores into larger (unaligned) stores. +(MOVBstore [i] {s} p (SHR(W|L)const [8] w) x:(MOVBstore [i-1] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-1] {s} p w mem) +(MOVBstore [i] {s} p w x:(MOVBstore {s} [i+1] p (SHR(W|L)const [8] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i] {s} p w mem) +(MOVBstore [i] {s} p (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRLconst [j-8] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-1] {s} p w0 mem) + +(MOVBstore [i] {s} p1 (SHR(W|L)const [8] w) x:(MOVBstore [i] {s} p0 w mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstore [i] {s} p0 w mem) +(MOVBstore [i] {s} p0 w x:(MOVBstore {s} [i] p1 (SHR(W|L)const [8] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstore [i] {s} p0 w mem) +(MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRLconst [j-8] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstore [i] {s} p0 w0 mem) + +(MOVWstore [i] {s} p (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVLstore [i-2] {s} p w mem) +(MOVWstore [i] {s} p (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRLconst [j-16] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVLstore [i-2] {s} p w0 mem) + +(MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i] {s} p0 w mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && clobber(x) + => (MOVLstore [i] {s} p0 w mem) +(MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRLconst [j-16] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && clobber(x) + => (MOVLstore [i] {s} p0 w0 mem) + +// For PIC, break floating-point constant loading into two instructions so we have +// a register to use for holding the address of the constant pool entry. +(MOVSSconst [c]) && config.ctxt.Flag_shared => (MOVSSconst2 (MOVSSconst1 [c])) +(MOVSDconst [c]) && config.ctxt.Flag_shared => (MOVSDconst2 (MOVSDconst1 [c])) + +(CMP(L|W|B) l:(MOV(L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) => (CMP(L|W|B)load {sym} [off] ptr x mem) +(CMP(L|W|B) x l:(MOV(L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (InvertFlags (CMP(L|W|B)load {sym} [off] ptr x mem)) + +(CMP(L|W|B)const l:(MOV(L|W|B)load {sym} [off] ptr mem) [c]) + && l.Uses == 1 + && clobber(l) => + @l.Block (CMP(L|W|B)constload {sym} [makeValAndOff(int32(c),off)] ptr mem) + +(CMPLload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPLconstload {sym} [makeValAndOff(c,off)] ptr mem) +(CMPWload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPWconstload {sym} [makeValAndOff(int32(int16(c)),off)] ptr mem) +(CMPBload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPBconstload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) + +(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read8(sym, int64(off)))]) +(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVLload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))]) diff --git a/src/cmd/compile/internal/ssa/_gen/386Ops.go b/src/cmd/compile/internal/ssa/_gen/386Ops.go new file mode 100644 index 0000000..c66650c --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/386Ops.go @@ -0,0 +1,588 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - Floating-point types live in the low natural slot of an sse2 register. +// Unused portions are junk. +// - We do not use AH,BH,CH,DH registers. +// - When doing sub-register operations, we try to write the whole +// destination register to avoid a partial-register write. +// - Unused portions of AuxInt (or the Val portion of ValAndOff) are +// filled by sign-extending the used portion. Users of AuxInt which interpret +// AuxInt as unsigned (e.g. shifts) must be careful. + +// Suffixes encode the bit width of various instructions. +// L (long word) = 32 bit +// W (word) = 16 bit +// B (byte) = 8 bit + +// copied from ../../x86/reg.go +var regNames386 = []string{ + "AX", + "CX", + "DX", + "BX", + "SP", + "BP", + "SI", + "DI", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + + // If you add registers, update asyncPreempt in runtime + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNames386) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNames386 { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + ax = buildReg("AX") + cx = buildReg("CX") + dx = buildReg("DX") + bx = buildReg("BX") + si = buildReg("SI") + gp = buildReg("AX CX DX BX BP SI DI") + fp = buildReg("X0 X1 X2 X3 X4 X5 X6 X7") + gpsp = gp | buildReg("SP") + gpspsb = gpsp | buildReg("SB") + callerSave = gp | fp + ) + // Common slices of register masks + var ( + gponly = []regMask{gp} + fponly = []regMask{fp} + ) + + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: gponly} + gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly} + gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly} + gp11sb = regInfo{inputs: []regMask{gpspsb}, outputs: gponly} + gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + gp11carry = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}} + gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}} + gp1carry1 = regInfo{inputs: []regMask{gp}, outputs: gponly} + gp2carry1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly} + gp21sb = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly} + gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}} + gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax}, clobbers: dx} + gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax} + gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx}, clobbers: ax} + gp21mul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}} + + gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}} + gp1flags = regInfo{inputs: []regMask{gpsp}} + gp0flagsLoad = regInfo{inputs: []regMask{gpspsb, 0}} + gp1flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, 0}} + flagsgp = regInfo{inputs: nil, outputs: gponly} + + readflags = regInfo{inputs: nil, outputs: gponly} + flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}} + + gpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly} + gp21load = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly} + gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly} + gp21loadidx = regInfo{inputs: []regMask{gp, gpspsb, gpsp, 0}, outputs: gponly} + + gpstore = regInfo{inputs: []regMask{gpspsb, gpsp, 0}} + gpstoreconst = regInfo{inputs: []regMask{gpspsb, 0}} + gpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}} + gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}} + + fp01 = regInfo{inputs: nil, outputs: fponly} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} + fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly} + fpgp = regInfo{inputs: fponly, outputs: gponly} + gpfp = regInfo{inputs: gponly, outputs: fponly} + fp11 = regInfo{inputs: fponly, outputs: fponly} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + + fpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly} + fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly} + + fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}} + fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}} + ) + + var _386ops = []opData{ + // fp ops + {name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add + {name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add + {name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true}, // fp32 sub + {name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true}, // fp64 sub + {name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul + {name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul + {name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true}, // fp32 div + {name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true}, // fp64 div + + {name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load + {name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load + {name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, // fp32 constant + {name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, // fp64 constant + {name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by i + {name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by 4*i + {name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by i + {name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by 8*i + + {name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store + {name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store + {name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by i store + {name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by 4i store + {name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by i store + {name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by 8i store + + {name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + + // binary ops + {name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true}, // arg0 + arg1 + {name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32", clobberFlags: true}, // arg0 + auxint + + {name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true}, // arg0 + arg1, generates <carry,result> pair + {name: "ADDLconstcarry", argLength: 1, reg: gp11carry, asm: "ADDL", aux: "Int32", resultInArg0: true}, // arg0 + auxint, generates <carry,result> pair + {name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0+arg1+carry(arg2), where arg2 is flags + {name: "ADCLconst", argLength: 2, reg: gp1carry1, asm: "ADCL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0+auxint+carry(arg1), where arg1 is flags + + {name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true}, // arg0 - arg1 + {name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint + + {name: "SUBLcarry", argLength: 2, reg: gp21carry, asm: "SUBL", resultInArg0: true}, // arg0-arg1, generates <borrow,result> pair + {name: "SUBLconstcarry", argLength: 1, reg: gp11carry, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0-auxint, generates <borrow,result> pair + {name: "SBBL", argLength: 3, reg: gp2carry1, asm: "SBBL", resultInArg0: true, clobberFlags: true}, // arg0-arg1-borrow(arg2), where arg2 is flags + {name: "SBBLconst", argLength: 2, reg: gp1carry1, asm: "SBBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0-auxint-borrow(arg1), where arg1 is flags + + {name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1 + {name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true}, // arg0 * auxint + + {name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true}, // Let x = arg0*arg1 (full 32x32->64 unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x. + + {name: "HMULL", argLength: 2, reg: gp21hmul, commutative: true, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width + {name: "HMULLU", argLength: 2, reg: gp21hmul, commutative: true, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width + + {name: "MULLQU", argLength: 2, reg: gp21mul, commutative: true, asm: "MULL", clobberFlags: true}, // arg0 * arg1, high 32 in result[0], low 32 in result[1] + + {name: "AVGLU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 32 result bits + + // For DIVL, DIVW, MODL and MODW, AuxInt non-zero means that the divisor has been proved to be not -1. + {name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 / arg1 + {name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 / arg1 + {name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL", clobberFlags: true}, // arg0 / arg1 + {name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW", clobberFlags: true}, // arg0 / arg1 + + {name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 % arg1 + {name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 % arg1 + {name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL", clobberFlags: true}, // arg0 % arg1 + {name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW", clobberFlags: true}, // arg0 % arg1 + + {name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1 + {name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint + + {name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1 + {name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint + + {name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1 + {name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint + + {name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint + {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint + {name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"}, // arg0 compare to auxint + + // compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem. + {name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + + // compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem. + {name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + + {name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32 + {name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64 + + {name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0 + {name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0 + {name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"}, // (arg0 & arg1) compare to 0 + {name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0 + {name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0 + {name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"}, // (arg0 & auxint) compare to 0 + + {name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true}, // arg0 << arg1, shift amount is mod 32 + {name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31 + // Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount! + + {name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32 + {name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32 + {name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32 + {name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31 + {name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-15 + {name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-7 + + {name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32 + {name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32 + {name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32 + {name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31 + {name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-15 + {name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-7 + + {name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true}, // 32 bits of arg0 rotate left by arg1 + {name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true}, // low 16 bits of arg0 rotate left by arg1 + {name: "ROLB", argLength: 2, reg: gp21shift, asm: "ROLB", resultInArg0: true, clobberFlags: true}, // low 8 bits of arg0 rotate left by arg1 + {name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31 + {name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15 + {name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-7 + + // binary-op with a memory source operand + {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "MULLload", argLength: 3, reg: gp21load, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + + // binary-op with an indexed memory source operand + {name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem + {name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem + {name: "MULLloadidx4", argLength: 4, reg: gp21loadidx, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem + {name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem + {name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem + {name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem + + // unary ops + {name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0 + + {name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0 + + {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero + {name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero + + {name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero + {name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero + + {name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes + + {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0) + {name: "SQRTSS", argLength: 1, reg: fp11, asm: "SQRTSS"}, // sqrt(arg0), float32 + + {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear. + // Note: SBBW and SBBB are subsumed by SBBL + + {name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0 + {name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0 + {name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"}, // extract signed < condition from arg0 + {name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0 + {name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"}, // extract signed > condition from arg0 + {name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0 + {name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"}, // extract unsigned < condition from arg0 + {name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0 + {name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"}, // extract unsigned > condition from arg0 + {name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0 + {name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"}, // extract if overflow flag is set from arg0 + // Need different opcodes for floating point conditions because + // any comparison involving a NaN is always FALSE and thus + // the patterns for inverting conditions cannot be used. + {name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0 + {name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0 + {name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"}, // extract "ordered" (No Nan present) condition from arg0 + {name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"}, // extract "unordered" (Nan present) condition from arg0 + + {name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"}, // extract floating > condition from arg0 + {name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0 + + {name: "MOVBLSX", argLength: 1, reg: gp11, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32 + {name: "MOVBLZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32 + {name: "MOVWLSX", argLength: 1, reg: gp11, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32 + {name: "MOVWLZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32 + + {name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint + + {name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32 + {name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32 + {name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"}, // convert int32 to float32 + {name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"}, // convert int32 to float64 + {name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"}, // convert float64 to float32 + {name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"}, // convert float32 to float64 + + {name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation. + + {name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux + {name: "LEAL1", argLength: 2, reg: gp21sb, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux + {name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux + {name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux + {name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux + // Note: LEAL{1,2,4,8} must not have OpSB as either argument. + + // auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address + {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int32 + {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int32 + {name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem + + // direct binary-op on memory (read-modify-write) + {name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem + {name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem + {name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem + {name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) |= arg1, arg2=mem + {name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem + + // direct binary-op on indexed memory (read-modify-write) + {name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) += arg2, arg3=mem + {name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) -= arg2, arg3=mem + {name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) &= arg2, arg3=mem + {name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) |= arg2, arg3=mem + {name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) ^= arg2, arg3=mem + + // direct binary-op on memory with a constant (read-modify-write) + {name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + {name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + {name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + {name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + + // direct binary-op on indexed memory with a constant (read-modify-write) + {name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem + {name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem + {name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem + {name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem + + // indexed loads/stores + {name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", aux: "SymOff", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem + {name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem + {name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem + {name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem + {name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff", symEffect: "Read"}, // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem + // TODO: sign-extending indexed loads + {name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem + {name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem + // TODO: add size-mismatched indexed loads, like MOVBstoreidx4. + + // For storeconst ops, the AuxInt field encodes both + // the value to store and an address offset of the store. + // Cast AuxInt to a ValAndOff to extract Val and Off fields. + {name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux. arg1=mem + {name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ... + {name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ... + + {name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux. arg2=mem + {name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... arg1 ... + {name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... 2*arg1 ... + {name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... arg1 ... + {name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... 4*arg1 ... + + // arg0 = pointer to start of memory to zero + // arg1 = value to store (will always be zero) + // arg2 = mem + // auxint = offset into duffzero code to start executing + // returns mem + { + name: "DUFFZERO", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("AX")}, + clobbers: buildReg("DI CX"), + // Note: CX is only clobbered when dynamic linking. + }, + faultOnNilArg0: true, + }, + + // arg0 = address of memory to zero + // arg1 = # of 4-byte words to zero + // arg2 = value to store (will always be zero) + // arg3 = mem + // returns mem + { + name: "REPSTOSL", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")}, + clobbers: buildReg("DI CX"), + }, + faultOnNilArg0: true, + }, + + {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // arg0 = destination pointer + // arg1 = source pointer + // arg2 = mem + // auxint = offset from duffcopy symbol to call + // returns memory + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("SI")}, + clobbers: buildReg("DI SI CX"), // uses CX as a temporary + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // arg0 = destination pointer + // arg1 = source pointer + // arg2 = # of 8-byte words to copy + // arg3 = mem + // returns memory + { + name: "REPMOVSL", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")}, + clobbers: buildReg("DI SI CX"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // (InvertFlags (CMPL a b)) == (CMPL b a) + // So if we want (SETL (CMPL a b)) but we can't do that because a is a constant, + // then we do (SETL (InvertFlags (CMPL b a))) instead. + // Rewrites will convert this to (SETG (CMPL b a)). + // InvertFlags is a pseudo-op which can't appear in assembly output. + {name: "InvertFlags", argLength: 1}, // reverse direction of arg0 + + // Pseudo-ops + {name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of DX (the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true}, + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, but may clobber others. + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + // Extend ops are the same as Bounds ops except the indexes are 64-bit. + {name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, dx, bx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + {name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, cx, dx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + {name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, ax, cx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + + // Constant flag values. For any comparison, there are 5 possible + // outcomes: the three from the signed total order (<,==,>) and the + // three from the unsigned total order. The == cases overlap. + // Note: there's a sixth "unordered" outcome for floating-point + // comparisons, but we don't use such a beast yet. + // These ops are for temporary use by rewrite rules. They + // cannot appear in the generated assembly. + {name: "FlagEQ"}, // equal + {name: "FlagLT_ULT"}, // signed < and unsigned < + {name: "FlagLT_UGT"}, // signed < and unsigned > + {name: "FlagGT_UGT"}, // signed > and unsigned < + {name: "FlagGT_ULT"}, // signed > and unsigned > + + // Special ops for PIC floating-point constants. + // MOVSXconst1 loads the address of the constant-pool entry into a register. + // MOVSXconst2 loads the constant from that address. + // MOVSXconst1 returns a pointer, but we type it as uint32 because it can never point to the Go heap. + {name: "MOVSSconst1", reg: gp01, typ: "UInt32", aux: "Float32"}, + {name: "MOVSDconst1", reg: gp01, typ: "UInt32", aux: "Float64"}, + {name: "MOVSSconst2", argLength: 1, reg: gpfp, asm: "MOVSS"}, + {name: "MOVSDconst2", argLength: 1, reg: gpfp, asm: "MOVSD"}, + } + + var _386blocks = []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LT", controls: 1}, + {name: "LE", controls: 1}, + {name: "GT", controls: 1}, + {name: "GE", controls: 1}, + {name: "OS", controls: 1}, + {name: "OC", controls: 1}, + {name: "ULT", controls: 1}, + {name: "ULE", controls: 1}, + {name: "UGT", controls: 1}, + {name: "UGE", controls: 1}, + {name: "EQF", controls: 1}, + {name: "NEF", controls: 1}, + {name: "ORD", controls: 1}, // FP, ordered comparison (parity zero) + {name: "NAN", controls: 1}, // FP, unordered comparison (parity one) + } + + archs = append(archs, arch{ + name: "386", + pkg: "cmd/internal/obj/x86", + genfile: "../../x86/ssa.go", + ops: _386ops, + blocks: _386blocks, + regnames: regNames386, + gpregmask: gp, + fpregmask: fp, + framepointerreg: int8(num["BP"]), + linkreg: -1, // not used + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/386splitload.rules b/src/cmd/compile/internal/ssa/_gen/386splitload.rules new file mode 100644 index 0000000..29d4f8c --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/386splitload.rules @@ -0,0 +1,11 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// See the top of AMD64splitload.rules for discussion of these rules. + +(CMP(L|W|B)load {sym} [off] ptr x mem) => (CMP(L|W|B) (MOV(L|W|B)load {sym} [off] ptr mem) x) + +(CMPLconstload {sym} [vo] ptr mem) => (CMPLconst (MOVLload {sym} [vo.Off()] ptr mem) [vo.Val()]) +(CMPWconstload {sym} [vo] ptr mem) => (CMPWconst (MOVWload {sym} [vo.Off()] ptr mem) [vo.Val16()]) +(CMPBconstload {sym} [vo] ptr mem) => (CMPBconst (MOVBload {sym} [vo.Off()] ptr mem) [vo.Val8()]) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules new file mode 100644 index 0000000..da5ef7e --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -0,0 +1,2216 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Lowering arithmetic +(Add(64|32|16|8) ...) => (ADD(Q|L|L|L) ...) +(AddPtr ...) => (ADDQ ...) +(Add(32|64)F ...) => (ADDS(S|D) ...) + +(Sub(64|32|16|8) ...) => (SUB(Q|L|L|L) ...) +(SubPtr ...) => (SUBQ ...) +(Sub(32|64)F ...) => (SUBS(S|D) ...) + +(Mul(64|32|16|8) ...) => (MUL(Q|L|L|L) ...) +(Mul(32|64)F ...) => (MULS(S|D) ...) + +(Select0 (Mul64uover x y)) => (Select0 <typ.UInt64> (MULQU x y)) +(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y)) +(Select1 (Mul(64|32)uover x y)) => (SETO (Select1 <types.TypeFlags> (MUL(Q|L)U x y))) + +(Hmul(64|32) ...) => (HMUL(Q|L) ...) +(Hmul(64|32)u ...) => (HMUL(Q|L)U ...) + +(Div(64|32|16) [a] x y) => (Select0 (DIV(Q|L|W) [a] x y)) +(Div8 x y) => (Select0 (DIVW (SignExt8to16 x) (SignExt8to16 y))) +(Div(64|32|16)u x y) => (Select0 (DIV(Q|L|W)U x y)) +(Div8u x y) => (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))) +(Div(32|64)F ...) => (DIVS(S|D) ...) + +(Select0 (Add64carry x y c)) => + (Select0 <typ.UInt64> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c)))) +(Select1 (Add64carry x y c)) => + (NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c)))))) +(Select0 (Sub64borrow x y c)) => + (Select0 <typ.UInt64> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c)))) +(Select1 (Sub64borrow x y c)) => + (NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c)))))) + +// Optimize ADCQ and friends +(ADCQ x (MOVQconst [c]) carry) && is32Bit(c) => (ADCQconst x [int32(c)] carry) +(ADCQ x y (FlagEQ)) => (ADDQcarry x y) +(ADCQconst x [c] (FlagEQ)) => (ADDQconstcarry x [c]) +(ADDQcarry x (MOVQconst [c])) && is32Bit(c) => (ADDQconstcarry x [int32(c)]) +(SBBQ x (MOVQconst [c]) borrow) && is32Bit(c) => (SBBQconst x [int32(c)] borrow) +(SBBQ x y (FlagEQ)) => (SUBQborrow x y) +(SBBQconst x [c] (FlagEQ)) => (SUBQconstborrow x [c]) +(SUBQborrow x (MOVQconst [c])) && is32Bit(c) => (SUBQconstborrow x [int32(c)]) +(Select1 (NEGLflags (MOVQconst [0]))) => (FlagEQ) +(Select1 (NEGLflags (NEGQ (SBBQcarrymask x)))) => x + + +(Mul64uhilo ...) => (MULQU2 ...) +(Div128u ...) => (DIVQU2 ...) + +(Avg64u ...) => (AVGQU ...) + +(Mod(64|32|16) [a] x y) => (Select1 (DIV(Q|L|W) [a] x y)) +(Mod8 x y) => (Select1 (DIVW (SignExt8to16 x) (SignExt8to16 y))) +(Mod(64|32|16)u x y) => (Select1 (DIV(Q|L|W)U x y)) +(Mod8u x y) => (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y))) + +(And(64|32|16|8) ...) => (AND(Q|L|L|L) ...) +(Or(64|32|16|8) ...) => (OR(Q|L|L|L) ...) +(Xor(64|32|16|8) ...) => (XOR(Q|L|L|L) ...) +(Com(64|32|16|8) ...) => (NOT(Q|L|L|L) ...) + +(Neg(64|32|16|8) ...) => (NEG(Q|L|L|L) ...) +(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))])) +(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)])) + +// Lowering boolean ops +(AndB ...) => (ANDL ...) +(OrB ...) => (ORL ...) +(Not x) => (XORLconst [1] x) + +// Lowering pointer arithmetic +(OffPtr [off] ptr) && is32Bit(off) => (ADDQconst [int32(off)] ptr) +(OffPtr [off] ptr) => (ADDQ (MOVQconst [off]) ptr) + +// Lowering other arithmetic +(Ctz64 x) && buildcfg.GOAMD64 >= 3 => (TZCNTQ x) +(Ctz32 x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x) +(Ctz64 <t> x) && buildcfg.GOAMD64 < 3 => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x))) +(Ctz32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x))) +(Ctz16 x) => (BSFL (BTSLconst <typ.UInt32> [16] x)) +(Ctz8 x) => (BSFL (BTSLconst <typ.UInt32> [ 8] x)) + +(Ctz64NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTQ x) +(Ctz32NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x) +(Ctz16NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x) +(Ctz8NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x) +(Ctz64NonZero x) && buildcfg.GOAMD64 < 3 => (Select0 (BSFQ x)) +(Ctz32NonZero x) && buildcfg.GOAMD64 < 3 => (BSFL x) +(Ctz16NonZero x) && buildcfg.GOAMD64 < 3 => (BSFL x) +(Ctz8NonZero x) && buildcfg.GOAMD64 < 3 => (BSFL x) + +// BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0. +// However, for zero-extended values, we can cheat a bit, and calculate +// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently +// places the index of the highest set bit where we want it. +// For GOAMD64>=3, BitLen can be calculated by OperandSize - LZCNT(x). +(BitLen64 <t> x) && buildcfg.GOAMD64 < 3 => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x)))) +(BitLen32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x)))) +(BitLen16 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x))) +(BitLen8 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x))) +(BitLen64 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-64] (LZCNTQ x))) +// Use 64-bit version to allow const-fold remove unnecessary arithmetic. +(BitLen32 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL x))) +(BitLen16 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL (MOVWQZX <x.Type> x)))) +(BitLen8 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL (MOVBQZX <x.Type> x)))) + +(Bswap(64|32) ...) => (BSWAP(Q|L) ...) + +(PopCount(64|32) ...) => (POPCNT(Q|L) ...) +(PopCount16 x) => (POPCNTL (MOVWQZX <typ.UInt32> x)) +(PopCount8 x) => (POPCNTL (MOVBQZX <typ.UInt32> x)) + +(Sqrt ...) => (SQRTSD ...) +(Sqrt32 ...) => (SQRTSS ...) + +(RoundToEven x) => (ROUNDSD [0] x) +(Floor x) => (ROUNDSD [1] x) +(Ceil x) => (ROUNDSD [2] x) +(Trunc x) => (ROUNDSD [3] x) + +(FMA x y z) => (VFMADD231SD z x y) + +// Lowering extension +// Note: we always extend to 64 bits even though some ops don't need that many result bits. +(SignExt8to16 ...) => (MOVBQSX ...) +(SignExt8to32 ...) => (MOVBQSX ...) +(SignExt8to64 ...) => (MOVBQSX ...) +(SignExt16to32 ...) => (MOVWQSX ...) +(SignExt16to64 ...) => (MOVWQSX ...) +(SignExt32to64 ...) => (MOVLQSX ...) + +(ZeroExt8to16 ...) => (MOVBQZX ...) +(ZeroExt8to32 ...) => (MOVBQZX ...) +(ZeroExt8to64 ...) => (MOVBQZX ...) +(ZeroExt16to32 ...) => (MOVWQZX ...) +(ZeroExt16to64 ...) => (MOVWQZX ...) +(ZeroExt32to64 ...) => (MOVLQZX ...) + +(Slicemask <t> x) => (SARQconst (NEGQ <t> x) [63]) + +(SpectreIndex <t> x y) => (CMOVQCC x (MOVQconst [0]) (CMPQ x y)) +(SpectreSliceIndex <t> x y) => (CMOVQHI x (MOVQconst [0]) (CMPQ x y)) + +// Lowering truncation +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) +(Trunc64to8 ...) => (Copy ...) +(Trunc64to16 ...) => (Copy ...) +(Trunc64to32 ...) => (Copy ...) + +// Lowering float <-> int +(Cvt32to32F ...) => (CVTSL2SS ...) +(Cvt32to64F ...) => (CVTSL2SD ...) +(Cvt64to32F ...) => (CVTSQ2SS ...) +(Cvt64to64F ...) => (CVTSQ2SD ...) + +(Cvt32Fto32 ...) => (CVTTSS2SL ...) +(Cvt32Fto64 ...) => (CVTTSS2SQ ...) +(Cvt64Fto32 ...) => (CVTTSD2SL ...) +(Cvt64Fto64 ...) => (CVTTSD2SQ ...) + +(Cvt32Fto64F ...) => (CVTSS2SD ...) +(Cvt64Fto32F ...) => (CVTSD2SS ...) + +(Round(32|64)F ...) => (Copy ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +// Lowering shifts +// Unsigned shifts need to return 0 if shift amount is >= width of shifted value. +// result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff) +(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64]))) +(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32]))) +(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32]))) +(Lsh8x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32]))) + +(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLQ x y) +(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y) +(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y) +(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y) + +(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64]))) +(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32]))) +(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16]))) +(Rsh8Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8]))) + +(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRQ x y) +(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRL x y) +(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRW x y) +(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRB x y) + +// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value. +// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width. +(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64]))))) +(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32]))))) +(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16]))))) +(Rsh8x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8]))))) + +(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SARQ x y) +(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SARL x y) +(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y) +(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SARB x y) + +// Lowering integer comparisons +(Less(64|32|16|8) x y) => (SETL (CMP(Q|L|W|B) x y)) +(Less(64|32|16|8)U x y) => (SETB (CMP(Q|L|W|B) x y)) +(Leq(64|32|16|8) x y) => (SETLE (CMP(Q|L|W|B) x y)) +(Leq(64|32|16|8)U x y) => (SETBE (CMP(Q|L|W|B) x y)) +(Eq(Ptr|64|32|16|8|B) x y) => (SETEQ (CMP(Q|Q|L|W|B|B) x y)) +(Neq(Ptr|64|32|16|8|B) x y) => (SETNE (CMP(Q|Q|L|W|B|B) x y)) + +// Lowering floating point comparisons +// Note Go assembler gets UCOMISx operand order wrong, but it is right here +// and the operands are reversed when generating assembly language. +(Eq(32|64)F x y) => (SETEQF (UCOMIS(S|D) x y)) +(Neq(32|64)F x y) => (SETNEF (UCOMIS(S|D) x y)) +// Use SETGF/SETGEF with reversed operands to dodge NaN case. +(Less(32|64)F x y) => (SETGF (UCOMIS(S|D) y x)) +(Leq(32|64)F x y) => (SETGEF (UCOMIS(S|D) y x)) + +// Lowering loads +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVQload ptr mem) +(Load <t> ptr mem) && is32BitInt(t) => (MOVLload ptr mem) +(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem) +(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem) + +// Lowering stores +// These more-specific FP versions of Store pattern should come first. +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem) + +(Store {t} ptr val mem) && t.Size() == 8 => (MOVQstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) + +// Lowering moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem) +(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem) +(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem) +(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem) +(Move [16] dst src mem) && config.useSSE => (MOVOstore dst (MOVOload src mem) mem) +(Move [16] dst src mem) && !config.useSSE => + (MOVQstore [8] dst (MOVQload [8] src mem) + (MOVQstore dst (MOVQload src mem) mem)) + +(Move [32] dst src mem) => + (Move [16] + (OffPtr <dst.Type> dst [16]) + (OffPtr <src.Type> src [16]) + (Move [16] dst src mem)) + +(Move [48] dst src mem) && config.useSSE => + (Move [32] + (OffPtr <dst.Type> dst [16]) + (OffPtr <src.Type> src [16]) + (Move [16] dst src mem)) + +(Move [64] dst src mem) && config.useSSE => + (Move [32] + (OffPtr <dst.Type> dst [32]) + (OffPtr <src.Type> src [32]) + (Move [32] dst src mem)) + +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVWstore dst (MOVWload src mem) mem)) +(Move [5] dst src mem) => + (MOVBstore [4] dst (MOVBload [4] src mem) + (MOVLstore dst (MOVLload src mem) mem)) +(Move [6] dst src mem) => + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVLstore dst (MOVLload src mem) mem)) +(Move [7] dst src mem) => + (MOVLstore [3] dst (MOVLload [3] src mem) + (MOVLstore dst (MOVLload src mem) mem)) +(Move [9] dst src mem) => + (MOVBstore [8] dst (MOVBload [8] src mem) + (MOVQstore dst (MOVQload src mem) mem)) +(Move [10] dst src mem) => + (MOVWstore [8] dst (MOVWload [8] src mem) + (MOVQstore dst (MOVQload src mem) mem)) +(Move [12] dst src mem) => + (MOVLstore [8] dst (MOVLload [8] src mem) + (MOVQstore dst (MOVQload src mem) mem)) +(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 => + (MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem) + (MOVQstore dst (MOVQload src mem) mem)) + +// Adjust moves to be a multiple of 16 bytes. +(Move [s] dst src mem) + && s > 16 && s%16 != 0 && s%16 <= 8 => + (Move [s-s%16] + (OffPtr <dst.Type> dst [s%16]) + (OffPtr <src.Type> src [s%16]) + (MOVQstore dst (MOVQload src mem) mem)) +(Move [s] dst src mem) + && s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE => + (Move [s-s%16] + (OffPtr <dst.Type> dst [s%16]) + (OffPtr <src.Type> src [s%16]) + (MOVOstore dst (MOVOload src mem) mem)) +(Move [s] dst src mem) + && s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE => + (Move [s-s%16] + (OffPtr <dst.Type> dst [s%16]) + (OffPtr <src.Type> src [s%16]) + (MOVQstore [8] dst (MOVQload [8] src mem) + (MOVQstore dst (MOVQload src mem) mem))) + +// Medium copying uses a duff device. +(Move [s] dst src mem) + && s > 64 && s <= 16*64 && s%16 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [s] dst src mem) + +// Large copying uses REP MOVSQ. +(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) => + (REPMOVSQ dst src (MOVQconst [s/8]) mem) + +// Lowering Zero instructions +(Zero [0] _ mem) => mem +(Zero [1] destptr mem) => (MOVBstoreconst [makeValAndOff(0,0)] destptr mem) +(Zero [2] destptr mem) => (MOVWstoreconst [makeValAndOff(0,0)] destptr mem) +(Zero [4] destptr mem) => (MOVLstoreconst [makeValAndOff(0,0)] destptr mem) +(Zero [8] destptr mem) => (MOVQstoreconst [makeValAndOff(0,0)] destptr mem) + +(Zero [3] destptr mem) => + (MOVBstoreconst [makeValAndOff(0,2)] destptr + (MOVWstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [5] destptr mem) => + (MOVBstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [6] destptr mem) => + (MOVWstoreconst [makeValAndOff(0,4)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [7] destptr mem) => + (MOVLstoreconst [makeValAndOff(0,3)] destptr + (MOVLstoreconst [makeValAndOff(0,0)] destptr mem)) + +// Strip off any fractional word zeroing. +(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE => + (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8]) + (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)) + +// Zero small numbers of words directly. +(Zero [16] destptr mem) && !config.useSSE => + (MOVQstoreconst [makeValAndOff(0,8)] destptr + (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [24] destptr mem) && !config.useSSE => + (MOVQstoreconst [makeValAndOff(0,16)] destptr + (MOVQstoreconst [makeValAndOff(0,8)] destptr + (MOVQstoreconst [makeValAndOff(0,0)] destptr mem))) +(Zero [32] destptr mem) && !config.useSSE => + (MOVQstoreconst [makeValAndOff(0,24)] destptr + (MOVQstoreconst [makeValAndOff(0,16)] destptr + (MOVQstoreconst [makeValAndOff(0,8)] destptr + (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)))) + +(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE => + (MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr + (MOVQstoreconst [makeValAndOff(0,0)] destptr mem)) + +// Adjust zeros to be a multiple of 16 bytes. +(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE => + (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16]) + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) + +(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE => + (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16]) + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) + +(Zero [16] destptr mem) && config.useSSE => + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem) +(Zero [32] destptr mem) && config.useSSE => + (MOVOstoreconst [makeValAndOff(0,16)] destptr + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)) +(Zero [48] destptr mem) && config.useSSE => + (MOVOstoreconst [makeValAndOff(0,32)] destptr + (MOVOstoreconst [makeValAndOff(0,16)] destptr + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))) +(Zero [64] destptr mem) && config.useSSE => + (MOVOstoreconst [makeValAndOff(0,48)] destptr + (MOVOstoreconst [makeValAndOff(0,32)] destptr + (MOVOstoreconst [makeValAndOff(0,16)] destptr + (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))) + +// Medium zeroing uses a duff device. +(Zero [s] destptr mem) + && s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice => + (DUFFZERO [s] destptr mem) + +// Large zeroing uses REP STOSQ. +(Zero [s] destptr mem) + && (s > 1024 || (config.noDuffDevice && s > 64 || !config.useSSE && s > 32)) + && s%8 == 0 => + (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem) + +// Lowering constants +(Const8 [c]) => (MOVLconst [int32(c)]) +(Const16 [c]) => (MOVLconst [int32(c)]) +(Const32 ...) => (MOVLconst ...) +(Const64 ...) => (MOVQconst ...) +(Const32F ...) => (MOVSSconst ...) +(Const64F ...) => (MOVSDconst ...) +(ConstNil ) => (MOVQconst [0]) +(ConstBool [c]) => (MOVLconst [b2i32(c)]) + +// Lowering calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// Lowering conditional moves +// If the condition is a SETxx, we can just run a CMOV from the comparison that was +// setting the flags. +// Legend: HI=unsigned ABOVE, CS=unsigned BELOW, CC=unsigned ABOVE EQUAL, LS=unsigned BELOW EQUAL +(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && (is64BitInt(t) || isPtr(t)) + => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond) +(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is32BitInt(t) + => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond) +(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is16BitInt(t) + => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond) + +// If the condition does not set the flags, we need to generate a comparison. +(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 1 + => (CondSelect <t> x y (MOVBQZX <typ.UInt64> check)) +(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 2 + => (CondSelect <t> x y (MOVWQZX <typ.UInt64> check)) +(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 4 + => (CondSelect <t> x y (MOVLQZX <typ.UInt64> check)) + +(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && (is64BitInt(t) || isPtr(t)) + => (CMOVQNE y x (CMPQconst [0] check)) +(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is32BitInt(t) + => (CMOVLNE y x (CMPQconst [0] check)) +(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is16BitInt(t) + => (CMOVWNE y x (CMPQconst [0] check)) + +// Absorb InvertFlags +(CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond)) + => (CMOVQ(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond) +(CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond)) + => (CMOVL(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond) +(CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond)) + => (CMOVW(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond) + +// Absorb constants generated during lower +(CMOV(QEQ|QLE|QGE|QCC|QLS|LEQ|LLE|LGE|LCC|LLS|WEQ|WLE|WGE|WCC|WLS) _ x (FlagEQ)) => x +(CMOV(QNE|QLT|QGT|QCS|QHI|LNE|LLT|LGT|LCS|LHI|WNE|WLT|WGT|WCS|WHI) y _ (FlagEQ)) => y +(CMOV(QNE|QGT|QGE|QHI|QCC|LNE|LGT|LGE|LHI|LCC|WNE|WGT|WGE|WHI|WCC) _ x (FlagGT_UGT)) => x +(CMOV(QEQ|QLE|QLT|QLS|QCS|LEQ|LLE|LLT|LLS|LCS|WEQ|WLE|WLT|WLS|WCS) y _ (FlagGT_UGT)) => y +(CMOV(QNE|QGT|QGE|QLS|QCS|LNE|LGT|LGE|LLS|LCS|WNE|WGT|WGE|WLS|WCS) _ x (FlagGT_ULT)) => x +(CMOV(QEQ|QLE|QLT|QHI|QCC|LEQ|LLE|LLT|LHI|LCC|WEQ|WLE|WLT|WHI|WCC) y _ (FlagGT_ULT)) => y +(CMOV(QNE|QLT|QLE|QCS|QLS|LNE|LLT|LLE|LCS|LLS|WNE|WLT|WLE|WCS|WLS) _ x (FlagLT_ULT)) => x +(CMOV(QEQ|QGT|QGE|QHI|QCC|LEQ|LGT|LGE|LHI|LCC|WEQ|WGT|WGE|WHI|WCC) y _ (FlagLT_ULT)) => y +(CMOV(QNE|QLT|QLE|QHI|QCC|LNE|LLT|LLE|LHI|LCC|WNE|WLT|WLE|WHI|WCC) _ x (FlagLT_UGT)) => x +(CMOV(QEQ|QGT|QGE|QCS|QLS|LEQ|LGT|LGE|LCS|LLS|WEQ|WGT|WGE|WCS|WLS) y _ (FlagLT_UGT)) => y + +// Miscellaneous +(IsNonNil p) => (SETNE (TESTQ p p)) +(IsInBounds idx len) => (SETB (CMPQ idx len)) +(IsSliceInBounds idx len) => (SETBE (CMPQ idx len)) +(NilCheck ...) => (LoweredNilCheck ...) +(GetG mem) && v.Block.Func.OwnAux.Fn.ABI() != obj.ABIInternal => (LoweredGetG mem) // only lower in old ABI. in new ABI we have a G register. +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) + +(HasCPUFeature {s}) => (SETNE (CMPLconst [0] (LoweredHasCPUFeature {s}))) +(Addr {sym} base) => (LEAQ {sym} base) +(LocalAddr {sym} base _) => (LEAQ {sym} base) + +(MOVBstore [off] {sym} ptr y:(SETL x) mem) && y.Uses == 1 => (SETLstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETLE x) mem) && y.Uses == 1 => (SETLEstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETG x) mem) && y.Uses == 1 => (SETGstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETGE x) mem) && y.Uses == 1 => (SETGEstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETEQ x) mem) && y.Uses == 1 => (SETEQstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETNE x) mem) && y.Uses == 1 => (SETNEstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETB x) mem) && y.Uses == 1 => (SETBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETBE x) mem) && y.Uses == 1 => (SETBEstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETA x) mem) && y.Uses == 1 => (SETAstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr y:(SETAE x) mem) && y.Uses == 1 => (SETAEstore [off] {sym} ptr x mem) + +// block rewrites +(If (SETL cmp) yes no) => (LT cmp yes no) +(If (SETLE cmp) yes no) => (LE cmp yes no) +(If (SETG cmp) yes no) => (GT cmp yes no) +(If (SETGE cmp) yes no) => (GE cmp yes no) +(If (SETEQ cmp) yes no) => (EQ cmp yes no) +(If (SETNE cmp) yes no) => (NE cmp yes no) +(If (SETB cmp) yes no) => (ULT cmp yes no) +(If (SETBE cmp) yes no) => (ULE cmp yes no) +(If (SETA cmp) yes no) => (UGT cmp yes no) +(If (SETAE cmp) yes no) => (UGE cmp yes no) +(If (SETO cmp) yes no) => (OS cmp yes no) + +// Special case for floating point - LF/LEF not generated +(If (SETGF cmp) yes no) => (UGT cmp yes no) +(If (SETGEF cmp) yes no) => (UGE cmp yes no) +(If (SETEQF cmp) yes no) => (EQF cmp yes no) +(If (SETNEF cmp) yes no) => (NEF cmp yes no) + +(If cond yes no) => (NE (TESTB cond cond) yes no) + +(JumpTable idx) => (JUMPTABLE {makeJumpTableSym(b)} idx (LEAQ <typ.Uintptr> {makeJumpTableSym(b)} (SB))) + +// Atomic loads. Other than preserving their ordering with respect to other loads, nothing special here. +(AtomicLoad8 ptr mem) => (MOVBatomicload ptr mem) +(AtomicLoad32 ptr mem) => (MOVLatomicload ptr mem) +(AtomicLoad64 ptr mem) => (MOVQatomicload ptr mem) +(AtomicLoadPtr ptr mem) => (MOVQatomicload ptr mem) + +// Atomic stores. We use XCHG to prevent the hardware reordering a subsequent load. +// TODO: most runtime uses of atomic stores don't need that property. Use normal stores for those? +(AtomicStore8 ptr val mem) => (Select1 (XCHGB <types.NewTuple(typ.UInt8,types.TypeMem)> val ptr mem)) +(AtomicStore32 ptr val mem) => (Select1 (XCHGL <types.NewTuple(typ.UInt32,types.TypeMem)> val ptr mem)) +(AtomicStore64 ptr val mem) => (Select1 (XCHGQ <types.NewTuple(typ.UInt64,types.TypeMem)> val ptr mem)) +(AtomicStorePtrNoWB ptr val mem) => (Select1 (XCHGQ <types.NewTuple(typ.BytePtr,types.TypeMem)> val ptr mem)) + +// Atomic exchanges. +(AtomicExchange32 ptr val mem) => (XCHGL val ptr mem) +(AtomicExchange64 ptr val mem) => (XCHGQ val ptr mem) + +// Atomic adds. +(AtomicAdd32 ptr val mem) => (AddTupleFirst32 val (XADDLlock val ptr mem)) +(AtomicAdd64 ptr val mem) => (AddTupleFirst64 val (XADDQlock val ptr mem)) +(Select0 <t> (AddTupleFirst32 val tuple)) => (ADDL val (Select0 <t> tuple)) +(Select1 (AddTupleFirst32 _ tuple)) => (Select1 tuple) +(Select0 <t> (AddTupleFirst64 val tuple)) => (ADDQ val (Select0 <t> tuple)) +(Select1 (AddTupleFirst64 _ tuple)) => (Select1 tuple) + +// Atomic compare and swap. +(AtomicCompareAndSwap32 ptr old new_ mem) => (CMPXCHGLlock ptr old new_ mem) +(AtomicCompareAndSwap64 ptr old new_ mem) => (CMPXCHGQlock ptr old new_ mem) + +// Atomic memory updates. +(AtomicAnd8 ptr val mem) => (ANDBlock ptr val mem) +(AtomicAnd32 ptr val mem) => (ANDLlock ptr val mem) +(AtomicOr8 ptr val mem) => (ORBlock ptr val mem) +(AtomicOr32 ptr val mem) => (ORLlock ptr val mem) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +// lowering rotates +(RotateLeft8 ...) => (ROLB ...) +(RotateLeft16 ...) => (ROLW ...) +(RotateLeft32 ...) => (ROLL ...) +(RotateLeft64 ...) => (ROLQ ...) + +// *************************** +// Above: lowering rules +// Below: optimizations +// *************************** +// TODO: Should the optimizations be a separate pass? + +// Fold boolean tests into blocks +(NE (TESTB (SETL cmp) (SETL cmp)) yes no) => (LT cmp yes no) +(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE cmp yes no) +(NE (TESTB (SETG cmp) (SETG cmp)) yes no) => (GT cmp yes no) +(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE cmp yes no) +(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ cmp yes no) +(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE cmp yes no) +(NE (TESTB (SETB cmp) (SETB cmp)) yes no) => (ULT cmp yes no) +(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no) +(NE (TESTB (SETA cmp) (SETA cmp)) yes no) => (UGT cmp yes no) +(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no) +(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no) + +// Unsigned comparisons to 0/1 +(ULT (TEST(Q|L|W|B) x x) yes no) => (First no yes) +(UGE (TEST(Q|L|W|B) x x) yes no) => (First yes no) +(SETB (TEST(Q|L|W|B) x x)) => (ConstBool [false]) +(SETAE (TEST(Q|L|W|B) x x)) => (ConstBool [true]) + +// x & 1 != 0 -> x & 1 +(SETNE (TEST(B|W)const [1] x)) => (AND(L|L)const [1] x) +(SETB (BT(L|Q)const [0] x)) => (AND(L|Q)const [1] x) + +// Recognize bit tests: a&(1<<b) != 0 for b suitably bounded +// Note that BTx instructions use the carry bit, so we need to convert tests for zero flag +// into tests for carry flags. +// ULT and SETB check the carry flag; they are identical to CS and SETCS. Same, mutatis +// mutandis, for UGE and SETAE, and CC and SETCC. +((NE|EQ) (TESTL (SHLL (MOVLconst [1]) x) y)) => ((ULT|UGE) (BTL x y)) +((NE|EQ) (TESTQ (SHLQ (MOVQconst [1]) x) y)) => ((ULT|UGE) (BTQ x y)) +((NE|EQ) (TESTLconst [c] x)) && isUint32PowerOfTwo(int64(c)) + => ((ULT|UGE) (BTLconst [int8(log32(c))] x)) +((NE|EQ) (TESTQconst [c] x)) && isUint64PowerOfTwo(int64(c)) + => ((ULT|UGE) (BTQconst [int8(log32(c))] x)) +((NE|EQ) (TESTQ (MOVQconst [c]) x)) && isUint64PowerOfTwo(c) + => ((ULT|UGE) (BTQconst [int8(log64(c))] x)) +(SET(NE|EQ) (TESTL (SHLL (MOVLconst [1]) x) y)) => (SET(B|AE) (BTL x y)) +(SET(NE|EQ) (TESTQ (SHLQ (MOVQconst [1]) x) y)) => (SET(B|AE) (BTQ x y)) +(SET(NE|EQ) (TESTLconst [c] x)) && isUint32PowerOfTwo(int64(c)) + => (SET(B|AE) (BTLconst [int8(log32(c))] x)) +(SET(NE|EQ) (TESTQconst [c] x)) && isUint64PowerOfTwo(int64(c)) + => (SET(B|AE) (BTQconst [int8(log32(c))] x)) +(SET(NE|EQ) (TESTQ (MOVQconst [c]) x)) && isUint64PowerOfTwo(c) + => (SET(B|AE) (BTQconst [int8(log64(c))] x)) +// SET..store variant +(SET(NE|EQ)store [off] {sym} ptr (TESTL (SHLL (MOVLconst [1]) x) y) mem) + => (SET(B|AE)store [off] {sym} ptr (BTL x y) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTQ (SHLQ (MOVQconst [1]) x) y) mem) + => (SET(B|AE)store [off] {sym} ptr (BTQ x y) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTLconst [c] x) mem) && isUint32PowerOfTwo(int64(c)) + => (SET(B|AE)store [off] {sym} ptr (BTLconst [int8(log32(c))] x) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTQconst [c] x) mem) && isUint64PowerOfTwo(int64(c)) + => (SET(B|AE)store [off] {sym} ptr (BTQconst [int8(log32(c))] x) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTQ (MOVQconst [c]) x) mem) && isUint64PowerOfTwo(c) + => (SET(B|AE)store [off] {sym} ptr (BTQconst [int8(log64(c))] x) mem) + +// Handle bit-testing in the form (a>>b)&1 != 0 by building the above rules +// and further combining shifts. +(BT(Q|L)const [c] (SHRQconst [d] x)) && (c+d)<64 => (BTQconst [c+d] x) +(BT(Q|L)const [c] (SHLQconst [d] x)) && c>d => (BT(Q|L)const [c-d] x) +(BT(Q|L)const [0] s:(SHRQ x y)) => (BTQ y x) +(BTLconst [c] (SHRLconst [d] x)) && (c+d)<32 => (BTLconst [c+d] x) +(BTLconst [c] (SHLLconst [d] x)) && c>d => (BTLconst [c-d] x) +(BTLconst [0] s:(SHR(L|XL) x y)) => (BTL y x) + +// Rewrite a & 1 != 1 into a & 1 == 0. +// Among other things, this lets us turn (a>>b)&1 != 1 into a bit test. +(SET(NE|EQ) (CMPLconst [1] s:(ANDLconst [1] _))) => (SET(EQ|NE) (CMPLconst [0] s)) +(SET(NE|EQ)store [off] {sym} ptr (CMPLconst [1] s:(ANDLconst [1] _)) mem) => (SET(EQ|NE)store [off] {sym} ptr (CMPLconst [0] s) mem) +(SET(NE|EQ) (CMPQconst [1] s:(ANDQconst [1] _))) => (SET(EQ|NE) (CMPQconst [0] s)) +(SET(NE|EQ)store [off] {sym} ptr (CMPQconst [1] s:(ANDQconst [1] _)) mem) => (SET(EQ|NE)store [off] {sym} ptr (CMPQconst [0] s) mem) + +// Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b) +(OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y) +(XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y) + +// Convert ORconst into BTS, if the code gets smaller, with boundary being +// (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes). +((ORQ|XORQ)const [c] x) && isUint64PowerOfTwo(int64(c)) && uint64(c) >= 128 + => (BT(S|C)Qconst [int8(log32(c))] x) +((ORL|XORL)const [c] x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128 + => (BT(S|C)Lconst [int8(log32(c))] x) +((ORQ|XORQ) (MOVQconst [c]) x) && isUint64PowerOfTwo(c) && uint64(c) >= 128 + => (BT(S|C)Qconst [int8(log64(c))] x) +((ORL|XORL) (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128 + => (BT(S|C)Lconst [int8(log32(c))] x) + +// Recognize bit clearing: a &^= 1<<b +(AND(Q|L) (NOT(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y)) x) => (BTR(Q|L) x y) +(ANDN(Q|L) x (SHL(Q|L) (MOV(Q|L)const [1]) y)) => (BTR(Q|L) x y) +(ANDQconst [c] x) && isUint64PowerOfTwo(int64(^c)) && uint64(^c) >= 128 + => (BTRQconst [int8(log32(^c))] x) +(ANDLconst [c] x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128 + => (BTRLconst [int8(log32(^c))] x) +(ANDQ (MOVQconst [c]) x) && isUint64PowerOfTwo(^c) && uint64(^c) >= 128 + => (BTRQconst [int8(log64(^c))] x) +(ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128 + => (BTRLconst [int8(log32(^c))] x) + +// Special-case bit patterns on first/last bit. +// generic.rules changes ANDs of high-part/low-part masks into a couple of shifts, +// for instance: +// x & 0xFFFF0000 -> (x >> 16) << 16 +// x & 0x80000000 -> (x >> 31) << 31 +// +// In case the mask is just one bit (like second example above), it conflicts +// with the above rules to detect bit-testing / bit-clearing of first/last bit. +// We thus special-case them, by detecting the shift patterns. + +// Special case resetting first/last bit +(SHL(L|Q)const [1] (SHR(L|Q)const [1] x)) + => (BTR(L|Q)const [0] x) +(SHRLconst [1] (SHLLconst [1] x)) + => (BTRLconst [31] x) +(SHRQconst [1] (SHLQconst [1] x)) + => (BTRQconst [63] x) + +// Special case testing first/last bit (with double-shift generated by generic.rules) +((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHLQconst [63] (SHRQconst [63] x)) z2)) && z1==z2 + => ((SETB|SETAE|ULT|UGE) (BTQconst [63] x)) +((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHLLconst [31] (SHRQconst [31] x)) z2)) && z1==z2 + => ((SETB|SETAE|ULT|UGE) (BTQconst [31] x)) +(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHLQconst [63] (SHRQconst [63] x)) z2) mem) && z1==z2 + => (SET(B|AE)store [off] {sym} ptr (BTQconst [63] x) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHLLconst [31] (SHRLconst [31] x)) z2) mem) && z1==z2 + => (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem) + +((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHRQconst [63] (SHLQconst [63] x)) z2)) && z1==z2 + => ((SETB|SETAE|ULT|UGE) (BTQconst [0] x)) +((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHRLconst [31] (SHLLconst [31] x)) z2)) && z1==z2 + => ((SETB|SETAE|ULT|UGE) (BTLconst [0] x)) +(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHRQconst [63] (SHLQconst [63] x)) z2) mem) && z1==z2 + => (SET(B|AE)store [off] {sym} ptr (BTQconst [0] x) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHRLconst [31] (SHLLconst [31] x)) z2) mem) && z1==z2 + => (SET(B|AE)store [off] {sym} ptr (BTLconst [0] x) mem) + +// Special-case manually testing last bit with "a>>63 != 0" (without "&1") +((SETNE|SETEQ|NE|EQ) (TESTQ z1:(SHRQconst [63] x) z2)) && z1==z2 + => ((SETB|SETAE|ULT|UGE) (BTQconst [63] x)) +((SETNE|SETEQ|NE|EQ) (TESTL z1:(SHRLconst [31] x) z2)) && z1==z2 + => ((SETB|SETAE|ULT|UGE) (BTLconst [31] x)) +(SET(NE|EQ)store [off] {sym} ptr (TESTQ z1:(SHRQconst [63] x) z2) mem) && z1==z2 + => (SET(B|AE)store [off] {sym} ptr (BTQconst [63] x) mem) +(SET(NE|EQ)store [off] {sym} ptr (TESTL z1:(SHRLconst [31] x) z2) mem) && z1==z2 + => (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem) + +// Fold combinations of bit ops on same bit. An example is math.Copysign(c,-1) +(BTS(Q|L)const [c] (BTR(Q|L)const [c] x)) => (BTS(Q|L)const [c] x) +(BTS(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTS(Q|L)const [c] x) +(BTR(Q|L)const [c] (BTS(Q|L)const [c] x)) => (BTR(Q|L)const [c] x) +(BTR(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTR(Q|L)const [c] x) + +// Fold boolean negation into SETcc. +(XORLconst [1] (SETNE x)) => (SETEQ x) +(XORLconst [1] (SETEQ x)) => (SETNE x) +(XORLconst [1] (SETL x)) => (SETGE x) +(XORLconst [1] (SETGE x)) => (SETL x) +(XORLconst [1] (SETLE x)) => (SETG x) +(XORLconst [1] (SETG x)) => (SETLE x) +(XORLconst [1] (SETB x)) => (SETAE x) +(XORLconst [1] (SETAE x)) => (SETB x) +(XORLconst [1] (SETBE x)) => (SETA x) +(XORLconst [1] (SETA x)) => (SETBE x) + +// Special case for floating point - LF/LEF not generated +(NE (TESTB (SETGF cmp) (SETGF cmp)) yes no) => (UGT cmp yes no) +(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE cmp yes no) +(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF cmp yes no) +(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF cmp yes no) + +// Disabled because it interferes with the pattern match above and makes worse code. +// (SETNEF x) => (ORQ (SETNE <typ.Int8> x) (SETNAN <typ.Int8> x)) +// (SETEQF x) => (ANDQ (SETEQ <typ.Int8> x) (SETORD <typ.Int8> x)) + +// fold constants into instructions +(ADDQ x (MOVQconst [c])) && is32Bit(c) => (ADDQconst [int32(c)] x) +(ADDQ x (MOVLconst [c])) => (ADDQconst [c] x) +(ADDL x (MOVLconst [c])) => (ADDLconst [c] x) + +(SUBQ x (MOVQconst [c])) && is32Bit(c) => (SUBQconst x [int32(c)]) +(SUBQ (MOVQconst [c]) x) && is32Bit(c) => (NEGQ (SUBQconst <v.Type> x [int32(c)])) +(SUBL x (MOVLconst [c])) => (SUBLconst x [c]) +(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c])) + +(MULQ x (MOVQconst [c])) && is32Bit(c) => (MULQconst [int32(c)] x) +(MULL x (MOVLconst [c])) => (MULLconst [c] x) + +(ANDQ x (MOVQconst [c])) && is32Bit(c) => (ANDQconst [int32(c)] x) +(ANDL x (MOVLconst [c])) => (ANDLconst [c] x) + +(AND(L|Q)const [c] (AND(L|Q)const [d] x)) => (AND(L|Q)const [c & d] x) +(XOR(L|Q)const [c] (XOR(L|Q)const [d] x)) => (XOR(L|Q)const [c ^ d] x) +(OR(L|Q)const [c] (OR(L|Q)const [d] x)) => (OR(L|Q)const [c | d] x) + +(BTRLconst [c] (ANDLconst [d] x)) => (ANDLconst [d &^ (1<<uint32(c))] x) +(ANDLconst [c] (BTRLconst [d] x)) => (ANDLconst [c &^ (1<<uint32(d))] x) +(BTRLconst [c] (BTRLconst [d] x)) => (ANDLconst [^(1<<uint32(c) | 1<<uint32(d))] x) + +(BTCLconst [c] (XORLconst [d] x)) => (XORLconst [d ^ 1<<uint32(c)] x) +(XORLconst [c] (BTCLconst [d] x)) => (XORLconst [c ^ 1<<uint32(d)] x) +(BTCLconst [c] (BTCLconst [d] x)) => (XORLconst [1<<uint32(c) | 1<<uint32(d)] x) + +(BTSLconst [c] (ORLconst [d] x)) => (ORLconst [d | 1<<uint32(c)] x) +(ORLconst [c] (BTSLconst [d] x)) => (ORLconst [c | 1<<uint32(d)] x) +(BTSLconst [c] (BTSLconst [d] x)) => (ORLconst [1<<uint32(c) | 1<<uint32(d)] x) + +(BTRQconst [c] (ANDQconst [d] x)) && is32Bit(int64(d) &^ (1<<uint32(c))) => (ANDQconst [d &^ (1<<uint32(c))] x) +(ANDQconst [c] (BTRQconst [d] x)) && is32Bit(int64(c) &^ (1<<uint32(d))) => (ANDQconst [c &^ (1<<uint32(d))] x) +(BTRQconst [c] (BTRQconst [d] x)) && is32Bit(^(1<<uint32(c) | 1<<uint32(d))) => (ANDQconst [^(1<<uint32(c) | 1<<uint32(d))] x) + +(BTCQconst [c] (XORQconst [d] x)) && is32Bit(int64(d) ^ 1<<uint32(c)) => (XORQconst [d ^ 1<<uint32(c)] x) +(XORQconst [c] (BTCQconst [d] x)) && is32Bit(int64(c) ^ 1<<uint32(d)) => (XORQconst [c ^ 1<<uint32(d)] x) +(BTCQconst [c] (BTCQconst [d] x)) && is32Bit(1<<uint32(c) ^ 1<<uint32(d)) => (XORQconst [1<<uint32(c) ^ 1<<uint32(d)] x) + +(BTSQconst [c] (ORQconst [d] x)) && is32Bit(int64(d) | 1<<uint32(c)) => (ORQconst [d | 1<<uint32(c)] x) +(ORQconst [c] (BTSQconst [d] x)) && is32Bit(int64(c) | 1<<uint32(d)) => (ORQconst [c | 1<<uint32(d)] x) +(BTSQconst [c] (BTSQconst [d] x)) && is32Bit(1<<uint32(c) | 1<<uint32(d)) => (ORQconst [1<<uint32(c) | 1<<uint32(d)] x) + + +(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x) +(MULQconst [c] (MULQconst [d] x)) && is32Bit(int64(c)*int64(d)) => (MULQconst [c * d] x) + +(ORQ x (MOVQconst [c])) && is32Bit(c) => (ORQconst [int32(c)] x) +(ORQ x (MOVLconst [c])) => (ORQconst [c] x) +(ORL x (MOVLconst [c])) => (ORLconst [c] x) + +(XORQ x (MOVQconst [c])) && is32Bit(c) => (XORQconst [int32(c)] x) +(XORL x (MOVLconst [c])) => (XORLconst [c] x) + +(SHLQ x (MOV(Q|L)const [c])) => (SHLQconst [int8(c&63)] x) +(SHLL x (MOV(Q|L)const [c])) => (SHLLconst [int8(c&31)] x) + +(SHRQ x (MOV(Q|L)const [c])) => (SHRQconst [int8(c&63)] x) +(SHRL x (MOV(Q|L)const [c])) => (SHRLconst [int8(c&31)] x) +(SHRW x (MOV(Q|L)const [c])) && c&31 < 16 => (SHRWconst [int8(c&31)] x) +(SHRW _ (MOV(Q|L)const [c])) && c&31 >= 16 => (MOVLconst [0]) +(SHRB x (MOV(Q|L)const [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x) +(SHRB _ (MOV(Q|L)const [c])) && c&31 >= 8 => (MOVLconst [0]) + +(SARQ x (MOV(Q|L)const [c])) => (SARQconst [int8(c&63)] x) +(SARL x (MOV(Q|L)const [c])) => (SARLconst [int8(c&31)] x) +(SARW x (MOV(Q|L)const [c])) => (SARWconst [int8(min(int64(c)&31,15))] x) +(SARB x (MOV(Q|L)const [c])) => (SARBconst [int8(min(int64(c)&31,7))] x) + +// Operations which don't affect the low 6/5 bits of the shift amount are NOPs. +((SHLQ|SHRQ|SARQ) x (ADDQconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x y) +((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ADDQconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y)) +((SHLQ|SHRQ|SARQ) x (ANDQconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y) +((SHLQ|SHRQ|SARQ) x (NEGQ <t> (ANDQconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGQ <t> y)) + +((SHLL|SHRL|SARL) x (ADDQconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL) x y) +((SHLL|SHRL|SARL) x (NEGQ <t> (ADDQconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL) x (NEGQ <t> y)) +((SHLL|SHRL|SARL) x (ANDQconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y) +((SHLL|SHRL|SARL) x (NEGQ <t> (ANDQconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGQ <t> y)) + +((SHLQ|SHRQ|SARQ) x (ADDLconst [c] y)) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x y) +((SHLQ|SHRQ|SARQ) x (NEGL <t> (ADDLconst [c] y))) && c & 63 == 0 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y)) +((SHLQ|SHRQ|SARQ) x (ANDLconst [c] y)) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x y) +((SHLQ|SHRQ|SARQ) x (NEGL <t> (ANDLconst [c] y))) && c & 63 == 63 => ((SHLQ|SHRQ|SARQ) x (NEGL <t> y)) + +((SHLL|SHRL|SARL) x (ADDLconst [c] y)) && c & 31 == 0 => ((SHLL|SHRL|SARL) x y) +((SHLL|SHRL|SARL) x (NEGL <t> (ADDLconst [c] y))) && c & 31 == 0 => ((SHLL|SHRL|SARL) x (NEGL <t> y)) +((SHLL|SHRL|SARL) x (ANDLconst [c] y)) && c & 31 == 31 => ((SHLL|SHRL|SARL) x y) +((SHLL|SHRL|SARL) x (NEGL <t> (ANDLconst [c] y))) && c & 31 == 31 => ((SHLL|SHRL|SARL) x (NEGL <t> y)) + +// rotate left negative = rotate right +(ROLQ x (NEG(Q|L) y)) => (RORQ x y) +(ROLL x (NEG(Q|L) y)) => (RORL x y) +(ROLW x (NEG(Q|L) y)) => (RORW x y) +(ROLB x (NEG(Q|L) y)) => (RORB x y) + +// rotate right negative = rotate left +(RORQ x (NEG(Q|L) y)) => (ROLQ x y) +(RORL x (NEG(Q|L) y)) => (ROLL x y) +(RORW x (NEG(Q|L) y)) => (ROLW x y) +(RORB x (NEG(Q|L) y)) => (ROLB x y) + +// rotate by constants +(ROLQ x (MOV(Q|L)const [c])) => (ROLQconst [int8(c&63)] x) +(ROLL x (MOV(Q|L)const [c])) => (ROLLconst [int8(c&31)] x) +(ROLW x (MOV(Q|L)const [c])) => (ROLWconst [int8(c&15)] x) +(ROLB x (MOV(Q|L)const [c])) => (ROLBconst [int8(c&7) ] x) + +(RORQ x (MOV(Q|L)const [c])) => (ROLQconst [int8((-c)&63)] x) +(RORL x (MOV(Q|L)const [c])) => (ROLLconst [int8((-c)&31)] x) +(RORW x (MOV(Q|L)const [c])) => (ROLWconst [int8((-c)&15)] x) +(RORB x (MOV(Q|L)const [c])) => (ROLBconst [int8((-c)&7) ] x) + +// Constant shift simplifications +((SHLQ|SHRQ|SARQ)const x [0]) => x +((SHLL|SHRL|SARL)const x [0]) => x +((SHRW|SARW)const x [0]) => x +((SHRB|SARB)const x [0]) => x +((ROLQ|ROLL|ROLW|ROLB)const x [0]) => x + +// Multi-register shifts +(ORQ (SH(R|L)Q lo bits) (SH(L|R)Q hi (NEGQ bits))) => (SH(R|L)DQ lo hi bits) +(ORQ (SH(R|L)XQ lo bits) (SH(L|R)XQ hi (NEGQ bits))) => (SH(R|L)DQ lo hi bits) + +// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits) +// because the x86 instructions are defined to use all 5 bits of the shift even +// for the small shifts. I don't think we'll ever generate a weird shift (e.g. +// (SHRW x (MOVLconst [24])), but just in case. + +(CMPQ x (MOVQconst [c])) && is32Bit(c) => (CMPQconst x [int32(c)]) +(CMPQ (MOVQconst [c]) x) && is32Bit(c) => (InvertFlags (CMPQconst x [int32(c)])) +(CMPL x (MOVLconst [c])) => (CMPLconst x [c]) +(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c])) +(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)]) +(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)])) +(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)]) +(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)])) + +// Canonicalize the order of arguments to comparisons - helps with CSE. +(CMP(Q|L|W|B) x y) && canonLessThan(x,y) => (InvertFlags (CMP(Q|L|W|B) y x)) + +// Using MOVZX instead of AND is cheaper. +(AND(Q|L)const [ 0xFF] x) => (MOVBQZX x) +(AND(Q|L)const [0xFFFF] x) => (MOVWQZX x) +// This rule is currently invalid because 0xFFFFFFFF is not representable by a signed int32. +// Commenting out for now, because it also can't trigger because of the is32bit guard on the +// ANDQconst lowering-rule, above, prevents 0xFFFFFFFF from matching (for the same reason) +// Using an alternate form of this rule segfaults some binaries because of +// adverse interactions with other passes. +// (ANDQconst [0xFFFFFFFF] x) => (MOVLQZX x) + +// strength reduction +// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf: +// 1 - addq, shlq, leaq, negq, subq +// 3 - imulq +// This limits the rewrites to two instructions. +// Note that negq always operates in-place, +// which can require a register-register move +// to preserve the original value, +// so it must be used with care. +(MUL(Q|L)const [-9] x) => (NEG(Q|L) (LEA(Q|L)8 <v.Type> x x)) +(MUL(Q|L)const [-5] x) => (NEG(Q|L) (LEA(Q|L)4 <v.Type> x x)) +(MUL(Q|L)const [-3] x) => (NEG(Q|L) (LEA(Q|L)2 <v.Type> x x)) +(MUL(Q|L)const [-1] x) => (NEG(Q|L) x) +(MUL(Q|L)const [ 0] _) => (MOV(Q|L)const [0]) +(MUL(Q|L)const [ 1] x) => x +(MUL(Q|L)const [ 3] x) => (LEA(Q|L)2 x x) +(MUL(Q|L)const [ 5] x) => (LEA(Q|L)4 x x) +(MUL(Q|L)const [ 7] x) => (LEA(Q|L)2 x (LEA(Q|L)2 <v.Type> x x)) +(MUL(Q|L)const [ 9] x) => (LEA(Q|L)8 x x) +(MUL(Q|L)const [11] x) => (LEA(Q|L)2 x (LEA(Q|L)4 <v.Type> x x)) +(MUL(Q|L)const [13] x) => (LEA(Q|L)4 x (LEA(Q|L)2 <v.Type> x x)) +(MUL(Q|L)const [19] x) => (LEA(Q|L)2 x (LEA(Q|L)8 <v.Type> x x)) +(MUL(Q|L)const [21] x) => (LEA(Q|L)4 x (LEA(Q|L)4 <v.Type> x x)) +(MUL(Q|L)const [25] x) => (LEA(Q|L)8 x (LEA(Q|L)2 <v.Type> x x)) +(MUL(Q|L)const [27] x) => (LEA(Q|L)8 (LEA(Q|L)2 <v.Type> x x) (LEA(Q|L)2 <v.Type> x x)) +(MUL(Q|L)const [37] x) => (LEA(Q|L)4 x (LEA(Q|L)8 <v.Type> x x)) +(MUL(Q|L)const [41] x) => (LEA(Q|L)8 x (LEA(Q|L)4 <v.Type> x x)) +(MUL(Q|L)const [45] x) => (LEA(Q|L)8 (LEA(Q|L)4 <v.Type> x x) (LEA(Q|L)4 <v.Type> x x)) +(MUL(Q|L)const [73] x) => (LEA(Q|L)8 x (LEA(Q|L)8 <v.Type> x x)) +(MUL(Q|L)const [81] x) => (LEA(Q|L)8 (LEA(Q|L)8 <v.Type> x x) (LEA(Q|L)8 <v.Type> x x)) + +(MUL(Q|L)const [c] x) && isPowerOfTwo64(int64(c)+1) && c >= 15 => (SUB(Q|L) (SHL(Q|L)const <v.Type> [int8(log64(int64(c)+1))] x) x) +(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-1) && c >= 17 => (LEA(Q|L)1 (SHL(Q|L)const <v.Type> [int8(log32(c-1))] x) x) +(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-2) && c >= 34 => (LEA(Q|L)2 (SHL(Q|L)const <v.Type> [int8(log32(c-2))] x) x) +(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-4) && c >= 68 => (LEA(Q|L)4 (SHL(Q|L)const <v.Type> [int8(log32(c-4))] x) x) +(MUL(Q|L)const [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEA(Q|L)8 (SHL(Q|L)const <v.Type> [int8(log32(c-8))] x) x) +(MUL(Q|L)const [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHL(Q|L)const [int8(log32(c/3))] (LEA(Q|L)2 <v.Type> x x)) +(MUL(Q|L)const [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHL(Q|L)const [int8(log32(c/5))] (LEA(Q|L)4 <v.Type> x x)) +(MUL(Q|L)const [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHL(Q|L)const [int8(log32(c/9))] (LEA(Q|L)8 <v.Type> x x)) + +// combine add/shift into LEAQ/LEAL +(ADD(L|Q) x (SHL(L|Q)const [3] y)) => (LEA(L|Q)8 x y) +(ADD(L|Q) x (SHL(L|Q)const [2] y)) => (LEA(L|Q)4 x y) +(ADD(L|Q) x (SHL(L|Q)const [1] y)) => (LEA(L|Q)2 x y) +(ADD(L|Q) x (ADD(L|Q) y y)) => (LEA(L|Q)2 x y) +(ADD(L|Q) x (ADD(L|Q) x y)) => (LEA(L|Q)2 y x) + +// combine ADDQ/ADDQconst into LEAQ1/LEAL1 +(ADD(Q|L)const [c] (ADD(Q|L) x y)) => (LEA(Q|L)1 [c] x y) +(ADD(Q|L) (ADD(Q|L)const [c] x) y) => (LEA(Q|L)1 [c] x y) +(ADD(Q|L)const [c] (SHL(Q|L)const [1] x)) => (LEA(Q|L)1 [c] x x) + +// fold ADDQ/ADDL into LEAQ/LEAL +(ADD(Q|L)const [c] (LEA(Q|L) [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L) [c+d] {s} x) +(LEA(Q|L) [c] {s} (ADD(Q|L)const [d] x)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L) [c+d] {s} x) +(LEA(Q|L) [c] {s} (ADD(Q|L) x y)) && x.Op != OpSB && y.Op != OpSB => (LEA(Q|L)1 [c] {s} x y) +(ADD(Q|L) x (LEA(Q|L) [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEA(Q|L)1 [c] {s} x y) + +// fold ADDQconst/ADDLconst into LEAQx/LEALx +(ADD(Q|L)const [c] (LEA(Q|L)1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)1 [c+d] {s} x y) +(ADD(Q|L)const [c] (LEA(Q|L)2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)2 [c+d] {s} x y) +(ADD(Q|L)const [c] (LEA(Q|L)4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)4 [c+d] {s} x y) +(ADD(Q|L)const [c] (LEA(Q|L)8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEA(Q|L)8 [c+d] {s} x y) +(LEA(Q|L)1 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)1 [c+d] {s} x y) +(LEA(Q|L)2 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)2 [c+d] {s} x y) +(LEA(Q|L)2 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEA(Q|L)2 [c+2*d] {s} x y) +(LEA(Q|L)4 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)4 [c+d] {s} x y) +(LEA(Q|L)4 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEA(Q|L)4 [c+4*d] {s} x y) +(LEA(Q|L)8 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(int64(c)+int64(d)) && x.Op != OpSB => (LEA(Q|L)8 [c+d] {s} x y) +(LEA(Q|L)8 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEA(Q|L)8 [c+8*d] {s} x y) + +// fold shifts into LEAQx/LEALx +(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)2 [c] {s} x y) +(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [2] y)) => (LEA(Q|L)4 [c] {s} x y) +(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [3] y)) => (LEA(Q|L)8 [c] {s} x y) +(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)4 [c] {s} x y) +(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [2] y)) => (LEA(Q|L)8 [c] {s} x y) +(LEA(Q|L)4 [c] {s} x (SHL(Q|L)const [1] y)) => (LEA(Q|L)8 [c] {s} x y) + +// reverse ordering of compare instruction +(SETL (InvertFlags x)) => (SETG x) +(SETG (InvertFlags x)) => (SETL x) +(SETB (InvertFlags x)) => (SETA x) +(SETA (InvertFlags x)) => (SETB x) +(SETLE (InvertFlags x)) => (SETGE x) +(SETGE (InvertFlags x)) => (SETLE x) +(SETBE (InvertFlags x)) => (SETAE x) +(SETAE (InvertFlags x)) => (SETBE x) +(SETEQ (InvertFlags x)) => (SETEQ x) +(SETNE (InvertFlags x)) => (SETNE x) + +(SETLstore [off] {sym} ptr (InvertFlags x) mem) => (SETGstore [off] {sym} ptr x mem) +(SETGstore [off] {sym} ptr (InvertFlags x) mem) => (SETLstore [off] {sym} ptr x mem) +(SETBstore [off] {sym} ptr (InvertFlags x) mem) => (SETAstore [off] {sym} ptr x mem) +(SETAstore [off] {sym} ptr (InvertFlags x) mem) => (SETBstore [off] {sym} ptr x mem) +(SETLEstore [off] {sym} ptr (InvertFlags x) mem) => (SETGEstore [off] {sym} ptr x mem) +(SETGEstore [off] {sym} ptr (InvertFlags x) mem) => (SETLEstore [off] {sym} ptr x mem) +(SETBEstore [off] {sym} ptr (InvertFlags x) mem) => (SETAEstore [off] {sym} ptr x mem) +(SETAEstore [off] {sym} ptr (InvertFlags x) mem) => (SETBEstore [off] {sym} ptr x mem) +(SETEQstore [off] {sym} ptr (InvertFlags x) mem) => (SETEQstore [off] {sym} ptr x mem) +(SETNEstore [off] {sym} ptr (InvertFlags x) mem) => (SETNEstore [off] {sym} ptr x mem) + +// sign extended loads +// Note: The combined instruction must end up in the same block +// as the original load. If not, we end up making a value with +// memory type live in two different blocks, which can lead to +// multiple memory values alive simultaneously. +// Make sure we don't combine these ops if the load has another use. +// This prevents a single load from being split into multiple loads +// which then might return different values. See test/atomicload.go. +(MOVBQSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem) +(MOVBQSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem) +(MOVBQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem) +(MOVBQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBQSXload <v.Type> [off] {sym} ptr mem) +(MOVBQZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem) +(MOVBQZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem) +(MOVBQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem) +(MOVBQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem) +(MOVWQSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem) +(MOVWQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem) +(MOVWQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWQSXload <v.Type> [off] {sym} ptr mem) +(MOVWQZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem) +(MOVWQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem) +(MOVWQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem) +(MOVLQSX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLQSXload <v.Type> [off] {sym} ptr mem) +(MOVLQSX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLQSXload <v.Type> [off] {sym} ptr mem) +(MOVLQZX x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLload <v.Type> [off] {sym} ptr mem) +(MOVLQZX x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVLload <v.Type> [off] {sym} ptr mem) + +(MOVLQZX x) && zeroUpper32Bits(x,3) => x +(MOVWQZX x) && zeroUpper48Bits(x,3) => x +(MOVBQZX x) && zeroUpper56Bits(x,3) => x + +// replace load from same location as preceding store with zero/sign extension (or copy in case of full width) +(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBQZX x) +(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWQZX x) +(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVLQZX x) +(MOVQload [off] {sym} ptr (MOVQstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x +(MOVBQSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBQSX x) +(MOVWQSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWQSX x) +(MOVLQSXload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVLQSX x) + +// Fold extensions and ANDs together. +(MOVBQZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x) +(MOVWQZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x) +(MOVLQZX (ANDLconst [c] x)) => (ANDLconst [c] x) +(MOVBQSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x) +(MOVWQSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x) +(MOVLQSX (ANDLconst [c] x)) && uint32(c) & 0x80000000 == 0 => (ANDLconst [c & 0x7fffffff] x) + +// Don't extend before storing +(MOVLstore [off] {sym} ptr (MOVLQSX x) mem) => (MOVLstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWQSX x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBQSX x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVLstore [off] {sym} ptr (MOVLQZX x) mem) => (MOVLstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWQZX x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBQZX x) mem) => (MOVBstore [off] {sym} ptr x mem) + +// fold constants into memory operations +// Note that this is not always a good idea because if not all the uses of +// the ADDQconst get eliminated, we still have to compute the ADDQconst and we now +// have potentially two live values (ptr and (ADDQconst [off] ptr)) instead of one. +// Nevertheless, let's do it! +(MOV(Q|L|W|B|SS|SD|O)load [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (MOV(Q|L|W|B|SS|SD|O)load [off1+off2] {sym} ptr mem) +(MOV(Q|L|W|B|SS|SD|O)store [off1] {sym} (ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => + (MOV(Q|L|W|B|SS|SD|O)store [off1+off2] {sym} ptr val mem) +(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + (SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1+off2] {sym} base val mem) +((ADD|SUB|AND|OR|XOR)Qload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {sym} val base mem) +((ADD|SUB|AND|OR|XOR)Lload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {sym} val base mem) +(CMP(Q|L|W|B)load [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + (CMP(Q|L|W|B)load [off1+off2] {sym} base val mem) +(CMP(Q|L|W|B)constload [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => + (CMP(Q|L|W|B)constload [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) + +((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem) +((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem) +((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => + ((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) +((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => + ((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) +((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {sym} base val mem) +((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem) + +// Fold constants into stores. +(MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validVal(c) => + (MOVQstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem) +(MOVLstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) => + (MOVLstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) => + (MOVWstoreconst [makeValAndOff(int32(int16(c)),off)] {sym} ptr mem) +(MOVBstore [off] {sym} ptr (MOV(L|Q)const [c]) mem) => + (MOVBstoreconst [makeValAndOff(int32(int8(c)),off)] {sym} ptr mem) + +// Fold address offsets into constant stores. +(MOV(Q|L|W|B|O)storeconst [sc] {s} (ADDQconst [off] ptr) mem) && ValAndOff(sc).canAdd32(off) => + (MOV(Q|L|W|B|O)storeconst [ValAndOff(sc).addOffset32(off)] {s} ptr mem) + +// We need to fold LEAQ into the MOVx ops so that the live variable analysis knows +// what variables are being read/written by the ops. +(MOV(Q|L|W|B|SS|SD|O|BQSX|WQSX|LQSX)load [off1] {sym1} (LEAQ [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOV(Q|L|W|B|SS|SD|O|BQSX|WQSX|LQSX)load [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOV(Q|L|W|B|SS|SD|O)store [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOV(Q|L|W|B|SS|SD|O)store [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOV(Q|L|W|B|O)storeconst [sc] {sym1} (LEAQ [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd32(off) => + (MOV(Q|L|W|B|O)storeconst [ValAndOff(sc).addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem) +(SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (SET(L|G|B|A|LE|GE|BE|AE|EQ|NE)store [off1+off2] {mergeSym(sym1,sym2)} base val mem) +((ADD|SUB|AND|OR|XOR)Qload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + ((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|AND|OR|XOR)Lload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +(CMP(Q|L|W|B)load [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (CMP(Q|L|W|B)load [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(CMP(Q|L|W|B)constload [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) + && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) => + (CMP(Q|L|W|B)constload [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) + +((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + ((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) + && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) => + ((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) +((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) + && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) => + ((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) +((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + ((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) +((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) + +// fold LEAQs together +(LEAQ [off1] {sym1} (LEAQ [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ [off1+off2] {mergeSym(sym1,sym2)} x) + +// LEAQ into LEAQ1 +(LEAQ1 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAQ1 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAQ1 into LEAQ +(LEAQ [off1] {sym1} (LEAQ1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ1 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAQ into LEAQ[248] +(LEAQ2 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAQ2 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAQ4 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAQ4 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAQ8 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (LEAQ8 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAQ[248] into LEAQ +(LEAQ [off1] {sym1} (LEAQ2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ2 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAQ [off1] {sym1} (LEAQ4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ4 [off1+off2] {mergeSym(sym1,sym2)} x y) +(LEAQ [off1] {sym1} (LEAQ8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ8 [off1+off2] {mergeSym(sym1,sym2)} x y) + +// LEAQ[1248] into LEAQ[1248]. Only some such merges are possible. +(LEAQ1 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ2 [off1+off2] {mergeSym(sym1, sym2)} x y) +(LEAQ1 [off1] {sym1} x (LEAQ1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (LEAQ2 [off1+off2] {mergeSym(sym1, sym2)} y x) +(LEAQ2 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+2*int64(off2)) && sym2 == nil => + (LEAQ4 [off1+2*off2] {sym1} x y) +(LEAQ4 [off1] {sym1} x (LEAQ1 [off2] {sym2} y y)) && is32Bit(int64(off1)+4*int64(off2)) && sym2 == nil => + (LEAQ8 [off1+4*off2] {sym1} x y) +// TODO: more? + +// Lower LEAQ2/4/8 when the offset is a constant +(LEAQ2 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*2) => + (LEAQ [off+int32(scale)*2] {sym} x) +(LEAQ4 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*4) => + (LEAQ [off+int32(scale)*4] {sym} x) +(LEAQ8 [off] {sym} x (MOV(Q|L)const [scale])) && is32Bit(int64(off)+int64(scale)*8) => + (LEAQ [off+int32(scale)*8] {sym} x) + +// Absorb InvertFlags into branches. +(LT (InvertFlags cmp) yes no) => (GT cmp yes no) +(GT (InvertFlags cmp) yes no) => (LT cmp yes no) +(LE (InvertFlags cmp) yes no) => (GE cmp yes no) +(GE (InvertFlags cmp) yes no) => (LE cmp yes no) +(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no) +(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no) +(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no) +(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no) +(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no) +(NE (InvertFlags cmp) yes no) => (NE cmp yes no) + +// Constant comparisons. +(CMPQconst (MOVQconst [x]) [y]) && x==int64(y) => (FlagEQ) +(CMPQconst (MOVQconst [x]) [y]) && x<int64(y) && uint64(x)<uint64(int64(y)) => (FlagLT_ULT) +(CMPQconst (MOVQconst [x]) [y]) && x<int64(y) && uint64(x)>uint64(int64(y)) => (FlagLT_UGT) +(CMPQconst (MOVQconst [x]) [y]) && x>int64(y) && uint64(x)<uint64(int64(y)) => (FlagGT_ULT) +(CMPQconst (MOVQconst [x]) [y]) && x>int64(y) && uint64(x)>uint64(int64(y)) => (FlagGT_UGT) +(CMPLconst (MOVLconst [x]) [y]) && x==y => (FlagEQ) +(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT) +(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT) +(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT) +(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y => (FlagEQ) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT) +(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y => (FlagEQ) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT) +(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT) + +// CMPQconst requires a 32 bit const, but we can still constant-fold 64 bit consts. +// In theory this applies to any of the simplifications above, +// but CMPQ is the only one I've actually seen occur. +(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x==y => (FlagEQ) +(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x<y && uint64(x)<uint64(y) => (FlagLT_ULT) +(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x<y && uint64(x)>uint64(y) => (FlagLT_UGT) +(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x>y && uint64(x)<uint64(y) => (FlagGT_ULT) +(CMPQ (MOVQconst [x]) (MOVQconst [y])) && x>y && uint64(x)>uint64(y) => (FlagGT_UGT) + +// Other known comparisons. +(CMPQconst (MOVBQZX _) [c]) && 0xFF < c => (FlagLT_ULT) +(CMPQconst (MOVWQZX _) [c]) && 0xFFFF < c => (FlagLT_ULT) +(CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) => (FlagLT_ULT) +(CMPQconst (SHRQconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 64 && (1<<uint64(64-c)) <= uint64(n) => (FlagLT_ULT) +(CMPQconst (ANDQconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT) +(CMPQconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT) +(CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT) +(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT) +(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m) && int8(m) < n => (FlagLT_ULT) + +// TESTQ c c sets flags like CMPQ c 0. +(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c == 0 => (FlagEQ) +(TESTLconst [c] (MOVLconst [c])) && c == 0 => (FlagEQ) +(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c < 0 => (FlagLT_UGT) +(TESTLconst [c] (MOVLconst [c])) && c < 0 => (FlagLT_UGT) +(TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c > 0 => (FlagGT_UGT) +(TESTLconst [c] (MOVLconst [c])) && c > 0 => (FlagGT_UGT) + +// TODO: DIVxU also. + +// Absorb flag constants into SBB ops. +(SBBQcarrymask (FlagEQ)) => (MOVQconst [0]) +(SBBQcarrymask (FlagLT_ULT)) => (MOVQconst [-1]) +(SBBQcarrymask (FlagLT_UGT)) => (MOVQconst [0]) +(SBBQcarrymask (FlagGT_ULT)) => (MOVQconst [-1]) +(SBBQcarrymask (FlagGT_UGT)) => (MOVQconst [0]) +(SBBLcarrymask (FlagEQ)) => (MOVLconst [0]) +(SBBLcarrymask (FlagLT_ULT)) => (MOVLconst [-1]) +(SBBLcarrymask (FlagLT_UGT)) => (MOVLconst [0]) +(SBBLcarrymask (FlagGT_ULT)) => (MOVLconst [-1]) +(SBBLcarrymask (FlagGT_UGT)) => (MOVLconst [0]) + +// Absorb flag constants into branches. +((EQ|LE|GE|ULE|UGE) (FlagEQ) yes no) => (First yes no) +((NE|LT|GT|ULT|UGT) (FlagEQ) yes no) => (First no yes) +((NE|LT|LE|ULT|ULE) (FlagLT_ULT) yes no) => (First yes no) +((EQ|GT|GE|UGT|UGE) (FlagLT_ULT) yes no) => (First no yes) +((NE|LT|LE|UGT|UGE) (FlagLT_UGT) yes no) => (First yes no) +((EQ|GT|GE|ULT|ULE) (FlagLT_UGT) yes no) => (First no yes) +((NE|GT|GE|ULT|ULE) (FlagGT_ULT) yes no) => (First yes no) +((EQ|LT|LE|UGT|UGE) (FlagGT_ULT) yes no) => (First no yes) +((NE|GT|GE|UGT|UGE) (FlagGT_UGT) yes no) => (First yes no) +((EQ|LT|LE|ULT|ULE) (FlagGT_UGT) yes no) => (First no yes) + +// Absorb flag constants into SETxx ops. +((SETEQ|SETLE|SETGE|SETBE|SETAE) (FlagEQ)) => (MOVLconst [1]) +((SETNE|SETL|SETG|SETB|SETA) (FlagEQ)) => (MOVLconst [0]) +((SETNE|SETL|SETLE|SETB|SETBE) (FlagLT_ULT)) => (MOVLconst [1]) +((SETEQ|SETG|SETGE|SETA|SETAE) (FlagLT_ULT)) => (MOVLconst [0]) +((SETNE|SETL|SETLE|SETA|SETAE) (FlagLT_UGT)) => (MOVLconst [1]) +((SETEQ|SETG|SETGE|SETB|SETBE) (FlagLT_UGT)) => (MOVLconst [0]) +((SETNE|SETG|SETGE|SETB|SETBE) (FlagGT_ULT)) => (MOVLconst [1]) +((SETEQ|SETL|SETLE|SETA|SETAE) (FlagGT_ULT)) => (MOVLconst [0]) +((SETNE|SETG|SETGE|SETA|SETAE) (FlagGT_UGT)) => (MOVLconst [1]) +((SETEQ|SETL|SETLE|SETB|SETBE) (FlagGT_UGT)) => (MOVLconst [0]) + +(SETEQstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETEQstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETEQstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETEQstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETEQstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) + +(SETNEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETNEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETNEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETNEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETNEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) + +(SETLstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETLstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETLstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETLstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETLstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) + +(SETLEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETLEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETLEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETLEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETLEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) + +(SETGstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETGstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETGstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETGstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETGstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) + +(SETGEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETGEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETGEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETGEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETGEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) + +(SETBstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETBstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETBstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETBstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETBstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) + +(SETBEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETBEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETBEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETBEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETBEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) + +(SETAstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETAstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETAstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETAstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETAstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) + +(SETAEstore [off] {sym} ptr (FlagEQ) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETAEstore [off] {sym} ptr (FlagLT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETAEstore [off] {sym} ptr (FlagLT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) +(SETAEstore [off] {sym} ptr (FlagGT_ULT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [0]) mem) +(SETAEstore [off] {sym} ptr (FlagGT_UGT) mem) => (MOVBstore [off] {sym} ptr (MOVLconst <typ.UInt8> [1]) mem) + +// Remove redundant *const ops +(ADDQconst [0] x) => x +(ADDLconst [c] x) && c==0 => x +(SUBQconst [0] x) => x +(SUBLconst [c] x) && c==0 => x +(ANDQconst [0] _) => (MOVQconst [0]) +(ANDLconst [c] _) && c==0 => (MOVLconst [0]) +(ANDQconst [-1] x) => x +(ANDLconst [c] x) && c==-1 => x +(ORQconst [0] x) => x +(ORLconst [c] x) && c==0 => x +(ORQconst [-1] _) => (MOVQconst [-1]) +(ORLconst [c] _) && c==-1 => (MOVLconst [-1]) +(XORQconst [0] x) => x +(XORLconst [c] x) && c==0 => x +// TODO: since we got rid of the W/B versions, we might miss +// things like (ANDLconst [0x100] x) which were formerly +// (ANDBconst [0] x). Probably doesn't happen very often. +// If we cared, we might do: +// (ANDLconst <t> [c] x) && t.Size()==1 && int8(x)==0 -> (MOVLconst [0]) + +// Remove redundant ops +// Not in generic rules, because they may appear after lowering e. g. Slicemask +(NEG(Q|L) (NEG(Q|L) x)) => x +(NEG(Q|L) s:(SUB(Q|L) x y)) && s.Uses == 1 => (SUB(Q|L) y x) + +// Convert constant subtracts to constant adds +(SUBQconst [c] x) && c != -(1<<31) => (ADDQconst [-c] x) +(SUBLconst [c] x) => (ADDLconst [-c] x) + +// generic constant folding +// TODO: more of this +(ADDQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)+d]) +(ADDLconst [c] (MOVLconst [d])) => (MOVLconst [c+d]) +(ADDQconst [c] (ADDQconst [d] x)) && is32Bit(int64(c)+int64(d)) => (ADDQconst [c+d] x) +(ADDLconst [c] (ADDLconst [d] x)) => (ADDLconst [c+d] x) +(SUBQconst (MOVQconst [d]) [c]) => (MOVQconst [d-int64(c)]) +(SUBQconst (SUBQconst x [d]) [c]) && is32Bit(int64(-c)-int64(d)) => (ADDQconst [-c-d] x) +(SARQconst [c] (MOVQconst [d])) => (MOVQconst [d>>uint64(c)]) +(SARLconst [c] (MOVQconst [d])) => (MOVQconst [int64(int32(d))>>uint64(c)]) +(SARWconst [c] (MOVQconst [d])) => (MOVQconst [int64(int16(d))>>uint64(c)]) +(SARBconst [c] (MOVQconst [d])) => (MOVQconst [int64(int8(d))>>uint64(c)]) +(NEGQ (MOVQconst [c])) => (MOVQconst [-c]) +(NEGL (MOVLconst [c])) => (MOVLconst [-c]) +(MULQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)*d]) +(MULLconst [c] (MOVLconst [d])) => (MOVLconst [c*d]) +(ANDQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)&d]) +(ANDLconst [c] (MOVLconst [d])) => (MOVLconst [c&d]) +(ORQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)|d]) +(ORLconst [c] (MOVLconst [d])) => (MOVLconst [c|d]) +(XORQconst [c] (MOVQconst [d])) => (MOVQconst [int64(c)^d]) +(XORLconst [c] (MOVLconst [d])) => (MOVLconst [c^d]) +(NOTQ (MOVQconst [c])) => (MOVQconst [^c]) +(NOTL (MOVLconst [c])) => (MOVLconst [^c]) +(BTSQconst [c] (MOVQconst [d])) => (MOVQconst [d|(1<<uint32(c))]) +(BTSLconst [c] (MOVLconst [d])) => (MOVLconst [d|(1<<uint32(c))]) +(BTRQconst [c] (MOVQconst [d])) => (MOVQconst [d&^(1<<uint32(c))]) +(BTRLconst [c] (MOVLconst [d])) => (MOVLconst [d&^(1<<uint32(c))]) +(BTCQconst [c] (MOVQconst [d])) => (MOVQconst [d^(1<<uint32(c))]) +(BTCLconst [c] (MOVLconst [d])) => (MOVLconst [d^(1<<uint32(c))]) + +// If c or d doesn't fit into 32 bits, then we can't construct ORQconst, +// but we can still constant-fold. +// In theory this applies to any of the simplifications above, +// but ORQ is the only one I've actually seen occur. +(ORQ (MOVQconst [c]) (MOVQconst [d])) => (MOVQconst [c|d]) + +// generic simplifications +// TODO: more of this +(ADDQ x (NEGQ y)) => (SUBQ x y) +(ADDL x (NEGL y)) => (SUBL x y) +(SUBQ x x) => (MOVQconst [0]) +(SUBL x x) => (MOVLconst [0]) +(ANDQ x x) => x +(ANDL x x) => x +(ORQ x x) => x +(ORL x x) => x +(XORQ x x) => (MOVQconst [0]) +(XORL x x) => (MOVLconst [0]) + +(SHLLconst [d] (MOVLconst [c])) => (MOVLconst [c << uint64(d)]) +(SHLQconst [d] (MOVQconst [c])) => (MOVQconst [c << uint64(d)]) +(SHLQconst [d] (MOVLconst [c])) => (MOVQconst [int64(c) << uint64(d)]) + +// Fold NEG into ADDconst/MULconst. Take care to keep c in 32 bit range. +(NEGQ (ADDQconst [c] (NEGQ x))) && c != -(1<<31) => (ADDQconst [-c] x) +(MULQconst [c] (NEGQ x)) && c != -(1<<31) => (MULQconst [-c] x) + +// checking AND against 0. +(CMPQconst a:(ANDQ x y) [0]) && a.Uses == 1 => (TESTQ x y) +(CMPLconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTL x y) +(CMPWconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTW x y) +(CMPBconst a:(ANDL x y) [0]) && a.Uses == 1 => (TESTB x y) +(CMPQconst a:(ANDQconst [c] x) [0]) && a.Uses == 1 => (TESTQconst [c] x) +(CMPLconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTLconst [c] x) +(CMPWconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTWconst [int16(c)] x) +(CMPBconst a:(ANDLconst [c] x) [0]) && a.Uses == 1 => (TESTBconst [int8(c)] x) + +// Convert TESTx to TESTxconst if possible. +(TESTQ (MOVQconst [c]) x) && is32Bit(c) => (TESTQconst [int32(c)] x) +(TESTL (MOVLconst [c]) x) => (TESTLconst [c] x) +(TESTW (MOVLconst [c]) x) => (TESTWconst [int16(c)] x) +(TESTB (MOVLconst [c]) x) => (TESTBconst [int8(c)] x) + +// TEST %reg,%reg is shorter than CMP +(CMPQconst x [0]) => (TESTQ x x) +(CMPLconst x [0]) => (TESTL x x) +(CMPWconst x [0]) => (TESTW x x) +(CMPBconst x [0]) => (TESTB x x) +(TESTQconst [-1] x) && x.Op != OpAMD64MOVQconst => (TESTQ x x) +(TESTLconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTL x x) +(TESTWconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTW x x) +(TESTBconst [-1] x) && x.Op != OpAMD64MOVLconst => (TESTB x x) + +// Convert LEAQ1 back to ADDQ if we can +(LEAQ1 [0] x y) && v.Aux == nil => (ADDQ x y) + +// Combining byte loads into larger (unaligned) loads. +// There are many ways these combinations could occur. This is +// designed to match the way encoding/binary.LittleEndian does it. + +// Little-endian loads + +(OR(L|Q) x0:(MOVBload [i0] {s} p mem) + sh:(SHL(L|Q)const [8] x1:(MOVBload [i1] {s} p mem))) + && i1 == i0+1 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem) + +(OR(L|Q) x0:(MOVBload [i] {s} p0 mem) + sh:(SHL(L|Q)const [8] x1:(MOVBload [i] {s} p1 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem) + +(OR(L|Q) x0:(MOVWload [i0] {s} p mem) + sh:(SHL(L|Q)const [16] x1:(MOVWload [i1] {s} p mem))) + && i1 == i0+2 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem) + +(OR(L|Q) x0:(MOVWload [i] {s} p0 mem) + sh:(SHL(L|Q)const [16] x1:(MOVWload [i] {s} p1 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem) + +(ORQ x0:(MOVLload [i0] {s} p mem) + sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem))) + && i1 == i0+4 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem) + +(ORQ x0:(MOVLload [i] {s} p0 mem) + sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p0, p1, 4) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem) + +(OR(L|Q) + s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem)) + or:(OR(L|Q) + s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem)) + y)) + && i1 == i0+1 + && j1 == j0+8 + && j0 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i0] {s} p mem)) y) + +(OR(L|Q) + s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem)) + or:(OR(L|Q) + s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem)) + y)) + && j1 == j0+8 + && j0 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y) + +(ORQ + s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem)) + or:(ORQ + s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem)) + y)) + && i1 == i0+2 + && j1 == j0+16 + && j0 % 32 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y) + +(ORQ + s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem)) + or:(ORQ + s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem)) + y)) + && j1 == j0+16 + && j0 % 32 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y) + +// Big-endian loads + +(OR(L|Q) + x1:(MOVBload [i1] {s} p mem) + sh:(SHL(L|Q)const [8] x0:(MOVBload [i0] {s} p mem))) + && i1 == i0+1 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem)) + +(OR(L|Q) + x1:(MOVBload [i] {s} p1 mem) + sh:(SHL(L|Q)const [8] x0:(MOVBload [i] {s} p0 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem)) + +(OR(L|Q) + r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)) + sh:(SHL(L|Q)const [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem)))) + && i1 == i0+2 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem)) + +(OR(L|Q) + r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)) + sh:(SHL(L|Q)const [16] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem)) + +(ORQ + r1:(BSWAPL x1:(MOVLload [i1] {s} p mem)) + sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i0] {s} p mem)))) + && i1 == i0+4 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem)) + +(ORQ + r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem)) + sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p0, p1, 4) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem)) + +(OR(L|Q) + s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem)) + or:(OR(L|Q) + s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem)) + y)) + && i1 == i0+1 + && j1 == j0-8 + && j1 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y) + +(OR(L|Q) + s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem)) + or:(OR(L|Q) + s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem)) + y)) + && j1 == j0-8 + && j1 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y) + +(ORQ + s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))) + or:(ORQ + s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))) + y)) + && i1 == i0+2 + && j1 == j0-16 + && j1 % 32 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, r0, r1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y) + +(ORQ + s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))) + or:(ORQ + s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))) + y)) + && j1 == j0-16 + && j1 % 32 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, r0, r1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y) + +// Combine 2 byte stores + shift into rolw 8 + word store +(MOVBstore [i] {s} p w + x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem)) + && x0.Uses == 1 + && clobber(x0) + => (MOVWstore [i-1] {s} p (ROLWconst <typ.UInt16> [8] w) mem) +(MOVBstore [i] {s} p1 w + x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem)) + && x0.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x0) + => (MOVWstore [i] {s} p0 (ROLWconst <typ.UInt16> [8] w) mem) + +// Combine stores + shifts into bswap and larger (unaligned) stores +(MOVBstore [i] {s} p w + x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w) + x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w) + x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0, x1, x2) + => (MOVLstore [i-3] {s} p (BSWAPL <typ.UInt32> w) mem) +(MOVBstore [i] {s} p3 w + x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w) + x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w) + x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && sequentialAddresses(p1, p2, 1) + && sequentialAddresses(p2, p3, 1) + && clobber(x0, x1, x2) + => (MOVLstore [i] {s} p0 (BSWAPL <typ.UInt32> w) mem) + +(MOVBstore [i] {s} p w + x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w) + x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w) + x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w) + x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w) + x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w) + x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w) + x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem)))))))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && clobber(x0, x1, x2, x3, x4, x5, x6) + => (MOVQstore [i-7] {s} p (BSWAPQ <typ.UInt64> w) mem) +(MOVBstore [i] {s} p7 w + x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w) + x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w) + x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w) + x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w) + x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w) + x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w) + x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem)))))))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && sequentialAddresses(p1, p2, 1) + && sequentialAddresses(p2, p3, 1) + && sequentialAddresses(p3, p4, 1) + && sequentialAddresses(p4, p5, 1) + && sequentialAddresses(p5, p6, 1) + && sequentialAddresses(p6, p7, 1) + && clobber(x0, x1, x2, x3, x4, x5, x6) + => (MOVQstore [i] {s} p0 (BSWAPQ <typ.UInt64> w) mem) + +// Combine constant stores into larger (unaligned) stores. +(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+1-c.Off())) + && clobber(x) + => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem) +(MOVBstoreconst [a] {s} p0 x:(MOVBstoreconst [c] {s} p1 mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+1-c.Off())) + && clobber(x) + => (MOVWstoreconst [makeValAndOff(a.Val()&0xff | c.Val()<<8, a.Off())] {s} p0 mem) +(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+2-c.Off())) + && clobber(x) + => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem) +(MOVWstoreconst [a] {s} p0 x:(MOVWstoreconst [c] {s} p1 mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+2-c.Off())) + && clobber(x) + => (MOVLstoreconst [makeValAndOff(a.Val()&0xffff | c.Val()<<16, a.Off())] {s} p0 mem) +(MOVLstoreconst [c] {s} p1 x:(MOVLstoreconst [a] {s} p0 mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+4-c.Off())) + && clobber(x) + => (MOVQstore [a.Off()] {s} p0 (MOVQconst [a.Val64()&0xffffffff | c.Val64()<<32]) mem) +(MOVLstoreconst [a] {s} p0 x:(MOVLstoreconst [c] {s} p1 mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+4-c.Off())) + && clobber(x) + => (MOVQstore [a.Off()] {s} p0 (MOVQconst [a.Val64()&0xffffffff | c.Val64()<<32]) mem) +(MOVQstoreconst [c] {s} p1 x:(MOVQstoreconst [a] {s} p0 mem)) + && config.useSSE + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+8-c.Off())) + && a.Val() == 0 + && c.Val() == 0 + && clobber(x) + => (MOVOstoreconst [makeValAndOff(0,a.Off())] {s} p0 mem) +(MOVQstoreconst [a] {s} p0 x:(MOVQstoreconst [c] {s} p1 mem)) + && config.useSSE + && x.Uses == 1 + && sequentialAddresses(p0, p1, int64(a.Off()+8-c.Off())) + && a.Val() == 0 + && c.Val() == 0 + && clobber(x) + => (MOVOstoreconst [makeValAndOff(0,a.Off())] {s} p0 mem) + +// Combine stores into larger (unaligned) stores. Little endian. +(MOVBstore [i] {s} p (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-1] {s} p w mem) +(MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHR(W|L|Q)const [8] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i] {s} p w mem) +(MOVBstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p w0:(SHR(L|Q)const [j-8] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-1] {s} p w0 mem) +(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i] {s} p0 w mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstore [i] {s} p0 w mem) +(MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstore [i] {s} p0 w mem) +(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 1) + && clobber(x) + => (MOVWstore [i] {s} p0 w0 mem) + +(MOVWstore [i] {s} p (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVLstore [i-2] {s} p w mem) +(MOVWstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p w0:(SHR(L|Q)const [j-16] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVLstore [i-2] {s} p w0 mem) +(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i] {s} p0 w mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && clobber(x) + => (MOVLstore [i] {s} p0 w mem) +(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 2) + && clobber(x) + => (MOVLstore [i] {s} p0 w0 mem) + +(MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVQstore [i-4] {s} p w mem) +(MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVQstore [i-4] {s} p w0 mem) +(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 4) + && clobber(x) + => (MOVQstore [i] {s} p0 w mem) +(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem)) + && x.Uses == 1 + && sequentialAddresses(p0, p1, 4) + && clobber(x) + => (MOVQstore [i] {s} p0 w0 mem) + +(MOVBstore [c3] {s} p3 (SHRQconst [56] w) + x1:(MOVWstore [c2] {s} p2 (SHRQconst [40] w) + x2:(MOVLstore [c1] {s} p1 (SHRQconst [8] w) + x3:(MOVBstore [c0] {s} p0 w mem)))) + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && sequentialAddresses(p0, p1, int64(1 + c0 - c1)) + && sequentialAddresses(p0, p2, int64(5 + c0 - c2)) + && sequentialAddresses(p0, p3, int64(7 + c0 - c3)) + && clobber(x1, x2, x3) + => (MOVQstore [c0] {s} p0 w mem) + +(MOVBstore [i] {s} p + x1:(MOVBload [j] {s2} p2 mem) + mem2:(MOVBstore [i-1] {s} p + x2:(MOVBload [j-1] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && clobber(x1, x2, mem2) + => (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem) + +(MOVWstore [i] {s} p + x1:(MOVWload [j] {s2} p2 mem) + mem2:(MOVWstore [i-2] {s} p + x2:(MOVWload [j-2] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && clobber(x1, x2, mem2) + => (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem) + +(MOVLstore [i] {s} p + x1:(MOVLload [j] {s2} p2 mem) + mem2:(MOVLstore [i-4] {s} p + x2:(MOVLload [j-4] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && clobber(x1, x2, mem2) + => (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem) + +// Merge load and op +// TODO: add indexed variants? +((ADD|SUB|AND|OR|XOR)Q x l:(MOVQload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|AND|OR|XOR)Qload x [off] {sym} ptr mem) +((ADD|SUB|AND|OR|XOR)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|AND|OR|XOR)Lload x [off] {sym} ptr mem) +((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem) +((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem) +(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) +(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) +(MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem) +(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => + ((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem) + +// Merge ADDQconst and LEAQ into atomic loads. +(MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (MOV(Q|L|B)atomicload [off1+off2] {sym} ptr mem) +(MOV(Q|L|B)atomicload [off1] {sym1} (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOV(Q|L|B)atomicload [off1+off2] {mergeSym(sym1, sym2)} ptr mem) + +// Merge ADDQconst and LEAQ into atomic stores. +(XCHGQ [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (XCHGQ [off1+off2] {sym} val ptr mem) +(XCHGQ [off1] {sym1} val (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ptr.Op != OpSB => + (XCHGQ [off1+off2] {mergeSym(sym1,sym2)} val ptr mem) +(XCHGL [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (XCHGL [off1+off2] {sym} val ptr mem) +(XCHGL [off1] {sym1} val (LEAQ [off2] {sym2} ptr) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ptr.Op != OpSB => + (XCHGL [off1+off2] {mergeSym(sym1,sym2)} val ptr mem) + +// Merge ADDQconst into atomic adds. +// TODO: merging LEAQ doesn't work, assembler doesn't like the resulting instructions. +(XADDQlock [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (XADDQlock [off1+off2] {sym} val ptr mem) +(XADDLlock [off1] {sym} val (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => + (XADDLlock [off1+off2] {sym} val ptr mem) + +// Merge ADDQconst into atomic compare and swaps. +// TODO: merging LEAQ doesn't work, assembler doesn't like the resulting instructions. +(CMPXCHGQlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(int64(off1)+int64(off2)) => + (CMPXCHGQlock [off1+off2] {sym} ptr old new_ mem) +(CMPXCHGLlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(int64(off1)+int64(off2)) => + (CMPXCHGLlock [off1+off2] {sym} ptr old new_ mem) + +// We don't need the conditional move if we know the arg of BSF is not zero. +(CMOVQEQ x _ (Select1 (BS(F|R)Q (ORQconst [c] _)))) && c != 0 => x +// Extension is unnecessary for trailing zeros. +(BSFQ (ORQconst <t> [1<<8] (MOVBQZX x))) => (BSFQ (ORQconst <t> [1<<8] x)) +(BSFQ (ORQconst <t> [1<<16] (MOVWQZX x))) => (BSFQ (ORQconst <t> [1<<16] x)) + +// Redundant sign/zero extensions +// Note: see issue 21963. We have to make sure we use the right type on +// the resulting extension (the outer type, not the inner type). +(MOVLQSX (MOVLQSX x)) => (MOVLQSX x) +(MOVLQSX (MOVWQSX x)) => (MOVWQSX x) +(MOVLQSX (MOVBQSX x)) => (MOVBQSX x) +(MOVWQSX (MOVWQSX x)) => (MOVWQSX x) +(MOVWQSX (MOVBQSX x)) => (MOVBQSX x) +(MOVBQSX (MOVBQSX x)) => (MOVBQSX x) +(MOVLQZX (MOVLQZX x)) => (MOVLQZX x) +(MOVLQZX (MOVWQZX x)) => (MOVWQZX x) +(MOVLQZX (MOVBQZX x)) => (MOVBQZX x) +(MOVWQZX (MOVWQZX x)) => (MOVWQZX x) +(MOVWQZX (MOVBQZX x)) => (MOVBQZX x) +(MOVBQZX (MOVBQZX x)) => (MOVBQZX x) + +(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem) + && isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) => + ((ADD|AND|OR|XOR)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem) +(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem) + && isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) => + ((ADD|AND|OR|XOR)Lconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem) + +// float <-> int register moves, with no conversion. +// These come up when compiling math.{Float{32,64}bits,Float{32,64}frombits}. +(MOVQload [off] {sym} ptr (MOVSDstore [off] {sym} ptr val _)) => (MOVQf2i val) +(MOVLload [off] {sym} ptr (MOVSSstore [off] {sym} ptr val _)) => (MOVLf2i val) +(MOVSDload [off] {sym} ptr (MOVQstore [off] {sym} ptr val _)) => (MOVQi2f val) +(MOVSSload [off] {sym} ptr (MOVLstore [off] {sym} ptr val _)) => (MOVLi2f val) + +// Other load-like ops. +(ADDQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (ADDQ x (MOVQf2i y)) +(ADDLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (ADDL x (MOVLf2i y)) +(SUBQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (SUBQ x (MOVQf2i y)) +(SUBLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (SUBL x (MOVLf2i y)) +(ANDQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (ANDQ x (MOVQf2i y)) +(ANDLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (ANDL x (MOVLf2i y)) +( ORQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => ( ORQ x (MOVQf2i y)) +( ORLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => ( ORL x (MOVLf2i y)) +(XORQload x [off] {sym} ptr (MOVSDstore [off] {sym} ptr y _)) => (XORQ x (MOVQf2i y)) +(XORLload x [off] {sym} ptr (MOVSSstore [off] {sym} ptr y _)) => (XORL x (MOVLf2i y)) + +(ADDSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (ADDSD x (MOVQi2f y)) +(ADDSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (ADDSS x (MOVLi2f y)) +(SUBSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (SUBSD x (MOVQi2f y)) +(SUBSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (SUBSS x (MOVLi2f y)) +(MULSDload x [off] {sym} ptr (MOVQstore [off] {sym} ptr y _)) => (MULSD x (MOVQi2f y)) +(MULSSload x [off] {sym} ptr (MOVLstore [off] {sym} ptr y _)) => (MULSS x (MOVLi2f y)) + +// Redirect stores to use the other register set. +(MOVQstore [off] {sym} ptr (MOVQf2i val) mem) => (MOVSDstore [off] {sym} ptr val mem) +(MOVLstore [off] {sym} ptr (MOVLf2i val) mem) => (MOVSSstore [off] {sym} ptr val mem) +(MOVSDstore [off] {sym} ptr (MOVQi2f val) mem) => (MOVQstore [off] {sym} ptr val mem) +(MOVSSstore [off] {sym} ptr (MOVLi2f val) mem) => (MOVLstore [off] {sym} ptr val mem) + +// Load args directly into the register class where it will be used. +// We do this by just modifying the type of the Arg. +(MOVQf2i <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym}) +(MOVLf2i <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym}) +(MOVQi2f <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym}) +(MOVLi2f <t> (Arg <u> [off] {sym})) && t.Size() == u.Size() => @b.Func.Entry (Arg <t> [off] {sym}) + +// LEAQ is rematerializeable, so this helps to avoid register spill. +// See issue 22947 for details +(ADD(Q|L)const [off] x:(SP)) => (LEA(Q|L) [off] x) + +// HMULx is commutative, but its first argument must go in AX. +// If possible, put a rematerializeable value in the first argument slot, +// to reduce the odds that another value will be have to spilled +// specifically to free up AX. +(HMUL(Q|L) x y) && !x.rematerializeable() && y.rematerializeable() => (HMUL(Q|L) y x) +(HMUL(Q|L)U x y) && !x.rematerializeable() && y.rematerializeable() => (HMUL(Q|L)U y x) + +// Fold loads into compares +// Note: these may be undone by the flagalloc pass. +(CMP(Q|L|W|B) l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) => (CMP(Q|L|W|B)load {sym} [off] ptr x mem) +(CMP(Q|L|W|B) x l:(MOV(Q|L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (InvertFlags (CMP(Q|L|W|B)load {sym} [off] ptr x mem)) + +(CMP(Q|L)const l:(MOV(Q|L)load {sym} [off] ptr mem) [c]) + && l.Uses == 1 + && clobber(l) => +@l.Block (CMP(Q|L)constload {sym} [makeValAndOff(c,off)] ptr mem) +(CMP(W|B)const l:(MOV(W|B)load {sym} [off] ptr mem) [c]) + && l.Uses == 1 + && clobber(l) => +@l.Block (CMP(W|B)constload {sym} [makeValAndOff(int32(c),off)] ptr mem) + +(CMPQload {sym} [off] ptr (MOVQconst [c]) mem) && validVal(c) => (CMPQconstload {sym} [makeValAndOff(int32(c),off)] ptr mem) +(CMPLload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPLconstload {sym} [makeValAndOff(c,off)] ptr mem) +(CMPWload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPWconstload {sym} [makeValAndOff(int32(int16(c)),off)] ptr mem) +(CMPBload {sym} [off] ptr (MOVLconst [c]) mem) => (CMPBconstload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) + +(TEST(Q|L|W|B) l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) l2) + && l == l2 + && l.Uses == 2 + && clobber(l) => + @l.Block (CMP(Q|L|W|B)constload {sym} [makeValAndOff(0, off)] ptr mem) + +// Convert ANDload to MOVload when we can do the AND in a containing TEST op. +// Only do when it's within the same block, so we don't have flags live across basic block boundaries. +// See issue 44228. +(TEST(Q|L) a:(AND(Q|L)load [off] {sym} x ptr mem) a) && a.Uses == 2 && a.Block == v.Block && clobber(a) => (TEST(Q|L) (MOV(Q|L)load <a.Type> [off] {sym} ptr mem) x) + +(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read8(sym, int64(off)))]) +(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVLconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVLload [off] {sym} (SB) _) && symIsRO(sym) => (MOVQconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVQload [off] {sym} (SB) _) && symIsRO(sym) => (MOVQconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVOstore [dstOff] {dstSym} ptr (MOVOload [srcOff] {srcSym} (SB) _) mem) && symIsRO(srcSym) => + (MOVQstore [dstOff+8] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff)+8, config.ctxt.Arch.ByteOrder))]) + (MOVQstore [dstOff] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff), config.ctxt.Arch.ByteOrder))]) mem)) + +// Arch-specific inlining for small or disjoint runtime.memmove +// Match post-lowering calls, memory version. +(SelectN [0] call:(CALLstatic {sym} s1:(MOVQstoreconst _ [sc] s2:(MOVQstore _ src s3:(MOVQstore _ dst mem))))) + && sc.Val64() >= 0 + && isSameCall(sym, "runtime.memmove") + && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 + && isInlinableMemmove(dst, src, sc.Val64(), config) + && clobber(s1, s2, s3, call) + => (Move [sc.Val64()] dst src mem) + +// Match post-lowering calls, register version. +(SelectN [0] call:(CALLstatic {sym} dst src (MOVQconst [sz]) mem)) + && sz >= 0 + && isSameCall(sym, "runtime.memmove") + && call.Uses == 1 + && isInlinableMemmove(dst, src, sz, config) + && clobber(call) + => (Move [sz] dst src mem) + +// Prefetch instructions +(PrefetchCache ...) => (PrefetchT0 ...) +(PrefetchCacheStreamed ...) => (PrefetchNTA ...) + +// CPUID feature: BMI1. +(AND(Q|L) x (NOT(Q|L) y)) && buildcfg.GOAMD64 >= 3 => (ANDN(Q|L) x y) +(AND(Q|L) x (NEG(Q|L) x)) && buildcfg.GOAMD64 >= 3 => (BLSI(Q|L) x) +(XOR(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSMSK(Q|L) x) +(AND(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSR(Q|L) x) + +(BSWAP(Q|L) (BSWAP(Q|L) p)) => p + +// CPUID feature: MOVBE. +(MOV(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)store [i] {s} p w mem) +(MOVBE(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 => (MOV(Q|L)store [i] {s} p w mem) +(BSWAP(Q|L) x:(MOV(Q|L)load [i] {s} p mem)) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => @x.Block (MOVBE(Q|L)load [i] {s} p mem) +(BSWAP(Q|L) x:(MOVBE(Q|L)load [i] {s} p mem)) && x.Uses == 1 => @x.Block (MOV(Q|L)load [i] {s} p mem) +(MOVWstore [i] {s} p x:(ROLWconst [8] w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBEWstore [i] {s} p w mem) +(MOVBEWstore [i] {s} p x:(ROLWconst [8] w) mem) && x.Uses == 1 => (MOVWstore [i] {s} p w mem) + +(ORQ x0:(MOVBELload [i0] {s} p mem) + sh:(SHLQconst [32] x1:(MOVBELload [i1] {s} p mem))) + && i0 == i1+4 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVBEQload [i1] {s} p mem) + +(ORQ x0:(MOVBELload [i] {s} p0 mem) + sh:(SHLQconst [32] x1:(MOVBELload [i] {s} p1 mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && sequentialAddresses(p1, p0, 4) + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVBEQload [i] {s} p1 mem) + +(SAR(Q|L) l:(MOV(Q|L)load [off] {sym} ptr mem) x) && buildcfg.GOAMD64 >= 3 && canMergeLoad(v, l) && clobber(l) => (SARX(Q|L)load [off] {sym} ptr x mem) +(SHL(Q|L) l:(MOV(Q|L)load [off] {sym} ptr mem) x) && buildcfg.GOAMD64 >= 3 && canMergeLoad(v, l) && clobber(l) => (SHLX(Q|L)load [off] {sym} ptr x mem) +(SHR(Q|L) l:(MOV(Q|L)load [off] {sym} ptr mem) x) && buildcfg.GOAMD64 >= 3 && canMergeLoad(v, l) && clobber(l) => (SHRX(Q|L)load [off] {sym} ptr x mem) + +((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVQconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem)) +((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem)) +((SHL|SHR|SAR)XLload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Lconst [int8(c&31)] (MOVLload [off] {sym} ptr mem)) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go new file mode 100644 index 0000000..cbe1f5b --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -0,0 +1,1133 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - Floating-point types live in the low natural slot of an sse2 register. +// Unused portions are junk. +// - We do not use AH,BH,CH,DH registers. +// - When doing sub-register operations, we try to write the whole +// destination register to avoid a partial-register write. +// - Unused portions of AuxInt (or the Val portion of ValAndOff) are +// filled by sign-extending the used portion. Users of AuxInt which interpret +// AuxInt as unsigned (e.g. shifts) must be careful. +// - All SymOff opcodes require their offset to fit in an int32. + +// Suffixes encode the bit width of various instructions. +// Q (quad word) = 64 bit +// L (long word) = 32 bit +// W (word) = 16 bit +// B (byte) = 8 bit +// D (double) = 64 bit float +// S (single) = 32 bit float + +// copied from ../../amd64/reg.go +var regNamesAMD64 = []string{ + "AX", + "CX", + "DX", + "BX", + "SP", + "BP", + "SI", + "DI", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "g", // a.k.a. R14 + "R15", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + "X9", + "X10", + "X11", + "X12", + "X13", + "X14", + "X15", // constant 0 in ABIInternal + + // If you add registers, update asyncPreempt in runtime + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesAMD64) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesAMD64 { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + ax = buildReg("AX") + cx = buildReg("CX") + dx = buildReg("DX") + bx = buildReg("BX") + gp = buildReg("AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15") + g = buildReg("g") + fp = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14") + x15 = buildReg("X15") + gpsp = gp | buildReg("SP") + gpspsb = gpsp | buildReg("SB") + gpspsbg = gpspsb | g + callerSave = gp | fp | g // runtime.setg (and anything calling it) may clobber g + ) + // Common slices of register masks + var ( + gponly = []regMask{gp} + fponly = []regMask{fp} + ) + + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: gponly} + gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly} + gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly} + gp11sb = regInfo{inputs: []regMask{gpspsbg}, outputs: gponly} + gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly} + gp21sb = regInfo{inputs: []regMask{gpspsbg, gpsp}, outputs: gponly} + gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}} + gp31shift = regInfo{inputs: []regMask{gp, gp, cx}, outputs: []regMask{gp}} + gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}} + gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax} + gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}} + gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}} + + gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}} + gp1flags = regInfo{inputs: []regMask{gpsp}} + gp0flagsLoad = regInfo{inputs: []regMask{gpspsbg, 0}} + gp1flagsLoad = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}} + gp2flagsLoad = regInfo{inputs: []regMask{gpspsbg, gpsp, gpsp, 0}} + flagsgp = regInfo{inputs: nil, outputs: gponly} + + gp11flags = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}} + gp1flags1flags = regInfo{inputs: []regMask{gp, 0}, outputs: []regMask{gp, 0}} + + readflags = regInfo{inputs: nil, outputs: gponly} + + gpload = regInfo{inputs: []regMask{gpspsbg, 0}, outputs: gponly} + gp21load = regInfo{inputs: []regMask{gp, gpspsbg, 0}, outputs: gponly} + gploadidx = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}, outputs: gponly} + gp21loadidx = regInfo{inputs: []regMask{gp, gpspsbg, gpsp, 0}, outputs: gponly} + gp21shxload = regInfo{inputs: []regMask{gpspsbg, gp, 0}, outputs: gponly} + gp21shxloadidx = regInfo{inputs: []regMask{gpspsbg, gpsp, gp, 0}, outputs: gponly} + + gpstore = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}} + gpstoreconst = regInfo{inputs: []regMask{gpspsbg, 0}} + gpstoreidx = regInfo{inputs: []regMask{gpspsbg, gpsp, gpsp, 0}} + gpstoreconstidx = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}} + gpstorexchg = regInfo{inputs: []regMask{gp, gpspsbg, 0}, outputs: []regMask{gp}} + cmpxchg = regInfo{inputs: []regMask{gp, ax, gp, 0}, outputs: []regMask{gp, 0}, clobbers: ax} + + fp01 = regInfo{inputs: nil, outputs: fponly} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly} + fp21load = regInfo{inputs: []regMask{fp, gpspsbg, 0}, outputs: fponly} + fp21loadidx = regInfo{inputs: []regMask{fp, gpspsbg, gpspsb, 0}, outputs: fponly} + fpgp = regInfo{inputs: fponly, outputs: gponly} + gpfp = regInfo{inputs: gponly, outputs: fponly} + fp11 = regInfo{inputs: fponly, outputs: fponly} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + + fpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly} + fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly} + + fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}} + fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}} + + prefreg = regInfo{inputs: []regMask{gpspsbg}} + ) + + var AMD64ops = []opData{ + // {ADD,SUB,MUL,DIV}Sx: floating-point arithmetic + // x==S for float32, x==D for float64 + // computes arg0 OP arg1 + {name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, + {name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, + {name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true}, + {name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true}, + {name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, + {name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, + {name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true}, + {name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true}, + + // MOVSxload: floating-point loads + // x==S for float32, x==D for float64 + // load from arg0+auxint+aux, arg1 = mem + {name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + + // MOVSxconst: floatint-point constants + // x==S for float32, x==D for float64 + {name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, + {name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, + + // MOVSxloadidx: floating-point indexed loads + // x==S for float32, x==D for float64 + // load from arg0 + scale*arg1+auxint+aux, arg2 = mem + {name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", scale: 1, aux: "SymOff", symEffect: "Read"}, + {name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", scale: 4, aux: "SymOff", symEffect: "Read"}, + {name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", scale: 1, aux: "SymOff", symEffect: "Read"}, + {name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", scale: 8, aux: "SymOff", symEffect: "Read"}, + + // MOVSxstore: floating-point stores + // x==S for float32, x==D for float64 + // does *(arg0+auxint+aux) = arg1, arg2 = mem + {name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, + + // MOVSxstoreidx: floating-point indexed stores + // x==S for float32, x==D for float64 + // does *(arg0+scale*arg1+auxint+aux) = arg2, arg3 = mem + {name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", scale: 1, aux: "SymOff", symEffect: "Write"}, + {name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", scale: 4, aux: "SymOff", symEffect: "Write"}, + {name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", scale: 1, aux: "SymOff", symEffect: "Write"}, + {name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", scale: 8, aux: "SymOff", symEffect: "Write"}, + + // {ADD,SUB,MUL,DIV}Sxload: floating-point load / op combo + // x==S for float32, x==D for float64 + // computes arg0 OP *(arg1+auxint+aux), arg2=mem + {name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, + + // {ADD,SUB,MUL,DIV}Sxloadidx: floating-point indexed load / op combo + // x==S for float32, x==D for float64 + // computes arg0 OP *(arg1+scale*arg2+auxint+aux), arg3=mem + {name: "ADDSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "ADDSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "ADDSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "ADDSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "SUBSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "SUBSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "SUBSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "SUBSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "MULSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "MULSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "MULSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "MULSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "DIVSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "DIVSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "DIVSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + {name: "DIVSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, + + // {ADD,SUB,MUL,DIV,AND,OR,XOR}x: binary integer ops + // unadorned versions compute arg0 OP arg1 + // const versions compute arg0 OP auxint (auxint is a sign-extended 32-bit value) + // constmodify versions compute *(arg0+ValAndOff(AuxInt).Off().aux) OP= ValAndOff(AuxInt).Val(), arg1 = mem + // x==L operations zero the upper 4 bytes of the destination register (not meaningful for constmodify versions). + {name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true}, + {name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true}, + {name: "ADDQconst", argLength: 1, reg: gp11sp, asm: "ADDQ", aux: "Int32", typ: "UInt64", clobberFlags: true}, + {name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", clobberFlags: true}, + {name: "ADDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + + {name: "SUBQ", argLength: 2, reg: gp21, asm: "SUBQ", resultInArg0: true, clobberFlags: true}, + {name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true}, + {name: "SUBQconst", argLength: 1, reg: gp11, asm: "SUBQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, + {name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, + + {name: "MULQ", argLength: 2, reg: gp21, asm: "IMULQ", commutative: true, resultInArg0: true, clobberFlags: true}, + {name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, + {name: "MULQconst", argLength: 1, reg: gp11, asm: "IMUL3Q", aux: "Int32", clobberFlags: true}, + {name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true}, + + // Let x = arg0*arg1 (full 32x32->64 unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x. + {name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true}, + // Let x = arg0*arg1 (full 64x64->128 unsigned multiply). Returns uint64(x), and flags set to overflow if uint64(x) != x. + {name: "MULQU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt64,Flags)", asm: "MULQ", commutative: true, clobberFlags: true}, + + // HMULx[U]: computes the high bits of an integer multiply. + // computes arg0 * arg1 >> (x==L?32:64) + // The multiply is unsigned for the U versions, signed for the non-U versions. + // HMULx[U] are intentionally not marked as commutative, even though they are. + // This is because they have asymmetric register requirements. + // There are rewrite rules to try to place arguments in preferable slots. + {name: "HMULQ", argLength: 2, reg: gp21hmul, asm: "IMULQ", clobberFlags: true}, + {name: "HMULL", argLength: 2, reg: gp21hmul, asm: "IMULL", clobberFlags: true}, + {name: "HMULQU", argLength: 2, reg: gp21hmul, asm: "MULQ", clobberFlags: true}, + {name: "HMULLU", argLength: 2, reg: gp21hmul, asm: "MULL", clobberFlags: true}, + + // (arg0 + arg1) / 2 as unsigned, all 64 result bits + {name: "AVGQU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, + + // DIVx[U] computes [arg0 / arg1, arg0 % arg1] + // For signed versions, AuxInt non-zero means that the divisor has been proved to be not -1. + {name: "DIVQ", argLength: 2, reg: gp11div, typ: "(Int64,Int64)", asm: "IDIVQ", aux: "Bool", clobberFlags: true}, + {name: "DIVL", argLength: 2, reg: gp11div, typ: "(Int32,Int32)", asm: "IDIVL", aux: "Bool", clobberFlags: true}, + {name: "DIVW", argLength: 2, reg: gp11div, typ: "(Int16,Int16)", asm: "IDIVW", aux: "Bool", clobberFlags: true}, + {name: "DIVQU", argLength: 2, reg: gp11div, typ: "(UInt64,UInt64)", asm: "DIVQ", clobberFlags: true}, + {name: "DIVLU", argLength: 2, reg: gp11div, typ: "(UInt32,UInt32)", asm: "DIVL", clobberFlags: true}, + {name: "DIVWU", argLength: 2, reg: gp11div, typ: "(UInt16,UInt16)", asm: "DIVW", clobberFlags: true}, + + // computes -arg0, flags set for 0-arg0. + {name: "NEGLflags", argLength: 1, reg: gp11flags, typ: "(UInt32,Flags)", asm: "NEGL", resultInArg0: true}, + + // The following 4 add opcodes return the low 64 bits of the sum in the first result and + // the carry (the 65th bit) in the carry flag. + {name: "ADDQcarry", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDQ", commutative: true, resultInArg0: true}, // r = arg0+arg1 + {name: "ADCQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", commutative: true, resultInArg0: true}, // r = arg0+arg1+carry(arg2) + {name: "ADDQconstcarry", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint + {name: "ADCQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint+carry(arg1) + + // The following 4 add opcodes return the low 64 bits of the difference in the first result and + // the borrow (if the result is negative) in the carry flag. + {name: "SUBQborrow", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "SUBQ", resultInArg0: true}, // r = arg0-arg1 + {name: "SBBQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "SBBQ", resultInArg0: true}, // r = arg0-(arg1+carry(arg2)) + {name: "SUBQconstborrow", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "SUBQ", aux: "Int32", resultInArg0: true}, // r = arg0-auxint + {name: "SBBQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "SBBQ", aux: "Int32", resultInArg0: true}, // r = arg0-(auxint+carry(arg1)) + + {name: "MULQU2", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}, commutative: true, asm: "MULQ", clobberFlags: true}, // arg0 * arg1, returns (hi, lo) + {name: "DIVQU2", argLength: 3, reg: regInfo{inputs: []regMask{dx, ax, gpsp}, outputs: []regMask{ax, dx}}, asm: "DIVQ", clobberFlags: true}, // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r) + + {name: "ANDQ", argLength: 2, reg: gp21, asm: "ANDQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1 + {name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1 + {name: "ANDQconst", argLength: 1, reg: gp11, asm: "ANDQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint + {name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint + {name: "ANDQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + {name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + + {name: "ORQ", argLength: 2, reg: gp21, asm: "ORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1 + {name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1 + {name: "ORQconst", argLength: 1, reg: gp11, asm: "ORQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint + {name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint + {name: "ORQconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + {name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + + {name: "XORQ", argLength: 2, reg: gp21, asm: "XORQ", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1 + {name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1 + {name: "XORQconst", argLength: 1, reg: gp11, asm: "XORQ", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint + {name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint + {name: "XORQconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + {name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem + + // CMPx: compare arg0 to arg1. + {name: "CMPQ", argLength: 2, reg: gp2flags, asm: "CMPQ", typ: "Flags"}, + {name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"}, + {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, + {name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"}, + + // CMPxconst: compare arg0 to auxint. + {name: "CMPQconst", argLength: 1, reg: gp1flags, asm: "CMPQ", typ: "Flags", aux: "Int32"}, + {name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, + {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, + {name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"}, + + // CMPxload: compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem. + {name: "CMPQload", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + + // CMPxconstload: compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem. + {name: "CMPQconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPQ", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + {name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true}, + + // CMPxloadidx: compare *(arg0+N*arg1+auxint+aux) to arg2 (in that order). arg3=mem. + {name: "CMPQloadidx8", argLength: 4, reg: gp2flagsLoad, asm: "CMPQ", scale: 8, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPQloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPQ", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPLloadidx4", argLength: 4, reg: gp2flagsLoad, asm: "CMPL", scale: 4, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPLloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPL", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPWloadidx2", argLength: 4, reg: gp2flagsLoad, asm: "CMPW", scale: 2, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPWloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPW", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPBloadidx1", argLength: 4, reg: gp2flagsLoad, asm: "CMPB", scale: 1, commutative: true, aux: "SymOff", typ: "Flags", symEffect: "Read"}, + + // CMPxconstloadidx: compare *(arg0+N*arg1+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg2=mem. + {name: "CMPQconstloadidx8", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", scale: 8, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPQconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPQ", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPLconstloadidx4", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", scale: 4, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPLconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPWconstloadidx2", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", scale: 2, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPWconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + {name: "CMPBconstloadidx1", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", scale: 1, commutative: true, aux: "SymValAndOff", typ: "Flags", symEffect: "Read"}, + + // UCOMISx: floating-point compare arg0 to arg1 + // x==S for float32, x==D for float64 + {name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, + {name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, + + // bit test/set/clear operations + {name: "BTL", argLength: 2, reg: gp2flags, asm: "BTL", typ: "Flags"}, // test whether bit arg0%32 in arg1 is set + {name: "BTQ", argLength: 2, reg: gp2flags, asm: "BTQ", typ: "Flags"}, // test whether bit arg0%64 in arg1 is set + {name: "BTCL", argLength: 2, reg: gp21, asm: "BTCL", resultInArg0: true, clobberFlags: true}, // complement bit arg1%32 in arg0 + {name: "BTCQ", argLength: 2, reg: gp21, asm: "BTCQ", resultInArg0: true, clobberFlags: true}, // complement bit arg1%64 in arg0 + {name: "BTRL", argLength: 2, reg: gp21, asm: "BTRL", resultInArg0: true, clobberFlags: true}, // reset bit arg1%32 in arg0 + {name: "BTRQ", argLength: 2, reg: gp21, asm: "BTRQ", resultInArg0: true, clobberFlags: true}, // reset bit arg1%64 in arg0 + {name: "BTSL", argLength: 2, reg: gp21, asm: "BTSL", resultInArg0: true, clobberFlags: true}, // set bit arg1%32 in arg0 + {name: "BTSQ", argLength: 2, reg: gp21, asm: "BTSQ", resultInArg0: true, clobberFlags: true}, // set bit arg1%64 in arg0 + {name: "BTLconst", argLength: 1, reg: gp1flags, asm: "BTL", typ: "Flags", aux: "Int8"}, // test whether bit auxint in arg0 is set, 0 <= auxint < 32 + {name: "BTQconst", argLength: 1, reg: gp1flags, asm: "BTQ", typ: "Flags", aux: "Int8"}, // test whether bit auxint in arg0 is set, 0 <= auxint < 64 + {name: "BTCLconst", argLength: 1, reg: gp11, asm: "BTCL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 32 + {name: "BTCQconst", argLength: 1, reg: gp11, asm: "BTCQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 64 + {name: "BTRLconst", argLength: 1, reg: gp11, asm: "BTRL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 32 + {name: "BTRQconst", argLength: 1, reg: gp11, asm: "BTRQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 64 + {name: "BTSLconst", argLength: 1, reg: gp11, asm: "BTSL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 32 + {name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64 + + // TESTx: compare (arg0 & arg1) to 0 + {name: "TESTQ", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTQ", typ: "Flags"}, + {name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, + {name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, + {name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"}, + + // TESTxconst: compare (arg0 & auxint) to 0 + {name: "TESTQconst", argLength: 1, reg: gp1flags, asm: "TESTQ", typ: "Flags", aux: "Int32"}, + {name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, + {name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, + {name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"}, + + // S{HL, HR, AR}x: shift operations + // SHL: shift left + // SHR: shift right logical (0s are shifted in from beyond the word size) + // SAR: shift right arithmetic (sign bit is shifted in from beyond the word size) + // arg0 is the value being shifted + // arg1 is the amount to shift, interpreted mod (Q=64,L=32,W=32,B=32) + // (Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!) + // For *const versions, use auxint instead of arg1 as the shift amount. auxint must be in the range 0 to (Q=63,L=31,W=15,B=7) inclusive. + {name: "SHLQ", argLength: 2, reg: gp21shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true}, + {name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true}, + {name: "SHLQconst", argLength: 1, reg: gp11, asm: "SHLQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int8", resultInArg0: true, clobberFlags: true}, + + {name: "SHRQ", argLength: 2, reg: gp21shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true}, + {name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true}, + {name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true}, + {name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true}, + {name: "SHRQconst", argLength: 1, reg: gp11, asm: "SHRQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true}, + + {name: "SARQ", argLength: 2, reg: gp21shift, asm: "SARQ", resultInArg0: true, clobberFlags: true}, + {name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true}, + {name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true}, + {name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true}, + {name: "SARQconst", argLength: 1, reg: gp11, asm: "SARQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, + + // unsigned arg0 >> arg2, shifting in bits from arg1 (==(arg1<<64+arg0)>>arg2, keeping low 64 bits), shift amount is mod 64 + {name: "SHRDQ", argLength: 3, reg: gp31shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true}, + // unsigned arg0 << arg2, shifting in bits from arg1 (==(arg0<<64+arg1)<<arg2, keeping high 64 bits), shift amount is mod 64 + {name: "SHLDQ", argLength: 3, reg: gp31shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true}, + + // RO{L,R}x: rotate instructions + // computes arg0 rotate (L=left,R=right) arg1 bits. + // Bits are rotated within the low (Q=64,L=32,W=16,B=8) bits of the register. + // For *const versions use auxint instead of arg1 as the rotate amount. auxint must be in the range 0 to (Q=63,L=31,W=15,B=7) inclusive. + // x==L versions zero the upper 32 bits of the destination register. + // x==W and x==B versions leave the upper bits unspecified. + {name: "ROLQ", argLength: 2, reg: gp21shift, asm: "ROLQ", resultInArg0: true, clobberFlags: true}, + {name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true}, + {name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true}, + {name: "ROLB", argLength: 2, reg: gp21shift, asm: "ROLB", resultInArg0: true, clobberFlags: true}, + {name: "RORQ", argLength: 2, reg: gp21shift, asm: "RORQ", resultInArg0: true, clobberFlags: true}, + {name: "RORL", argLength: 2, reg: gp21shift, asm: "RORL", resultInArg0: true, clobberFlags: true}, + {name: "RORW", argLength: 2, reg: gp21shift, asm: "RORW", resultInArg0: true, clobberFlags: true}, + {name: "RORB", argLength: 2, reg: gp21shift, asm: "RORB", resultInArg0: true, clobberFlags: true}, + {name: "ROLQconst", argLength: 1, reg: gp11, asm: "ROLQ", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int8", resultInArg0: true, clobberFlags: true}, + {name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true}, + + // [ADD,SUB,AND,OR]xload: integer load/op combo + // L = int32, Q = int64 + // x==L operations zero the upper 4 bytes of the destination register. + // computes arg0 op *(arg1+auxint+aux), arg2=mem + {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "ADDQload", argLength: 3, reg: gp21load, asm: "ADDQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "SUBQload", argLength: 3, reg: gp21load, asm: "SUBQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "ANDQload", argLength: 3, reg: gp21load, asm: "ANDQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "ORQload", argLength: 3, reg: gp21load, asm: "ORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "XORQload", argLength: 3, reg: gp21load, asm: "XORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, + + // integer indexed load/op combo + // L = int32, Q = int64 + // L operations zero the upper 4 bytes of the destination register. + // computes arg0 op *(arg1+scale*arg2+auxint+aux), arg3=mem + {name: "ADDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ADDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ADDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ADDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "SUBLloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "SUBLloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "SUBQloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "SUBQloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ANDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ANDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ANDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ANDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "ORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "XORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "XORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "XORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + {name: "XORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, + + // direct binary op on memory (read-modify-write) + // L = int32, Q = int64 + // does *(arg0+auxint+aux) op= arg1, arg2=mem + {name: "ADDQmodify", argLength: 3, reg: gpstore, asm: "ADDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "SUBQmodify", argLength: 3, reg: gpstore, asm: "SUBQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "ANDQmodify", argLength: 3, reg: gpstore, asm: "ANDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "ORQmodify", argLength: 3, reg: gpstore, asm: "ORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "XORQmodify", argLength: 3, reg: gpstore, asm: "XORQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + {name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, + + // indexed direct binary op on memory. + // does *(arg0+scale*arg1+auxint+aux) op= arg2, arg3=mem + {name: "ADDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "SUBQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "SUBQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "SUBLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "SUBLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + + // indexed direct binary op on memory with constant argument. + // does *(arg0+scale*arg1+ValAndOff(AuxInt).Off()+aux) op= ValAndOff(AuxInt).Val(), arg2=mem + {name: "ADDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ADDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ANDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "ORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + {name: "XORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, + + // {NEG,NOT}x: unary ops + // computes [NEG:-,NOT:^]arg0 + // L = int32, Q = int64 + // L operations zero the upper 4 bytes of the destination register. + {name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true}, + {name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, + {name: "NOTQ", argLength: 1, reg: gp11, asm: "NOTQ", resultInArg0: true}, + {name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, + + // BS{F,R}Q returns a tuple [result, flags] + // result is undefined if the input is zero. + // flags are set to "equal" if the input is zero, "not equal" otherwise. + // BS{F,R}L returns only the result. + {name: "BSFQ", argLength: 1, reg: gp11flags, asm: "BSFQ", typ: "(UInt64,Flags)"}, // # of low-order zeroes in 64-bit arg + {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", typ: "UInt32", clobberFlags: true}, // # of low-order zeroes in 32-bit arg + {name: "BSRQ", argLength: 1, reg: gp11flags, asm: "BSRQ", typ: "(UInt64,Flags)"}, // # of high-order zeroes in 64-bit arg + {name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", typ: "UInt32", clobberFlags: true}, // # of high-order zeroes in 32-bit arg + + // CMOV instructions: 64, 32 and 16-bit sizes. + // if arg2 encodes a true result, return arg1, else arg0 + {name: "CMOVQEQ", argLength: 3, reg: gp21, asm: "CMOVQEQ", resultInArg0: true}, + {name: "CMOVQNE", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true}, + {name: "CMOVQLT", argLength: 3, reg: gp21, asm: "CMOVQLT", resultInArg0: true}, + {name: "CMOVQGT", argLength: 3, reg: gp21, asm: "CMOVQGT", resultInArg0: true}, + {name: "CMOVQLE", argLength: 3, reg: gp21, asm: "CMOVQLE", resultInArg0: true}, + {name: "CMOVQGE", argLength: 3, reg: gp21, asm: "CMOVQGE", resultInArg0: true}, + {name: "CMOVQLS", argLength: 3, reg: gp21, asm: "CMOVQLS", resultInArg0: true}, + {name: "CMOVQHI", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true}, + {name: "CMOVQCC", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true}, + {name: "CMOVQCS", argLength: 3, reg: gp21, asm: "CMOVQCS", resultInArg0: true}, + + {name: "CMOVLEQ", argLength: 3, reg: gp21, asm: "CMOVLEQ", resultInArg0: true}, + {name: "CMOVLNE", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true}, + {name: "CMOVLLT", argLength: 3, reg: gp21, asm: "CMOVLLT", resultInArg0: true}, + {name: "CMOVLGT", argLength: 3, reg: gp21, asm: "CMOVLGT", resultInArg0: true}, + {name: "CMOVLLE", argLength: 3, reg: gp21, asm: "CMOVLLE", resultInArg0: true}, + {name: "CMOVLGE", argLength: 3, reg: gp21, asm: "CMOVLGE", resultInArg0: true}, + {name: "CMOVLLS", argLength: 3, reg: gp21, asm: "CMOVLLS", resultInArg0: true}, + {name: "CMOVLHI", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true}, + {name: "CMOVLCC", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true}, + {name: "CMOVLCS", argLength: 3, reg: gp21, asm: "CMOVLCS", resultInArg0: true}, + + {name: "CMOVWEQ", argLength: 3, reg: gp21, asm: "CMOVWEQ", resultInArg0: true}, + {name: "CMOVWNE", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true}, + {name: "CMOVWLT", argLength: 3, reg: gp21, asm: "CMOVWLT", resultInArg0: true}, + {name: "CMOVWGT", argLength: 3, reg: gp21, asm: "CMOVWGT", resultInArg0: true}, + {name: "CMOVWLE", argLength: 3, reg: gp21, asm: "CMOVWLE", resultInArg0: true}, + {name: "CMOVWGE", argLength: 3, reg: gp21, asm: "CMOVWGE", resultInArg0: true}, + {name: "CMOVWLS", argLength: 3, reg: gp21, asm: "CMOVWLS", resultInArg0: true}, + {name: "CMOVWHI", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true}, + {name: "CMOVWCC", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true}, + {name: "CMOVWCS", argLength: 3, reg: gp21, asm: "CMOVWCS", resultInArg0: true}, + + // CMOV with floating point instructions. We need separate pseudo-op to handle + // InvertFlags correctly, and to generate special code that handles NaN (unordered flag). + // NOTE: the fact that CMOV*EQF here is marked to generate CMOV*NE is not a bug. See + // code generation in amd64/ssa.go. + {name: "CMOVQEQF", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true, needIntTemp: true}, + {name: "CMOVQNEF", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true}, + {name: "CMOVQGTF", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true}, + {name: "CMOVQGEF", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true}, + {name: "CMOVLEQF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true, needIntTemp: true}, + {name: "CMOVLNEF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true}, + {name: "CMOVLGTF", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true}, + {name: "CMOVLGEF", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true}, + {name: "CMOVWEQF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true, needIntTemp: true}, + {name: "CMOVWNEF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true}, + {name: "CMOVWGTF", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true}, + {name: "CMOVWGEF", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true}, + + // BSWAPx swaps the low-order (L=4,Q=8) bytes of arg0. + // Q: abcdefgh -> hgfedcba + // L: abcdefgh -> 0000hgfe (L zeros the upper 4 bytes) + {name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true}, + {name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, + + // POPCNTx counts the number of set bits in the low-order (L=32,Q=64) bits of arg0. + // POPCNTx instructions are only guaranteed to be available if GOAMD64>=v2. + // For GOAMD64<v2, any use must be preceded by a successful runtime check of runtime.x86HasPOPCNT. + {name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true}, + {name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true}, + + // SQRTSx computes sqrt(arg0) + // S = float32, D = float64 + {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, + {name: "SQRTSS", argLength: 1, reg: fp11, asm: "SQRTSS"}, + + // ROUNDSD rounds arg0 to an integer depending on auxint + // 0 means math.RoundToEven, 1 means math.Floor, 2 math.Ceil, 3 math.Trunc + // (The result is still a float64.) + // ROUNDSD instruction is only guaraneteed to be available if GOAMD64>=v2. + // For GOAMD64<v2, any use must be preceded by a successful check of runtime.x86HasSSE41. + {name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, + + // VFMADD231SD only exists on platforms with the FMA3 instruction set. + // Any use must be preceded by a successful check of runtime.support_fma. + {name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"}, + + {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear. + {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear. + // Note: SBBW and SBBB are subsumed by SBBL + + {name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0 + {name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0 + {name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"}, // extract signed < condition from arg0 + {name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0 + {name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"}, // extract signed > condition from arg0 + {name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0 + {name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"}, // extract unsigned < condition from arg0 + {name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0 + {name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"}, // extract unsigned > condition from arg0 + {name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0 + {name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"}, // extract if overflow flag is set from arg0 + // Variants that store result to memory + {name: "SETEQstore", argLength: 3, reg: gpstoreconst, asm: "SETEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract == condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETNEstore", argLength: 3, reg: gpstoreconst, asm: "SETNE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract != condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETLstore", argLength: 3, reg: gpstoreconst, asm: "SETLT", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed < condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETLEstore", argLength: 3, reg: gpstoreconst, asm: "SETLE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed <= condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETGstore", argLength: 3, reg: gpstoreconst, asm: "SETGT", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed > condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETGEstore", argLength: 3, reg: gpstoreconst, asm: "SETGE", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract signed >= condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETBstore", argLength: 3, reg: gpstoreconst, asm: "SETCS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned < condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETBEstore", argLength: 3, reg: gpstoreconst, asm: "SETLS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned <= condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETAstore", argLength: 3, reg: gpstoreconst, asm: "SETHI", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned > condition from arg1 to arg0+auxint+aux, arg2=mem + {name: "SETAEstore", argLength: 3, reg: gpstoreconst, asm: "SETCC", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // extract unsigned >= condition from arg1 to arg0+auxint+aux, arg2=mem + // Need different opcodes for floating point conditions because + // any comparison involving a NaN is always FALSE and thus + // the patterns for inverting conditions cannot be used. + {name: "SETEQF", argLength: 1, reg: flagsgp, asm: "SETEQ", clobberFlags: true, needIntTemp: true}, // extract == condition from arg0 + {name: "SETNEF", argLength: 1, reg: flagsgp, asm: "SETNE", clobberFlags: true, needIntTemp: true}, // extract != condition from arg0 + {name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"}, // extract "ordered" (No Nan present) condition from arg0 + {name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"}, // extract "unordered" (Nan present) condition from arg0 + + {name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"}, // extract floating > condition from arg0 + {name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0 + + {name: "MOVBQSX", argLength: 1, reg: gp11, asm: "MOVBQSX"}, // sign extend arg0 from int8 to int64 + {name: "MOVBQZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int64 + {name: "MOVWQSX", argLength: 1, reg: gp11, asm: "MOVWQSX"}, // sign extend arg0 from int16 to int64 + {name: "MOVWQZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int64 + {name: "MOVLQSX", argLength: 1, reg: gp11, asm: "MOVLQSX"}, // sign extend arg0 from int32 to int64 + {name: "MOVLQZX", argLength: 1, reg: gp11, asm: "MOVL"}, // zero extend arg0 from int32 to int64 + + {name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint + {name: "MOVQconst", reg: gp01, asm: "MOVQ", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint + + {name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32 + {name: "CVTTSD2SQ", argLength: 1, reg: fpgp, asm: "CVTTSD2SQ"}, // convert float64 to int64 + {name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32 + {name: "CVTTSS2SQ", argLength: 1, reg: fpgp, asm: "CVTTSS2SQ"}, // convert float32 to int64 + {name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"}, // convert int32 to float32 + {name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"}, // convert int32 to float64 + {name: "CVTSQ2SS", argLength: 1, reg: gpfp, asm: "CVTSQ2SS"}, // convert int64 to float32 + {name: "CVTSQ2SD", argLength: 1, reg: gpfp, asm: "CVTSQ2SD"}, // convert int64 to float64 + {name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"}, // convert float64 to float32 + {name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"}, // convert float32 to float64 + + // Move values between int and float registers, with no conversion. + // TODO: should we have generic versions of these? + {name: "MOVQi2f", argLength: 1, reg: gpfp, typ: "Float64"}, // move 64 bits from int to float reg + {name: "MOVQf2i", argLength: 1, reg: fpgp, typ: "UInt64"}, // move 64 bits from float to int reg + {name: "MOVLi2f", argLength: 1, reg: gpfp, typ: "Float32"}, // move 32 bits from int to float reg + {name: "MOVLf2i", argLength: 1, reg: fpgp, typ: "UInt32"}, // move 32 bits from float to int reg, zero extend + + {name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation. + + {name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux + {name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux + {name: "LEAW", argLength: 1, reg: gp11sb, asm: "LEAW", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux + + // LEAxn computes arg0 + n*arg1 + auxint + aux + // x==L zeroes the upper 4 bytes. + {name: "LEAQ1", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux + {name: "LEAL1", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux + {name: "LEAW1", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 1, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux + {name: "LEAQ2", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 2, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux + {name: "LEAL2", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 2, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux + {name: "LEAW2", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 2, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux + {name: "LEAQ4", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 4, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux + {name: "LEAL4", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 4, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux + {name: "LEAW4", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 4, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux + {name: "LEAQ8", argLength: 2, reg: gp21sb, asm: "LEAQ", scale: 8, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux + {name: "LEAL8", argLength: 2, reg: gp21sb, asm: "LEAL", scale: 8, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux + {name: "LEAW8", argLength: 2, reg: gp21sb, asm: "LEAW", scale: 8, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux + // Note: LEAx{1,2,4,8} must not have OpSB as either argument. + + // MOVxload: loads + // Load (Q=8,L=4,W=2,B=1) bytes from (arg0+auxint+aux), arg1=mem. + // "+auxint+aux" == add auxint and the offset of the symbol in aux (if any) to the effective address + // Standard versions zero extend the result. SX versions sign extend the result. + {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVBQSXload", argLength: 2, reg: gpload, asm: "MOVBQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVWQSXload", argLength: 2, reg: gpload, asm: "MOVWQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVLQSXload", argLength: 2, reg: gpload, asm: "MOVLQSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVQload", argLength: 2, reg: gpload, asm: "MOVQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, + + // MOVxstore: stores + // Store (Q=8,L=4,W=2,B=1) low bytes of arg1. + // Does *(arg0+auxint+aux) = arg1, arg2=mem. + {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVQstore", argLength: 3, reg: gpstore, asm: "MOVQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + + // MOVOload/store: 16 byte load/store + // These operations are only used to move data around: there is no *O arithmetic, for example. + {name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128", faultOnNilArg0: true, symEffect: "Read"}, // load 16 bytes from arg0+auxint+aux. arg1=mem + {name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem + + // MOVxloadidx: indexed loads + // load (Q=8,L=4,W=2,B=1) bytes from (arg0+scale*arg1+auxint+aux), arg2=mem. + // Results are zero-extended. (TODO: sign-extending indexed loads) + {name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", scale: 1, aux: "SymOff", typ: "UInt8", symEffect: "Read"}, + {name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", scale: 1, aux: "SymOff", typ: "UInt16", symEffect: "Read"}, + {name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", scale: 2, aux: "SymOff", typ: "UInt16", symEffect: "Read"}, + {name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, + {name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", scale: 4, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, + {name: "MOVLloadidx8", argLength: 3, reg: gploadidx, asm: "MOVL", scale: 8, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, + {name: "MOVQloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymOff", typ: "UInt64", symEffect: "Read"}, + {name: "MOVQloadidx8", argLength: 3, reg: gploadidx, asm: "MOVQ", scale: 8, aux: "SymOff", typ: "UInt64", symEffect: "Read"}, + + // MOVxstoreidx: indexed stores + // Store (Q=8,L=4,W=2,B=1) low bytes of arg2. + // Does *(arg0+scale*arg1+auxint+aux) = arg2, arg3=mem. + {name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", scale: 1, aux: "SymOff", symEffect: "Write"}, + {name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", scale: 1, aux: "SymOff", symEffect: "Write"}, + {name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", scale: 2, aux: "SymOff", symEffect: "Write"}, + {name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymOff", symEffect: "Write"}, + {name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", scale: 4, aux: "SymOff", symEffect: "Write"}, + {name: "MOVLstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVL", scale: 8, aux: "SymOff", symEffect: "Write"}, + {name: "MOVQstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymOff", symEffect: "Write"}, + {name: "MOVQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVQ", scale: 8, aux: "SymOff", symEffect: "Write"}, + + // TODO: add size-mismatched indexed loads/stores, like MOVBstoreidx4? + + // MOVxstoreconst: constant stores + // Store (O=16,Q=8,L=4,W=2,B=1) constant bytes. + // Does *(arg0+ValAndOff(AuxInt).Off()+aux) = ValAndOff(AuxInt).Val(), arg1=mem. + // O version can only store the constant 0. + {name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVQstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVQ", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + {name: "MOVOstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVUPS", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, + + // MOVxstoreconstidx: constant indexed stores + // Store (Q=8,L=4,W=2,B=1) constant bytes. + // Does *(arg0+scale*arg1+ValAndOff(AuxInt).Off()+aux) = ValAndOff(AuxInt).Val(), arg2=mem. + {name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVB", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + {name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVW", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + {name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", scale: 2, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + {name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVL", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + {name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", scale: 4, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + {name: "MOVQstoreconstidx1", argLength: 3, reg: gpstoreconstidx, commutative: true, asm: "MOVQ", scale: 1, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + {name: "MOVQstoreconstidx8", argLength: 3, reg: gpstoreconstidx, asm: "MOVQ", scale: 8, aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, + + // arg0 = pointer to start of memory to zero + // arg1 = mem + // auxint = # of bytes to zero + // returns mem + { + name: "DUFFZERO", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{buildReg("DI")}, + clobbers: buildReg("DI"), + }, + faultOnNilArg0: true, + unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts + }, + + // arg0 = address of memory to zero + // arg1 = # of 8-byte words to zero + // arg2 = value to store (will always be zero) + // arg3 = mem + // returns mem + { + name: "REPSTOSQ", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")}, + clobbers: buildReg("DI CX"), + }, + faultOnNilArg0: true, + }, + + // With a register ABI, the actual register info for these instructions (i.e., what is used in regalloc) is augmented with per-call-site bindings of additional arguments to specific in and out registers. + {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem + + // arg0 = destination pointer + // arg1 = source pointer + // arg2 = mem + // auxint = # of bytes to copy, must be multiple of 16 + // returns memory + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("SI")}, + clobbers: buildReg("DI SI X0"), // uses X0 as a temporary + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts + }, + + // arg0 = destination pointer + // arg1 = source pointer + // arg2 = # of 8-byte words to copy + // arg3 = mem + // returns memory + { + name: "REPMOVSQ", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")}, + clobbers: buildReg("DI SI CX"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // (InvertFlags (CMPQ a b)) == (CMPQ b a) + // So if we want (SETL (CMPQ a b)) but we can't do that because a is a constant, + // then we do (SETL (InvertFlags (CMPQ b a))) instead. + // Rewrites will convert this to (SETG (CMPQ b a)). + // InvertFlags is a pseudo-op which can't appear in assembly output. + {name: "InvertFlags", argLength: 1}, // reverse direction of arg0 + + // Pseudo-ops + {name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of DX (the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true}, + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true}, + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, but may clobber others. + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ (gp | g)}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + {name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "UInt64", aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + + // Constant flag values. For any comparison, there are 5 possible + // outcomes: the three from the signed total order (<,==,>) and the + // three from the unsigned total order. The == cases overlap. + // Note: there's a sixth "unordered" outcome for floating-point + // comparisons, but we don't use such a beast yet. + // These ops are for temporary use by rewrite rules. They + // cannot appear in the generated assembly. + {name: "FlagEQ"}, // equal + {name: "FlagLT_ULT"}, // signed < and unsigned < + {name: "FlagLT_UGT"}, // signed < and unsigned > + {name: "FlagGT_UGT"}, // signed > and unsigned > + {name: "FlagGT_ULT"}, // signed > and unsigned < + + // Atomic loads. These are just normal loads but return <value,memory> tuples + // so they can be properly ordered with other loads. + // load from arg0+auxint+aux. arg1=mem. + {name: "MOVBatomicload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVLatomicload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVQatomicload", argLength: 2, reg: gpload, asm: "MOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + + // Atomic stores and exchanges. Stores use XCHG to get the right memory ordering semantics. + // store arg0 to arg1+auxint+aux, arg2=mem. + // These ops return a tuple of <old contents of *(arg1+auxint+aux), memory>. + // Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)! + {name: "XCHGB", argLength: 3, reg: gpstorexchg, asm: "XCHGB", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "XCHGL", argLength: 3, reg: gpstorexchg, asm: "XCHGL", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "XCHGQ", argLength: 3, reg: gpstorexchg, asm: "XCHGQ", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"}, + + // Atomic adds. + // *(arg1+auxint+aux) += arg0. arg2=mem. + // Returns a tuple of <old contents of *(arg1+auxint+aux), memory>. + // Note: arg0 and arg1 are backwards compared to MOVLstore (to facilitate resultInArg0)! + {name: "XADDLlock", argLength: 3, reg: gpstorexchg, asm: "XADDL", typ: "(UInt32,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "XADDQlock", argLength: 3, reg: gpstorexchg, asm: "XADDQ", typ: "(UInt64,Mem)", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>. + {name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>. + + // Compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. + // if *(arg0+auxint+aux) == arg1 { + // *(arg0+auxint+aux) = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // Note that these instructions also return the old value in AX, but we ignore it. + // TODO: have these return flags instead of bool. The current system generates: + // CMPXCHGQ ... + // SETEQ AX + // CMPB AX, $0 + // JNE ... + // instead of just + // CMPXCHGQ ... + // JEQ ... + // but we can't do that because memory-using ops can't generate flags yet + // (flagalloc wants to move flag-generating instructions around). + {name: "CMPXCHGLlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "CMPXCHGQlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + + // Atomic memory updates. + {name: "ANDBlock", argLength: 3, reg: gpstore, asm: "ANDB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1 + {name: "ANDLlock", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1 + {name: "ORBlock", argLength: 3, reg: gpstore, asm: "ORB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1 + {name: "ORLlock", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1 + + // Prefetch instructions + // Do prefetch arg0 address. arg0=addr, arg1=memory. Instruction variant selects locality hint + {name: "PrefetchT0", argLength: 2, reg: prefreg, asm: "PREFETCHT0", hasSideEffects: true}, + {name: "PrefetchNTA", argLength: 2, reg: prefreg, asm: "PREFETCHNTA", hasSideEffects: true}, + + // CPUID feature: BMI1. + {name: "ANDNQ", argLength: 2, reg: gp21, asm: "ANDNQ", clobberFlags: true}, // arg0 &^ arg1 + {name: "ANDNL", argLength: 2, reg: gp21, asm: "ANDNL", clobberFlags: true}, // arg0 &^ arg1 + {name: "BLSIQ", argLength: 1, reg: gp11, asm: "BLSIQ", clobberFlags: true}, // arg0 & -arg0 + {name: "BLSIL", argLength: 1, reg: gp11, asm: "BLSIL", clobberFlags: true}, // arg0 & -arg0 + {name: "BLSMSKQ", argLength: 1, reg: gp11, asm: "BLSMSKQ", clobberFlags: true}, // arg0 ^ (arg0 - 1) + {name: "BLSMSKL", argLength: 1, reg: gp11, asm: "BLSMSKL", clobberFlags: true}, // arg0 ^ (arg0 - 1) + {name: "BLSRQ", argLength: 1, reg: gp11, asm: "BLSRQ", clobberFlags: true}, // arg0 & (arg0 - 1) + {name: "BLSRL", argLength: 1, reg: gp11, asm: "BLSRL", clobberFlags: true}, // arg0 & (arg0 - 1) + // count the number of trailing zero bits, prefer TZCNTQ over BSFQ, as TZCNTQ(0)==64 + // and BSFQ(0) is undefined. Same for TZCNTL(0)==32 + {name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true}, + {name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true}, + + // CPUID feature: LZCNT. + // count the number of leading zero bits. + {name: "LZCNTQ", argLength: 1, reg: gp11, asm: "LZCNTQ", typ: "UInt64", clobberFlags: true}, + {name: "LZCNTL", argLength: 1, reg: gp11, asm: "LZCNTL", typ: "UInt32", clobberFlags: true}, + + // CPUID feature: MOVBE + // MOVBEWload does not satisfy zero extended, so only use MOVBEWstore + {name: "MOVBEWstore", argLength: 3, reg: gpstore, asm: "MOVBEW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVBELload", argLength: 2, reg: gpload, asm: "MOVBEL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVBELstore", argLength: 3, reg: gpstore, asm: "MOVBEL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVBEQload", argLength: 2, reg: gpload, asm: "MOVBEQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 8 bytes from arg0+auxint+aux. arg1=mem + {name: "MOVBEQstore", argLength: 3, reg: gpstore, asm: "MOVBEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem + // indexed MOVBE loads + {name: "MOVBELloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBEL", scale: 1, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load and swap 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend. + {name: "MOVBELloadidx4", argLength: 3, reg: gploadidx, asm: "MOVBEL", scale: 4, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load and swap 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem. Zero extend. + {name: "MOVBELloadidx8", argLength: 3, reg: gploadidx, asm: "MOVBEL", scale: 8, aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load and swap 4 bytes from arg0+8*arg1+auxint+aux. arg2=mem. Zero extend. + {name: "MOVBEQloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBEQ", scale: 1, aux: "SymOff", typ: "UInt64", symEffect: "Read"}, // load and swap 8 bytes from arg0+arg1+auxint+aux. arg2=mem + {name: "MOVBEQloadidx8", argLength: 3, reg: gploadidx, asm: "MOVBEQ", scale: 8, aux: "SymOff", typ: "UInt64", symEffect: "Read"}, // load and swap 8 bytes from arg0+8*arg1+auxint+aux. arg2=mem + // indexed MOVBE stores + {name: "MOVBEWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVBEW", scale: 1, aux: "SymOff", symEffect: "Write"}, // swap and store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVBEWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVBEW", scale: 2, aux: "SymOff", symEffect: "Write"}, // swap and store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem + {name: "MOVBELstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVBEL", scale: 1, aux: "SymOff", symEffect: "Write"}, // swap and store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVBELstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVBEL", scale: 4, aux: "SymOff", symEffect: "Write"}, // swap and store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem + {name: "MOVBELstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVBEL", scale: 8, aux: "SymOff", symEffect: "Write"}, // swap and store 4 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem + {name: "MOVBEQstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVBEQ", scale: 1, aux: "SymOff", symEffect: "Write"}, // swap and store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVBEQstoreidx8", argLength: 4, reg: gpstoreidx, asm: "MOVBEQ", scale: 8, aux: "SymOff", symEffect: "Write"}, // swap and store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem + + // CPUID feature: BMI2. + {name: "SARXQ", argLength: 2, reg: gp21, asm: "SARXQ"}, // signed arg0 >> arg1, shift amount is mod 64 + {name: "SARXL", argLength: 2, reg: gp21, asm: "SARXL"}, // signed int32(arg0) >> arg1, shift amount is mod 32 + {name: "SHLXQ", argLength: 2, reg: gp21, asm: "SHLXQ"}, // arg0 << arg1, shift amount is mod 64 + {name: "SHLXL", argLength: 2, reg: gp21, asm: "SHLXL"}, // arg0 << arg1, shift amount is mod 32 + {name: "SHRXQ", argLength: 2, reg: gp21, asm: "SHRXQ"}, // unsigned arg0 >> arg1, shift amount is mod 64 + {name: "SHRXL", argLength: 2, reg: gp21, asm: "SHRXL"}, // unsigned uint32(arg0) >> arg1, shift amount is mod 32 + + {name: "SARXLload", argLength: 3, reg: gp21shxload, asm: "SARXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 32 + {name: "SARXQload", argLength: 3, reg: gp21shxload, asm: "SARXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 64 + {name: "SHLXLload", argLength: 3, reg: gp21shxload, asm: "SHLXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 32 + {name: "SHLXQload", argLength: 3, reg: gp21shxload, asm: "SHLXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+auxint+aux) << arg1, arg2=mem, shift amount is mod 64 + {name: "SHRXLload", argLength: 3, reg: gp21shxload, asm: "SHRXL", aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 32 + {name: "SHRXQload", argLength: 3, reg: gp21shxload, asm: "SHRXQ", aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+auxint+aux) >> arg1, arg2=mem, shift amount is mod 64 + + {name: "SARXLloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SARXL", scale: 1, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 + {name: "SARXLloadidx4", argLength: 4, reg: gp21shxloadidx, asm: "SARXL", scale: 4, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+4*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 + {name: "SARXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SARXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 + {name: "SARXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SARXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64 + {name: "SARXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SARXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // signed *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64 + {name: "SHLXLloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHLXL", scale: 1, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+1*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 32 + {name: "SHLXLloadidx4", argLength: 4, reg: gp21shxloadidx, asm: "SHLXL", scale: 4, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+4*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 32 + {name: "SHLXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHLXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+8*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 32 + {name: "SHLXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHLXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+1*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 64 + {name: "SHLXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHLXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // *(arg0+8*arg1+auxint+aux) << arg2, arg3=mem, shift amount is mod 64 + {name: "SHRXLloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 1, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 + {name: "SHRXLloadidx4", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 4, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+4*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 + {name: "SHRXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 + {name: "SHRXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64 + {name: "SHRXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64 + } + + var AMD64blocks = []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LT", controls: 1}, + {name: "LE", controls: 1}, + {name: "GT", controls: 1}, + {name: "GE", controls: 1}, + {name: "OS", controls: 1}, + {name: "OC", controls: 1}, + {name: "ULT", controls: 1}, + {name: "ULE", controls: 1}, + {name: "UGT", controls: 1}, + {name: "UGE", controls: 1}, + {name: "EQF", controls: 1}, + {name: "NEF", controls: 1}, + {name: "ORD", controls: 1}, // FP, ordered comparison (parity zero) + {name: "NAN", controls: 1}, // FP, unordered comparison (parity one) + + // JUMPTABLE implements jump tables. + // Aux is the symbol (an *obj.LSym) for the jump table. + // control[0] is the index into the jump table. + // control[1] is the address of the jump table (the address of the symbol stored in Aux). + {name: "JUMPTABLE", controls: 2, aux: "Sym"}, + } + + archs = append(archs, arch{ + name: "AMD64", + pkg: "cmd/internal/obj/x86", + genfile: "../../amd64/ssa.go", + ops: AMD64ops, + blocks: AMD64blocks, + regnames: regNamesAMD64, + ParamIntRegNames: "AX BX CX DI SI R8 R9 R10 R11", + ParamFloatRegNames: "X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14", + gpregmask: gp, + fpregmask: fp, + specialregmask: x15, + framepointerreg: int8(num["BP"]), + linkreg: -1, // not used + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules b/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules new file mode 100644 index 0000000..a1e63d6 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/AMD64latelower.rules @@ -0,0 +1,8 @@ +// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Prefer SARX/SHLX/SHRX instruction because it has less register restriction on the shift input.
+(SAR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SARX(Q|L) x y)
+(SHL(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SHLX(Q|L) x y)
+(SHR(Q|L) x y) && buildcfg.GOAMD64 >= 3 => (SHRX(Q|L) x y)
diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules b/src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules new file mode 100644 index 0000000..dd8f8ac --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/AMD64splitload.rules @@ -0,0 +1,45 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains rules used by flagalloc and addressingmodes to +// split a flag-generating merged load op into separate load and op. +// Unlike with the other rules files, not all of these +// rules will be applied to all values. +// Rather, flagalloc will request for rules to be applied +// to a particular problematic value. +// These are often the exact inverse of rules in AMD64.rules, +// only with the conditions removed. +// +// For addressingmodes, certain single instructions are slower than the two instruction +// split generated here (which is different from the inputs to addressingmodes). +// For example: +// (CMPBconstload c (ADDQ x y)) -> (CMPBconstloadidx1 c x y) -> (CMPB c (MOVBloadidx1 x y)) + +(CMP(Q|L|W|B)load {sym} [off] ptr x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)load {sym} [off] ptr mem) x) + +(CMP(Q|L|W|B)constload {sym} [vo] ptr mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)load {sym} [vo.Off()] ptr mem) x) + +(CMPQconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPQconst (MOVQload {sym} [vo.Off()] ptr mem) [vo.Val()]) +(CMPLconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPLconst (MOVLload {sym} [vo.Off()] ptr mem) [vo.Val()]) +(CMPWconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPWconst (MOVWload {sym} [vo.Off()] ptr mem) [vo.Val16()]) +(CMPBconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPBconst (MOVBload {sym} [vo.Off()] ptr mem) [vo.Val8()]) + +(CMP(Q|L|W|B)loadidx1 {sym} [off] ptr idx x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)loadidx1 {sym} [off] ptr idx mem) x) +(CMPQloadidx8 {sym} [off] ptr idx x mem) => (CMPQ (MOVQloadidx8 {sym} [off] ptr idx mem) x) +(CMPLloadidx4 {sym} [off] ptr idx x mem) => (CMPL (MOVLloadidx4 {sym} [off] ptr idx mem) x) +(CMPWloadidx2 {sym} [off] ptr idx x mem) => (CMPW (MOVWloadidx2 {sym} [off] ptr idx mem) x) + +(CMP(Q|L|W|B)constloadidx1 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)loadidx1 {sym} [vo.Off()] ptr idx mem) x) +(CMPQconstloadidx8 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTQ x:(MOVQloadidx8 {sym} [vo.Off()] ptr idx mem) x) +(CMPLconstloadidx4 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTL x:(MOVLloadidx4 {sym} [vo.Off()] ptr idx mem) x) +(CMPWconstloadidx2 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTW x:(MOVWloadidx2 {sym} [vo.Off()] ptr idx mem) x) + +(CMPQconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val()]) +(CMPLconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val()]) +(CMPWconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val16()]) +(CMPBconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPBconst (MOVBloadidx1 {sym} [vo.Off()] ptr idx mem) [vo.Val8()]) + +(CMPQconstloadidx8 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx8 {sym} [vo.Off()] ptr idx mem) [vo.Val()]) +(CMPLconstloadidx4 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx4 {sym} [vo.Off()] ptr idx mem) [vo.Val()]) +(CMPWconstloadidx2 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx2 {sym} [vo.Off()] ptr idx mem) [vo.Val16()]) diff --git a/src/cmd/compile/internal/ssa/_gen/ARM.rules b/src/cmd/compile/internal/ssa/_gen/ARM.rules new file mode 100644 index 0000000..e5898b0 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/ARM.rules @@ -0,0 +1,1474 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +(Add(Ptr|32|16|8) ...) => (ADD ...) +(Add(32|64)F ...) => (ADD(F|D) ...) +(Add32carry ...) => (ADDS ...) +(Add32withcarry ...) => (ADC ...) + +(Sub(Ptr|32|16|8) ...) => (SUB ...) +(Sub(32|64)F ...) => (SUB(F|D) ...) +(Sub32carry ...) => (SUBS ...) +(Sub32withcarry ...) => (SBC ...) + +(Mul(32|16|8) ...) => (MUL ...) +(Mul(32|64)F ...) => (MUL(F|D) ...) +(Hmul(32|32u) ...) => (HMU(L|LU) ...) +(Mul32uhilo ...) => (MULLU ...) + +(Div32 x y) => + (SUB (XOR <typ.UInt32> // negate the result if one operand is negative + (Select0 <typ.UInt32> (CALLudiv + (SUB <typ.UInt32> (XOR x <typ.UInt32> (Signmask x)) (Signmask x)) // negate x if negative + (SUB <typ.UInt32> (XOR y <typ.UInt32> (Signmask y)) (Signmask y)))) // negate y if negative + (Signmask (XOR <typ.UInt32> x y))) (Signmask (XOR <typ.UInt32> x y))) +(Div32u x y) => (Select0 <typ.UInt32> (CALLudiv x y)) +(Div16 x y) => (Div32 (SignExt16to32 x) (SignExt16to32 y)) +(Div16u x y) => (Div32u (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Div8 x y) => (Div32 (SignExt8to32 x) (SignExt8to32 y)) +(Div8u x y) => (Div32u (ZeroExt8to32 x) (ZeroExt8to32 y)) +(Div(32|64)F ...) => (DIV(F|D) ...) + +(Mod32 x y) => + (SUB (XOR <typ.UInt32> // negate the result if x is negative + (Select1 <typ.UInt32> (CALLudiv + (SUB <typ.UInt32> (XOR <typ.UInt32> x (Signmask x)) (Signmask x)) // negate x if negative + (SUB <typ.UInt32> (XOR <typ.UInt32> y (Signmask y)) (Signmask y)))) // negate y if negative + (Signmask x)) (Signmask x)) +(Mod32u x y) => (Select1 <typ.UInt32> (CALLudiv x y)) +(Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y)) +(Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y)) +(Mod8u x y) => (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y)) + +// (x + y) / 2 with x>=y -> (x - y) / 2 + y +(Avg32u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y) + +(And(32|16|8) ...) => (AND ...) +(Or(32|16|8) ...) => (OR ...) +(Xor(32|16|8) ...) => (XOR ...) + +// unary ops +(Neg(32|16|8) x) => (RSBconst [0] x) +(Neg(32|64)F ...) => (NEG(F|D) ...) + +(Com(32|16|8) ...) => (MVN ...) + +(Sqrt ...) => (SQRTD ...) +(Sqrt32 ...) => (SQRTF ...) +(Abs ...) => (ABSD ...) + +// TODO: optimize this for ARMv5 and ARMv6 +(Ctz32NonZero ...) => (Ctz32 ...) +(Ctz16NonZero ...) => (Ctz32 ...) +(Ctz8NonZero ...) => (Ctz32 ...) + +// count trailing zero for ARMv5 and ARMv6 +// 32 - CLZ(x&-x - 1) +(Ctz32 <t> x) && buildcfg.GOARM<=6 => + (RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1]))) +(Ctz16 <t> x) && buildcfg.GOARM<=6 => + (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x10000] x))) [1]))) +(Ctz8 <t> x) && buildcfg.GOARM<=6 => + (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x100] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x100] x))) [1]))) + +// count trailing zero for ARMv7 +(Ctz32 <t> x) && buildcfg.GOARM==7 => (CLZ <t> (RBIT <t> x)) +(Ctz16 <t> x) && buildcfg.GOARM==7 => (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x))) +(Ctz8 <t> x) && buildcfg.GOARM==7 => (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x100] x))) + +// bit length +(BitLen32 <t> x) => (RSBconst [32] (CLZ <t> x)) + +// byte swap for ARMv5 +// let (a, b, c, d) be the bytes of x from high to low +// t1 = x right rotate 16 bits -- (c, d, a, b ) +// t2 = x ^ t1 -- (a^c, b^d, a^c, b^d) +// t3 = t2 &^ 0xff0000 -- (a^c, 0, a^c, b^d) +// t4 = t3 >> 8 -- (0, a^c, 0, a^c) +// t5 = x right rotate 8 bits -- (d, a, b, c ) +// result = t4 ^ t5 -- (d, c, b, a ) +// using shifted ops this can be done in 4 instructions. +(Bswap32 <t> x) && buildcfg.GOARM==5 => + (XOR <t> + (SRLconst <t> (BICconst <t> (XOR <t> x (SRRconst <t> [16] x)) [0xff0000]) [8]) + (SRRconst <t> x [8])) + +// byte swap for ARMv6 and above +(Bswap32 x) && buildcfg.GOARM>=6 => (REV x) + +// boolean ops -- booleans are represented with 0=false, 1=true +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(EqB x y) => (XORconst [1] (XOR <typ.Bool> x y)) +(NeqB ...) => (XOR ...) +(Not x) => (XORconst [1] x) + +// shifts +// hardware instruction uses only the low byte of the shift +// we compare to 256 to ensure Go semantics for large shifts +(Lsh32x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0]) +(Lsh32x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0]) +(Lsh32x8 x y) => (SLL x (ZeroExt8to32 y)) + +(Lsh16x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0]) +(Lsh16x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0]) +(Lsh16x8 x y) => (SLL x (ZeroExt8to32 y)) + +(Lsh8x32 x y) => (CMOVWHSconst (SLL <x.Type> x y) (CMPconst [256] y) [0]) +(Lsh8x16 x y) => (CMOVWHSconst (SLL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0]) +(Lsh8x8 x y) => (SLL x (ZeroExt8to32 y)) + +(Rsh32Ux32 x y) => (CMOVWHSconst (SRL <x.Type> x y) (CMPconst [256] y) [0]) +(Rsh32Ux16 x y) => (CMOVWHSconst (SRL <x.Type> x (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0]) +(Rsh32Ux8 x y) => (SRL x (ZeroExt8to32 y)) + +(Rsh16Ux32 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt16to32 x) y) (CMPconst [256] y) [0]) +(Rsh16Ux16 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt16to32 x) (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0]) +(Rsh16Ux8 x y) => (SRL (ZeroExt16to32 x) (ZeroExt8to32 y)) + +(Rsh8Ux32 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt8to32 x) y) (CMPconst [256] y) [0]) +(Rsh8Ux16 x y) => (CMOVWHSconst (SRL <x.Type> (ZeroExt8to32 x) (ZeroExt16to32 y)) (CMPconst [256] (ZeroExt16to32 y)) [0]) +(Rsh8Ux8 x y) => (SRL (ZeroExt8to32 x) (ZeroExt8to32 y)) + +(Rsh32x32 x y) => (SRAcond x y (CMPconst [256] y)) +(Rsh32x16 x y) => (SRAcond x (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y))) +(Rsh32x8 x y) => (SRA x (ZeroExt8to32 y)) + +(Rsh16x32 x y) => (SRAcond (SignExt16to32 x) y (CMPconst [256] y)) +(Rsh16x16 x y) => (SRAcond (SignExt16to32 x) (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y))) +(Rsh16x8 x y) => (SRA (SignExt16to32 x) (ZeroExt8to32 y)) + +(Rsh8x32 x y) => (SRAcond (SignExt8to32 x) y (CMPconst [256] y)) +(Rsh8x16 x y) => (SRAcond (SignExt8to32 x) (ZeroExt16to32 y) (CMPconst [256] (ZeroExt16to32 y))) +(Rsh8x8 x y) => (SRA (SignExt8to32 x) (ZeroExt8to32 y)) + +// constant shifts +// generic opt rewrites all constant shifts to shift by Const64 +(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SLLconst x [int32(c)]) +(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 => (SRAconst x [int32(c)]) +(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 => (SRLconst x [int32(c)]) +(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SLLconst x [int32(c)]) +(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)]) +(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 => (SRLconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)]) +(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SLLconst x [int32(c)]) +(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)]) +(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 => (SRLconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)]) + +// large constant shifts +(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0]) +(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0]) +(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0]) +(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0]) +(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0]) +(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0]) + +// large constant signed right shift, we leave the sign bit +(Rsh32x64 x (Const64 [c])) && uint64(c) >= 32 => (SRAconst x [31]) +(Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [31]) +(Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [31]) + +// constants +(Const(8|16|32) [val]) => (MOVWconst [int32(val)]) +(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)]) +(ConstNil) => (MOVWconst [0]) +(ConstBool [t]) => (MOVWconst [b2i32(t)]) + +// truncations +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) + +// Zero-/Sign-extensions +(ZeroExt8to16 ...) => (MOVBUreg ...) +(ZeroExt8to32 ...) => (MOVBUreg ...) +(ZeroExt16to32 ...) => (MOVHUreg ...) + +(SignExt8to16 ...) => (MOVBreg ...) +(SignExt8to32 ...) => (MOVBreg ...) +(SignExt16to32 ...) => (MOVHreg ...) + +(Signmask x) => (SRAconst x [31]) +(Zeromask x) => (SRAconst (RSBshiftRL <typ.Int32> x x [1]) [31]) // sign bit of uint32(x)>>1 - x +(Slicemask <t> x) => (SRAconst (RSBconst <t> [0] x) [31]) + +// float <-> int conversion +(Cvt32to32F ...) => (MOVWF ...) +(Cvt32to64F ...) => (MOVWD ...) +(Cvt32Uto32F ...) => (MOVWUF ...) +(Cvt32Uto64F ...) => (MOVWUD ...) +(Cvt32Fto32 ...) => (MOVFW ...) +(Cvt64Fto32 ...) => (MOVDW ...) +(Cvt32Fto32U ...) => (MOVFWU ...) +(Cvt64Fto32U ...) => (MOVDWU ...) +(Cvt32Fto64F ...) => (MOVFD ...) +(Cvt64Fto32F ...) => (MOVDF ...) + +(Round(32|64)F ...) => (Copy ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +// fused-multiply-add +(FMA x y z) => (FMULAD z x y) + +// comparisons +(Eq8 x y) => (Equal (CMP (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Eq16 x y) => (Equal (CMP (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Eq32 x y) => (Equal (CMP x y)) +(EqPtr x y) => (Equal (CMP x y)) +(Eq(32|64)F x y) => (Equal (CMP(F|D) x y)) + +(Neq8 x y) => (NotEqual (CMP (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Neq16 x y) => (NotEqual (CMP (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Neq32 x y) => (NotEqual (CMP x y)) +(NeqPtr x y) => (NotEqual (CMP x y)) +(Neq(32|64)F x y) => (NotEqual (CMP(F|D) x y)) + +(Less8 x y) => (LessThan (CMP (SignExt8to32 x) (SignExt8to32 y))) +(Less16 x y) => (LessThan (CMP (SignExt16to32 x) (SignExt16to32 y))) +(Less32 x y) => (LessThan (CMP x y)) +(Less(32|64)F x y) => (GreaterThan (CMP(F|D) y x)) // reverse operands to work around NaN + +(Less8U x y) => (LessThanU (CMP (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Less16U x y) => (LessThanU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Less32U x y) => (LessThanU (CMP x y)) + +(Leq8 x y) => (LessEqual (CMP (SignExt8to32 x) (SignExt8to32 y))) +(Leq16 x y) => (LessEqual (CMP (SignExt16to32 x) (SignExt16to32 y))) +(Leq32 x y) => (LessEqual (CMP x y)) +(Leq(32|64)F x y) => (GreaterEqual (CMP(F|D) y x)) // reverse operands to work around NaN + +(Leq8U x y) => (LessEqualU (CMP (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Leq16U x y) => (LessEqualU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Leq32U x y) => (LessEqualU (CMP x y)) + +(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr) +(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr) + +(Addr {sym} base) => (MOVWaddr {sym} base) +(LocalAddr {sym} base _) => (MOVWaddr {sym} base) + +// loads +(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVWload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem) + +// stores +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem) + +// zero instructions +(Zero [0] _ mem) => mem +(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem) +(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore ptr (MOVWconst [0]) mem) +(Zero [2] ptr mem) => + (MOVBstore [1] ptr (MOVWconst [0]) + (MOVBstore [0] ptr (MOVWconst [0]) mem)) +(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore ptr (MOVWconst [0]) mem) +(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] ptr (MOVWconst [0]) + (MOVHstore [0] ptr (MOVWconst [0]) mem)) +(Zero [4] ptr mem) => + (MOVBstore [3] ptr (MOVWconst [0]) + (MOVBstore [2] ptr (MOVWconst [0]) + (MOVBstore [1] ptr (MOVWconst [0]) + (MOVBstore [0] ptr (MOVWconst [0]) mem)))) + +(Zero [3] ptr mem) => + (MOVBstore [2] ptr (MOVWconst [0]) + (MOVBstore [1] ptr (MOVWconst [0]) + (MOVBstore [0] ptr (MOVWconst [0]) mem))) + +// Medium zeroing uses a duff device +// 4 and 128 are magic constants, see runtime/mkduff.go +(Zero [s] {t} ptr mem) + && s%4 == 0 && s > 4 && s <= 512 + && t.Alignment()%4 == 0 && !config.noDuffDevice => + (DUFFZERO [4 * (128 - s/4)] ptr (MOVWconst [0]) mem) + +// Large zeroing uses a loop +(Zero [s] {t} ptr mem) + && (s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0 => + (LoweredZero [t.Alignment()] + ptr + (ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))]) + (MOVWconst [0]) + mem) + +// moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem) +(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore dst (MOVHUload src mem) mem) +(Move [2] dst src mem) => + (MOVBstore [1] dst (MOVBUload [1] src mem) + (MOVBstore dst (MOVBUload src mem) mem)) +(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore dst (MOVWload src mem) mem) +(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] dst (MOVHUload [2] src mem) + (MOVHstore dst (MOVHUload src mem) mem)) +(Move [4] dst src mem) => + (MOVBstore [3] dst (MOVBUload [3] src mem) + (MOVBstore [2] dst (MOVBUload [2] src mem) + (MOVBstore [1] dst (MOVBUload [1] src mem) + (MOVBstore dst (MOVBUload src mem) mem)))) + +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBUload [2] src mem) + (MOVBstore [1] dst (MOVBUload [1] src mem) + (MOVBstore dst (MOVBUload src mem) mem))) + +// Medium move uses a duff device +// 8 and 128 are magic constants, see runtime/mkduff.go +(Move [s] {t} dst src mem) + && s%4 == 0 && s > 4 && s <= 512 + && t.Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [8 * (128 - s/4)] dst src mem) + +// Large move uses a loop +(Move [s] {t} dst src mem) + && ((s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0) && logLargeCopy(v, s) => + (LoweredMove [t.Alignment()] + dst + src + (ADDconst <src.Type> src [int32(s-moveSize(t.Alignment(), config))]) + mem) + +// calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// checks +(NilCheck ...) => (LoweredNilCheck ...) +(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr)) +(IsInBounds idx len) => (LessThanU (CMP idx len)) +(IsSliceInBounds idx len) => (LessEqualU (CMP idx len)) + +// pseudo-ops +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) + +// Absorb pseudo-ops into blocks. +(If (Equal cc) yes no) => (EQ cc yes no) +(If (NotEqual cc) yes no) => (NE cc yes no) +(If (LessThan cc) yes no) => (LT cc yes no) +(If (LessThanU cc) yes no) => (ULT cc yes no) +(If (LessEqual cc) yes no) => (LE cc yes no) +(If (LessEqualU cc) yes no) => (ULE cc yes no) +(If (GreaterThan cc) yes no) => (GT cc yes no) +(If (GreaterThanU cc) yes no) => (UGT cc yes no) +(If (GreaterEqual cc) yes no) => (GE cc yes no) +(If (GreaterEqualU cc) yes no) => (UGE cc yes no) + +(If cond yes no) => (NE (CMPconst [0] cond) yes no) + +// Absorb boolean tests into block +(NE (CMPconst [0] (Equal cc)) yes no) => (EQ cc yes no) +(NE (CMPconst [0] (NotEqual cc)) yes no) => (NE cc yes no) +(NE (CMPconst [0] (LessThan cc)) yes no) => (LT cc yes no) +(NE (CMPconst [0] (LessThanU cc)) yes no) => (ULT cc yes no) +(NE (CMPconst [0] (LessEqual cc)) yes no) => (LE cc yes no) +(NE (CMPconst [0] (LessEqualU cc)) yes no) => (ULE cc yes no) +(NE (CMPconst [0] (GreaterThan cc)) yes no) => (GT cc yes no) +(NE (CMPconst [0] (GreaterThanU cc)) yes no) => (UGT cc yes no) +(NE (CMPconst [0] (GreaterEqual cc)) yes no) => (GE cc yes no) +(NE (CMPconst [0] (GreaterEqualU cc)) yes no) => (UGE cc yes no) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem) +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem) +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem) + +// Optimizations + +// fold offset into address +(ADDconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off1+off2] {sym} ptr) +(SUBconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off2-off1] {sym} ptr) + +// fold address into load/store +(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVBload [off1+off2] {sym} ptr mem) +(MOVBload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVBload [off1-off2] {sym} ptr mem) +(MOVBUload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVBUload [off1+off2] {sym} ptr mem) +(MOVBUload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVBUload [off1-off2] {sym} ptr mem) +(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVHload [off1+off2] {sym} ptr mem) +(MOVHload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVHload [off1-off2] {sym} ptr mem) +(MOVHUload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVHUload [off1+off2] {sym} ptr mem) +(MOVHUload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVHUload [off1-off2] {sym} ptr mem) +(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVWload [off1+off2] {sym} ptr mem) +(MOVWload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVWload [off1-off2] {sym} ptr mem) +(MOVFload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVFload [off1+off2] {sym} ptr mem) +(MOVFload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVFload [off1-off2] {sym} ptr mem) +(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) => (MOVDload [off1+off2] {sym} ptr mem) +(MOVDload [off1] {sym} (SUBconst [off2] ptr) mem) => (MOVDload [off1-off2] {sym} ptr mem) + +(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVBstore [off1+off2] {sym} ptr val mem) +(MOVBstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVBstore [off1-off2] {sym} ptr val mem) +(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVHstore [off1+off2] {sym} ptr val mem) +(MOVHstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVHstore [off1-off2] {sym} ptr val mem) +(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVWstore [off1+off2] {sym} ptr val mem) +(MOVWstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVWstore [off1-off2] {sym} ptr val mem) +(MOVFstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVFstore [off1+off2] {sym} ptr val mem) +(MOVFstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVFstore [off1-off2] {sym} ptr val mem) +(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) => (MOVDstore [off1+off2] {sym} ptr val mem) +(MOVDstore [off1] {sym} (SUBconst [off2] ptr) val mem) => (MOVDstore [off1-off2] {sym} ptr val mem) + +(MOVBload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVBUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVFload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVFload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVDload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +(MOVBstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVHstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVWstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVFstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVFstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVDstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) + +// replace load from same location as preceding store with zero/sign extension (or copy in case of full width) +(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBreg x) +(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBUreg x) +(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHreg x) +(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHUreg x) +(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x + +(MOVFload [off] {sym} ptr (MOVFstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x +(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x + +(MOVWloadidx ptr idx (MOVWstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => x +(MOVWloadshiftLL ptr idx [c] (MOVWstoreshiftLL ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x +(MOVWloadshiftRL ptr idx [c] (MOVWstoreshiftRL ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x +(MOVWloadshiftRA ptr idx [c] (MOVWstoreshiftRA ptr2 idx [d] x _)) && c==d && isSamePtr(ptr, ptr2) => x +(MOVBUloadidx ptr idx (MOVBstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVBUreg x) +(MOVBloadidx ptr idx (MOVBstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVBreg x) +(MOVHUloadidx ptr idx (MOVHstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVHUreg x) +(MOVHloadidx ptr idx (MOVHstoreidx ptr2 idx x _)) && isSamePtr(ptr, ptr2) => (MOVHreg x) + +// fold constant into arithmetic ops +(ADD x (MOVWconst [c])) => (ADDconst [c] x) +(SUB (MOVWconst [c]) x) => (RSBconst [c] x) +(SUB x (MOVWconst [c])) => (SUBconst [c] x) +(RSB (MOVWconst [c]) x) => (SUBconst [c] x) +(RSB x (MOVWconst [c])) => (RSBconst [c] x) + +(ADDS x (MOVWconst [c])) => (ADDSconst [c] x) +(SUBS x (MOVWconst [c])) => (SUBSconst [c] x) + +(ADC (MOVWconst [c]) x flags) => (ADCconst [c] x flags) +(SBC (MOVWconst [c]) x flags) => (RSCconst [c] x flags) +(SBC x (MOVWconst [c]) flags) => (SBCconst [c] x flags) + +(AND x (MOVWconst [c])) => (ANDconst [c] x) +(OR x (MOVWconst [c])) => (ORconst [c] x) +(XOR x (MOVWconst [c])) => (XORconst [c] x) +(BIC x (MOVWconst [c])) => (BICconst [c] x) + +(SLL x (MOVWconst [c])) && 0 <= c && c < 32 => (SLLconst x [c]) +(SRL x (MOVWconst [c])) && 0 <= c && c < 32 => (SRLconst x [c]) +(SRA x (MOVWconst [c])) && 0 <= c && c < 32 => (SRAconst x [c]) + +(CMP x (MOVWconst [c])) => (CMPconst [c] x) +(CMP (MOVWconst [c]) x) => (InvertFlags (CMPconst [c] x)) +(CMN x (MOVWconst [c])) => (CMNconst [c] x) +(TST x (MOVWconst [c])) => (TSTconst [c] x) +(TEQ x (MOVWconst [c])) => (TEQconst [c] x) + +(SRR x (MOVWconst [c])) => (SRRconst x [c&31]) + +// Canonicalize the order of arguments to comparisons - helps with CSE. +(CMP x y) && canonLessThan(x,y) => (InvertFlags (CMP y x)) + +// don't extend after proper load +// MOVWreg instruction is not emitted if src and dst registers are same, but it ensures the type. +(MOVBreg x:(MOVBload _ _)) => (MOVWreg x) +(MOVBUreg x:(MOVBUload _ _)) => (MOVWreg x) +(MOVHreg x:(MOVBload _ _)) => (MOVWreg x) +(MOVHreg x:(MOVBUload _ _)) => (MOVWreg x) +(MOVHreg x:(MOVHload _ _)) => (MOVWreg x) +(MOVHUreg x:(MOVBUload _ _)) => (MOVWreg x) +(MOVHUreg x:(MOVHUload _ _)) => (MOVWreg x) + +// fold extensions and ANDs together +(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x) +(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&0xffff] x) +(MOVBreg (ANDconst [c] x)) && c & 0x80 == 0 => (ANDconst [c&0x7f] x) +(MOVHreg (ANDconst [c] x)) && c & 0x8000 == 0 => (ANDconst [c&0x7fff] x) + +// fold double extensions +(MOVBreg x:(MOVBreg _)) => (MOVWreg x) +(MOVBUreg x:(MOVBUreg _)) => (MOVWreg x) +(MOVHreg x:(MOVBreg _)) => (MOVWreg x) +(MOVHreg x:(MOVBUreg _)) => (MOVWreg x) +(MOVHreg x:(MOVHreg _)) => (MOVWreg x) +(MOVHUreg x:(MOVBUreg _)) => (MOVWreg x) +(MOVHUreg x:(MOVHUreg _)) => (MOVWreg x) + +// don't extend before store +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) + +// if a register move has only 1 use, just use the same register without emitting instruction +// MOVWnop doesn't emit instruction, only for ensuring the type. +(MOVWreg x) && x.Uses == 1 => (MOVWnop x) + +// TODO: we should be able to get rid of MOVWnop all together. +// But for now, this is enough to get rid of lots of them. +(MOVWnop (MOVWconst [c])) => (MOVWconst [c]) + +// mul by constant +(MUL x (MOVWconst [c])) && int32(c) == -1 => (RSBconst [0] x) +(MUL _ (MOVWconst [0])) => (MOVWconst [0]) +(MUL x (MOVWconst [1])) => x +(MUL x (MOVWconst [c])) && isPowerOfTwo32(c) => (SLLconst [int32(log32(c))] x) +(MUL x (MOVWconst [c])) && isPowerOfTwo32(c-1) && c >= 3 => (ADDshiftLL x x [int32(log32(c-1))]) +(MUL x (MOVWconst [c])) && isPowerOfTwo32(c+1) && c >= 7 => (RSBshiftLL x x [int32(log32(c+1))]) +(MUL x (MOVWconst [c])) && c%3 == 0 && isPowerOfTwo32(c/3) => (SLLconst [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) +(MUL x (MOVWconst [c])) && c%5 == 0 && isPowerOfTwo32(c/5) => (SLLconst [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) +(MUL x (MOVWconst [c])) && c%7 == 0 && isPowerOfTwo32(c/7) => (SLLconst [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) +(MUL x (MOVWconst [c])) && c%9 == 0 && isPowerOfTwo32(c/9) => (SLLconst [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) + +(MULA x (MOVWconst [c]) a) && c == -1 => (SUB a x) +(MULA _ (MOVWconst [0]) a) => a +(MULA x (MOVWconst [1]) a) => (ADD x a) +(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c) => (ADD (SLLconst <x.Type> [int32(log32(c))] x) a) +(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c-1) && c >= 3 => (ADD (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a) +(MULA x (MOVWconst [c]) a) && isPowerOfTwo32(c+1) && c >= 7 => (ADD (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a) +(MULA x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo32(c/3) => (ADD (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a) +(MULA x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo32(c/5) => (ADD (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a) +(MULA x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo32(c/7) => (ADD (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a) +(MULA x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo32(c/9) => (ADD (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a) + +(MULA (MOVWconst [c]) x a) && c == -1 => (SUB a x) +(MULA (MOVWconst [0]) _ a) => a +(MULA (MOVWconst [1]) x a) => (ADD x a) +(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c) => (ADD (SLLconst <x.Type> [int32(log32(c))] x) a) +(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c-1) && c >= 3 => (ADD (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a) +(MULA (MOVWconst [c]) x a) && isPowerOfTwo32(c+1) && c >= 7 => (ADD (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a) +(MULA (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo32(c/3) => (ADD (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a) +(MULA (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo32(c/5) => (ADD (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a) +(MULA (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo32(c/7) => (ADD (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a) +(MULA (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo32(c/9) => (ADD (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a) + +(MULS x (MOVWconst [c]) a) && c == -1 => (ADD a x) +(MULS _ (MOVWconst [0]) a) => a +(MULS x (MOVWconst [1]) a) => (RSB x a) +(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c) => (RSB (SLLconst <x.Type> [int32(log32(c))] x) a) +(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c-1) && c >= 3 => (RSB (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a) +(MULS x (MOVWconst [c]) a) && isPowerOfTwo32(c+1) && c >= 7 => (RSB (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a) +(MULS x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo32(c/3) => (RSB (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a) +(MULS x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo32(c/5) => (RSB (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a) +(MULS x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo32(c/7) => (RSB (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a) +(MULS x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo32(c/9) => (RSB (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a) + +(MULS (MOVWconst [c]) x a) && c == -1 => (ADD a x) +(MULS (MOVWconst [0]) _ a) => a +(MULS (MOVWconst [1]) x a) => (RSB x a) +(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c) => (RSB (SLLconst <x.Type> [int32(log32(c))] x) a) +(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c-1) && c >= 3 => (RSB (ADDshiftLL <x.Type> x x [int32(log32(c-1))]) a) +(MULS (MOVWconst [c]) x a) && isPowerOfTwo32(c+1) && c >= 7 => (RSB (RSBshiftLL <x.Type> x x [int32(log32(c+1))]) a) +(MULS (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo32(c/3) => (RSB (SLLconst <x.Type> [int32(log32(c/3))] (ADDshiftLL <x.Type> x x [1])) a) +(MULS (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo32(c/5) => (RSB (SLLconst <x.Type> [int32(log32(c/5))] (ADDshiftLL <x.Type> x x [2])) a) +(MULS (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo32(c/7) => (RSB (SLLconst <x.Type> [int32(log32(c/7))] (RSBshiftLL <x.Type> x x [3])) a) +(MULS (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo32(c/9) => (RSB (SLLconst <x.Type> [int32(log32(c/9))] (ADDshiftLL <x.Type> x x [3])) a) + +// div by constant +(Select0 (CALLudiv x (MOVWconst [1]))) => x +(Select1 (CALLudiv _ (MOVWconst [1]))) => (MOVWconst [0]) +(Select0 (CALLudiv x (MOVWconst [c]))) && isPowerOfTwo32(c) => (SRLconst [int32(log32(c))] x) +(Select1 (CALLudiv x (MOVWconst [c]))) && isPowerOfTwo32(c) => (ANDconst [c-1] x) + +// constant comparisons +(CMPconst (MOVWconst [x]) [y]) => (FlagConstant [subFlags32(x,y)]) +(CMNconst (MOVWconst [x]) [y]) => (FlagConstant [addFlags32(x,y)]) +(TSTconst (MOVWconst [x]) [y]) => (FlagConstant [logicFlags32(x&y)]) +(TEQconst (MOVWconst [x]) [y]) => (FlagConstant [logicFlags32(x^y)]) + +// other known comparisons +(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags32(0, 1)]) +(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags32(0, 1)]) +(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n => (FlagConstant [subFlags32(0, 1)]) +(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint32(32-c)) <= uint32(n) => (FlagConstant [subFlags32(0, 1)]) + +// absorb flag constants into branches +(EQ (FlagConstant [fc]) yes no) && fc.eq() => (First yes no) +(EQ (FlagConstant [fc]) yes no) && !fc.eq() => (First no yes) + +(NE (FlagConstant [fc]) yes no) && fc.ne() => (First yes no) +(NE (FlagConstant [fc]) yes no) && !fc.ne() => (First no yes) + +(LT (FlagConstant [fc]) yes no) && fc.lt() => (First yes no) +(LT (FlagConstant [fc]) yes no) && !fc.lt() => (First no yes) + +(LE (FlagConstant [fc]) yes no) && fc.le() => (First yes no) +(LE (FlagConstant [fc]) yes no) && !fc.le() => (First no yes) + +(GT (FlagConstant [fc]) yes no) && fc.gt() => (First yes no) +(GT (FlagConstant [fc]) yes no) && !fc.gt() => (First no yes) + +(GE (FlagConstant [fc]) yes no) && fc.ge() => (First yes no) +(GE (FlagConstant [fc]) yes no) && !fc.ge() => (First no yes) + +(ULT (FlagConstant [fc]) yes no) && fc.ult() => (First yes no) +(ULT (FlagConstant [fc]) yes no) && !fc.ult() => (First no yes) + +(ULE (FlagConstant [fc]) yes no) && fc.ule() => (First yes no) +(ULE (FlagConstant [fc]) yes no) && !fc.ule() => (First no yes) + +(UGT (FlagConstant [fc]) yes no) && fc.ugt() => (First yes no) +(UGT (FlagConstant [fc]) yes no) && !fc.ugt() => (First no yes) + +(UGE (FlagConstant [fc]) yes no) && fc.uge() => (First yes no) +(UGE (FlagConstant [fc]) yes no) && !fc.uge() => (First no yes) + +(LTnoov (FlagConstant [fc]) yes no) && fc.ltNoov() => (First yes no) +(LTnoov (FlagConstant [fc]) yes no) && !fc.ltNoov() => (First no yes) + +(LEnoov (FlagConstant [fc]) yes no) && fc.leNoov() => (First yes no) +(LEnoov (FlagConstant [fc]) yes no) && !fc.leNoov() => (First no yes) + +(GTnoov (FlagConstant [fc]) yes no) && fc.gtNoov() => (First yes no) +(GTnoov (FlagConstant [fc]) yes no) && !fc.gtNoov() => (First no yes) + +(GEnoov (FlagConstant [fc]) yes no) && fc.geNoov() => (First yes no) +(GEnoov (FlagConstant [fc]) yes no) && !fc.geNoov() => (First no yes) + +// absorb InvertFlags into branches +(LT (InvertFlags cmp) yes no) => (GT cmp yes no) +(GT (InvertFlags cmp) yes no) => (LT cmp yes no) +(LE (InvertFlags cmp) yes no) => (GE cmp yes no) +(GE (InvertFlags cmp) yes no) => (LE cmp yes no) +(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no) +(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no) +(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no) +(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no) +(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no) +(NE (InvertFlags cmp) yes no) => (NE cmp yes no) +(LTnoov (InvertFlags cmp) yes no) => (GTnoov cmp yes no) +(GEnoov (InvertFlags cmp) yes no) => (LEnoov cmp yes no) +(LEnoov (InvertFlags cmp) yes no) => (GEnoov cmp yes no) +(GTnoov (InvertFlags cmp) yes no) => (LTnoov cmp yes no) + +// absorb flag constants into boolean values +(Equal (FlagConstant [fc])) => (MOVWconst [b2i32(fc.eq())]) +(NotEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ne())]) +(LessThan (FlagConstant [fc])) => (MOVWconst [b2i32(fc.lt())]) +(LessThanU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ult())]) +(LessEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.le())]) +(LessEqualU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ule())]) +(GreaterThan (FlagConstant [fc])) => (MOVWconst [b2i32(fc.gt())]) +(GreaterThanU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ugt())]) +(GreaterEqual (FlagConstant [fc])) => (MOVWconst [b2i32(fc.ge())]) +(GreaterEqualU (FlagConstant [fc])) => (MOVWconst [b2i32(fc.uge())]) + +// absorb InvertFlags into boolean values +(Equal (InvertFlags x)) => (Equal x) +(NotEqual (InvertFlags x)) => (NotEqual x) +(LessThan (InvertFlags x)) => (GreaterThan x) +(LessThanU (InvertFlags x)) => (GreaterThanU x) +(GreaterThan (InvertFlags x)) => (LessThan x) +(GreaterThanU (InvertFlags x)) => (LessThanU x) +(LessEqual (InvertFlags x)) => (GreaterEqual x) +(LessEqualU (InvertFlags x)) => (GreaterEqualU x) +(GreaterEqual (InvertFlags x)) => (LessEqual x) +(GreaterEqualU (InvertFlags x)) => (LessEqualU x) + +// absorb flag constants into conditional instructions +(CMOVWLSconst _ (FlagConstant [fc]) [c]) && fc.ule() => (MOVWconst [c]) +(CMOVWLSconst x (FlagConstant [fc]) [c]) && fc.ugt() => x + +(CMOVWHSconst _ (FlagConstant [fc]) [c]) && fc.uge() => (MOVWconst [c]) +(CMOVWHSconst x (FlagConstant [fc]) [c]) && fc.ult() => x + +(CMOVWLSconst x (InvertFlags flags) [c]) => (CMOVWHSconst x flags [c]) +(CMOVWHSconst x (InvertFlags flags) [c]) => (CMOVWLSconst x flags [c]) + +(SRAcond x _ (FlagConstant [fc])) && fc.uge() => (SRAconst x [31]) +(SRAcond x y (FlagConstant [fc])) && fc.ult() => (SRA x y) + +// remove redundant *const ops +(ADDconst [0] x) => x +(SUBconst [0] x) => x +(ANDconst [0] _) => (MOVWconst [0]) +(ANDconst [c] x) && int32(c)==-1 => x +(ORconst [0] x) => x +(ORconst [c] _) && int32(c)==-1 => (MOVWconst [-1]) +(XORconst [0] x) => x +(BICconst [0] x) => x +(BICconst [c] _) && int32(c)==-1 => (MOVWconst [0]) + +// generic constant folding +(ADDconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(uint32(-c)) => (SUBconst [-c] x) +(SUBconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(uint32(-c)) => (ADDconst [-c] x) +(ANDconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(^uint32(c)) => (BICconst [int32(^uint32(c))] x) +(BICconst [c] x) && !isARMImmRot(uint32(c)) && isARMImmRot(^uint32(c)) => (ANDconst [int32(^uint32(c))] x) +(ADDconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && uint32(-c)<=0xffff => (SUBconst [-c] x) +(SUBconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && uint32(-c)<=0xffff => (ADDconst [-c] x) +(ANDconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && ^uint32(c)<=0xffff => (BICconst [int32(^uint32(c))] x) +(BICconst [c] x) && buildcfg.GOARM==7 && !isARMImmRot(uint32(c)) && uint32(c)>0xffff && ^uint32(c)<=0xffff => (ANDconst [int32(^uint32(c))] x) +(ADDconst [c] (MOVWconst [d])) => (MOVWconst [c+d]) +(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x) +(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x) +(ADDconst [c] (RSBconst [d] x)) => (RSBconst [c+d] x) +(ADCconst [c] (ADDconst [d] x) flags) => (ADCconst [c+d] x flags) +(ADCconst [c] (SUBconst [d] x) flags) => (ADCconst [c-d] x flags) +(SUBconst [c] (MOVWconst [d])) => (MOVWconst [d-c]) +(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x) +(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x) +(SUBconst [c] (RSBconst [d] x)) => (RSBconst [-c+d] x) +(SBCconst [c] (ADDconst [d] x) flags) => (SBCconst [c-d] x flags) +(SBCconst [c] (SUBconst [d] x) flags) => (SBCconst [c+d] x flags) +(RSBconst [c] (MOVWconst [d])) => (MOVWconst [c-d]) +(RSBconst [c] (RSBconst [d] x)) => (ADDconst [c-d] x) +(RSBconst [c] (ADDconst [d] x)) => (RSBconst [c-d] x) +(RSBconst [c] (SUBconst [d] x)) => (RSBconst [c+d] x) +(RSCconst [c] (ADDconst [d] x) flags) => (RSCconst [c-d] x flags) +(RSCconst [c] (SUBconst [d] x) flags) => (RSCconst [c+d] x flags) +(SLLconst [c] (MOVWconst [d])) => (MOVWconst [d<<uint64(c)]) +(SRLconst [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)>>uint64(c))]) +(SRAconst [c] (MOVWconst [d])) => (MOVWconst [d>>uint64(c)]) +(MUL (MOVWconst [c]) (MOVWconst [d])) => (MOVWconst [c*d]) +(MULA (MOVWconst [c]) (MOVWconst [d]) a) => (ADDconst [c*d] a) +(MULS (MOVWconst [c]) (MOVWconst [d]) a) => (SUBconst [c*d] a) +(Select0 (CALLudiv (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)/uint32(d))]) +(Select1 (CALLudiv (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)%uint32(d))]) +(ANDconst [c] (MOVWconst [d])) => (MOVWconst [c&d]) +(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x) +(ORconst [c] (MOVWconst [d])) => (MOVWconst [c|d]) +(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x) +(XORconst [c] (MOVWconst [d])) => (MOVWconst [c^d]) +(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x) +(BICconst [c] (MOVWconst [d])) => (MOVWconst [d&^c]) +(BICconst [c] (BICconst [d] x)) => (BICconst [c|d] x) +(MVN (MOVWconst [c])) => (MOVWconst [^c]) +(MOVBreg (MOVWconst [c])) => (MOVWconst [int32(int8(c))]) +(MOVBUreg (MOVWconst [c])) => (MOVWconst [int32(uint8(c))]) +(MOVHreg (MOVWconst [c])) => (MOVWconst [int32(int16(c))]) +(MOVHUreg (MOVWconst [c])) => (MOVWconst [int32(uint16(c))]) +(MOVWreg (MOVWconst [c])) => (MOVWconst [c]) +// BFX: Width = c >> 8, LSB = c & 0xff, result = d << (32 - Width - LSB) >> (32 - Width) +(BFX [c] (MOVWconst [d])) => (MOVWconst [d<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8))]) +(BFXU [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))]) + +// absorb shifts into ops +(ADD x (SLLconst [c] y)) => (ADDshiftLL x y [c]) +(ADD x (SRLconst [c] y)) => (ADDshiftRL x y [c]) +(ADD x (SRAconst [c] y)) => (ADDshiftRA x y [c]) +(ADD x (SLL y z)) => (ADDshiftLLreg x y z) +(ADD x (SRL y z)) => (ADDshiftRLreg x y z) +(ADD x (SRA y z)) => (ADDshiftRAreg x y z) +(ADC x (SLLconst [c] y) flags) => (ADCshiftLL x y [c] flags) +(ADC x (SRLconst [c] y) flags) => (ADCshiftRL x y [c] flags) +(ADC x (SRAconst [c] y) flags) => (ADCshiftRA x y [c] flags) +(ADC x (SLL y z) flags) => (ADCshiftLLreg x y z flags) +(ADC x (SRL y z) flags) => (ADCshiftRLreg x y z flags) +(ADC x (SRA y z) flags) => (ADCshiftRAreg x y z flags) +(ADDS x (SLLconst [c] y)) => (ADDSshiftLL x y [c]) +(ADDS x (SRLconst [c] y)) => (ADDSshiftRL x y [c]) +(ADDS x (SRAconst [c] y)) => (ADDSshiftRA x y [c]) +(ADDS x (SLL y z)) => (ADDSshiftLLreg x y z) +(ADDS x (SRL y z)) => (ADDSshiftRLreg x y z) +(ADDS x (SRA y z)) => (ADDSshiftRAreg x y z) +(SUB x (SLLconst [c] y)) => (SUBshiftLL x y [c]) +(SUB (SLLconst [c] y) x) => (RSBshiftLL x y [c]) +(SUB x (SRLconst [c] y)) => (SUBshiftRL x y [c]) +(SUB (SRLconst [c] y) x) => (RSBshiftRL x y [c]) +(SUB x (SRAconst [c] y)) => (SUBshiftRA x y [c]) +(SUB (SRAconst [c] y) x) => (RSBshiftRA x y [c]) +(SUB x (SLL y z)) => (SUBshiftLLreg x y z) +(SUB (SLL y z) x) => (RSBshiftLLreg x y z) +(SUB x (SRL y z)) => (SUBshiftRLreg x y z) +(SUB (SRL y z) x) => (RSBshiftRLreg x y z) +(SUB x (SRA y z)) => (SUBshiftRAreg x y z) +(SUB (SRA y z) x) => (RSBshiftRAreg x y z) +(SBC x (SLLconst [c] y) flags) => (SBCshiftLL x y [c] flags) +(SBC (SLLconst [c] y) x flags) => (RSCshiftLL x y [c] flags) +(SBC x (SRLconst [c] y) flags) => (SBCshiftRL x y [c] flags) +(SBC (SRLconst [c] y) x flags) => (RSCshiftRL x y [c] flags) +(SBC x (SRAconst [c] y) flags) => (SBCshiftRA x y [c] flags) +(SBC (SRAconst [c] y) x flags) => (RSCshiftRA x y [c] flags) +(SBC x (SLL y z) flags) => (SBCshiftLLreg x y z flags) +(SBC (SLL y z) x flags) => (RSCshiftLLreg x y z flags) +(SBC x (SRL y z) flags) => (SBCshiftRLreg x y z flags) +(SBC (SRL y z) x flags) => (RSCshiftRLreg x y z flags) +(SBC x (SRA y z) flags) => (SBCshiftRAreg x y z flags) +(SBC (SRA y z) x flags) => (RSCshiftRAreg x y z flags) +(SUBS x (SLLconst [c] y)) => (SUBSshiftLL x y [c]) +(SUBS (SLLconst [c] y) x) => (RSBSshiftLL x y [c]) +(SUBS x (SRLconst [c] y)) => (SUBSshiftRL x y [c]) +(SUBS (SRLconst [c] y) x) => (RSBSshiftRL x y [c]) +(SUBS x (SRAconst [c] y)) => (SUBSshiftRA x y [c]) +(SUBS (SRAconst [c] y) x) => (RSBSshiftRA x y [c]) +(SUBS x (SLL y z)) => (SUBSshiftLLreg x y z) +(SUBS (SLL y z) x) => (RSBSshiftLLreg x y z) +(SUBS x (SRL y z)) => (SUBSshiftRLreg x y z) +(SUBS (SRL y z) x) => (RSBSshiftRLreg x y z) +(SUBS x (SRA y z)) => (SUBSshiftRAreg x y z) +(SUBS (SRA y z) x) => (RSBSshiftRAreg x y z) +(RSB x (SLLconst [c] y)) => (RSBshiftLL x y [c]) +(RSB (SLLconst [c] y) x) => (SUBshiftLL x y [c]) +(RSB x (SRLconst [c] y)) => (RSBshiftRL x y [c]) +(RSB (SRLconst [c] y) x) => (SUBshiftRL x y [c]) +(RSB x (SRAconst [c] y)) => (RSBshiftRA x y [c]) +(RSB (SRAconst [c] y) x) => (SUBshiftRA x y [c]) +(RSB x (SLL y z)) => (RSBshiftLLreg x y z) +(RSB (SLL y z) x) => (SUBshiftLLreg x y z) +(RSB x (SRL y z)) => (RSBshiftRLreg x y z) +(RSB (SRL y z) x) => (SUBshiftRLreg x y z) +(RSB x (SRA y z)) => (RSBshiftRAreg x y z) +(RSB (SRA y z) x) => (SUBshiftRAreg x y z) +(AND x (SLLconst [c] y)) => (ANDshiftLL x y [c]) +(AND x (SRLconst [c] y)) => (ANDshiftRL x y [c]) +(AND x (SRAconst [c] y)) => (ANDshiftRA x y [c]) +(AND x (SLL y z)) => (ANDshiftLLreg x y z) +(AND x (SRL y z)) => (ANDshiftRLreg x y z) +(AND x (SRA y z)) => (ANDshiftRAreg x y z) +(OR x (SLLconst [c] y)) => (ORshiftLL x y [c]) +(OR x (SRLconst [c] y)) => (ORshiftRL x y [c]) +(OR x (SRAconst [c] y)) => (ORshiftRA x y [c]) +(OR x (SLL y z)) => (ORshiftLLreg x y z) +(OR x (SRL y z)) => (ORshiftRLreg x y z) +(OR x (SRA y z)) => (ORshiftRAreg x y z) +(XOR x (SLLconst [c] y)) => (XORshiftLL x y [c]) +(XOR x (SRLconst [c] y)) => (XORshiftRL x y [c]) +(XOR x (SRAconst [c] y)) => (XORshiftRA x y [c]) +(XOR x (SRRconst [c] y)) => (XORshiftRR x y [c]) +(XOR x (SLL y z)) => (XORshiftLLreg x y z) +(XOR x (SRL y z)) => (XORshiftRLreg x y z) +(XOR x (SRA y z)) => (XORshiftRAreg x y z) +(BIC x (SLLconst [c] y)) => (BICshiftLL x y [c]) +(BIC x (SRLconst [c] y)) => (BICshiftRL x y [c]) +(BIC x (SRAconst [c] y)) => (BICshiftRA x y [c]) +(BIC x (SLL y z)) => (BICshiftLLreg x y z) +(BIC x (SRL y z)) => (BICshiftRLreg x y z) +(BIC x (SRA y z)) => (BICshiftRAreg x y z) +(MVN (SLLconst [c] x)) => (MVNshiftLL x [c]) +(MVN (SRLconst [c] x)) => (MVNshiftRL x [c]) +(MVN (SRAconst [c] x)) => (MVNshiftRA x [c]) +(MVN (SLL x y)) => (MVNshiftLLreg x y) +(MVN (SRL x y)) => (MVNshiftRLreg x y) +(MVN (SRA x y)) => (MVNshiftRAreg x y) + +(CMP x (SLLconst [c] y)) => (CMPshiftLL x y [c]) +(CMP (SLLconst [c] y) x) => (InvertFlags (CMPshiftLL x y [c])) +(CMP x (SRLconst [c] y)) => (CMPshiftRL x y [c]) +(CMP (SRLconst [c] y) x) => (InvertFlags (CMPshiftRL x y [c])) +(CMP x (SRAconst [c] y)) => (CMPshiftRA x y [c]) +(CMP (SRAconst [c] y) x) => (InvertFlags (CMPshiftRA x y [c])) +(CMP x (SLL y z)) => (CMPshiftLLreg x y z) +(CMP (SLL y z) x) => (InvertFlags (CMPshiftLLreg x y z)) +(CMP x (SRL y z)) => (CMPshiftRLreg x y z) +(CMP (SRL y z) x) => (InvertFlags (CMPshiftRLreg x y z)) +(CMP x (SRA y z)) => (CMPshiftRAreg x y z) +(CMP (SRA y z) x) => (InvertFlags (CMPshiftRAreg x y z)) +(TST x (SLLconst [c] y)) => (TSTshiftLL x y [c]) +(TST x (SRLconst [c] y)) => (TSTshiftRL x y [c]) +(TST x (SRAconst [c] y)) => (TSTshiftRA x y [c]) +(TST x (SLL y z)) => (TSTshiftLLreg x y z) +(TST x (SRL y z)) => (TSTshiftRLreg x y z) +(TST x (SRA y z)) => (TSTshiftRAreg x y z) +(TEQ x (SLLconst [c] y)) => (TEQshiftLL x y [c]) +(TEQ x (SRLconst [c] y)) => (TEQshiftRL x y [c]) +(TEQ x (SRAconst [c] y)) => (TEQshiftRA x y [c]) +(TEQ x (SLL y z)) => (TEQshiftLLreg x y z) +(TEQ x (SRL y z)) => (TEQshiftRLreg x y z) +(TEQ x (SRA y z)) => (TEQshiftRAreg x y z) +(CMN x (SLLconst [c] y)) => (CMNshiftLL x y [c]) +(CMN x (SRLconst [c] y)) => (CMNshiftRL x y [c]) +(CMN x (SRAconst [c] y)) => (CMNshiftRA x y [c]) +(CMN x (SLL y z)) => (CMNshiftLLreg x y z) +(CMN x (SRL y z)) => (CMNshiftRLreg x y z) +(CMN x (SRA y z)) => (CMNshiftRAreg x y z) + +// prefer *const ops to *shift ops +(ADDshiftLL (MOVWconst [c]) x [d]) => (ADDconst [c] (SLLconst <x.Type> x [d])) +(ADDshiftRL (MOVWconst [c]) x [d]) => (ADDconst [c] (SRLconst <x.Type> x [d])) +(ADDshiftRA (MOVWconst [c]) x [d]) => (ADDconst [c] (SRAconst <x.Type> x [d])) +(ADCshiftLL (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SLLconst <x.Type> x [d]) flags) +(ADCshiftRL (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SRLconst <x.Type> x [d]) flags) +(ADCshiftRA (MOVWconst [c]) x [d] flags) => (ADCconst [c] (SRAconst <x.Type> x [d]) flags) +(ADDSshiftLL (MOVWconst [c]) x [d]) => (ADDSconst [c] (SLLconst <x.Type> x [d])) +(ADDSshiftRL (MOVWconst [c]) x [d]) => (ADDSconst [c] (SRLconst <x.Type> x [d])) +(ADDSshiftRA (MOVWconst [c]) x [d]) => (ADDSconst [c] (SRAconst <x.Type> x [d])) +(SUBshiftLL (MOVWconst [c]) x [d]) => (RSBconst [c] (SLLconst <x.Type> x [d])) +(SUBshiftRL (MOVWconst [c]) x [d]) => (RSBconst [c] (SRLconst <x.Type> x [d])) +(SUBshiftRA (MOVWconst [c]) x [d]) => (RSBconst [c] (SRAconst <x.Type> x [d])) +(SBCshiftLL (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SLLconst <x.Type> x [d]) flags) +(SBCshiftRL (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SRLconst <x.Type> x [d]) flags) +(SBCshiftRA (MOVWconst [c]) x [d] flags) => (RSCconst [c] (SRAconst <x.Type> x [d]) flags) +(SUBSshiftLL (MOVWconst [c]) x [d]) => (RSBSconst [c] (SLLconst <x.Type> x [d])) +(SUBSshiftRL (MOVWconst [c]) x [d]) => (RSBSconst [c] (SRLconst <x.Type> x [d])) +(SUBSshiftRA (MOVWconst [c]) x [d]) => (RSBSconst [c] (SRAconst <x.Type> x [d])) +(RSBshiftLL (MOVWconst [c]) x [d]) => (SUBconst [c] (SLLconst <x.Type> x [d])) +(RSBshiftRL (MOVWconst [c]) x [d]) => (SUBconst [c] (SRLconst <x.Type> x [d])) +(RSBshiftRA (MOVWconst [c]) x [d]) => (SUBconst [c] (SRAconst <x.Type> x [d])) +(RSCshiftLL (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SLLconst <x.Type> x [d]) flags) +(RSCshiftRL (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SRLconst <x.Type> x [d]) flags) +(RSCshiftRA (MOVWconst [c]) x [d] flags) => (SBCconst [c] (SRAconst <x.Type> x [d]) flags) +(RSBSshiftLL (MOVWconst [c]) x [d]) => (SUBSconst [c] (SLLconst <x.Type> x [d])) +(RSBSshiftRL (MOVWconst [c]) x [d]) => (SUBSconst [c] (SRLconst <x.Type> x [d])) +(RSBSshiftRA (MOVWconst [c]) x [d]) => (SUBSconst [c] (SRAconst <x.Type> x [d])) +(ANDshiftLL (MOVWconst [c]) x [d]) => (ANDconst [c] (SLLconst <x.Type> x [d])) +(ANDshiftRL (MOVWconst [c]) x [d]) => (ANDconst [c] (SRLconst <x.Type> x [d])) +(ANDshiftRA (MOVWconst [c]) x [d]) => (ANDconst [c] (SRAconst <x.Type> x [d])) +(ORshiftLL (MOVWconst [c]) x [d]) => (ORconst [c] (SLLconst <x.Type> x [d])) +(ORshiftRL (MOVWconst [c]) x [d]) => (ORconst [c] (SRLconst <x.Type> x [d])) +(ORshiftRA (MOVWconst [c]) x [d]) => (ORconst [c] (SRAconst <x.Type> x [d])) +(XORshiftLL (MOVWconst [c]) x [d]) => (XORconst [c] (SLLconst <x.Type> x [d])) +(XORshiftRL (MOVWconst [c]) x [d]) => (XORconst [c] (SRLconst <x.Type> x [d])) +(XORshiftRA (MOVWconst [c]) x [d]) => (XORconst [c] (SRAconst <x.Type> x [d])) +(XORshiftRR (MOVWconst [c]) x [d]) => (XORconst [c] (SRRconst <x.Type> x [d])) +(CMPshiftLL (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SLLconst <x.Type> x [d]))) +(CMPshiftRL (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRLconst <x.Type> x [d]))) +(CMPshiftRA (MOVWconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRAconst <x.Type> x [d]))) +(TSTshiftLL (MOVWconst [c]) x [d]) => (TSTconst [c] (SLLconst <x.Type> x [d])) +(TSTshiftRL (MOVWconst [c]) x [d]) => (TSTconst [c] (SRLconst <x.Type> x [d])) +(TSTshiftRA (MOVWconst [c]) x [d]) => (TSTconst [c] (SRAconst <x.Type> x [d])) +(TEQshiftLL (MOVWconst [c]) x [d]) => (TEQconst [c] (SLLconst <x.Type> x [d])) +(TEQshiftRL (MOVWconst [c]) x [d]) => (TEQconst [c] (SRLconst <x.Type> x [d])) +(TEQshiftRA (MOVWconst [c]) x [d]) => (TEQconst [c] (SRAconst <x.Type> x [d])) +(CMNshiftLL (MOVWconst [c]) x [d]) => (CMNconst [c] (SLLconst <x.Type> x [d])) +(CMNshiftRL (MOVWconst [c]) x [d]) => (CMNconst [c] (SRLconst <x.Type> x [d])) +(CMNshiftRA (MOVWconst [c]) x [d]) => (CMNconst [c] (SRAconst <x.Type> x [d])) + +(ADDshiftLLreg (MOVWconst [c]) x y) => (ADDconst [c] (SLL <x.Type> x y)) +(ADDshiftRLreg (MOVWconst [c]) x y) => (ADDconst [c] (SRL <x.Type> x y)) +(ADDshiftRAreg (MOVWconst [c]) x y) => (ADDconst [c] (SRA <x.Type> x y)) +(ADCshiftLLreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SLL <x.Type> x y) flags) +(ADCshiftRLreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SRL <x.Type> x y) flags) +(ADCshiftRAreg (MOVWconst [c]) x y flags) => (ADCconst [c] (SRA <x.Type> x y) flags) +(ADDSshiftLLreg (MOVWconst [c]) x y) => (ADDSconst [c] (SLL <x.Type> x y)) +(ADDSshiftRLreg (MOVWconst [c]) x y) => (ADDSconst [c] (SRL <x.Type> x y)) +(ADDSshiftRAreg (MOVWconst [c]) x y) => (ADDSconst [c] (SRA <x.Type> x y)) +(SUBshiftLLreg (MOVWconst [c]) x y) => (RSBconst [c] (SLL <x.Type> x y)) +(SUBshiftRLreg (MOVWconst [c]) x y) => (RSBconst [c] (SRL <x.Type> x y)) +(SUBshiftRAreg (MOVWconst [c]) x y) => (RSBconst [c] (SRA <x.Type> x y)) +(SBCshiftLLreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SLL <x.Type> x y) flags) +(SBCshiftRLreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SRL <x.Type> x y) flags) +(SBCshiftRAreg (MOVWconst [c]) x y flags) => (RSCconst [c] (SRA <x.Type> x y) flags) +(SUBSshiftLLreg (MOVWconst [c]) x y) => (RSBSconst [c] (SLL <x.Type> x y)) +(SUBSshiftRLreg (MOVWconst [c]) x y) => (RSBSconst [c] (SRL <x.Type> x y)) +(SUBSshiftRAreg (MOVWconst [c]) x y) => (RSBSconst [c] (SRA <x.Type> x y)) +(RSBshiftLLreg (MOVWconst [c]) x y) => (SUBconst [c] (SLL <x.Type> x y)) +(RSBshiftRLreg (MOVWconst [c]) x y) => (SUBconst [c] (SRL <x.Type> x y)) +(RSBshiftRAreg (MOVWconst [c]) x y) => (SUBconst [c] (SRA <x.Type> x y)) +(RSCshiftLLreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SLL <x.Type> x y) flags) +(RSCshiftRLreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SRL <x.Type> x y) flags) +(RSCshiftRAreg (MOVWconst [c]) x y flags) => (SBCconst [c] (SRA <x.Type> x y) flags) +(RSBSshiftLLreg (MOVWconst [c]) x y) => (SUBSconst [c] (SLL <x.Type> x y)) +(RSBSshiftRLreg (MOVWconst [c]) x y) => (SUBSconst [c] (SRL <x.Type> x y)) +(RSBSshiftRAreg (MOVWconst [c]) x y) => (SUBSconst [c] (SRA <x.Type> x y)) +(ANDshiftLLreg (MOVWconst [c]) x y) => (ANDconst [c] (SLL <x.Type> x y)) +(ANDshiftRLreg (MOVWconst [c]) x y) => (ANDconst [c] (SRL <x.Type> x y)) +(ANDshiftRAreg (MOVWconst [c]) x y) => (ANDconst [c] (SRA <x.Type> x y)) +(ORshiftLLreg (MOVWconst [c]) x y) => (ORconst [c] (SLL <x.Type> x y)) +(ORshiftRLreg (MOVWconst [c]) x y) => (ORconst [c] (SRL <x.Type> x y)) +(ORshiftRAreg (MOVWconst [c]) x y) => (ORconst [c] (SRA <x.Type> x y)) +(XORshiftLLreg (MOVWconst [c]) x y) => (XORconst [c] (SLL <x.Type> x y)) +(XORshiftRLreg (MOVWconst [c]) x y) => (XORconst [c] (SRL <x.Type> x y)) +(XORshiftRAreg (MOVWconst [c]) x y) => (XORconst [c] (SRA <x.Type> x y)) +(CMPshiftLLreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SLL <x.Type> x y))) +(CMPshiftRLreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SRL <x.Type> x y))) +(CMPshiftRAreg (MOVWconst [c]) x y) => (InvertFlags (CMPconst [c] (SRA <x.Type> x y))) +(TSTshiftLLreg (MOVWconst [c]) x y) => (TSTconst [c] (SLL <x.Type> x y)) +(TSTshiftRLreg (MOVWconst [c]) x y) => (TSTconst [c] (SRL <x.Type> x y)) +(TSTshiftRAreg (MOVWconst [c]) x y) => (TSTconst [c] (SRA <x.Type> x y)) +(TEQshiftLLreg (MOVWconst [c]) x y) => (TEQconst [c] (SLL <x.Type> x y)) +(TEQshiftRLreg (MOVWconst [c]) x y) => (TEQconst [c] (SRL <x.Type> x y)) +(TEQshiftRAreg (MOVWconst [c]) x y) => (TEQconst [c] (SRA <x.Type> x y)) +(CMNshiftLLreg (MOVWconst [c]) x y) => (CMNconst [c] (SLL <x.Type> x y)) +(CMNshiftRLreg (MOVWconst [c]) x y) => (CMNconst [c] (SRL <x.Type> x y)) +(CMNshiftRAreg (MOVWconst [c]) x y) => (CMNconst [c] (SRA <x.Type> x y)) + +// constant folding in *shift ops +(ADDshiftLL x (MOVWconst [c]) [d]) => (ADDconst x [c<<uint64(d)]) +(ADDshiftRL x (MOVWconst [c]) [d]) => (ADDconst x [int32(uint32(c)>>uint64(d))]) +(ADDshiftRA x (MOVWconst [c]) [d]) => (ADDconst x [c>>uint64(d)]) +(ADCshiftLL x (MOVWconst [c]) [d] flags) => (ADCconst x [c<<uint64(d)] flags) +(ADCshiftRL x (MOVWconst [c]) [d] flags) => (ADCconst x [int32(uint32(c)>>uint64(d))] flags) +(ADCshiftRA x (MOVWconst [c]) [d] flags) => (ADCconst x [c>>uint64(d)] flags) +(ADDSshiftLL x (MOVWconst [c]) [d]) => (ADDSconst x [c<<uint64(d)]) +(ADDSshiftRL x (MOVWconst [c]) [d]) => (ADDSconst x [int32(uint32(c)>>uint64(d))]) +(ADDSshiftRA x (MOVWconst [c]) [d]) => (ADDSconst x [c>>uint64(d)]) +(SUBshiftLL x (MOVWconst [c]) [d]) => (SUBconst x [c<<uint64(d)]) +(SUBshiftRL x (MOVWconst [c]) [d]) => (SUBconst x [int32(uint32(c)>>uint64(d))]) +(SUBshiftRA x (MOVWconst [c]) [d]) => (SUBconst x [c>>uint64(d)]) +(SBCshiftLL x (MOVWconst [c]) [d] flags) => (SBCconst x [c<<uint64(d)] flags) +(SBCshiftRL x (MOVWconst [c]) [d] flags) => (SBCconst x [int32(uint32(c)>>uint64(d))] flags) +(SBCshiftRA x (MOVWconst [c]) [d] flags) => (SBCconst x [c>>uint64(d)] flags) +(SUBSshiftLL x (MOVWconst [c]) [d]) => (SUBSconst x [c<<uint64(d)]) +(SUBSshiftRL x (MOVWconst [c]) [d]) => (SUBSconst x [int32(uint32(c)>>uint64(d))]) +(SUBSshiftRA x (MOVWconst [c]) [d]) => (SUBSconst x [c>>uint64(d)]) +(RSBshiftLL x (MOVWconst [c]) [d]) => (RSBconst x [c<<uint64(d)]) +(RSBshiftRL x (MOVWconst [c]) [d]) => (RSBconst x [int32(uint32(c)>>uint64(d))]) +(RSBshiftRA x (MOVWconst [c]) [d]) => (RSBconst x [c>>uint64(d)]) +(RSCshiftLL x (MOVWconst [c]) [d] flags) => (RSCconst x [c<<uint64(d)] flags) +(RSCshiftRL x (MOVWconst [c]) [d] flags) => (RSCconst x [int32(uint32(c)>>uint64(d))] flags) +(RSCshiftRA x (MOVWconst [c]) [d] flags) => (RSCconst x [c>>uint64(d)] flags) +(RSBSshiftLL x (MOVWconst [c]) [d]) => (RSBSconst x [c<<uint64(d)]) +(RSBSshiftRL x (MOVWconst [c]) [d]) => (RSBSconst x [int32(uint32(c)>>uint64(d))]) +(RSBSshiftRA x (MOVWconst [c]) [d]) => (RSBSconst x [c>>uint64(d)]) +(ANDshiftLL x (MOVWconst [c]) [d]) => (ANDconst x [c<<uint64(d)]) +(ANDshiftRL x (MOVWconst [c]) [d]) => (ANDconst x [int32(uint32(c)>>uint64(d))]) +(ANDshiftRA x (MOVWconst [c]) [d]) => (ANDconst x [c>>uint64(d)]) +(ORshiftLL x (MOVWconst [c]) [d]) => (ORconst x [c<<uint64(d)]) +(ORshiftRL x (MOVWconst [c]) [d]) => (ORconst x [int32(uint32(c)>>uint64(d))]) +(ORshiftRA x (MOVWconst [c]) [d]) => (ORconst x [c>>uint64(d)]) +(XORshiftLL x (MOVWconst [c]) [d]) => (XORconst x [c<<uint64(d)]) +(XORshiftRL x (MOVWconst [c]) [d]) => (XORconst x [int32(uint32(c)>>uint64(d))]) +(XORshiftRA x (MOVWconst [c]) [d]) => (XORconst x [c>>uint64(d)]) +(XORshiftRR x (MOVWconst [c]) [d]) => (XORconst x [int32(uint32(c)>>uint64(d)|uint32(c)<<uint64(32-d))]) +(BICshiftLL x (MOVWconst [c]) [d]) => (BICconst x [c<<uint64(d)]) +(BICshiftRL x (MOVWconst [c]) [d]) => (BICconst x [int32(uint32(c)>>uint64(d))]) +(BICshiftRA x (MOVWconst [c]) [d]) => (BICconst x [c>>uint64(d)]) +(MVNshiftLL (MOVWconst [c]) [d]) => (MOVWconst [^(c<<uint64(d))]) +(MVNshiftRL (MOVWconst [c]) [d]) => (MOVWconst [^int32(uint32(c)>>uint64(d))]) +(MVNshiftRA (MOVWconst [c]) [d]) => (MOVWconst [int32(c)>>uint64(d)]) +(CMPshiftLL x (MOVWconst [c]) [d]) => (CMPconst x [c<<uint64(d)]) +(CMPshiftRL x (MOVWconst [c]) [d]) => (CMPconst x [int32(uint32(c)>>uint64(d))]) +(CMPshiftRA x (MOVWconst [c]) [d]) => (CMPconst x [c>>uint64(d)]) +(TSTshiftLL x (MOVWconst [c]) [d]) => (TSTconst x [c<<uint64(d)]) +(TSTshiftRL x (MOVWconst [c]) [d]) => (TSTconst x [int32(uint32(c)>>uint64(d))]) +(TSTshiftRA x (MOVWconst [c]) [d]) => (TSTconst x [c>>uint64(d)]) +(TEQshiftLL x (MOVWconst [c]) [d]) => (TEQconst x [c<<uint64(d)]) +(TEQshiftRL x (MOVWconst [c]) [d]) => (TEQconst x [int32(uint32(c)>>uint64(d))]) +(TEQshiftRA x (MOVWconst [c]) [d]) => (TEQconst x [c>>uint64(d)]) +(CMNshiftLL x (MOVWconst [c]) [d]) => (CMNconst x [c<<uint64(d)]) +(CMNshiftRL x (MOVWconst [c]) [d]) => (CMNconst x [int32(uint32(c)>>uint64(d))]) +(CMNshiftRA x (MOVWconst [c]) [d]) => (CMNconst x [c>>uint64(d)]) + +(ADDshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftLL x y [c]) +(ADDshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftRL x y [c]) +(ADDshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDshiftRA x y [c]) +(ADCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftLL x y [c] flags) +(ADCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftRL x y [c] flags) +(ADCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (ADCshiftRA x y [c] flags) +(ADDSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftLL x y [c]) +(ADDSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftRL x y [c]) +(ADDSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ADDSshiftRA x y [c]) +(SUBshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftLL x y [c]) +(SUBshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftRL x y [c]) +(SUBshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBshiftRA x y [c]) +(SBCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftLL x y [c] flags) +(SBCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftRL x y [c] flags) +(SBCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (SBCshiftRA x y [c] flags) +(SUBSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftLL x y [c]) +(SUBSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftRL x y [c]) +(SUBSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (SUBSshiftRA x y [c]) +(RSBshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftLL x y [c]) +(RSBshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftRL x y [c]) +(RSBshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBshiftRA x y [c]) +(RSCshiftLLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftLL x y [c] flags) +(RSCshiftRLreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftRL x y [c] flags) +(RSCshiftRAreg x y (MOVWconst [c]) flags) && 0 <= c && c < 32 => (RSCshiftRA x y [c] flags) +(RSBSshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftLL x y [c]) +(RSBSshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftRL x y [c]) +(RSBSshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (RSBSshiftRA x y [c]) +(ANDshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftLL x y [c]) +(ANDshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftRL x y [c]) +(ANDshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ANDshiftRA x y [c]) +(ORshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftLL x y [c]) +(ORshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftRL x y [c]) +(ORshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (ORshiftRA x y [c]) +(XORshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftLL x y [c]) +(XORshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftRL x y [c]) +(XORshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (XORshiftRA x y [c]) +(BICshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftLL x y [c]) +(BICshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftRL x y [c]) +(BICshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (BICshiftRA x y [c]) +(MVNshiftLLreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftLL x [c]) +(MVNshiftRLreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftRL x [c]) +(MVNshiftRAreg x (MOVWconst [c])) && 0 <= c && c < 32 => (MVNshiftRA x [c]) +(CMPshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftLL x y [c]) +(CMPshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftRL x y [c]) +(CMPshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMPshiftRA x y [c]) +(TSTshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftLL x y [c]) +(TSTshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftRL x y [c]) +(TSTshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TSTshiftRA x y [c]) +(TEQshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftLL x y [c]) +(TEQshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftRL x y [c]) +(TEQshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (TEQshiftRA x y [c]) +(CMNshiftLLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftLL x y [c]) +(CMNshiftRLreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftRL x y [c]) +(CMNshiftRAreg x y (MOVWconst [c])) && 0 <= c && c < 32 => (CMNshiftRA x y [c]) + +(RotateLeft16 <t> x (MOVWconst [c])) => (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15]))) +(RotateLeft8 <t> x (MOVWconst [c])) => (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7]))) +(RotateLeft32 x y) => (SRR x (RSBconst [0] <y.Type> y)) + +// ((x>>8) | (x<<8)) -> (REV16 x), the type of x is uint16, "|" can also be "^" or "+". +// UBFX instruction is supported by ARMv6T2, ARMv7 and above versions, REV16 is supported by +// ARMv6 and above versions. So for ARMv6, we need to match SLLconst, SRLconst and ORshiftLL. +((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (BFXU <typ.UInt16> [int32(armBFAuxInt(8, 8))] x) x) => (REV16 x) +((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (SRLconst <typ.UInt16> [24] (SLLconst [16] x)) x) && buildcfg.GOARM>=6 => (REV16 x) + +// use indexed loads and stores +(MOVWload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVWloadidx ptr idx mem) +(MOVWstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVWstoreidx ptr idx val mem) +(MOVWload [0] {sym} (ADDshiftLL ptr idx [c]) mem) && sym == nil => (MOVWloadshiftLL ptr idx [c] mem) +(MOVWload [0] {sym} (ADDshiftRL ptr idx [c]) mem) && sym == nil => (MOVWloadshiftRL ptr idx [c] mem) +(MOVWload [0] {sym} (ADDshiftRA ptr idx [c]) mem) && sym == nil => (MOVWloadshiftRA ptr idx [c] mem) +(MOVWstore [0] {sym} (ADDshiftLL ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftLL ptr idx [c] val mem) +(MOVWstore [0] {sym} (ADDshiftRL ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftRL ptr idx [c] val mem) +(MOVWstore [0] {sym} (ADDshiftRA ptr idx [c]) val mem) && sym == nil => (MOVWstoreshiftRA ptr idx [c] val mem) +(MOVBUload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVBUloadidx ptr idx mem) +(MOVBload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVBloadidx ptr idx mem) +(MOVBstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVBstoreidx ptr idx val mem) +(MOVHUload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVHUloadidx ptr idx mem) +(MOVHload [0] {sym} (ADD ptr idx) mem) && sym == nil => (MOVHloadidx ptr idx mem) +(MOVHstore [0] {sym} (ADD ptr idx) val mem) && sym == nil => (MOVHstoreidx ptr idx val mem) + +// constant folding in indexed loads and stores +(MOVWloadidx ptr (MOVWconst [c]) mem) => (MOVWload [c] ptr mem) +(MOVWloadidx (MOVWconst [c]) ptr mem) => (MOVWload [c] ptr mem) +(MOVBloadidx ptr (MOVWconst [c]) mem) => (MOVBload [c] ptr mem) +(MOVBloadidx (MOVWconst [c]) ptr mem) => (MOVBload [c] ptr mem) +(MOVBUloadidx ptr (MOVWconst [c]) mem) => (MOVBUload [c] ptr mem) +(MOVBUloadidx (MOVWconst [c]) ptr mem) => (MOVBUload [c] ptr mem) +(MOVHUloadidx ptr (MOVWconst [c]) mem) => (MOVHUload [c] ptr mem) +(MOVHUloadidx (MOVWconst [c]) ptr mem) => (MOVHUload [c] ptr mem) +(MOVHloadidx ptr (MOVWconst [c]) mem) => (MOVHload [c] ptr mem) +(MOVHloadidx (MOVWconst [c]) ptr mem) => (MOVHload [c] ptr mem) + +(MOVWstoreidx ptr (MOVWconst [c]) val mem) => (MOVWstore [c] ptr val mem) +(MOVWstoreidx (MOVWconst [c]) ptr val mem) => (MOVWstore [c] ptr val mem) +(MOVBstoreidx ptr (MOVWconst [c]) val mem) => (MOVBstore [c] ptr val mem) +(MOVBstoreidx (MOVWconst [c]) ptr val mem) => (MOVBstore [c] ptr val mem) +(MOVHstoreidx ptr (MOVWconst [c]) val mem) => (MOVHstore [c] ptr val mem) +(MOVHstoreidx (MOVWconst [c]) ptr val mem) => (MOVHstore [c] ptr val mem) + +(MOVWloadidx ptr (SLLconst idx [c]) mem) => (MOVWloadshiftLL ptr idx [c] mem) +(MOVWloadidx (SLLconst idx [c]) ptr mem) => (MOVWloadshiftLL ptr idx [c] mem) +(MOVWloadidx ptr (SRLconst idx [c]) mem) => (MOVWloadshiftRL ptr idx [c] mem) +(MOVWloadidx (SRLconst idx [c]) ptr mem) => (MOVWloadshiftRL ptr idx [c] mem) +(MOVWloadidx ptr (SRAconst idx [c]) mem) => (MOVWloadshiftRA ptr idx [c] mem) +(MOVWloadidx (SRAconst idx [c]) ptr mem) => (MOVWloadshiftRA ptr idx [c] mem) + +(MOVWstoreidx ptr (SLLconst idx [c]) val mem) => (MOVWstoreshiftLL ptr idx [c] val mem) +(MOVWstoreidx (SLLconst idx [c]) ptr val mem) => (MOVWstoreshiftLL ptr idx [c] val mem) +(MOVWstoreidx ptr (SRLconst idx [c]) val mem) => (MOVWstoreshiftRL ptr idx [c] val mem) +(MOVWstoreidx (SRLconst idx [c]) ptr val mem) => (MOVWstoreshiftRL ptr idx [c] val mem) +(MOVWstoreidx ptr (SRAconst idx [c]) val mem) => (MOVWstoreshiftRA ptr idx [c] val mem) +(MOVWstoreidx (SRAconst idx [c]) ptr val mem) => (MOVWstoreshiftRA ptr idx [c] val mem) + +(MOVWloadshiftLL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)<<uint64(d))] ptr mem) +(MOVWloadshiftRL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)>>uint64(d))] ptr mem) +(MOVWloadshiftRA ptr (MOVWconst [c]) [d] mem) => (MOVWload [c>>uint64(d)] ptr mem) + +(MOVWstoreshiftLL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)<<uint64(d))] ptr val mem) +(MOVWstoreshiftRL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)>>uint64(d))] ptr val mem) +(MOVWstoreshiftRA ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [c>>uint64(d)] ptr val mem) + +// generic simplifications +(ADD x (RSBconst [0] y)) => (SUB x y) +(ADD <t> (RSBconst [c] x) (RSBconst [d] y)) => (RSBconst [c+d] (ADD <t> x y)) +(SUB x x) => (MOVWconst [0]) +(RSB x x) => (MOVWconst [0]) +(AND x x) => x +(OR x x) => x +(XOR x x) => (MOVWconst [0]) +(BIC x x) => (MOVWconst [0]) + +(ADD (MUL x y) a) => (MULA x y a) +(SUB a (MUL x y)) && buildcfg.GOARM == 7 => (MULS x y a) +(RSB (MUL x y) a) && buildcfg.GOARM == 7 => (MULS x y a) + +(NEGF (MULF x y)) && buildcfg.GOARM >= 6 => (NMULF x y) +(NEGD (MULD x y)) && buildcfg.GOARM >= 6 => (NMULD x y) +(MULF (NEGF x) y) && buildcfg.GOARM >= 6 => (NMULF x y) +(MULD (NEGD x) y) && buildcfg.GOARM >= 6 => (NMULD x y) +(NMULF (NEGF x) y) => (MULF x y) +(NMULD (NEGD x) y) => (MULD x y) + +// the result will overwrite the addend, since they are in the same register +(ADDF a (MULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAF a x y) +(ADDF a (NMULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSF a x y) +(ADDD a (MULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAD a x y) +(ADDD a (NMULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSD a x y) +(SUBF a (MULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSF a x y) +(SUBF a (NMULF x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAF a x y) +(SUBD a (MULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULSD a x y) +(SUBD a (NMULD x y)) && a.Uses == 1 && buildcfg.GOARM >= 6 => (MULAD a x y) + +(AND x (MVN y)) => (BIC x y) + +// simplification with *shift ops +(SUBshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0]) +(SUBshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0]) +(SUBshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0]) +(RSBshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0]) +(RSBshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0]) +(RSBshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0]) +(ANDshiftLL y:(SLLconst x [c]) x [c]) => y +(ANDshiftRL y:(SRLconst x [c]) x [c]) => y +(ANDshiftRA y:(SRAconst x [c]) x [c]) => y +(ORshiftLL y:(SLLconst x [c]) x [c]) => y +(ORshiftRL y:(SRLconst x [c]) x [c]) => y +(ORshiftRA y:(SRAconst x [c]) x [c]) => y +(XORshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0]) +(XORshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0]) +(XORshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0]) +(BICshiftLL (SLLconst x [c]) x [c]) => (MOVWconst [0]) +(BICshiftRL (SRLconst x [c]) x [c]) => (MOVWconst [0]) +(BICshiftRA (SRAconst x [c]) x [c]) => (MOVWconst [0]) +(AND x (MVNshiftLL y [c])) => (BICshiftLL x y [c]) +(AND x (MVNshiftRL y [c])) => (BICshiftRL x y [c]) +(AND x (MVNshiftRA y [c])) => (BICshiftRA x y [c]) + +// floating point optimizations +(CMPF x (MOVFconst [0])) => (CMPF0 x) +(CMPD x (MOVDconst [0])) => (CMPD0 x) + +// bit extraction +(SRAconst (SLLconst x [c]) [d]) && buildcfg.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFX [(d-c)|(32-d)<<8] x) +(SRLconst (SLLconst x [c]) [d]) && buildcfg.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFXU [(d-c)|(32-d)<<8] x) + +// comparison simplification +((EQ|NE) (CMP x (RSBconst [0] y))) => ((EQ|NE) (CMN x y)) // sense of carry bit not preserved; see also #50854 +((EQ|NE) (CMN x (RSBconst [0] y))) => ((EQ|NE) (CMP x y)) // sense of carry bit not preserved; see also #50864 +(EQ (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (EQ (CMP x y) yes no) +(EQ (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no) +(EQ (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (EQ (CMPconst [c] x) yes no) +(EQ (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftLL x y [c]) yes no) +(EQ (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftRL x y [c]) yes no) +(EQ (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (CMPshiftRA x y [c]) yes no) +(EQ (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftLLreg x y z) yes no) +(EQ (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftRLreg x y z) yes no) +(EQ (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (CMPshiftRAreg x y z) yes no) +(NE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (NE (CMP x y) yes no) +(NE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (NE (CMP a (MUL <x.Type> x y)) yes no) +(NE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (NE (CMPconst [c] x) yes no) +(NE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftLL x y [c]) yes no) +(NE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftRL x y [c]) yes no) +(NE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (NE (CMPshiftRA x y [c]) yes no) +(NE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftLLreg x y z) yes no) +(NE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftRLreg x y z) yes no) +(NE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (CMPshiftRAreg x y z) yes no) +(EQ (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (EQ (CMN x y) yes no) +(EQ (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (EQ (CMN a (MUL <x.Type> x y)) yes no) +(EQ (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (EQ (CMNconst [c] x) yes no) +(EQ (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftLL x y [c]) yes no) +(EQ (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftRL x y [c]) yes no) +(EQ (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (CMNshiftRA x y [c]) yes no) +(EQ (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftLLreg x y z) yes no) +(EQ (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftRLreg x y z) yes no) +(EQ (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (CMNshiftRAreg x y z) yes no) +(NE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (NE (CMN x y) yes no) +(NE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (NE (CMN a (MUL <x.Type> x y)) yes no) +(NE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (NE (CMNconst [c] x) yes no) +(NE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftLL x y [c]) yes no) +(NE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftRL x y [c]) yes no) +(NE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (NE (CMNshiftRA x y [c]) yes no) +(NE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftLLreg x y z) yes no) +(NE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftRLreg x y z) yes no) +(NE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (CMNshiftRAreg x y z) yes no) +(EQ (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (EQ (TST x y) yes no) +(EQ (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (EQ (TSTconst [c] x) yes no) +(EQ (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftLL x y [c]) yes no) +(EQ (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftRL x y [c]) yes no) +(EQ (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (TSTshiftRA x y [c]) yes no) +(EQ (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftLLreg x y z) yes no) +(EQ (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftRLreg x y z) yes no) +(EQ (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (TSTshiftRAreg x y z) yes no) +(NE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (NE (TST x y) yes no) +(NE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (NE (TSTconst [c] x) yes no) +(NE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftLL x y [c]) yes no) +(NE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftRL x y [c]) yes no) +(NE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (NE (TSTshiftRA x y [c]) yes no) +(NE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftLLreg x y z) yes no) +(NE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftRLreg x y z) yes no) +(NE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (TSTshiftRAreg x y z) yes no) +(EQ (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (EQ (TEQ x y) yes no) +(EQ (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (EQ (TEQconst [c] x) yes no) +(EQ (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftLL x y [c]) yes no) +(EQ (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftRL x y [c]) yes no) +(EQ (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (EQ (TEQshiftRA x y [c]) yes no) +(EQ (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftLLreg x y z) yes no) +(EQ (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftRLreg x y z) yes no) +(EQ (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (EQ (TEQshiftRAreg x y z) yes no) +(NE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (NE (TEQ x y) yes no) +(NE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (NE (TEQconst [c] x) yes no) +(NE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftLL x y [c]) yes no) +(NE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftRL x y [c]) yes no) +(NE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (NE (TEQshiftRA x y [c]) yes no) +(NE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftLLreg x y z) yes no) +(NE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftRLreg x y z) yes no) +(NE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (NE (TEQshiftRAreg x y z) yes no) +(LT (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (LTnoov (CMP x y) yes no) +(LT (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (LTnoov (CMP a (MUL <x.Type> x y)) yes no) +(LT (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (LTnoov (CMPconst [c] x) yes no) +(LT (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftLL x y [c]) yes no) +(LT (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftRL x y [c]) yes no) +(LT (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (CMPshiftRA x y [c]) yes no) +(LT (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftLLreg x y z) yes no) +(LT (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftRLreg x y z) yes no) +(LT (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMPshiftRAreg x y z) yes no) +(LE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (LEnoov (CMP x y) yes no) +(LE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (LEnoov (CMP a (MUL <x.Type> x y)) yes no) +(LE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (LEnoov (CMPconst [c] x) yes no) +(LE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftLL x y [c]) yes no) +(LE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftRL x y [c]) yes no) +(LE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (CMPshiftRA x y [c]) yes no) +(LE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftLLreg x y z) yes no) +(LE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftRLreg x y z) yes no) +(LE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMPshiftRAreg x y z) yes no) +(LT (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (LTnoov (CMN x y) yes no) +(LT (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (LTnoov (CMN a (MUL <x.Type> x y)) yes no) +(LT (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (LTnoov (CMNconst [c] x) yes no) +(LT (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftLL x y [c]) yes no) +(LT (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftRL x y [c]) yes no) +(LT (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (CMNshiftRA x y [c]) yes no) +(LT (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftLLreg x y z) yes no) +(LT (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftRLreg x y z) yes no) +(LT (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (CMNshiftRAreg x y z) yes no) +(LE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (LEnoov (CMN x y) yes no) +(LE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (LEnoov (CMN a (MUL <x.Type> x y)) yes no) +(LE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (LEnoov (CMNconst [c] x) yes no) +(LE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftLL x y [c]) yes no) +(LE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftRL x y [c]) yes no) +(LE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (CMNshiftRA x y [c]) yes no) +(LE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftLLreg x y z) yes no) +(LE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftRLreg x y z) yes no) +(LE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (CMNshiftRAreg x y z) yes no) +(LT (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (LTnoov (TST x y) yes no) +(LT (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (LTnoov (TSTconst [c] x) yes no) +(LT (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftLL x y [c]) yes no) +(LT (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftRL x y [c]) yes no) +(LT (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (TSTshiftRA x y [c]) yes no) +(LT (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftLLreg x y z) yes no) +(LT (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftRLreg x y z) yes no) +(LT (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (TSTshiftRAreg x y z) yes no) +(LE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (LEnoov (TST x y) yes no) +(LE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (LEnoov (TSTconst [c] x) yes no) +(LE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftLL x y [c]) yes no) +(LE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftRL x y [c]) yes no) +(LE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (TSTshiftRA x y [c]) yes no) +(LE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftLLreg x y z) yes no) +(LE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftRLreg x y z) yes no) +(LE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (TSTshiftRAreg x y z) yes no) +(LT (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (LTnoov (TEQ x y) yes no) +(LT (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (LTnoov (TEQconst [c] x) yes no) +(LT (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftLL x y [c]) yes no) +(LT (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftRL x y [c]) yes no) +(LT (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (LTnoov (TEQshiftRA x y [c]) yes no) +(LT (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftLLreg x y z) yes no) +(LT (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftRLreg x y z) yes no) +(LT (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (LTnoov (TEQshiftRAreg x y z) yes no) +(LE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (LEnoov (TEQ x y) yes no) +(LE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (LEnoov (TEQconst [c] x) yes no) +(LE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftLL x y [c]) yes no) +(LE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftRL x y [c]) yes no) +(LE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (LEnoov (TEQshiftRA x y [c]) yes no) +(LE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftLLreg x y z) yes no) +(LE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftRLreg x y z) yes no) +(LE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (LEnoov (TEQshiftRAreg x y z) yes no) +(GT (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (GTnoov (CMP x y) yes no) +(GT (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (GTnoov (CMP a (MUL <x.Type> x y)) yes no) +(GT (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (GTnoov (CMPconst [c] x) yes no) +(GT (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftLL x y [c]) yes no) +(GT (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftRL x y [c]) yes no) +(GT (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (CMPshiftRA x y [c]) yes no) +(GT (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftLLreg x y z) yes no) +(GT (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftRLreg x y z) yes no) +(GT (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMPshiftRAreg x y z) yes no) +(GE (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (GEnoov (CMP x y) yes no) +(GE (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (GEnoov (CMP a (MUL <x.Type> x y)) yes no) +(GE (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (GEnoov (CMPconst [c] x) yes no) +(GE (CMPconst [0] l:(SUBshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftLL x y [c]) yes no) +(GE (CMPconst [0] l:(SUBshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftRL x y [c]) yes no) +(GE (CMPconst [0] l:(SUBshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (CMPshiftRA x y [c]) yes no) +(GE (CMPconst [0] l:(SUBshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftLLreg x y z) yes no) +(GE (CMPconst [0] l:(SUBshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftRLreg x y z) yes no) +(GE (CMPconst [0] l:(SUBshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMPshiftRAreg x y z) yes no) +(GT (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (GTnoov (CMN x y) yes no) +(GT (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (GTnoov (CMNconst [c] x) yes no) +(GT (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftLL x y [c]) yes no) +(GT (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftRL x y [c]) yes no) +(GT (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (CMNshiftRA x y [c]) yes no) +(GT (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftLLreg x y z) yes no) +(GT (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftRLreg x y z) yes no) +(GT (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (CMNshiftRAreg x y z) yes no) +(GE (CMPconst [0] l:(ADD x y)) yes no) && l.Uses==1 => (GEnoov (CMN x y) yes no) +(GE (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (GEnoov (CMN a (MUL <x.Type> x y)) yes no) +(GE (CMPconst [0] l:(ADDconst [c] x)) yes no) && l.Uses==1 => (GEnoov (CMNconst [c] x) yes no) +(GE (CMPconst [0] l:(ADDshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftLL x y [c]) yes no) +(GE (CMPconst [0] l:(ADDshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftRL x y [c]) yes no) +(GE (CMPconst [0] l:(ADDshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (CMNshiftRA x y [c]) yes no) +(GE (CMPconst [0] l:(ADDshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftLLreg x y z) yes no) +(GE (CMPconst [0] l:(ADDshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftRLreg x y z) yes no) +(GE (CMPconst [0] l:(ADDshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (CMNshiftRAreg x y z) yes no) +(GT (CMPconst [0] l:(MULA x y a)) yes no) && l.Uses==1 => (GTnoov (CMN a (MUL <x.Type> x y)) yes no) +(GT (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (GTnoov (TST x y) yes no) +(GT (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (GTnoov (TSTconst [c] x) yes no) +(GT (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftLL x y [c]) yes no) +(GT (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftRL x y [c]) yes no) +(GT (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (TSTshiftRA x y [c]) yes no) +(GT (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftLLreg x y z) yes no) +(GT (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftRLreg x y z) yes no) +(GT (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (TSTshiftRAreg x y z) yes no) +(GE (CMPconst [0] l:(AND x y)) yes no) && l.Uses==1 => (GEnoov (TST x y) yes no) +(GE (CMPconst [0] l:(ANDconst [c] x)) yes no) && l.Uses==1 => (GEnoov (TSTconst [c] x) yes no) +(GE (CMPconst [0] l:(ANDshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftLL x y [c]) yes no) +(GE (CMPconst [0] l:(ANDshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftRL x y [c]) yes no) +(GE (CMPconst [0] l:(ANDshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (TSTshiftRA x y [c]) yes no) +(GE (CMPconst [0] l:(ANDshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftLLreg x y z) yes no) +(GE (CMPconst [0] l:(ANDshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftRLreg x y z) yes no) +(GE (CMPconst [0] l:(ANDshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (TSTshiftRAreg x y z) yes no) +(GT (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (GTnoov (TEQ x y) yes no) +(GT (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (GTnoov (TEQconst [c] x) yes no) +(GT (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftLL x y [c]) yes no) +(GT (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftRL x y [c]) yes no) +(GT (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (GTnoov (TEQshiftRA x y [c]) yes no) +(GT (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftLLreg x y z) yes no) +(GT (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftRLreg x y z) yes no) +(GT (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GTnoov (TEQshiftRAreg x y z) yes no) +(GE (CMPconst [0] l:(XOR x y)) yes no) && l.Uses==1 => (GEnoov (TEQ x y) yes no) +(GE (CMPconst [0] l:(XORconst [c] x)) yes no) && l.Uses==1 => (GEnoov (TEQconst [c] x) yes no) +(GE (CMPconst [0] l:(XORshiftLL x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftLL x y [c]) yes no) +(GE (CMPconst [0] l:(XORshiftRL x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftRL x y [c]) yes no) +(GE (CMPconst [0] l:(XORshiftRA x y [c])) yes no) && l.Uses==1 => (GEnoov (TEQshiftRA x y [c]) yes no) +(GE (CMPconst [0] l:(XORshiftLLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftLLreg x y z) yes no) +(GE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftRLreg x y z) yes no) +(GE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GEnoov (TEQshiftRAreg x y z) yes no) + +(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read8(sym, int64(off)))]) +(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))]) diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules new file mode 100644 index 0000000..78a8492 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules @@ -0,0 +1,3030 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +(Add(Ptr|64|32|16|8) ...) => (ADD ...) +(Add(32F|64F) ...) => (FADD(S|D) ...) + +(Sub(Ptr|64|32|16|8) ...) => (SUB ...) +(Sub(32F|64F) ...) => (FSUB(S|D) ...) + +(Mul64 ...) => (MUL ...) +(Mul(32|16|8) ...) => (MULW ...) +(Mul(32F|64F) ...) => (FMUL(S|D) ...) + +(Hmul64 ...) => (MULH ...) +(Hmul64u ...) => (UMULH ...) +(Hmul32 x y) => (SRAconst (MULL <typ.Int64> x y) [32]) +(Hmul32u x y) => (SRAconst (UMULL <typ.UInt64> x y) [32]) +(Select0 (Mul64uhilo x y)) => (UMULH x y) +(Select1 (Mul64uhilo x y)) => (MUL x y) + +(Div64 [false] x y) => (DIV x y) +(Div64u ...) => (UDIV ...) +(Div32 [false] x y) => (DIVW x y) +(Div32u ...) => (UDIVW ...) +(Div16 [false] x y) => (DIVW (SignExt16to32 x) (SignExt16to32 y)) +(Div16u x y) => (UDIVW (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y)) +(Div8u x y) => (UDIVW (ZeroExt8to32 x) (ZeroExt8to32 y)) +(Div32F ...) => (FDIVS ...) +(Div64F ...) => (FDIVD ...) + +(Mod64 x y) => (MOD x y) +(Mod64u ...) => (UMOD ...) +(Mod32 x y) => (MODW x y) +(Mod32u ...) => (UMODW ...) +(Mod16 x y) => (MODW (SignExt16to32 x) (SignExt16to32 y)) +(Mod16u x y) => (UMODW (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Mod8 x y) => (MODW (SignExt8to32 x) (SignExt8to32 y)) +(Mod8u x y) => (UMODW (ZeroExt8to32 x) (ZeroExt8to32 y)) + +// (x + y) / 2 with x>=y => (x - y) / 2 + y +(Avg64u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y) + +(And(64|32|16|8) ...) => (AND ...) +(Or(64|32|16|8) ...) => (OR ...) +(Xor(64|32|16|8) ...) => (XOR ...) + +// unary ops +(Neg(64|32|16|8) ...) => (NEG ...) +(Neg(32F|64F) ...) => (FNEG(S|D) ...) +(Com(64|32|16|8) ...) => (MVN ...) + +// math package intrinsics +(Abs ...) => (FABSD ...) +(Sqrt ...) => (FSQRTD ...) +(Ceil ...) => (FRINTPD ...) +(Floor ...) => (FRINTMD ...) +(Round ...) => (FRINTAD ...) +(RoundToEven ...) => (FRINTND ...) +(Trunc ...) => (FRINTZD ...) +(FMA x y z) => (FMADDD z x y) + +(Sqrt32 ...) => (FSQRTS ...) + +// lowering rotates +// we do rotate detection in generic rules, if the following rules need to be changed, chcek generic rules first. +(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7]))) +(RotateLeft8 <t> x y) => (OR <t> (SLL <t> x (ANDconst <typ.Int64> [7] y)) (SRL <t> (ZeroExt8to64 x) (ANDconst <typ.Int64> [7] (NEG <typ.Int64> y)))) +(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15]))) +(RotateLeft16 <t> x y) => (RORW <t> (ORshiftLL <typ.UInt32> (ZeroExt16to32 x) (ZeroExt16to32 x) [16]) (NEG <typ.Int64> y)) +(RotateLeft32 x y) => (RORW x (NEG <y.Type> y)) +(RotateLeft64 x y) => (ROR x (NEG <y.Type> y)) + +(Ctz(64|32|16|8)NonZero ...) => (Ctz(64|32|32|32) ...) + +(Ctz64 <t> x) => (CLZ (RBIT <t> x)) +(Ctz32 <t> x) => (CLZW (RBITW <t> x)) +(Ctz16 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x))) +(Ctz8 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x))) + +(PopCount64 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x)))) +(PopCount32 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x))))) +(PopCount16 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt16to64 x))))) + +// Load args directly into the register class where it will be used. +(FMOVDgpfp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym}) +(FMOVDfpgp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym}) + +// Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set. +(MOVDstore [off] {sym} ptr (FMOVDfpgp val) mem) => (FMOVDstore [off] {sym} ptr val mem) +(FMOVDstore [off] {sym} ptr (FMOVDgpfp val) mem) => (MOVDstore [off] {sym} ptr val mem) +(MOVWstore [off] {sym} ptr (FMOVSfpgp val) mem) => (FMOVSstore [off] {sym} ptr val mem) +(FMOVSstore [off] {sym} ptr (FMOVSgpfp val) mem) => (MOVWstore [off] {sym} ptr val mem) + +// float <=> int register moves, with no conversion. +// These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}. +(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr val _)) => (FMOVDfpgp val) +(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr val _)) => (FMOVDgpfp val) +(MOVWUload [off] {sym} ptr (FMOVSstore [off] {sym} ptr val _)) => (FMOVSfpgp val) +(FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (FMOVSgpfp val) + +(BitLen64 x) => (SUB (MOVDconst [64]) (CLZ <typ.Int> x)) +(BitLen32 x) => (SUB (MOVDconst [32]) (CLZW <typ.Int> x)) + +(Bswap64 ...) => (REV ...) +(Bswap32 ...) => (REVW ...) + +(BitRev64 ...) => (RBIT ...) +(BitRev32 ...) => (RBITW ...) +(BitRev16 x) => (SRLconst [48] (RBIT <typ.UInt64> x)) +(BitRev8 x) => (SRLconst [56] (RBIT <typ.UInt64> x)) + +// In fact, UMOD will be translated into UREM instruction, and UREM is originally translated into +// UDIV and MSUB instructions. But if there is already an identical UDIV instruction just before or +// after UREM (case like quo, rem := z/y, z%y), then the second UDIV instruction becomes redundant. +// The purpose of this rule is to have this extra UDIV instruction removed in CSE pass. +(UMOD <typ.UInt64> x y) => (MSUB <typ.UInt64> x y (UDIV <typ.UInt64> x y)) +(UMODW <typ.UInt32> x y) => (MSUBW <typ.UInt32> x y (UDIVW <typ.UInt32> x y)) + +// 64-bit addition with carry. +(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c)))) +(Select1 (Add64carry x y c)) => (ADCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c))))) + +// 64-bit subtraction with borrowing. +(Select0 (Sub64borrow x y bo)) => (Select0 <typ.UInt64> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo)))) +(Select1 (Sub64borrow x y bo)) => (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo)))))) + +// boolean ops -- booleans are represented with 0=false, 1=true +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(EqB x y) => (XOR (MOVDconst [1]) (XOR <typ.Bool> x y)) +(NeqB ...) => (XOR ...) +(Not x) => (XOR (MOVDconst [1]) x) + +// shifts +// hardware instruction uses only the low 6 bits of the shift +// we compare to 64 to ensure Go semantics for large shifts +// Rules about rotates with non-const shift are based on the following rules, +// if the following rules change, please also modify the rules based on them. + +// check shiftIsBounded first, if shift value is proved to be valid then we +// can do the shift directly. +// left shift +(Lsh(64|32|16|8)x64 <t> x y) && shiftIsBounded(v) => (SLL <t> x y) +(Lsh(64|32|16|8)x32 <t> x y) && shiftIsBounded(v) => (SLL <t> x y) +(Lsh(64|32|16|8)x16 <t> x y) && shiftIsBounded(v) => (SLL <t> x y) +(Lsh(64|32|16|8)x8 <t> x y) && shiftIsBounded(v) => (SLL <t> x y) + +// signed right shift +(Rsh64x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> x y) +(Rsh32x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) y) +(Rsh16x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) y) +(Rsh8x(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) y) + +// unsigned right shift +(Rsh64Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> x y) +(Rsh32Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> (ZeroExt32to64 x) y) +(Rsh16Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> (ZeroExt16to64 x) y) +(Rsh8Ux(64|32|16|8) <t> x y) && shiftIsBounded(v) => (SRL <t> (ZeroExt8to64 x) y) + +// shift value may be out of range, use CMP + CSEL instead +(Lsh64x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y)) +(Lsh64x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Lsh64x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Lsh64x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Lsh32x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y)) +(Lsh32x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Lsh32x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Lsh32x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Lsh16x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y)) +(Lsh16x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Lsh16x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Lsh16x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Lsh8x64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] y)) +(Lsh8x32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Lsh8x16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Lsh8x8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SLL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] y)) +(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Rsh64Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> x y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] y)) +(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Rsh32Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt32to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] y)) +(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Rsh16Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt16to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Rsh8Ux64 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] y)) +(Rsh8Ux32 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt32to64 y))) +(Rsh8Ux16 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt16to64 y))) +(Rsh8Ux8 <t> x y) && !shiftIsBounded(v) => (CSEL [OpARM64LessThanU] (SRL <t> (ZeroExt8to64 x) y) (Const64 <t> [0]) (CMPconst [64] (ZeroExt8to64 y))) + +(Rsh64x64 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y))) +(Rsh64x32 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y)))) +(Rsh64x16 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y)))) +(Rsh64x8 x y) && !shiftIsBounded(v) => (SRA x (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y)))) + +(Rsh32x64 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y))) +(Rsh32x32 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y)))) +(Rsh32x16 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y)))) +(Rsh32x8 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y)))) + +(Rsh16x64 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y))) +(Rsh16x32 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y)))) +(Rsh16x16 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y)))) +(Rsh16x8 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y)))) + +(Rsh8x64 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] y))) +(Rsh8x32 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt32to64 y)))) +(Rsh8x16 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt16to64 y)))) +(Rsh8x8 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (CSEL [OpARM64LessThanU] <y.Type> y (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64 y)))) + +// constants +(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)]) +(Const(32F|64F) [val]) => (FMOV(S|D)const [float64(val)]) +(ConstNil) => (MOVDconst [0]) +(ConstBool [t]) => (MOVDconst [b2i(t)]) + +(Slicemask <t> x) => (SRAconst (NEG <t> x) [63]) + +// truncations +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) +(Trunc64to8 ...) => (Copy ...) +(Trunc64to16 ...) => (Copy ...) +(Trunc64to32 ...) => (Copy ...) + +// Zero-/Sign-extensions +(ZeroExt8to16 ...) => (MOVBUreg ...) +(ZeroExt8to32 ...) => (MOVBUreg ...) +(ZeroExt16to32 ...) => (MOVHUreg ...) +(ZeroExt8to64 ...) => (MOVBUreg ...) +(ZeroExt16to64 ...) => (MOVHUreg ...) +(ZeroExt32to64 ...) => (MOVWUreg ...) + +(SignExt8to16 ...) => (MOVBreg ...) +(SignExt8to32 ...) => (MOVBreg ...) +(SignExt16to32 ...) => (MOVHreg ...) +(SignExt8to64 ...) => (MOVBreg ...) +(SignExt16to64 ...) => (MOVHreg ...) +(SignExt32to64 ...) => (MOVWreg ...) + +// float <=> int conversion +(Cvt32to32F ...) => (SCVTFWS ...) +(Cvt32to64F ...) => (SCVTFWD ...) +(Cvt64to32F ...) => (SCVTFS ...) +(Cvt64to64F ...) => (SCVTFD ...) +(Cvt32Uto32F ...) => (UCVTFWS ...) +(Cvt32Uto64F ...) => (UCVTFWD ...) +(Cvt64Uto32F ...) => (UCVTFS ...) +(Cvt64Uto64F ...) => (UCVTFD ...) +(Cvt32Fto32 ...) => (FCVTZSSW ...) +(Cvt64Fto32 ...) => (FCVTZSDW ...) +(Cvt32Fto64 ...) => (FCVTZSS ...) +(Cvt64Fto64 ...) => (FCVTZSD ...) +(Cvt32Fto32U ...) => (FCVTZUSW ...) +(Cvt64Fto32U ...) => (FCVTZUDW ...) +(Cvt32Fto64U ...) => (FCVTZUS ...) +(Cvt64Fto64U ...) => (FCVTZUD ...) +(Cvt32Fto64F ...) => (FCVTSD ...) +(Cvt64Fto32F ...) => (FCVTDS ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round32F ...) => (LoweredRound32F ...) +(Round64F ...) => (LoweredRound64F ...) + +// comparisons +(Eq8 x y) => (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Eq16 x y) => (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Eq32 x y) => (Equal (CMPW x y)) +(Eq64 x y) => (Equal (CMP x y)) +(EqPtr x y) => (Equal (CMP x y)) +(Eq32F x y) => (Equal (FCMPS x y)) +(Eq64F x y) => (Equal (FCMPD x y)) + +(Neq8 x y) => (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Neq16 x y) => (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Neq32 x y) => (NotEqual (CMPW x y)) +(Neq64 x y) => (NotEqual (CMP x y)) +(NeqPtr x y) => (NotEqual (CMP x y)) +(Neq32F x y) => (NotEqual (FCMPS x y)) +(Neq64F x y) => (NotEqual (FCMPD x y)) + +(Less8 x y) => (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y))) +(Less16 x y) => (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y))) +(Less32 x y) => (LessThan (CMPW x y)) +(Less64 x y) => (LessThan (CMP x y)) + +// Set condition flags for floating-point comparisons "x < y" +// and "x <= y". Because if either or both of the operands are +// NaNs, all three of (x < y), (x == y) and (x > y) are false, +// and ARM Manual says FCMP instruction sets PSTATE.<N,Z,C,V> +// of this case to (0, 0, 1, 1). +(Less32F x y) => (LessThanF (FCMPS x y)) +(Less64F x y) => (LessThanF (FCMPD x y)) + +// For an unsigned integer x, the following rules are useful when combining branch +// 0 < x => x != 0 +// x <= 0 => x == 0 +// x < 1 => x == 0 +// 1 <= x => x != 0 +(Less(8U|16U|32U|64U) zero:(MOVDconst [0]) x) => (Neq(8|16|32|64) zero x) +(Leq(8U|16U|32U|64U) x zero:(MOVDconst [0])) => (Eq(8|16|32|64) x zero) +(Less(8U|16U|32U|64U) x (MOVDconst [1])) => (Eq(8|16|32|64) x (MOVDconst [0])) +(Leq(8U|16U|32U|64U) (MOVDconst [1]) x) => (Neq(8|16|32|64) (MOVDconst [0]) x) + +(Less8U x y) => (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Less16U x y) => (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Less32U x y) => (LessThanU (CMPW x y)) +(Less64U x y) => (LessThanU (CMP x y)) + +(Leq8 x y) => (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y))) +(Leq16 x y) => (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y))) +(Leq32 x y) => (LessEqual (CMPW x y)) +(Leq64 x y) => (LessEqual (CMP x y)) + +// Refer to the comments for op Less64F above. +(Leq32F x y) => (LessEqualF (FCMPS x y)) +(Leq64F x y) => (LessEqualF (FCMPD x y)) + +(Leq8U x y) => (LessEqualU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Leq16U x y) => (LessEqualU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Leq32U x y) => (LessEqualU (CMPW x y)) +(Leq64U x y) => (LessEqualU (CMP x y)) + +// Optimize comparison between a floating-point value and 0.0 with "FCMP $(0.0), Fn" +(FCMPS x (FMOVSconst [0])) => (FCMPS0 x) +(FCMPS (FMOVSconst [0]) x) => (InvertFlags (FCMPS0 x)) +(FCMPD x (FMOVDconst [0])) => (FCMPD0 x) +(FCMPD (FMOVDconst [0]) x) => (InvertFlags (FCMPD0 x)) + +// CSEL needs a flag-generating argument. Synthesize a TSTW if necessary. +(CondSelect x y boolval) && flagArg(boolval) != nil => (CSEL [boolval.Op] x y flagArg(boolval)) +(CondSelect x y boolval) && flagArg(boolval) == nil => (CSEL [OpARM64NotEqual] x y (TSTWconst [1] boolval)) + +(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVDaddr [int32(off)] ptr) +(OffPtr [off] ptr) => (ADDconst [off] ptr) + +(Addr {sym} base) => (MOVDaddr {sym} base) +(LocalAddr {sym} base _) => (MOVDaddr {sym} base) + +// loads +(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem) +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem) + +// stores +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem) + +// zeroing +(Zero [0] _ mem) => mem +(Zero [1] ptr mem) => (MOVBstore ptr (MOVDconst [0]) mem) +(Zero [2] ptr mem) => (MOVHstore ptr (MOVDconst [0]) mem) +(Zero [4] ptr mem) => (MOVWstore ptr (MOVDconst [0]) mem) +(Zero [3] ptr mem) => + (MOVBstore [2] ptr (MOVDconst [0]) + (MOVHstore ptr (MOVDconst [0]) mem)) +(Zero [5] ptr mem) => + (MOVBstore [4] ptr (MOVDconst [0]) + (MOVWstore ptr (MOVDconst [0]) mem)) +(Zero [6] ptr mem) => + (MOVHstore [4] ptr (MOVDconst [0]) + (MOVWstore ptr (MOVDconst [0]) mem)) +(Zero [7] ptr mem) => + (MOVWstore [3] ptr (MOVDconst [0]) + (MOVWstore ptr (MOVDconst [0]) mem)) +(Zero [8] ptr mem) => (MOVDstore ptr (MOVDconst [0]) mem) +(Zero [9] ptr mem) => + (MOVBstore [8] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [10] ptr mem) => + (MOVHstore [8] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [11] ptr mem) => + (MOVDstore [3] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [12] ptr mem) => + (MOVWstore [8] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [13] ptr mem) => + (MOVDstore [5] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [14] ptr mem) => + (MOVDstore [6] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [15] ptr mem) => + (MOVDstore [7] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [16] ptr mem) => + (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem) + +(Zero [32] ptr mem) => + (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) + (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)) + +(Zero [48] ptr mem) => + (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) + (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) + (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))) + +(Zero [64] ptr mem) => + (STP [48] ptr (MOVDconst [0]) (MOVDconst [0]) + (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) + (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) + (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))) + +// strip off fractional word zeroing +(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 => + (Zero [8] + (OffPtr <ptr.Type> ptr [s-8]) + (Zero [s-s%16] ptr mem)) +(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 => + (Zero [16] + (OffPtr <ptr.Type> ptr [s-16]) + (Zero [s-s%16] ptr mem)) + +// medium zeroing uses a duff device +// 4, 16, and 64 are magic constants, see runtime/mkduff.go +(Zero [s] ptr mem) + && s%16 == 0 && s > 64 && s <= 16*64 + && !config.noDuffDevice => + (DUFFZERO [4 * (64 - s/16)] ptr mem) + +// large zeroing uses a loop +(Zero [s] ptr mem) + && s%16 == 0 && (s > 16*64 || config.noDuffDevice) => + (LoweredZero + ptr + (ADDconst <ptr.Type> [s-16] ptr) + mem) + +// moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem) +(Move [2] dst src mem) => (MOVHstore dst (MOVHUload src mem) mem) +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBUload [2] src mem) + (MOVHstore dst (MOVHUload src mem) mem)) +(Move [4] dst src mem) => (MOVWstore dst (MOVWUload src mem) mem) +(Move [5] dst src mem) => + (MOVBstore [4] dst (MOVBUload [4] src mem) + (MOVWstore dst (MOVWUload src mem) mem)) +(Move [6] dst src mem) => + (MOVHstore [4] dst (MOVHUload [4] src mem) + (MOVWstore dst (MOVWUload src mem) mem)) +(Move [7] dst src mem) => + (MOVWstore [3] dst (MOVWUload [3] src mem) + (MOVWstore dst (MOVWUload src mem) mem)) +(Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem) +(Move [9] dst src mem) => + (MOVBstore [8] dst (MOVBUload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [10] dst src mem) => + (MOVHstore [8] dst (MOVHUload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [11] dst src mem) => + (MOVDstore [3] dst (MOVDload [3] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [12] dst src mem) => + (MOVWstore [8] dst (MOVWUload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [13] dst src mem) => + (MOVDstore [5] dst (MOVDload [5] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [14] dst src mem) => + (MOVDstore [6] dst (MOVDload [6] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [15] dst src mem) => + (MOVDstore [7] dst (MOVDload [7] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [16] dst src mem) => + (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem) +(Move [32] dst src mem) => + (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) + (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)) +(Move [48] dst src mem) => + (STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem)) + (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) + (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))) +(Move [64] dst src mem) => + (STP [48] dst (Select0 <typ.UInt64> (LDP [48] src mem)) (Select1 <typ.UInt64> (LDP [48] src mem)) + (STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem)) + (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) + (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)))) + +// strip off fractional word move +(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 16 => + (Move [8] + (OffPtr <dst.Type> dst [s-8]) + (OffPtr <src.Type> src [s-8]) + (Move [s-s%16] dst src mem)) +(Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 16 => + (Move [16] + (OffPtr <dst.Type> dst [s-16]) + (OffPtr <src.Type> src [s-16]) + (Move [s-s%16] dst src mem)) + +// medium move uses a duff device +(Move [s] dst src mem) + && s > 64 && s <= 16*64 && s%16 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [8 * (64 - s/16)] dst src mem) +// 8 is the number of bytes to encode: +// +// LDP.P 16(R16), (R26, R27) +// STP.P (R26, R27), 16(R17) +// +// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy + +// large move uses a loop +(Move [s] dst src mem) + && s%16 == 0 && (s > 16*64 || config.noDuffDevice) + && logLargeCopy(v, s) => + (LoweredMove + dst + src + (ADDconst <src.Type> src [s-16]) + mem) + +// calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// checks +(NilCheck ...) => (LoweredNilCheck ...) +(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr)) +(IsInBounds idx len) => (LessThanU (CMP idx len)) +(IsSliceInBounds idx len) => (LessEqualU (CMP idx len)) + +// pseudo-ops +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) + +// Absorb pseudo-ops into blocks. +(If (Equal cc) yes no) => (EQ cc yes no) +(If (NotEqual cc) yes no) => (NE cc yes no) +(If (LessThan cc) yes no) => (LT cc yes no) +(If (LessThanU cc) yes no) => (ULT cc yes no) +(If (LessEqual cc) yes no) => (LE cc yes no) +(If (LessEqualU cc) yes no) => (ULE cc yes no) +(If (GreaterThan cc) yes no) => (GT cc yes no) +(If (GreaterThanU cc) yes no) => (UGT cc yes no) +(If (GreaterEqual cc) yes no) => (GE cc yes no) +(If (GreaterEqualU cc) yes no) => (UGE cc yes no) +(If (LessThanF cc) yes no) => (FLT cc yes no) +(If (LessEqualF cc) yes no) => (FLE cc yes no) +(If (GreaterThanF cc) yes no) => (FGT cc yes no) +(If (GreaterEqualF cc) yes no) => (FGE cc yes no) + +(If cond yes no) => (TBNZ [0] cond yes no) + +(JumpTable idx) => (JUMPTABLE {makeJumpTableSym(b)} idx (MOVDaddr <typ.Uintptr> {makeJumpTableSym(b)} (SB))) + +// atomic intrinsics +// Note: these ops do not accept offset. +(AtomicLoad8 ...) => (LDARB ...) +(AtomicLoad32 ...) => (LDARW ...) +(AtomicLoad64 ...) => (LDAR ...) +(AtomicLoadPtr ...) => (LDAR ...) + +(AtomicStore8 ...) => (STLRB ...) +(AtomicStore32 ...) => (STLRW ...) +(AtomicStore64 ...) => (STLR ...) +(AtomicStorePtrNoWB ...) => (STLR ...) + +(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...) +(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...) +(AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...) + +(AtomicAdd(32|64)Variant ...) => (LoweredAtomicAdd(32|64)Variant ...) +(AtomicExchange(32|64)Variant ...) => (LoweredAtomicExchange(32|64)Variant ...) +(AtomicCompareAndSwap(32|64)Variant ...) => (LoweredAtomicCas(32|64)Variant ...) + +// Currently the updated value is not used, but we need a register to temporarily hold it. +(AtomicAnd8 ptr val mem) => (Select1 (LoweredAtomicAnd8 ptr val mem)) +(AtomicAnd32 ptr val mem) => (Select1 (LoweredAtomicAnd32 ptr val mem)) +(AtomicOr8 ptr val mem) => (Select1 (LoweredAtomicOr8 ptr val mem)) +(AtomicOr32 ptr val mem) => (Select1 (LoweredAtomicOr32 ptr val mem)) + +(AtomicAnd8Variant ptr val mem) => (Select1 (LoweredAtomicAnd8Variant ptr val mem)) +(AtomicAnd32Variant ptr val mem) => (Select1 (LoweredAtomicAnd32Variant ptr val mem)) +(AtomicOr8Variant ptr val mem) => (Select1 (LoweredAtomicOr8Variant ptr val mem)) +(AtomicOr32Variant ptr val mem) => (Select1 (LoweredAtomicOr32Variant ptr val mem)) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +// Publication barrier (0xe is ST option) +(PubBarrier mem) => (DMB [0xe] mem) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +// Optimizations + +// Absorb boolean tests into block +(NZ (Equal cc) yes no) => (EQ cc yes no) +(NZ (NotEqual cc) yes no) => (NE cc yes no) +(NZ (LessThan cc) yes no) => (LT cc yes no) +(NZ (LessThanU cc) yes no) => (ULT cc yes no) +(NZ (LessEqual cc) yes no) => (LE cc yes no) +(NZ (LessEqualU cc) yes no) => (ULE cc yes no) +(NZ (GreaterThan cc) yes no) => (GT cc yes no) +(NZ (GreaterThanU cc) yes no) => (UGT cc yes no) +(NZ (GreaterEqual cc) yes no) => (GE cc yes no) +(NZ (GreaterEqualU cc) yes no) => (UGE cc yes no) +(NZ (LessThanF cc) yes no) => (FLT cc yes no) +(NZ (LessEqualF cc) yes no) => (FLE cc yes no) +(NZ (GreaterThanF cc) yes no) => (FGT cc yes no) +(NZ (GreaterEqualF cc) yes no) => (FGE cc yes no) + +(TBNZ [0] (Equal cc) yes no) => (EQ cc yes no) +(TBNZ [0] (NotEqual cc) yes no) => (NE cc yes no) +(TBNZ [0] (LessThan cc) yes no) => (LT cc yes no) +(TBNZ [0] (LessThanU cc) yes no) => (ULT cc yes no) +(TBNZ [0] (LessEqual cc) yes no) => (LE cc yes no) +(TBNZ [0] (LessEqualU cc) yes no) => (ULE cc yes no) +(TBNZ [0] (GreaterThan cc) yes no) => (GT cc yes no) +(TBNZ [0] (GreaterThanU cc) yes no) => (UGT cc yes no) +(TBNZ [0] (GreaterEqual cc) yes no) => (GE cc yes no) +(TBNZ [0] (GreaterEqualU cc) yes no) => (UGE cc yes no) +(TBNZ [0] (LessThanF cc) yes no) => (FLT cc yes no) +(TBNZ [0] (LessEqualF cc) yes no) => (FLE cc yes no) +(TBNZ [0] (GreaterThanF cc) yes no) => (FGT cc yes no) +(TBNZ [0] (GreaterEqualF cc) yes no) => (FGE cc yes no) + +(EQ (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (EQ (TST x y) yes no) +(NE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (NE (TST x y) yes no) +(LT (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LT (TST x y) yes no) +(LE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LE (TST x y) yes no) +(GT (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GT (TST x y) yes no) +(GE (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GE (TST x y) yes no) + +(EQ (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (EQ (TSTconst [c] y) yes no) +(NE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (NE (TSTconst [c] y) yes no) +(LT (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LT (TSTconst [c] y) yes no) +(LE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LE (TSTconst [c] y) yes no) +(GT (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GT (TSTconst [c] y) yes no) +(GE (CMPconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GE (TSTconst [c] y) yes no) + +(EQ (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (EQ (TSTW x y) yes no) +(NE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (NE (TSTW x y) yes no) +(LT (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LT (TSTW x y) yes no) +(LE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (LE (TSTW x y) yes no) +(GT (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GT (TSTW x y) yes no) +(GE (CMPWconst [0] z:(AND x y)) yes no) && z.Uses == 1 => (GE (TSTW x y) yes no) + +(EQ (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (EQ (TSTWconst [int32(c)] y) yes no) +(NE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (NE (TSTWconst [int32(c)] y) yes no) +(LT (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LT (TSTWconst [int32(c)] y) yes no) +(LE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (LE (TSTWconst [int32(c)] y) yes no) +(GT (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GT (TSTWconst [int32(c)] y) yes no) +(GE (CMPWconst [0] x:(ANDconst [c] y)) yes no) && x.Uses == 1 => (GE (TSTWconst [int32(c)] y) yes no) + +// For conditional instructions such as CSET, CSEL. +(Equal (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (Equal (TST x y)) +(NotEqual (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (NotEqual (TST x y)) +(LessThan (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (LessThan (TST x y)) +(LessEqual (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (LessEqual (TST x y)) +(GreaterThan (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterThan (TST x y)) +(GreaterEqual (CMPconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterEqual (TST x y)) + +(Equal (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (Equal (TSTWconst [int32(c)] y)) +(NotEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (NotEqual (TSTWconst [int32(c)] y)) +(LessThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessThan (TSTWconst [int32(c)] y)) +(LessEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessEqual (TSTWconst [int32(c)] y)) +(GreaterThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterThan (TSTWconst [int32(c)] y)) +(GreaterEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterEqual (TSTWconst [int32(c)] y)) + +(Equal (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (Equal (TSTW x y)) +(NotEqual (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (NotEqual (TSTW x y)) +(LessThan (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (LessThan (TSTW x y)) +(LessEqual (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (LessEqual (TSTW x y)) +(GreaterThan (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterThan (TSTW x y)) +(GreaterEqual (CMPWconst [0] z:(AND x y))) && z.Uses == 1 => (GreaterEqual (TSTW x y)) + +(Equal (CMPconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (Equal (TSTconst [c] y)) +(NotEqual (CMPconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (NotEqual (TSTconst [c] y)) +(LessThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessThan (TSTconst [c] y)) +(LessEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (LessEqual (TSTconst [c] y)) +(GreaterThan (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterThan (TSTconst [c] y)) +(GreaterEqual (CMPWconst [0] x:(ANDconst [c] y))) && x.Uses == 1 => (GreaterEqual (TSTconst [c] y)) + +(EQ (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (EQ (CMNconst [c] y) yes no) +(NE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (NE (CMNconst [c] y) yes no) +(LT (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LTnoov (CMNconst [c] y) yes no) +(LE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LEnoov (CMNconst [c] y) yes no) +(GT (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GTnoov (CMNconst [c] y) yes no) +(GE (CMPconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GEnoov (CMNconst [c] y) yes no) + +(EQ (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (EQ (CMNWconst [int32(c)] y) yes no) +(NE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (NE (CMNWconst [int32(c)] y) yes no) +(LT (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LTnoov (CMNWconst [int32(c)] y) yes no) +(LE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (LEnoov (CMNWconst [int32(c)] y) yes no) +(GT (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GTnoov (CMNWconst [int32(c)] y) yes no) +(GE (CMPWconst [0] x:(ADDconst [c] y)) yes no) && x.Uses == 1 => (GEnoov (CMNWconst [int32(c)] y) yes no) + +(EQ (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (EQ (CMN x y) yes no) +(NE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (NE (CMN x y) yes no) +(LT (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LTnoov (CMN x y) yes no) +(LE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LEnoov (CMN x y) yes no) +(GT (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GTnoov (CMN x y) yes no) +(GE (CMPconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GEnoov (CMN x y) yes no) + +(EQ (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (EQ (CMNW x y) yes no) +(NE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (NE (CMNW x y) yes no) +(LT (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LTnoov (CMNW x y) yes no) +(LE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (LEnoov (CMNW x y) yes no) +(GT (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GTnoov (CMNW x y) yes no) +(GE (CMPWconst [0] z:(ADD x y)) yes no) && z.Uses == 1 => (GEnoov (CMNW x y) yes no) + +// CMP(x,-y) -> CMN(x,y) is only valid for unordered comparison, if y can be -1<<63 +(EQ (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (EQ (CMN x y) yes no) +(NE (CMP x z:(NEG y)) yes no) && z.Uses == 1 => (NE (CMN x y) yes no) + +(Equal (CMP x z:(NEG y))) && z.Uses == 1 => (Equal (CMN x y)) +(NotEqual (CMP x z:(NEG y))) && z.Uses == 1 => (NotEqual (CMN x y)) + +// CMPW(x,-y) -> CMNW(x,y) is only valid for unordered comparison, if y can be -1<<31 +(EQ (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (EQ (CMNW x y) yes no) +(NE (CMPW x z:(NEG y)) yes no) && z.Uses == 1 => (NE (CMNW x y) yes no) + +(Equal (CMPW x z:(NEG y))) && z.Uses == 1 => (Equal (CMNW x y)) +(NotEqual (CMPW x z:(NEG y))) && z.Uses == 1 => (NotEqual (CMNW x y)) + +// For conditional instructions such as CSET, CSEL. +// TODO: add support for LT, LE, GT, GE, overflow needs to be considered. +(Equal (CMPconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (Equal (CMNconst [c] y)) +(NotEqual (CMPconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (NotEqual (CMNconst [c] y)) + +(Equal (CMPWconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (Equal (CMNWconst [int32(c)] y)) +(NotEqual (CMPWconst [0] x:(ADDconst [c] y))) && x.Uses == 1 => (NotEqual (CMNWconst [int32(c)] y)) + +(Equal (CMPconst [0] z:(ADD x y))) && z.Uses == 1 => (Equal (CMN x y)) +(NotEqual (CMPconst [0] z:(ADD x y))) && z.Uses == 1 => (NotEqual (CMN x y)) + +(Equal (CMPWconst [0] z:(ADD x y))) && z.Uses == 1 => (Equal (CMNW x y)) +(NotEqual (CMPWconst [0] z:(ADD x y))) && z.Uses == 1 => (NotEqual (CMNW x y)) + +(Equal (CMPconst [0] z:(MADD a x y))) && z.Uses==1 => (Equal (CMN a (MUL <x.Type> x y))) +(NotEqual (CMPconst [0] z:(MADD a x y))) && z.Uses==1 => (NotEqual (CMN a (MUL <x.Type> x y))) + +(Equal (CMPconst [0] z:(MSUB a x y))) && z.Uses==1 => (Equal (CMP a (MUL <x.Type> x y))) +(NotEqual (CMPconst [0] z:(MSUB a x y))) && z.Uses==1 => (NotEqual (CMP a (MUL <x.Type> x y))) + +(Equal (CMPWconst [0] z:(MADDW a x y))) && z.Uses==1 => (Equal (CMNW a (MULW <x.Type> x y))) +(NotEqual (CMPWconst [0] z:(MADDW a x y))) && z.Uses==1 => (NotEqual (CMNW a (MULW <x.Type> x y))) + +(Equal (CMPWconst [0] z:(MSUBW a x y))) && z.Uses==1 => (Equal (CMPW a (MULW <x.Type> x y))) +(NotEqual (CMPWconst [0] z:(MSUBW a x y))) && z.Uses==1 => (NotEqual (CMPW a (MULW <x.Type> x y))) + +(CMPconst [c] y) && c < 0 && c != -1<<63 => (CMNconst [-c] y) +(CMPWconst [c] y) && c < 0 && c != -1<<31 => (CMNWconst [-c] y) +(CMNconst [c] y) && c < 0 && c != -1<<63 => (CMPconst [-c] y) +(CMNWconst [c] y) && c < 0 && c != -1<<31 => (CMPWconst [-c] y) + +(EQ (CMPconst [0] x) yes no) => (Z x yes no) +(NE (CMPconst [0] x) yes no) => (NZ x yes no) +(EQ (CMPWconst [0] x) yes no) => (ZW x yes no) +(NE (CMPWconst [0] x) yes no) => (NZW x yes no) + +(EQ (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (EQ (CMN a (MUL <x.Type> x y)) yes no) +(NE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (NE (CMN a (MUL <x.Type> x y)) yes no) +(LT (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (LTnoov (CMN a (MUL <x.Type> x y)) yes no) +(LE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (LEnoov (CMN a (MUL <x.Type> x y)) yes no) +(GT (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (GTnoov (CMN a (MUL <x.Type> x y)) yes no) +(GE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 => (GEnoov (CMN a (MUL <x.Type> x y)) yes no) + +(EQ (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no) +(NE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (NE (CMP a (MUL <x.Type> x y)) yes no) +(LE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (LEnoov (CMP a (MUL <x.Type> x y)) yes no) +(LT (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (LTnoov (CMP a (MUL <x.Type> x y)) yes no) +(GE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (GEnoov (CMP a (MUL <x.Type> x y)) yes no) +(GT (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 => (GTnoov (CMP a (MUL <x.Type> x y)) yes no) + +(EQ (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (EQ (CMNW a (MULW <x.Type> x y)) yes no) +(NE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (NE (CMNW a (MULW <x.Type> x y)) yes no) +(LE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (LEnoov (CMNW a (MULW <x.Type> x y)) yes no) +(LT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (LTnoov (CMNW a (MULW <x.Type> x y)) yes no) +(GE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (GEnoov (CMNW a (MULW <x.Type> x y)) yes no) +(GT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 => (GTnoov (CMNW a (MULW <x.Type> x y)) yes no) + +(EQ (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (EQ (CMPW a (MULW <x.Type> x y)) yes no) +(NE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (NE (CMPW a (MULW <x.Type> x y)) yes no) +(LE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (LEnoov (CMPW a (MULW <x.Type> x y)) yes no) +(LT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (LTnoov (CMPW a (MULW <x.Type> x y)) yes no) +(GE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (GEnoov (CMPW a (MULW <x.Type> x y)) yes no) +(GT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 => (GTnoov (CMPW a (MULW <x.Type> x y)) yes no) + +// Absorb bit-tests into block +(Z (ANDconst [c] x) yes no) && oneBit(c) => (TBZ [int64(ntz64(c))] x yes no) +(NZ (ANDconst [c] x) yes no) && oneBit(c) => (TBNZ [int64(ntz64(c))] x yes no) +(ZW (ANDconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBZ [int64(ntz64(int64(uint32(c))))] x yes no) +(NZW (ANDconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBNZ [int64(ntz64(int64(uint32(c))))] x yes no) +(EQ (TSTconst [c] x) yes no) && oneBit(c) => (TBZ [int64(ntz64(c))] x yes no) +(NE (TSTconst [c] x) yes no) && oneBit(c) => (TBNZ [int64(ntz64(c))] x yes no) +(EQ (TSTWconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBZ [int64(ntz64(int64(uint32(c))))] x yes no) +(NE (TSTWconst [c] x) yes no) && oneBit(int64(uint32(c))) => (TBNZ [int64(ntz64(int64(uint32(c))))] x yes no) + +// Test sign-bit for signed comparisons against zero +(GE (CMPWconst [0] x) yes no) => (TBZ [31] x yes no) +(GE (CMPconst [0] x) yes no) => (TBZ [63] x yes no) +(LT (CMPWconst [0] x) yes no) => (TBNZ [31] x yes no) +(LT (CMPconst [0] x) yes no) => (TBNZ [63] x yes no) + +// fold offset into address +(ADDconst [off1] (MOVDaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) => + (MOVDaddr [int32(off1)+off2] {sym} ptr) + +// fold address into load/store +(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBload [off1+int32(off2)] {sym} ptr mem) +(MOVBUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBUload [off1+int32(off2)] {sym} ptr mem) +(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHload [off1+int32(off2)] {sym} ptr mem) +(MOVHUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHUload [off1+int32(off2)] {sym} ptr mem) +(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWload [off1+int32(off2)] {sym} ptr mem) +(MOVWUload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWUload [off1+int32(off2)] {sym} ptr mem) +(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVDload [off1+int32(off2)] {sym} ptr mem) +(LDP [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (LDP [off1+int32(off2)] {sym} ptr mem) +(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVSload [off1+int32(off2)] {sym} ptr mem) +(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVDload [off1+int32(off2)] {sym} ptr mem) + +// register indexed load +(MOVDload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVDloadidx ptr idx mem) +(MOVWUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWUloadidx ptr idx mem) +(MOVWload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWloadidx ptr idx mem) +(MOVHUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHUloadidx ptr idx mem) +(MOVHload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHloadidx ptr idx mem) +(MOVBUload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBUloadidx ptr idx mem) +(MOVBload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBloadidx ptr idx mem) +(FMOVSload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (FMOVSloadidx ptr idx mem) +(FMOVDload [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (FMOVDloadidx ptr idx mem) +(MOVDloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVDload [int32(c)] ptr mem) +(MOVDloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVDload [int32(c)] ptr mem) +(MOVWUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWUload [int32(c)] ptr mem) +(MOVWUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVWUload [int32(c)] ptr mem) +(MOVWloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWload [int32(c)] ptr mem) +(MOVWloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVWload [int32(c)] ptr mem) +(MOVHUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHUload [int32(c)] ptr mem) +(MOVHUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVHUload [int32(c)] ptr mem) +(MOVHloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHload [int32(c)] ptr mem) +(MOVHloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVHload [int32(c)] ptr mem) +(MOVBUloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBUload [int32(c)] ptr mem) +(MOVBUloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVBUload [int32(c)] ptr mem) +(MOVBloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBload [int32(c)] ptr mem) +(MOVBloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (MOVBload [int32(c)] ptr mem) +(FMOVSloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (FMOVSload [int32(c)] ptr mem) +(FMOVSloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (FMOVSload [int32(c)] ptr mem) +(FMOVDloadidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (FMOVDload [int32(c)] ptr mem) +(FMOVDloadidx (MOVDconst [c]) ptr mem) && is32Bit(c) => (FMOVDload [int32(c)] ptr mem) + +// shifted register indexed load +(MOVDload [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (MOVDloadidx8 ptr idx mem) +(MOVWUload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWUloadidx4 ptr idx mem) +(MOVWload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWloadidx4 ptr idx mem) +(MOVHUload [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHUloadidx2 ptr idx mem) +(MOVHload [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHloadidx2 ptr idx mem) +(MOVDloadidx ptr (SLLconst [3] idx) mem) => (MOVDloadidx8 ptr idx mem) +(MOVWloadidx ptr (SLLconst [2] idx) mem) => (MOVWloadidx4 ptr idx mem) +(MOVWUloadidx ptr (SLLconst [2] idx) mem) => (MOVWUloadidx4 ptr idx mem) +(MOVHloadidx ptr (SLLconst [1] idx) mem) => (MOVHloadidx2 ptr idx mem) +(MOVHUloadidx ptr (SLLconst [1] idx) mem) => (MOVHUloadidx2 ptr idx mem) +(MOVHloadidx ptr (ADD idx idx) mem) => (MOVHloadidx2 ptr idx mem) +(MOVHUloadidx ptr (ADD idx idx) mem) => (MOVHUloadidx2 ptr idx mem) +(MOVDloadidx (SLLconst [3] idx) ptr mem) => (MOVDloadidx8 ptr idx mem) +(MOVWloadidx (SLLconst [2] idx) ptr mem) => (MOVWloadidx4 ptr idx mem) +(MOVWUloadidx (SLLconst [2] idx) ptr mem) => (MOVWUloadidx4 ptr idx mem) +(MOVHloadidx (ADD idx idx) ptr mem) => (MOVHloadidx2 ptr idx mem) +(MOVHUloadidx (ADD idx idx) ptr mem) => (MOVHUloadidx2 ptr idx mem) +(MOVDloadidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (MOVDload [int32(c)<<3] ptr mem) +(MOVWUloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWUload [int32(c)<<2] ptr mem) +(MOVWloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWload [int32(c)<<2] ptr mem) +(MOVHUloadidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHUload [int32(c)<<1] ptr mem) +(MOVHloadidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHload [int32(c)<<1] ptr mem) + +(FMOVDload [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (FMOVDloadidx8 ptr idx mem) +(FMOVSload [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (FMOVSloadidx4 ptr idx mem) +(FMOVDloadidx ptr (SLLconst [3] idx) mem) => (FMOVDloadidx8 ptr idx mem) +(FMOVSloadidx ptr (SLLconst [2] idx) mem) => (FMOVSloadidx4 ptr idx mem) +(FMOVDloadidx (SLLconst [3] idx) ptr mem) => (FMOVDloadidx8 ptr idx mem) +(FMOVSloadidx (SLLconst [2] idx) ptr mem) => (FMOVSloadidx4 ptr idx mem) +(FMOVDloadidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (FMOVDload ptr [int32(c)<<3] mem) +(FMOVSloadidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (FMOVSload ptr [int32(c)<<2] mem) + +(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBstore [off1+int32(off2)] {sym} ptr val mem) +(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHstore [off1+int32(off2)] {sym} ptr val mem) +(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWstore [off1+int32(off2)] {sym} ptr val mem) +(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVDstore [off1+int32(off2)] {sym} ptr val mem) +(STP [off1] {sym} (ADDconst [off2] ptr) val1 val2 mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (STP [off1+int32(off2)] {sym} ptr val1 val2 mem) +(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVSstore [off1+int32(off2)] {sym} ptr val mem) +(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVDstore [off1+int32(off2)] {sym} ptr val mem) +(MOVBstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVHstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVWstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVDstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVDstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVQstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVQstorezero [off1+int32(off2)] {sym} ptr mem) + +// register indexed store +(MOVDstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVDstoreidx ptr idx val mem) +(MOVWstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVWstoreidx ptr idx val mem) +(MOVHstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVHstoreidx ptr idx val mem) +(MOVBstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (MOVBstoreidx ptr idx val mem) +(FMOVDstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (FMOVDstoreidx ptr idx val mem) +(FMOVSstore [off] {sym} (ADD ptr idx) val mem) && off == 0 && sym == nil => (FMOVSstoreidx ptr idx val mem) +(MOVDstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVDstore [int32(c)] ptr val mem) +(MOVDstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVDstore [int32(c)] idx val mem) +(MOVWstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVWstore [int32(c)] ptr val mem) +(MOVWstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVWstore [int32(c)] idx val mem) +(MOVHstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVHstore [int32(c)] ptr val mem) +(MOVHstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVHstore [int32(c)] idx val mem) +(MOVBstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (MOVBstore [int32(c)] ptr val mem) +(MOVBstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (MOVBstore [int32(c)] idx val mem) +(FMOVDstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (FMOVDstore [int32(c)] ptr val mem) +(FMOVDstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (FMOVDstore [int32(c)] idx val mem) +(FMOVSstoreidx ptr (MOVDconst [c]) val mem) && is32Bit(c) => (FMOVSstore [int32(c)] ptr val mem) +(FMOVSstoreidx (MOVDconst [c]) idx val mem) && is32Bit(c) => (FMOVSstore [int32(c)] idx val mem) + +// shifted register indexed store +(MOVDstore [off] {sym} (ADDshiftLL [3] ptr idx) val mem) && off == 0 && sym == nil => (MOVDstoreidx8 ptr idx val mem) +(MOVWstore [off] {sym} (ADDshiftLL [2] ptr idx) val mem) && off == 0 && sym == nil => (MOVWstoreidx4 ptr idx val mem) +(MOVHstore [off] {sym} (ADDshiftLL [1] ptr idx) val mem) && off == 0 && sym == nil => (MOVHstoreidx2 ptr idx val mem) +(MOVDstoreidx ptr (SLLconst [3] idx) val mem) => (MOVDstoreidx8 ptr idx val mem) +(MOVWstoreidx ptr (SLLconst [2] idx) val mem) => (MOVWstoreidx4 ptr idx val mem) +(MOVHstoreidx ptr (SLLconst [1] idx) val mem) => (MOVHstoreidx2 ptr idx val mem) +(MOVHstoreidx ptr (ADD idx idx) val mem) => (MOVHstoreidx2 ptr idx val mem) +(MOVDstoreidx (SLLconst [3] idx) ptr val mem) => (MOVDstoreidx8 ptr idx val mem) +(MOVWstoreidx (SLLconst [2] idx) ptr val mem) => (MOVWstoreidx4 ptr idx val mem) +(MOVHstoreidx (SLLconst [1] idx) ptr val mem) => (MOVHstoreidx2 ptr idx val mem) +(MOVHstoreidx (ADD idx idx) ptr val mem) => (MOVHstoreidx2 ptr idx val mem) +(MOVDstoreidx8 ptr (MOVDconst [c]) val mem) && is32Bit(c<<3) => (MOVDstore [int32(c)<<3] ptr val mem) +(MOVWstoreidx4 ptr (MOVDconst [c]) val mem) && is32Bit(c<<2) => (MOVWstore [int32(c)<<2] ptr val mem) +(MOVHstoreidx2 ptr (MOVDconst [c]) val mem) && is32Bit(c<<1) => (MOVHstore [int32(c)<<1] ptr val mem) + +(FMOVDstore [off] {sym} (ADDshiftLL [3] ptr idx) val mem) && off == 0 && sym == nil => (FMOVDstoreidx8 ptr idx val mem) +(FMOVSstore [off] {sym} (ADDshiftLL [2] ptr idx) val mem) && off == 0 && sym == nil => (FMOVSstoreidx4 ptr idx val mem) +(FMOVDstoreidx ptr (SLLconst [3] idx) val mem) => (FMOVDstoreidx8 ptr idx val mem) +(FMOVSstoreidx ptr (SLLconst [2] idx) val mem) => (FMOVSstoreidx4 ptr idx val mem) +(FMOVDstoreidx (SLLconst [3] idx) ptr val mem) => (FMOVDstoreidx8 ptr idx val mem) +(FMOVSstoreidx (SLLconst [2] idx) ptr val mem) => (FMOVSstoreidx4 ptr idx val mem) +(FMOVDstoreidx8 ptr (MOVDconst [c]) val mem) && is32Bit(c<<3) => (FMOVDstore [int32(c)<<3] ptr val mem) +(FMOVSstoreidx4 ptr (MOVDconst [c]) val mem) && is32Bit(c<<2) => (FMOVSstore [int32(c)<<2] ptr val mem) + +(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVBUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWUload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(LDP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (LDP [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVSload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +(MOVBstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVHstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVWstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(STP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val1 val2 mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (STP [off1+off2] {mergeSym(sym1,sym2)} ptr val1 val2 mem) +(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVBstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVDstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVQstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem) + && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) + && (ptr.Op != OpSB || !config.ctxt.Flag_shared) => + (MOVQstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +// store zero +(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem) +(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem) +(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem) +(STP [off] {sym} ptr (MOVDconst [0]) (MOVDconst [0]) mem) => (MOVQstorezero [off] {sym} ptr mem) + +// register indexed store zero +(MOVDstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVDstorezeroidx ptr idx mem) +(MOVWstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVWstorezeroidx ptr idx mem) +(MOVHstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVHstorezeroidx ptr idx mem) +(MOVBstorezero [off] {sym} (ADD ptr idx) mem) && off == 0 && sym == nil => (MOVBstorezeroidx ptr idx mem) +(MOVDstoreidx ptr idx (MOVDconst [0]) mem) => (MOVDstorezeroidx ptr idx mem) +(MOVWstoreidx ptr idx (MOVDconst [0]) mem) => (MOVWstorezeroidx ptr idx mem) +(MOVHstoreidx ptr idx (MOVDconst [0]) mem) => (MOVHstorezeroidx ptr idx mem) +(MOVBstoreidx ptr idx (MOVDconst [0]) mem) => (MOVBstorezeroidx ptr idx mem) +(MOVDstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVDstorezero [int32(c)] ptr mem) +(MOVDstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVDstorezero [int32(c)] idx mem) +(MOVWstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVWstorezero [int32(c)] ptr mem) +(MOVWstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVWstorezero [int32(c)] idx mem) +(MOVHstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVHstorezero [int32(c)] ptr mem) +(MOVHstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVHstorezero [int32(c)] idx mem) +(MOVBstorezeroidx ptr (MOVDconst [c]) mem) && is32Bit(c) => (MOVBstorezero [int32(c)] ptr mem) +(MOVBstorezeroidx (MOVDconst [c]) idx mem) && is32Bit(c) => (MOVBstorezero [int32(c)] idx mem) + +// shifted register indexed store zero +(MOVDstorezero [off] {sym} (ADDshiftLL [3] ptr idx) mem) && off == 0 && sym == nil => (MOVDstorezeroidx8 ptr idx mem) +(MOVWstorezero [off] {sym} (ADDshiftLL [2] ptr idx) mem) && off == 0 && sym == nil => (MOVWstorezeroidx4 ptr idx mem) +(MOVHstorezero [off] {sym} (ADDshiftLL [1] ptr idx) mem) && off == 0 && sym == nil => (MOVHstorezeroidx2 ptr idx mem) +(MOVDstorezeroidx ptr (SLLconst [3] idx) mem) => (MOVDstorezeroidx8 ptr idx mem) +(MOVWstorezeroidx ptr (SLLconst [2] idx) mem) => (MOVWstorezeroidx4 ptr idx mem) +(MOVHstorezeroidx ptr (SLLconst [1] idx) mem) => (MOVHstorezeroidx2 ptr idx mem) +(MOVHstorezeroidx ptr (ADD idx idx) mem) => (MOVHstorezeroidx2 ptr idx mem) +(MOVDstorezeroidx (SLLconst [3] idx) ptr mem) => (MOVDstorezeroidx8 ptr idx mem) +(MOVWstorezeroidx (SLLconst [2] idx) ptr mem) => (MOVWstorezeroidx4 ptr idx mem) +(MOVHstorezeroidx (SLLconst [1] idx) ptr mem) => (MOVHstorezeroidx2 ptr idx mem) +(MOVHstorezeroidx (ADD idx idx) ptr mem) => (MOVHstorezeroidx2 ptr idx mem) +(MOVDstoreidx8 ptr idx (MOVDconst [0]) mem) => (MOVDstorezeroidx8 ptr idx mem) +(MOVWstoreidx4 ptr idx (MOVDconst [0]) mem) => (MOVWstorezeroidx4 ptr idx mem) +(MOVHstoreidx2 ptr idx (MOVDconst [0]) mem) => (MOVHstorezeroidx2 ptr idx mem) +(MOVDstorezeroidx8 ptr (MOVDconst [c]) mem) && is32Bit(c<<3) => (MOVDstorezero [int32(c<<3)] ptr mem) +(MOVWstorezeroidx4 ptr (MOVDconst [c]) mem) && is32Bit(c<<2) => (MOVWstorezero [int32(c<<2)] ptr mem) +(MOVHstorezeroidx2 ptr (MOVDconst [c]) mem) && is32Bit(c<<1) => (MOVHstorezero [int32(c<<1)] ptr mem) + +// replace load from same location as preceding store with zero/sign extension (or copy in case of full width) +// these seem to have bad interaction with other rules, resulting in slower code +//(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBreg x) +//(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBUreg x) +//(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHreg x) +//(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHUreg x) +//(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWreg x) +//(MOVWUload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWUreg x) +//(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x +//(LDP [off] {sym} ptr (STP [off2] {sym2} ptr2 x y _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x y +//(FMOVSload [off] {sym} ptr (FMOVSstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x +//(FMOVDload [off] {sym} ptr (FMOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x + +(MOVBload [off] {sym} ptr (MOVBstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) +(MOVBUload [off] {sym} ptr (MOVBstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) +(MOVHload [off] {sym} ptr (MOVHstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) +(MOVHUload [off] {sym} ptr (MOVHstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) +(MOVWload [off] {sym} ptr (MOVWstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) +(MOVWUload [off] {sym} ptr (MOVWstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) +(MOVDload [off] {sym} ptr (MOVDstorezero [off2] {sym2} ptr2 _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVDconst [0]) + +(MOVBloadidx ptr idx (MOVBstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) +(MOVBUloadidx ptr idx (MOVBstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) +(MOVHloadidx ptr idx (MOVHstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) +(MOVHUloadidx ptr idx (MOVHstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) +(MOVWloadidx ptr idx (MOVWstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) +(MOVWUloadidx ptr idx (MOVWstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) +(MOVDloadidx ptr idx (MOVDstorezeroidx ptr2 idx2 _)) + && (isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) || isSamePtr(ptr, idx2) && isSamePtr(idx, ptr2)) => (MOVDconst [0]) + +(MOVHloadidx2 ptr idx (MOVHstorezeroidx2 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0]) +(MOVHUloadidx2 ptr idx (MOVHstorezeroidx2 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0]) +(MOVWloadidx4 ptr idx (MOVWstorezeroidx4 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0]) +(MOVWUloadidx4 ptr idx (MOVWstorezeroidx4 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0]) +(MOVDloadidx8 ptr idx (MOVDstorezeroidx8 ptr2 idx2 _)) && isSamePtr(ptr, ptr2) && isSamePtr(idx, idx2) => (MOVDconst [0]) + +// don't extend after proper load +(MOVBreg x:(MOVBload _ _)) => (MOVDreg x) +(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVHreg x:(MOVBload _ _)) => (MOVDreg x) +(MOVHreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVHreg x:(MOVHload _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVBload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHUload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVWload _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x) +(MOVBreg x:(MOVBloadidx _ _ _)) => (MOVDreg x) +(MOVBUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x) +(MOVHreg x:(MOVBloadidx _ _ _)) => (MOVDreg x) +(MOVHreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x) +(MOVHreg x:(MOVHloadidx _ _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVBloadidx _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHloadidx _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVWloadidx _ _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVBUloadidx _ _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVHUloadidx _ _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVWUloadidx _ _ _)) => (MOVDreg x) +(MOVHreg x:(MOVHloadidx2 _ _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHloadidx2 _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x) +(MOVWreg x:(MOVWloadidx4 _ _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVHUloadidx2 _ _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVWUloadidx4 _ _ _)) => (MOVDreg x) + +// fold double extensions +(MOVBreg x:(MOVBreg _)) => (MOVDreg x) +(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVHreg x:(MOVBreg _)) => (MOVDreg x) +(MOVHreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVHreg x:(MOVHreg _)) => (MOVDreg x) +(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x) +(MOVWreg x:(MOVBreg _)) => (MOVDreg x) +(MOVWreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVWreg x:(MOVHreg _)) => (MOVDreg x) +(MOVWreg x:(MOVWreg _)) => (MOVDreg x) +(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x) +(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x) + +// don't extend before store +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVBstoreidx ptr idx (MOVBreg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVBstoreidx ptr idx (MOVBUreg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVBstoreidx ptr idx (MOVHreg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVBstoreidx ptr idx (MOVHUreg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVBstoreidx ptr idx (MOVWreg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVBstoreidx ptr idx (MOVWUreg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVHstoreidx ptr idx (MOVHreg x) mem) => (MOVHstoreidx ptr idx x mem) +(MOVHstoreidx ptr idx (MOVHUreg x) mem) => (MOVHstoreidx ptr idx x mem) +(MOVHstoreidx ptr idx (MOVWreg x) mem) => (MOVHstoreidx ptr idx x mem) +(MOVHstoreidx ptr idx (MOVWUreg x) mem) => (MOVHstoreidx ptr idx x mem) +(MOVWstoreidx ptr idx (MOVWreg x) mem) => (MOVWstoreidx ptr idx x mem) +(MOVWstoreidx ptr idx (MOVWUreg x) mem) => (MOVWstoreidx ptr idx x mem) +(MOVHstoreidx2 ptr idx (MOVHreg x) mem) => (MOVHstoreidx2 ptr idx x mem) +(MOVHstoreidx2 ptr idx (MOVHUreg x) mem) => (MOVHstoreidx2 ptr idx x mem) +(MOVHstoreidx2 ptr idx (MOVWreg x) mem) => (MOVHstoreidx2 ptr idx x mem) +(MOVHstoreidx2 ptr idx (MOVWUreg x) mem) => (MOVHstoreidx2 ptr idx x mem) +(MOVWstoreidx4 ptr idx (MOVWreg x) mem) => (MOVWstoreidx4 ptr idx x mem) +(MOVWstoreidx4 ptr idx (MOVWUreg x) mem) => (MOVWstoreidx4 ptr idx x mem) + +// if a register move has only 1 use, just use the same register without emitting instruction +// MOVDnop doesn't emit instruction, only for ensuring the type. +(MOVDreg x) && x.Uses == 1 => (MOVDnop x) + +// TODO: we should be able to get rid of MOVDnop all together. +// But for now, this is enough to get rid of lots of them. +(MOVDnop (MOVDconst [c])) => (MOVDconst [c]) + +// fold constant into arithmetic ops +(ADD x (MOVDconst [c])) => (ADDconst [c] x) +(SUB x (MOVDconst [c])) => (SUBconst [c] x) +(AND x (MOVDconst [c])) => (ANDconst [c] x) +(OR x (MOVDconst [c])) => (ORconst [c] x) +(XOR x (MOVDconst [c])) => (XORconst [c] x) +(TST x (MOVDconst [c])) => (TSTconst [c] x) +(TSTW x (MOVDconst [c])) => (TSTWconst [int32(c)] x) +(CMN x (MOVDconst [c])) => (CMNconst [c] x) +(CMNW x (MOVDconst [c])) => (CMNWconst [int32(c)] x) +(BIC x (MOVDconst [c])) => (ANDconst [^c] x) +(EON x (MOVDconst [c])) => (XORconst [^c] x) +(ORN x (MOVDconst [c])) => (ORconst [^c] x) + +(SLL x (MOVDconst [c])) => (SLLconst x [c&63]) +(SRL x (MOVDconst [c])) => (SRLconst x [c&63]) +(SRA x (MOVDconst [c])) => (SRAconst x [c&63]) +(SLL x (ANDconst [63] y)) => (SLL x y) +(SRL x (ANDconst [63] y)) => (SRL x y) +(SRA x (ANDconst [63] y)) => (SRA x y) + +(CMP x (MOVDconst [c])) => (CMPconst [c] x) +(CMP (MOVDconst [c]) x) => (InvertFlags (CMPconst [c] x)) +(CMPW x (MOVDconst [c])) => (CMPWconst [int32(c)] x) +(CMPW (MOVDconst [c]) x) => (InvertFlags (CMPWconst [int32(c)] x)) + +(ROR x (MOVDconst [c])) => (RORconst x [c&63]) +(RORW x (MOVDconst [c])) => (RORWconst x [c&31]) + +(ADDSflags x (MOVDconst [c])) => (ADDSconstflags [c] x) + +(ADDconst [c] y) && c < 0 => (SUBconst [-c] y) + +// Canonicalize the order of arguments to comparisons - helps with CSE. +((CMP|CMPW) x y) && canonLessThan(x,y) => (InvertFlags ((CMP|CMPW) y x)) + +// mul-neg => mneg +(NEG (MUL x y)) => (MNEG x y) +(NEG (MULW x y)) => (MNEGW x y) +(MUL (NEG x) y) => (MNEG x y) +(MULW (NEG x) y) => (MNEGW x y) + +// madd/msub +(ADD a l:(MUL x y)) && l.Uses==1 && clobber(l) => (MADD a x y) +(SUB a l:(MUL x y)) && l.Uses==1 && clobber(l) => (MSUB a x y) +(ADD a l:(MNEG x y)) && l.Uses==1 && clobber(l) => (MSUB a x y) +(SUB a l:(MNEG x y)) && l.Uses==1 && clobber(l) => (MADD a x y) + +(ADD a l:(MULW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MADDW a x y) +(SUB a l:(MULW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MSUBW a x y) +(ADD a l:(MNEGW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MSUBW a x y) +(SUB a l:(MNEGW x y)) && a.Type.Size() != 8 && l.Uses==1 && clobber(l) => (MADDW a x y) + +// optimize ADCSflags, SBCSflags and friends +(ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] (ADCzerocarry <typ.UInt64> c)))) => (ADCSflags x y c) +(ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] (MOVDconst [0])))) => (ADDSflags x y) +(SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> bo))))) => (SBCSflags x y bo) +(SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags (MOVDconst [0])))) => (SUBSflags x y) + +// mul by constant +(MUL x (MOVDconst [-1])) => (NEG x) +(MUL _ (MOVDconst [0])) => (MOVDconst [0]) +(MUL x (MOVDconst [1])) => x +(MUL x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log64(c)] x) +(MUL x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (ADDshiftLL x x [log64(c-1)]) +(MUL x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log64(c+1)]) +(MUL x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst [log64(c/3)] (ADDshiftLL <x.Type> x x [1])) +(MUL x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SLLconst [log64(c/5)] (ADDshiftLL <x.Type> x x [2])) +(MUL x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst [log64(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])) +(MUL x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SLLconst [log64(c/9)] (ADDshiftLL <x.Type> x x [3])) + +(MULW x (MOVDconst [c])) && int32(c)==-1 => (NEG x) +(MULW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0]) +(MULW x (MOVDconst [c])) && int32(c)==1 => x +(MULW x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log64(c)] x) +(MULW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (ADDshiftLL x x [log64(c-1)]) +(MULW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log64(c+1)]) +(MULW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst [log64(c/3)] (ADDshiftLL <x.Type> x x [1])) +(MULW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SLLconst [log64(c/5)] (ADDshiftLL <x.Type> x x [2])) +(MULW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst [log64(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])) +(MULW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SLLconst [log64(c/9)] (ADDshiftLL <x.Type> x x [3])) + +// mneg by constant +(MNEG x (MOVDconst [-1])) => x +(MNEG _ (MOVDconst [0])) => (MOVDconst [0]) +(MNEG x (MOVDconst [1])) => (NEG x) +(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log64(c)] x)) +(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (NEG (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log64(c+1)])) +(MNEG x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst <x.Type> [log64(c/3)] (SUBshiftLL <x.Type> x x [2])) +(MNEG x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (NEG (SLLconst <x.Type> [log64(c/5)] (ADDshiftLL <x.Type> x x [2]))) +(MNEG x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst <x.Type> [log64(c/7)] (SUBshiftLL <x.Type> x x [3])) +(MNEG x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (NEG (SLLconst <x.Type> [log64(c/9)] (ADDshiftLL <x.Type> x x [3]))) + + +(MNEGW x (MOVDconst [c])) && int32(c)==-1 => x +(MNEGW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0]) +(MNEGW x (MOVDconst [c])) && int32(c)==1 => (NEG x) +(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log64(c)] x)) +(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (NEG (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log64(c+1)])) +(MNEGW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst <x.Type> [log64(c/3)] (SUBshiftLL <x.Type> x x [2])) +(MNEGW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (NEG (SLLconst <x.Type> [log64(c/5)] (ADDshiftLL <x.Type> x x [2]))) +(MNEGW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst <x.Type> [log64(c/7)] (SUBshiftLL <x.Type> x x [3])) +(MNEGW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (NEG (SLLconst <x.Type> [log64(c/9)] (ADDshiftLL <x.Type> x x [3]))) + + +(MADD a x (MOVDconst [-1])) => (SUB a x) +(MADD a _ (MOVDconst [0])) => a +(MADD a x (MOVDconst [1])) => (ADD a x) +(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)]) +(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MADD a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MADD a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MADD a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MADD a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MADD a (MOVDconst [-1]) x) => (SUB a x) +(MADD a (MOVDconst [0]) _) => a +(MADD a (MOVDconst [1]) x) => (ADD a x) +(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)]) +(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MADD a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MADD a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MADD a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MADD a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MADDW a x (MOVDconst [c])) && int32(c)==-1 => (SUB a x) +(MADDW a _ (MOVDconst [c])) && int32(c)==0 => a +(MADDW a x (MOVDconst [c])) && int32(c)==1 => (ADD a x) +(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)]) +(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MADDW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MADDW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MADDW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MADDW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MADDW a (MOVDconst [c]) x) && int32(c)==-1 => (SUB a x) +(MADDW a (MOVDconst [c]) _) && int32(c)==0 => a +(MADDW a (MOVDconst [c]) x) && int32(c)==1 => (ADD a x) +(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log64(c)]) +(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MADDW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MADDW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MADDW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MADDW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MSUB a x (MOVDconst [-1])) => (ADD a x) +(MSUB a _ (MOVDconst [0])) => a +(MSUB a x (MOVDconst [1])) => (SUB a x) +(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)]) +(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MSUB a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MSUB a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MSUB a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MSUB a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MSUB a (MOVDconst [-1]) x) => (ADD a x) +(MSUB a (MOVDconst [0]) _) => a +(MSUB a (MOVDconst [1]) x) => (SUB a x) +(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)]) +(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MSUB a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MSUB a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MSUB a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MSUB a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MSUBW a x (MOVDconst [c])) && int32(c)==-1 => (ADD a x) +(MSUBW a _ (MOVDconst [c])) && int32(c)==0 => a +(MSUBW a x (MOVDconst [c])) && int32(c)==1 => (SUB a x) +(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)]) +(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MSUBW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MSUBW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MSUBW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MSUBW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +(MSUBW a (MOVDconst [c]) x) && int32(c)==-1 => (ADD a x) +(MSUBW a (MOVDconst [c]) _) && int32(c)==0 => a +(MSUBW a (MOVDconst [c]) x) && int32(c)==1 => (SUB a x) +(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log64(c)]) +(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log64(c-1)])) +(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log64(c+1)])) +(MSUBW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log64(c/3)]) +(MSUBW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log64(c/5)]) +(MSUBW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log64(c/7)]) +(MSUBW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log64(c/9)]) + +// div by constant +(UDIV x (MOVDconst [1])) => x +(UDIV x (MOVDconst [c])) && isPowerOfTwo64(c) => (SRLconst [log64(c)] x) +(UDIVW x (MOVDconst [c])) && uint32(c)==1 => x +(UDIVW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (SRLconst [log64(c)] x) +(UMOD _ (MOVDconst [1])) => (MOVDconst [0]) +(UMOD x (MOVDconst [c])) && isPowerOfTwo64(c) => (ANDconst [c-1] x) +(UMODW _ (MOVDconst [c])) && uint32(c)==1 => (MOVDconst [0]) +(UMODW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (ANDconst [c-1] x) + +// generic simplifications +(ADD x (NEG y)) => (SUB x y) +(SUB x x) => (MOVDconst [0]) +(AND x x) => x +(OR x x) => x +(XOR x x) => (MOVDconst [0]) +(BIC x x) => (MOVDconst [0]) +(EON x x) => (MOVDconst [-1]) +(ORN x x) => (MOVDconst [-1]) +(AND x (MVN y)) => (BIC x y) +(XOR x (MVN y)) => (EON x y) +(OR x (MVN y)) => (ORN x y) +(MVN (XOR x y)) => (EON x y) +(NEG (NEG x)) => x + +(CSEL [cc] (MOVDconst [-1]) (MOVDconst [0]) flag) => (CSETM [cc] flag) +(CSEL [cc] (MOVDconst [0]) (MOVDconst [-1]) flag) => (CSETM [arm64Negate(cc)] flag) +(CSEL [cc] x (MOVDconst [0]) flag) => (CSEL0 [cc] x flag) +(CSEL [cc] (MOVDconst [0]) y flag) => (CSEL0 [arm64Negate(cc)] y flag) +(CSEL [cc] x (ADDconst [1] a) flag) => (CSINC [cc] x a flag) +(CSEL [cc] (ADDconst [1] a) x flag) => (CSINC [arm64Negate(cc)] x a flag) +(CSEL [cc] x (MVN a) flag) => (CSINV [cc] x a flag) +(CSEL [cc] (MVN a) x flag) => (CSINV [arm64Negate(cc)] x a flag) +(CSEL [cc] x (NEG a) flag) => (CSNEG [cc] x a flag) +(CSEL [cc] (NEG a) x flag) => (CSNEG [arm64Negate(cc)] x a flag) + +(SUB x (SUB y z)) => (SUB (ADD <v.Type> x z) y) +(SUB (SUB x y) z) => (SUB x (ADD <y.Type> y z)) + +// remove redundant *const ops +(ADDconst [0] x) => x +(SUBconst [0] x) => x +(ANDconst [0] _) => (MOVDconst [0]) +(ANDconst [-1] x) => x +(ORconst [0] x) => x +(ORconst [-1] _) => (MOVDconst [-1]) +(XORconst [0] x) => x +(XORconst [-1] x) => (MVN x) + +// generic constant folding +(ADDconst [c] (MOVDconst [d])) => (MOVDconst [c+d]) +(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x) +(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x) +(SUBconst [c] (MOVDconst [d])) => (MOVDconst [d-c]) +(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x) +(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x) +(SLLconst [c] (MOVDconst [d])) => (MOVDconst [d<<uint64(c)]) +(SRLconst [c] (MOVDconst [d])) => (MOVDconst [int64(uint64(d)>>uint64(c))]) +(SRAconst [c] (MOVDconst [d])) => (MOVDconst [d>>uint64(c)]) +(MUL (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c*d]) +(MULW (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [int64(int32(c)*int32(d))]) +(MNEG (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [-c*d]) +(MNEGW (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [-int64(int32(c)*int32(d))]) +(MADD (MOVDconst [c]) x y) => (ADDconst [c] (MUL <x.Type> x y)) +(MADDW (MOVDconst [c]) x y) => (ADDconst [c] (MULW <x.Type> x y)) +(MSUB (MOVDconst [c]) x y) => (ADDconst [c] (MNEG <x.Type> x y)) +(MSUBW (MOVDconst [c]) x y) => (ADDconst [c] (MNEGW <x.Type> x y)) +(MADD a (MOVDconst [c]) (MOVDconst [d])) => (ADDconst [c*d] a) +(MADDW a (MOVDconst [c]) (MOVDconst [d])) => (ADDconst [int64(int32(c)*int32(d))] a) +(MSUB a (MOVDconst [c]) (MOVDconst [d])) => (SUBconst [c*d] a) +(MSUBW a (MOVDconst [c]) (MOVDconst [d])) => (SUBconst [int64(int32(c)*int32(d))] a) +(DIV (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [c/d]) +(UDIV (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint64(c)/uint64(d))]) +(DIVW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(int32(c)/int32(d))]) +(UDIVW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint32(c)/uint32(d))]) +(MOD (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [c%d]) +(UMOD (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint64(c)%uint64(d))]) +(MODW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(int32(c)%int32(d))]) +(UMODW (MOVDconst [c]) (MOVDconst [d])) && d != 0 => (MOVDconst [int64(uint32(c)%uint32(d))]) +(ANDconst [c] (MOVDconst [d])) => (MOVDconst [c&d]) +(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x) +(ANDconst [c] (MOVWUreg x)) => (ANDconst [c&(1<<32-1)] x) +(ANDconst [c] (MOVHUreg x)) => (ANDconst [c&(1<<16-1)] x) +(ANDconst [c] (MOVBUreg x)) => (ANDconst [c&(1<<8-1)] x) +(MOVWUreg (ANDconst [c] x)) => (ANDconst [c&(1<<32-1)] x) +(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&(1<<16-1)] x) +(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&(1<<8-1)] x) +(ORconst [c] (MOVDconst [d])) => (MOVDconst [c|d]) +(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x) +(XORconst [c] (MOVDconst [d])) => (MOVDconst [c^d]) +(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x) +(MVN (MOVDconst [c])) => (MOVDconst [^c]) +(NEG (MOVDconst [c])) => (MOVDconst [-c]) +(MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))]) +(MOVBUreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))]) +(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))]) +(MOVHUreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))]) +(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))]) +(MOVWUreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))]) +(MOVDreg (MOVDconst [c])) => (MOVDconst [c]) + +// constant comparisons +(CMPconst (MOVDconst [x]) [y]) => (FlagConstant [subFlags64(x,y)]) +(CMPWconst (MOVDconst [x]) [y]) => (FlagConstant [subFlags32(int32(x),y)]) +(TSTconst (MOVDconst [x]) [y]) => (FlagConstant [logicFlags64(x&y)]) +(TSTWconst (MOVDconst [x]) [y]) => (FlagConstant [logicFlags32(int32(x)&y)]) +(CMNconst (MOVDconst [x]) [y]) => (FlagConstant [addFlags64(x,y)]) +(CMNWconst (MOVDconst [x]) [y]) => (FlagConstant [addFlags32(int32(x),y)]) + +// other known comparisons +(CMPconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags64(0,1)]) +(CMPconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags64(0,1)]) +(CMPconst (MOVWUreg _) [c]) && 0xffffffff < c => (FlagConstant [subFlags64(0,1)]) +(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n => (FlagConstant [subFlags64(0,1)]) +(CMPconst (SRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 63 && (1<<uint64(64-c)) <= uint64(n) => (FlagConstant [subFlags64(0,1)]) +(CMPWconst (MOVBUreg _) [c]) && 0xff < c => (FlagConstant [subFlags64(0,1)]) +(CMPWconst (MOVHUreg _) [c]) && 0xffff < c => (FlagConstant [subFlags64(0,1)]) + +// absorb flag constants into branches +(EQ (FlagConstant [fc]) yes no) && fc.eq() => (First yes no) +(EQ (FlagConstant [fc]) yes no) && !fc.eq() => (First no yes) + +(NE (FlagConstant [fc]) yes no) && fc.ne() => (First yes no) +(NE (FlagConstant [fc]) yes no) && !fc.ne() => (First no yes) + +(LT (FlagConstant [fc]) yes no) && fc.lt() => (First yes no) +(LT (FlagConstant [fc]) yes no) && !fc.lt() => (First no yes) + +(LE (FlagConstant [fc]) yes no) && fc.le() => (First yes no) +(LE (FlagConstant [fc]) yes no) && !fc.le() => (First no yes) + +(GT (FlagConstant [fc]) yes no) && fc.gt() => (First yes no) +(GT (FlagConstant [fc]) yes no) && !fc.gt() => (First no yes) + +(GE (FlagConstant [fc]) yes no) && fc.ge() => (First yes no) +(GE (FlagConstant [fc]) yes no) && !fc.ge() => (First no yes) + +(ULT (FlagConstant [fc]) yes no) && fc.ult() => (First yes no) +(ULT (FlagConstant [fc]) yes no) && !fc.ult() => (First no yes) + +(ULE (FlagConstant [fc]) yes no) && fc.ule() => (First yes no) +(ULE (FlagConstant [fc]) yes no) && !fc.ule() => (First no yes) + +(UGT (FlagConstant [fc]) yes no) && fc.ugt() => (First yes no) +(UGT (FlagConstant [fc]) yes no) && !fc.ugt() => (First no yes) + +(UGE (FlagConstant [fc]) yes no) && fc.uge() => (First yes no) +(UGE (FlagConstant [fc]) yes no) && !fc.uge() => (First no yes) + +(LTnoov (FlagConstant [fc]) yes no) && fc.ltNoov() => (First yes no) +(LTnoov (FlagConstant [fc]) yes no) && !fc.ltNoov() => (First no yes) + +(LEnoov (FlagConstant [fc]) yes no) && fc.leNoov() => (First yes no) +(LEnoov (FlagConstant [fc]) yes no) && !fc.leNoov() => (First no yes) + +(GTnoov (FlagConstant [fc]) yes no) && fc.gtNoov() => (First yes no) +(GTnoov (FlagConstant [fc]) yes no) && !fc.gtNoov() => (First no yes) + +(GEnoov (FlagConstant [fc]) yes no) && fc.geNoov() => (First yes no) +(GEnoov (FlagConstant [fc]) yes no) && !fc.geNoov() => (First no yes) + +(Z (MOVDconst [0]) yes no) => (First yes no) +(Z (MOVDconst [c]) yes no) && c != 0 => (First no yes) +(NZ (MOVDconst [0]) yes no) => (First no yes) +(NZ (MOVDconst [c]) yes no) && c != 0 => (First yes no) +(ZW (MOVDconst [c]) yes no) && int32(c) == 0 => (First yes no) +(ZW (MOVDconst [c]) yes no) && int32(c) != 0 => (First no yes) +(NZW (MOVDconst [c]) yes no) && int32(c) == 0 => (First no yes) +(NZW (MOVDconst [c]) yes no) && int32(c) != 0 => (First yes no) + +// absorb InvertFlags into branches +(LT (InvertFlags cmp) yes no) => (GT cmp yes no) +(GT (InvertFlags cmp) yes no) => (LT cmp yes no) +(LE (InvertFlags cmp) yes no) => (GE cmp yes no) +(GE (InvertFlags cmp) yes no) => (LE cmp yes no) +(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no) +(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no) +(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no) +(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no) +(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no) +(NE (InvertFlags cmp) yes no) => (NE cmp yes no) +(FLT (InvertFlags cmp) yes no) => (FGT cmp yes no) +(FGT (InvertFlags cmp) yes no) => (FLT cmp yes no) +(FLE (InvertFlags cmp) yes no) => (FGE cmp yes no) +(FGE (InvertFlags cmp) yes no) => (FLE cmp yes no) +(LTnoov (InvertFlags cmp) yes no) => (GTnoov cmp yes no) +(GEnoov (InvertFlags cmp) yes no) => (LEnoov cmp yes no) +(LEnoov (InvertFlags cmp) yes no) => (GEnoov cmp yes no) +(GTnoov (InvertFlags cmp) yes no) => (LTnoov cmp yes no) + +// absorb InvertFlags into conditional instructions +(CSEL [cc] x y (InvertFlags cmp)) => (CSEL [arm64Invert(cc)] x y cmp) +(CSEL0 [cc] x (InvertFlags cmp)) => (CSEL0 [arm64Invert(cc)] x cmp) +(CSETM [cc] (InvertFlags cmp)) => (CSETM [arm64Invert(cc)] cmp) +(CSINC [cc] x y (InvertFlags cmp)) => (CSINC [arm64Invert(cc)] x y cmp) +(CSINV [cc] x y (InvertFlags cmp)) => (CSINV [arm64Invert(cc)] x y cmp) +(CSNEG [cc] x y (InvertFlags cmp)) => (CSNEG [arm64Invert(cc)] x y cmp) + +// absorb flag constants into boolean values +(Equal (FlagConstant [fc])) => (MOVDconst [b2i(fc.eq())]) +(NotEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.ne())]) +(LessThan (FlagConstant [fc])) => (MOVDconst [b2i(fc.lt())]) +(LessThanU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ult())]) +(LessEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.le())]) +(LessEqualU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ule())]) +(GreaterThan (FlagConstant [fc])) => (MOVDconst [b2i(fc.gt())]) +(GreaterThanU (FlagConstant [fc])) => (MOVDconst [b2i(fc.ugt())]) +(GreaterEqual (FlagConstant [fc])) => (MOVDconst [b2i(fc.ge())]) +(GreaterEqualU (FlagConstant [fc])) => (MOVDconst [b2i(fc.uge())]) + +// absorb InvertFlags into boolean values +(Equal (InvertFlags x)) => (Equal x) +(NotEqual (InvertFlags x)) => (NotEqual x) +(LessThan (InvertFlags x)) => (GreaterThan x) +(LessThanU (InvertFlags x)) => (GreaterThanU x) +(GreaterThan (InvertFlags x)) => (LessThan x) +(GreaterThanU (InvertFlags x)) => (LessThanU x) +(LessEqual (InvertFlags x)) => (GreaterEqual x) +(LessEqualU (InvertFlags x)) => (GreaterEqualU x) +(GreaterEqual (InvertFlags x)) => (LessEqual x) +(GreaterEqualU (InvertFlags x)) => (LessEqualU x) +(LessThanF (InvertFlags x)) => (GreaterThanF x) +(LessEqualF (InvertFlags x)) => (GreaterEqualF x) +(GreaterThanF (InvertFlags x)) => (LessThanF x) +(GreaterEqualF (InvertFlags x)) => (LessEqualF x) + +// Boolean-generating instructions (NOTE: NOT all boolean Values) always +// zero upper bit of the register; no need to zero-extend +(MOVBUreg x:((Equal|NotEqual|LessThan|LessThanU|LessThanF|LessEqual|LessEqualU|LessEqualF|GreaterThan|GreaterThanU|GreaterThanF|GreaterEqual|GreaterEqualU|GreaterEqualF) _)) => (MOVDreg x) + +// absorb flag constants into conditional instructions +(CSEL [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x +(CSEL [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => y +(CSEL0 [cc] x flag) && ccARM64Eval(cc, flag) > 0 => x +(CSEL0 [cc] _ flag) && ccARM64Eval(cc, flag) < 0 => (MOVDconst [0]) +(CSNEG [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x +(CSNEG [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => (NEG y) +(CSINV [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x +(CSINV [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => (Not y) +(CSINC [cc] x _ flag) && ccARM64Eval(cc, flag) > 0 => x +(CSINC [cc] _ y flag) && ccARM64Eval(cc, flag) < 0 => (ADDconst [1] y) +(CSETM [cc] flag) && ccARM64Eval(cc, flag) > 0 => (MOVDconst [-1]) +(CSETM [cc] flag) && ccARM64Eval(cc, flag) < 0 => (MOVDconst [0]) + +// absorb flags back into boolean CSEL +(CSEL [cc] x y (CMPWconst [0] boolval)) && cc == OpARM64NotEqual && flagArg(boolval) != nil => + (CSEL [boolval.Op] x y flagArg(boolval)) +(CSEL [cc] x y (CMPWconst [0] boolval)) && cc == OpARM64Equal && flagArg(boolval) != nil => + (CSEL [arm64Negate(boolval.Op)] x y flagArg(boolval)) +(CSEL0 [cc] x (CMPWconst [0] boolval)) && cc == OpARM64NotEqual && flagArg(boolval) != nil => + (CSEL0 [boolval.Op] x flagArg(boolval)) +(CSEL0 [cc] x (CMPWconst [0] boolval)) && cc == OpARM64Equal && flagArg(boolval) != nil => + (CSEL0 [arm64Negate(boolval.Op)] x flagArg(boolval)) + +// absorb shifts into ops +(NEG x:(SLLconst [c] y)) && clobberIfDead(x) => (NEGshiftLL [c] y) +(NEG x:(SRLconst [c] y)) && clobberIfDead(x) => (NEGshiftRL [c] y) +(NEG x:(SRAconst [c] y)) && clobberIfDead(x) => (NEGshiftRA [c] y) +(MVN x:(SLLconst [c] y)) && clobberIfDead(x) => (MVNshiftLL [c] y) +(MVN x:(SRLconst [c] y)) && clobberIfDead(x) => (MVNshiftRL [c] y) +(MVN x:(SRAconst [c] y)) && clobberIfDead(x) => (MVNshiftRA [c] y) +(MVN x:(RORconst [c] y)) && clobberIfDead(x) => (MVNshiftRO [c] y) +(ADD x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ADDshiftLL x0 y [c]) +(ADD x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ADDshiftRL x0 y [c]) +(ADD x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ADDshiftRA x0 y [c]) +(SUB x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (SUBshiftLL x0 y [c]) +(SUB x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (SUBshiftRL x0 y [c]) +(SUB x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (SUBshiftRA x0 y [c]) +(AND x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ANDshiftLL x0 y [c]) +(AND x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ANDshiftRL x0 y [c]) +(AND x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ANDshiftRA x0 y [c]) +(AND x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (ANDshiftRO x0 y [c]) +(OR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ORshiftLL x0 y [c]) // useful for combined load +(OR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ORshiftRL x0 y [c]) +(OR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ORshiftRA x0 y [c]) +(OR x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (ORshiftRO x0 y [c]) +(XOR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (XORshiftLL x0 y [c]) +(XOR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (XORshiftRL x0 y [c]) +(XOR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (XORshiftRA x0 y [c]) +(XOR x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (XORshiftRO x0 y [c]) +(BIC x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (BICshiftLL x0 y [c]) +(BIC x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (BICshiftRL x0 y [c]) +(BIC x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (BICshiftRA x0 y [c]) +(BIC x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (BICshiftRO x0 y [c]) +(ORN x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (ORNshiftLL x0 y [c]) +(ORN x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (ORNshiftRL x0 y [c]) +(ORN x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (ORNshiftRA x0 y [c]) +(ORN x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (ORNshiftRO x0 y [c]) +(EON x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (EONshiftLL x0 y [c]) +(EON x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (EONshiftRL x0 y [c]) +(EON x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (EONshiftRA x0 y [c]) +(EON x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (EONshiftRO x0 y [c]) +(CMP x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (CMPshiftLL x0 y [c]) +(CMP x0:(SLLconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftLL x1 y [c])) +(CMP x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (CMPshiftRL x0 y [c]) +(CMP x0:(SRLconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftRL x1 y [c])) +(CMP x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (CMPshiftRA x0 y [c]) +(CMP x0:(SRAconst [c] y) x1) && clobberIfDead(x0) => (InvertFlags (CMPshiftRA x1 y [c])) +(CMN x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (CMNshiftLL x0 y [c]) +(CMN x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (CMNshiftRL x0 y [c]) +(CMN x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (CMNshiftRA x0 y [c]) +(TST x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) => (TSTshiftLL x0 y [c]) +(TST x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) => (TSTshiftRL x0 y [c]) +(TST x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) => (TSTshiftRA x0 y [c]) +(TST x0 x1:(RORconst [c] y)) && clobberIfDead(x1) => (TSTshiftRO x0 y [c]) + +// prefer *const ops to *shift ops +(ADDshiftLL (MOVDconst [c]) x [d]) => (ADDconst [c] (SLLconst <x.Type> x [d])) +(ADDshiftRL (MOVDconst [c]) x [d]) => (ADDconst [c] (SRLconst <x.Type> x [d])) +(ADDshiftRA (MOVDconst [c]) x [d]) => (ADDconst [c] (SRAconst <x.Type> x [d])) +(ANDshiftLL (MOVDconst [c]) x [d]) => (ANDconst [c] (SLLconst <x.Type> x [d])) +(ANDshiftRL (MOVDconst [c]) x [d]) => (ANDconst [c] (SRLconst <x.Type> x [d])) +(ANDshiftRA (MOVDconst [c]) x [d]) => (ANDconst [c] (SRAconst <x.Type> x [d])) +(ANDshiftRO (MOVDconst [c]) x [d]) => (ANDconst [c] (RORconst <x.Type> x [d])) +(ORshiftLL (MOVDconst [c]) x [d]) => (ORconst [c] (SLLconst <x.Type> x [d])) +(ORshiftRL (MOVDconst [c]) x [d]) => (ORconst [c] (SRLconst <x.Type> x [d])) +(ORshiftRA (MOVDconst [c]) x [d]) => (ORconst [c] (SRAconst <x.Type> x [d])) +(ORshiftRO (MOVDconst [c]) x [d]) => (ORconst [c] (RORconst <x.Type> x [d])) +(XORshiftLL (MOVDconst [c]) x [d]) => (XORconst [c] (SLLconst <x.Type> x [d])) +(XORshiftRL (MOVDconst [c]) x [d]) => (XORconst [c] (SRLconst <x.Type> x [d])) +(XORshiftRA (MOVDconst [c]) x [d]) => (XORconst [c] (SRAconst <x.Type> x [d])) +(XORshiftRO (MOVDconst [c]) x [d]) => (XORconst [c] (RORconst <x.Type> x [d])) +(CMPshiftLL (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SLLconst <x.Type> x [d]))) +(CMPshiftRL (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRLconst <x.Type> x [d]))) +(CMPshiftRA (MOVDconst [c]) x [d]) => (InvertFlags (CMPconst [c] (SRAconst <x.Type> x [d]))) +(CMNshiftLL (MOVDconst [c]) x [d]) => (CMNconst [c] (SLLconst <x.Type> x [d])) +(CMNshiftRL (MOVDconst [c]) x [d]) => (CMNconst [c] (SRLconst <x.Type> x [d])) +(CMNshiftRA (MOVDconst [c]) x [d]) => (CMNconst [c] (SRAconst <x.Type> x [d])) +(TSTshiftLL (MOVDconst [c]) x [d]) => (TSTconst [c] (SLLconst <x.Type> x [d])) +(TSTshiftRL (MOVDconst [c]) x [d]) => (TSTconst [c] (SRLconst <x.Type> x [d])) +(TSTshiftRA (MOVDconst [c]) x [d]) => (TSTconst [c] (SRAconst <x.Type> x [d])) +(TSTshiftRO (MOVDconst [c]) x [d]) => (TSTconst [c] (RORconst <x.Type> x [d])) + +// constant folding in *shift ops +(MVNshiftLL (MOVDconst [c]) [d]) => (MOVDconst [^int64(uint64(c)<<uint64(d))]) +(MVNshiftRL (MOVDconst [c]) [d]) => (MOVDconst [^int64(uint64(c)>>uint64(d))]) +(MVNshiftRA (MOVDconst [c]) [d]) => (MOVDconst [^(c>>uint64(d))]) +(MVNshiftRO (MOVDconst [c]) [d]) => (MOVDconst [^rotateRight64(c, d)]) +(NEGshiftLL (MOVDconst [c]) [d]) => (MOVDconst [-int64(uint64(c)<<uint64(d))]) +(NEGshiftRL (MOVDconst [c]) [d]) => (MOVDconst [-int64(uint64(c)>>uint64(d))]) +(NEGshiftRA (MOVDconst [c]) [d]) => (MOVDconst [-(c>>uint64(d))]) +(ADDshiftLL x (MOVDconst [c]) [d]) => (ADDconst x [int64(uint64(c)<<uint64(d))]) +(ADDshiftRL x (MOVDconst [c]) [d]) => (ADDconst x [int64(uint64(c)>>uint64(d))]) +(ADDshiftRA x (MOVDconst [c]) [d]) => (ADDconst x [c>>uint64(d)]) +(SUBshiftLL x (MOVDconst [c]) [d]) => (SUBconst x [int64(uint64(c)<<uint64(d))]) +(SUBshiftRL x (MOVDconst [c]) [d]) => (SUBconst x [int64(uint64(c)>>uint64(d))]) +(SUBshiftRA x (MOVDconst [c]) [d]) => (SUBconst x [c>>uint64(d)]) +(ANDshiftLL x (MOVDconst [c]) [d]) => (ANDconst x [int64(uint64(c)<<uint64(d))]) +(ANDshiftRL x (MOVDconst [c]) [d]) => (ANDconst x [int64(uint64(c)>>uint64(d))]) +(ANDshiftRA x (MOVDconst [c]) [d]) => (ANDconst x [c>>uint64(d)]) +(ANDshiftRO x (MOVDconst [c]) [d]) => (ANDconst x [rotateRight64(c, d)]) +(ORshiftLL x (MOVDconst [c]) [d]) => (ORconst x [int64(uint64(c)<<uint64(d))]) +(ORshiftRL x (MOVDconst [c]) [d]) => (ORconst x [int64(uint64(c)>>uint64(d))]) +(ORshiftRA x (MOVDconst [c]) [d]) => (ORconst x [c>>uint64(d)]) +(ORshiftRO x (MOVDconst [c]) [d]) => (ORconst x [rotateRight64(c, d)]) +(XORshiftLL x (MOVDconst [c]) [d]) => (XORconst x [int64(uint64(c)<<uint64(d))]) +(XORshiftRL x (MOVDconst [c]) [d]) => (XORconst x [int64(uint64(c)>>uint64(d))]) +(XORshiftRA x (MOVDconst [c]) [d]) => (XORconst x [c>>uint64(d)]) +(XORshiftRO x (MOVDconst [c]) [d]) => (XORconst x [rotateRight64(c, d)]) +(BICshiftLL x (MOVDconst [c]) [d]) => (ANDconst x [^int64(uint64(c)<<uint64(d))]) +(BICshiftRL x (MOVDconst [c]) [d]) => (ANDconst x [^int64(uint64(c)>>uint64(d))]) +(BICshiftRA x (MOVDconst [c]) [d]) => (ANDconst x [^(c>>uint64(d))]) +(BICshiftRO x (MOVDconst [c]) [d]) => (ANDconst x [^rotateRight64(c, d)]) +(ORNshiftLL x (MOVDconst [c]) [d]) => (ORconst x [^int64(uint64(c)<<uint64(d))]) +(ORNshiftRL x (MOVDconst [c]) [d]) => (ORconst x [^int64(uint64(c)>>uint64(d))]) +(ORNshiftRA x (MOVDconst [c]) [d]) => (ORconst x [^(c>>uint64(d))]) +(ORNshiftRO x (MOVDconst [c]) [d]) => (ORconst x [^rotateRight64(c, d)]) +(EONshiftLL x (MOVDconst [c]) [d]) => (XORconst x [^int64(uint64(c)<<uint64(d))]) +(EONshiftRL x (MOVDconst [c]) [d]) => (XORconst x [^int64(uint64(c)>>uint64(d))]) +(EONshiftRA x (MOVDconst [c]) [d]) => (XORconst x [^(c>>uint64(d))]) +(EONshiftRO x (MOVDconst [c]) [d]) => (XORconst x [^rotateRight64(c, d)]) +(CMPshiftLL x (MOVDconst [c]) [d]) => (CMPconst x [int64(uint64(c)<<uint64(d))]) +(CMPshiftRL x (MOVDconst [c]) [d]) => (CMPconst x [int64(uint64(c)>>uint64(d))]) +(CMPshiftRA x (MOVDconst [c]) [d]) => (CMPconst x [c>>uint64(d)]) +(CMNshiftLL x (MOVDconst [c]) [d]) => (CMNconst x [int64(uint64(c)<<uint64(d))]) +(CMNshiftRL x (MOVDconst [c]) [d]) => (CMNconst x [int64(uint64(c)>>uint64(d))]) +(CMNshiftRA x (MOVDconst [c]) [d]) => (CMNconst x [c>>uint64(d)]) +(TSTshiftLL x (MOVDconst [c]) [d]) => (TSTconst x [int64(uint64(c)<<uint64(d))]) +(TSTshiftRL x (MOVDconst [c]) [d]) => (TSTconst x [int64(uint64(c)>>uint64(d))]) +(TSTshiftRA x (MOVDconst [c]) [d]) => (TSTconst x [c>>uint64(d)]) +(TSTshiftRO x (MOVDconst [c]) [d]) => (TSTconst x [rotateRight64(c, d)]) + +// simplification with *shift ops +(SUBshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0]) +(SUBshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0]) +(SUBshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0]) +(ANDshiftLL y:(SLLconst x [c]) x [c]) => y +(ANDshiftRL y:(SRLconst x [c]) x [c]) => y +(ANDshiftRA y:(SRAconst x [c]) x [c]) => y +(ANDshiftRO y:(RORconst x [c]) x [c]) => y +(ORshiftLL y:(SLLconst x [c]) x [c]) => y +(ORshiftRL y:(SRLconst x [c]) x [c]) => y +(ORshiftRA y:(SRAconst x [c]) x [c]) => y +(ORshiftRO y:(RORconst x [c]) x [c]) => y +(XORshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0]) +(XORshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0]) +(XORshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0]) +(XORshiftRO (RORconst x [c]) x [c]) => (MOVDconst [0]) +(BICshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [0]) +(BICshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [0]) +(BICshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [0]) +(BICshiftRO (RORconst x [c]) x [c]) => (MOVDconst [0]) +(EONshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [-1]) +(EONshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [-1]) +(EONshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [-1]) +(EONshiftRO (RORconst x [c]) x [c]) => (MOVDconst [-1]) +(ORNshiftLL (SLLconst x [c]) x [c]) => (MOVDconst [-1]) +(ORNshiftRL (SRLconst x [c]) x [c]) => (MOVDconst [-1]) +(ORNshiftRA (SRAconst x [c]) x [c]) => (MOVDconst [-1]) +(ORNshiftRO (RORconst x [c]) x [c]) => (MOVDconst [-1]) + +// rev16w | rev16 +// ((x>>8) | (x<<8)) => (REV16W x), the type of x is uint16, "|" can also be "^" or "+". +((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (UBFX <typ.UInt16> [armBFAuxInt(8, 8)] x) x) => (REV16W x) + +// ((x & 0xff00ff00)>>8) | ((x & 0x00ff00ff)<<8), "|" can also be "^" or "+". +((ADDshiftLL|ORshiftLL|XORshiftLL) [8] (UBFX [armBFAuxInt(8, 24)] (ANDconst [c1] x)) (ANDconst [c2] x)) + && uint32(c1) == 0xff00ff00 && uint32(c2) == 0x00ff00ff + => (REV16W x) + +// ((x & 0xff00ff00ff00ff00)>>8) | ((x & 0x00ff00ff00ff00ff)<<8), "|" can also be "^" or "+". +((ADDshiftLL|ORshiftLL|XORshiftLL) [8] (SRLconst [8] (ANDconst [c1] x)) (ANDconst [c2] x)) + && (uint64(c1) == 0xff00ff00ff00ff00 && uint64(c2) == 0x00ff00ff00ff00ff) + => (REV16 x) + +// ((x & 0xff00ff00)>>8) | ((x & 0x00ff00ff)<<8), "|" can also be "^" or "+". +((ADDshiftLL|ORshiftLL|XORshiftLL) [8] (SRLconst [8] (ANDconst [c1] x)) (ANDconst [c2] x)) + && (uint64(c1) == 0xff00ff00 && uint64(c2) == 0x00ff00ff) + => (REV16 (ANDconst <x.Type> [0xffffffff] x)) + +// Extract from reg pair +(ADDshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x) +( ORshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x) +(XORshiftLL [c] (SRLconst x [64-c]) x2) => (EXTRconst [64-c] x2 x) + +(ADDshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c) + => (EXTRWconst [32-c] x2 x) +( ORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c) + => (EXTRWconst [32-c] x2 x) +(XORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c) + => (EXTRWconst [32-c] x2 x) + +// Rewrite special pairs of shifts to AND. +// On ARM64 the bitmask can fit into an instruction. +(SRLconst [c] (SLLconst [c] x)) && 0 < c && c < 64 => (ANDconst [1<<uint(64-c)-1] x) // mask out high bits +(SLLconst [c] (SRLconst [c] x)) && 0 < c && c < 64 => (ANDconst [^(1<<uint(c)-1)] x) // mask out low bits + +// Special case setting bit as 1. An example is math.Copysign(c,-1) +(ORconst [c1] (ANDconst [c2] x)) && c2|c1 == ^0 => (ORconst [c1] x) + +// If the shift amount is larger than the datasize(32, 16, 8), we can optimize to constant 0. +(MOVWUreg (SLLconst [lc] x)) && lc >= 32 => (MOVDconst [0]) +(MOVHUreg (SLLconst [lc] x)) && lc >= 16 => (MOVDconst [0]) +(MOVBUreg (SLLconst [lc] x)) && lc >= 8 => (MOVDconst [0]) + +// After zero extension, the upper (64-datasize(32|16|8)) bits are zero, we can optimiza to constant 0. +(SRLconst [rc] (MOVWUreg x)) && rc >= 32 => (MOVDconst [0]) +(SRLconst [rc] (MOVHUreg x)) && rc >= 16 => (MOVDconst [0]) +(SRLconst [rc] (MOVBUreg x)) && rc >= 8 => (MOVDconst [0]) + +// bitfield ops + +// sbfiz +// (x << lc) >> rc +(SRAconst [rc] (SLLconst [lc] x)) && lc > rc => (SBFIZ [armBFAuxInt(lc-rc, 64-lc)] x) +// int64(x << lc) +(MOVWreg (SLLconst [lc] x)) && lc < 32 => (SBFIZ [armBFAuxInt(lc, 32-lc)] x) +(MOVHreg (SLLconst [lc] x)) && lc < 16 => (SBFIZ [armBFAuxInt(lc, 16-lc)] x) +(MOVBreg (SLLconst [lc] x)) && lc < 8 => (SBFIZ [armBFAuxInt(lc, 8-lc)] x) +// int64(x) << lc +(SLLconst [lc] (MOVWreg x)) => (SBFIZ [armBFAuxInt(lc, min(32, 64-lc))] x) +(SLLconst [lc] (MOVHreg x)) => (SBFIZ [armBFAuxInt(lc, min(16, 64-lc))] x) +(SLLconst [lc] (MOVBreg x)) => (SBFIZ [armBFAuxInt(lc, min(8, 64-lc))] x) + +// sbfx +// (x << lc) >> rc +(SRAconst [rc] (SLLconst [lc] x)) && lc <= rc => (SBFX [armBFAuxInt(rc-lc, 64-rc)] x) +// int64(x) >> rc +(SRAconst [rc] (MOVWreg x)) && rc < 32 => (SBFX [armBFAuxInt(rc, 32-rc)] x) +(SRAconst [rc] (MOVHreg x)) && rc < 16 => (SBFX [armBFAuxInt(rc, 16-rc)] x) +(SRAconst [rc] (MOVBreg x)) && rc < 8 => (SBFX [armBFAuxInt(rc, 8-rc)] x) +// merge sbfx and sign-extension into sbfx +(MOVWreg (SBFX [bfc] x)) && bfc.getARM64BFwidth() <= 32 => (SBFX [bfc] x) +(MOVHreg (SBFX [bfc] x)) && bfc.getARM64BFwidth() <= 16 => (SBFX [bfc] x) +(MOVBreg (SBFX [bfc] x)) && bfc.getARM64BFwidth() <= 8 => (SBFX [bfc] x) + +// sbfiz/sbfx combinations: merge shifts into bitfield ops +(SRAconst [sc] (SBFIZ [bfc] x)) && sc < bfc.getARM64BFlsb() + => (SBFIZ [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x) +(SRAconst [sc] (SBFIZ [bfc] x)) && sc >= bfc.getARM64BFlsb() + && sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth() + => (SBFX [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x) + +// ubfiz +// (x << lc) >> rc +(SRLconst [rc] (SLLconst [lc] x)) && lc > rc => (UBFIZ [armBFAuxInt(lc-rc, 64-lc)] x) +// uint64(x) << lc +(SLLconst [lc] (MOVWUreg x)) => (UBFIZ [armBFAuxInt(lc, min(32, 64-lc))] x) +(SLLconst [lc] (MOVHUreg x)) => (UBFIZ [armBFAuxInt(lc, min(16, 64-lc))] x) +(SLLconst [lc] (MOVBUreg x)) => (UBFIZ [armBFAuxInt(lc, min(8, 64-lc))] x) +// uint64(x << lc) +(MOVWUreg (SLLconst [lc] x)) && lc < 32 => (UBFIZ [armBFAuxInt(lc, 32-lc)] x) +(MOVHUreg (SLLconst [lc] x)) && lc < 16 => (UBFIZ [armBFAuxInt(lc, 16-lc)] x) +(MOVBUreg (SLLconst [lc] x)) && lc < 8 => (UBFIZ [armBFAuxInt(lc, 8-lc)] x) + +// merge ANDconst into ubfiz +// (x & ac) << sc +(SLLconst [sc] (ANDconst [ac] x)) && isARM64BFMask(sc, ac, 0) + => (UBFIZ [armBFAuxInt(sc, arm64BFWidth(ac, 0))] x) +// (x << sc) & ac +(ANDconst [ac] (SLLconst [sc] x)) && isARM64BFMask(sc, ac, sc) + => (UBFIZ [armBFAuxInt(sc, arm64BFWidth(ac, sc))] x) + +// ubfx +// (x << lc) >> rc +(SRLconst [rc] (SLLconst [lc] x)) && lc < rc => (UBFX [armBFAuxInt(rc-lc, 64-rc)] x) +// uint64(x) >> rc +(SRLconst [rc] (MOVWUreg x)) && rc < 32 => (UBFX [armBFAuxInt(rc, 32-rc)] x) +(SRLconst [rc] (MOVHUreg x)) && rc < 16 => (UBFX [armBFAuxInt(rc, 16-rc)] x) +(SRLconst [rc] (MOVBUreg x)) && rc < 8 => (UBFX [armBFAuxInt(rc, 8-rc)] x) +// uint64(x >> rc) +(MOVWUreg (SRLconst [rc] x)) && rc < 32 => (UBFX [armBFAuxInt(rc, 32)] x) +(MOVHUreg (SRLconst [rc] x)) && rc < 16 => (UBFX [armBFAuxInt(rc, 16)] x) +(MOVBUreg (SRLconst [rc] x)) && rc < 8 => (UBFX [armBFAuxInt(rc, 8)] x) +// merge ANDconst into ubfx +// (x >> sc) & ac +(ANDconst [ac] (SRLconst [sc] x)) && isARM64BFMask(sc, ac, 0) + => (UBFX [armBFAuxInt(sc, arm64BFWidth(ac, 0))] x) +// (x & ac) >> sc +(SRLconst [sc] (ANDconst [ac] x)) && isARM64BFMask(sc, ac, sc) + => (UBFX [armBFAuxInt(sc, arm64BFWidth(ac, sc))] x) +// merge ANDconst and ubfx into ubfx +(ANDconst [c] (UBFX [bfc] x)) && isARM64BFMask(0, c, 0) => + (UBFX [armBFAuxInt(bfc.getARM64BFlsb(), min(bfc.getARM64BFwidth(), arm64BFWidth(c, 0)))] x) +(UBFX [bfc] (ANDconst [c] x)) && isARM64BFMask(0, c, 0) && bfc.getARM64BFlsb() + bfc.getARM64BFwidth() <= arm64BFWidth(c, 0) => + (UBFX [bfc] x) +// merge ubfx and zerso-extension into ubfx +(MOVWUreg (UBFX [bfc] x)) && bfc.getARM64BFwidth() <= 32 => (UBFX [bfc] x) +(MOVHUreg (UBFX [bfc] x)) && bfc.getARM64BFwidth() <= 16 => (UBFX [bfc] x) +(MOVBUreg (UBFX [bfc] x)) && bfc.getARM64BFwidth() <= 8 => (UBFX [bfc] x) + +// ubfiz/ubfx combinations: merge shifts into bitfield ops +(SRLconst [sc] (UBFX [bfc] x)) && sc < bfc.getARM64BFwidth() + => (UBFX [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth()-sc)] x) +(UBFX [bfc] (SRLconst [sc] x)) && sc+bfc.getARM64BFwidth()+bfc.getARM64BFlsb() < 64 + => (UBFX [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth())] x) +(SLLconst [sc] (UBFIZ [bfc] x)) && sc+bfc.getARM64BFwidth()+bfc.getARM64BFlsb() < 64 + => (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth())] x) +(UBFIZ [bfc] (SLLconst [sc] x)) && sc < bfc.getARM64BFwidth() + => (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()+sc, bfc.getARM64BFwidth()-sc)] x) +// ((x << c1) >> c2) >> c3 +(SRLconst [sc] (UBFIZ [bfc] x)) && sc == bfc.getARM64BFlsb() + => (ANDconst [1<<uint(bfc.getARM64BFwidth())-1] x) +(SRLconst [sc] (UBFIZ [bfc] x)) && sc < bfc.getARM64BFlsb() + => (UBFIZ [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x) +(SRLconst [sc] (UBFIZ [bfc] x)) && sc > bfc.getARM64BFlsb() + && sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth() + => (UBFX [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x) +// ((x << c1) << c2) >> c3 +(UBFX [bfc] (SLLconst [sc] x)) && sc == bfc.getARM64BFlsb() + => (ANDconst [1<<uint(bfc.getARM64BFwidth())-1] x) +(UBFX [bfc] (SLLconst [sc] x)) && sc < bfc.getARM64BFlsb() + => (UBFX [armBFAuxInt(bfc.getARM64BFlsb()-sc, bfc.getARM64BFwidth())] x) +(UBFX [bfc] (SLLconst [sc] x)) && sc > bfc.getARM64BFlsb() + && sc < bfc.getARM64BFlsb()+bfc.getARM64BFwidth() + => (UBFIZ [armBFAuxInt(sc-bfc.getARM64BFlsb(), bfc.getARM64BFlsb()+bfc.getARM64BFwidth()-sc)] x) + +// bfi +(OR (UBFIZ [bfc] x) (ANDconst [ac] y)) + && ac == ^((1<<uint(bfc.getARM64BFwidth())-1) << uint(bfc.getARM64BFlsb())) + => (BFI [bfc] y x) +(ORshiftRL [rc] (ANDconst [ac] x) (SLLconst [lc] y)) + && lc > rc && ac == ^((1<<uint(64-lc)-1) << uint64(lc-rc)) + => (BFI [armBFAuxInt(lc-rc, 64-lc)] x y) +// bfxil +(OR (UBFX [bfc] x) (ANDconst [ac] y)) && ac == ^(1<<uint(bfc.getARM64BFwidth())-1) + => (BFXIL [bfc] y x) +(ORshiftLL [sc] (UBFX [bfc] x) (SRLconst [sc] y)) && sc == bfc.getARM64BFwidth() + => (BFXIL [bfc] y x) +(ORshiftRL [rc] (ANDconst [ac] y) (SLLconst [lc] x)) && lc < rc && ac == ^((1<<uint(64-rc)-1)) + => (BFXIL [armBFAuxInt(rc-lc, 64-rc)] y x) + +// do combined loads +// little endian loads +// b[0] | b[1]<<8 => load 16-bit +(ORshiftLL <t> [8] + y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem)) + y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem))) + && i1 == i0+1 + && x0.Uses == 1 && x1.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, y0, y1) + => @mergePoint(b,x0,x1) (MOVHUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem) +(ORshiftLL <t> [8] + y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem)) + y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 + && mergePoint(b,x0,x1) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x0, x1, y0, y1) + => @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr0 idx0 mem) +(ORshiftLL <t> [8] + y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem)) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + && x0.Uses == 1 && x1.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, y0, y1) + => @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr idx mem) + +// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 => load 32-bit +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + x0:(MOVHUload [i0] {s} p mem) + y1:(MOVDnop x1:(MOVBUload [i2] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i3] {s} p mem))) + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && clobber(x0, x1, x2, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem) +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + x0:(MOVHUloadidx ptr0 idx0 mem) + y1:(MOVDnop x1:(MOVBUload [2] {s} p1:(ADD ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUload [3] {s} p mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr0 idx0 mem) +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + x0:(MOVHUloadidx ptr idx mem) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && clobber(x0, x1, x2, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr idx mem) +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + x0:(MOVHUloadidx2 ptr0 idx0 mem) + y1:(MOVDnop x1:(MOVBUload [2] {s} p1:(ADDshiftLL [1] ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUload [3] {s} p mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (MOVWUloadidx <t> ptr0 (SLLconst <idx0.Type> [1] idx0) mem) + +// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4]<<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 => load 64-bit +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + x0:(MOVWUload [i0] {s} p mem) + y1:(MOVDnop x1:(MOVBUload [i4] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [i6] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [i7] {s} p mem))) + && i4 == i0+4 + && i5 == i0+5 + && i6 == i0+6 + && i7 == i0+7 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem) +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + x0:(MOVWUloadidx ptr0 idx0 mem) + y1:(MOVDnop x1:(MOVBUload [4] {s} p1:(ADD ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUload [5] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [6] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [7] {s} p mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr0 idx0 mem) +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + x0:(MOVWUloadidx4 ptr0 idx0 mem) + y1:(MOVDnop x1:(MOVBUload [4] {s} p1:(ADDshiftLL [2] ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUload [5] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [6] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [7] {s} p mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr0 (SLLconst <idx0.Type> [2] idx0) mem) +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + x0:(MOVWUloadidx ptr idx mem) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [4] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [5] idx) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [6] idx) mem))) + y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [7] idx) mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && clobber(x0, x1, x2, x3, x4, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDloadidx <t> ptr idx mem) + +// b[3]<<24 | b[2]<<16 | b[1]<<8 | b[0] => load 32-bit +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] + y0:(MOVDnop x0:(MOVBUload [i3] {s} p mem))) + y1:(MOVDnop x1:(MOVBUload [i2] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i1] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [i0] {s} p mem))) + && i1 == i0+1 + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0) + => @mergePoint(b,x0,x1,x2,x3) (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] + y0:(MOVDnop x0:(MOVBUload [3] {s} p mem))) + y1:(MOVDnop x1:(MOVBUload [2] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr0 idx0 mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0) + => @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr0 idx0 mem) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] + y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr idx mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0) + => @mergePoint(b,x0,x1,x2,x3) (MOVWUloadidx <t> ptr idx mem) + +// b[7]<<56 | b[6]<<48 | b[5]<<40 | b[4]<<32 | b[3]<<24 | b[2]<<16 | b[1]<<8 | b[0] => load 64-bit +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] + y0:(MOVDnop x0:(MOVBUload [i7] {s} p mem))) + y1:(MOVDnop x1:(MOVBUload [i6] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i5] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [i4] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [i3] {s} p mem))) + y5:(MOVDnop x5:(MOVBUload [i2] {s} p mem))) + y6:(MOVDnop x6:(MOVBUload [i1] {s} p mem))) + y7:(MOVDnop x7:(MOVBUload [i0] {s} p mem))) + && i1 == i0+1 + && i2 == i0+2 + && i3 == i0+3 + && i4 == i0+4 + && i5 == i0+5 + && i6 == i0+6 + && i7 == i0+7 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 + && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0) + => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] + y0:(MOVDnop x0:(MOVBUload [7] {s} p mem))) + y1:(MOVDnop x1:(MOVBUload [6] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [5] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [4] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [3] {s} p mem))) + y5:(MOVDnop x5:(MOVBUload [2] {s} p mem))) + y6:(MOVDnop x6:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + y7:(MOVDnop x7:(MOVBUloadidx ptr0 idx0 mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 + && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0) + => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDloadidx <t> ptr0 idx0 mem) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] + y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [7] idx) mem))) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [6] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [5] idx) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [4] idx) mem))) + y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) + y5:(MOVDnop x5:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) + y6:(MOVDnop x6:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + y7:(MOVDnop x7:(MOVBUloadidx ptr idx mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 + && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0) + => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVDloadidx <t> ptr idx mem) + +// big endian loads +// b[1] | b[0]<<8 => load 16-bit, reverse +(ORshiftLL <t> [8] + y0:(MOVDnop x0:(MOVBUload [i1] {s} p mem)) + y1:(MOVDnop x1:(MOVBUload [i0] {s} p mem))) + && i1 == i0+1 + && x0.Uses == 1 && x1.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, y0, y1) + => @mergePoint(b,x0,x1) (REV16W <t> (MOVHUload <t> [i0] {s} p mem)) +(ORshiftLL <t> [8] + y0:(MOVDnop x0:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)) + y1:(MOVDnop x1:(MOVBUloadidx ptr0 idx0 mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 + && mergePoint(b,x0,x1) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x0, x1, y0, y1) + => @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr0 idx0 mem)) +(ORshiftLL <t> [8] + y0:(MOVDnop x0:(MOVBUloadidx ptr (ADDconst [1] idx) mem)) + y1:(MOVDnop x1:(MOVBUloadidx ptr idx mem))) + && x0.Uses == 1 && x1.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, y0, y1) + => @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr idx mem)) + +// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 => load 32-bit, reverse +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + y0:(REV16W x0:(MOVHUload [i2] {s} p mem)) + y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i0] {s} p mem))) + && i1 == i0+1 + && i2 == i0+2 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && clobber(x0, x1, x2, y0, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)) +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + y0:(REV16W x0:(MOVHUload [2] {s} p mem)) + y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr0 idx0 mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, y0, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem)) +(ORshiftLL <t> [24] o0:(ORshiftLL [16] + y0:(REV16W x0:(MOVHUloadidx ptr (ADDconst [2] idx) mem)) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr idx mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 + && o0.Uses == 1 + && mergePoint(b,x0,x1,x2) != nil + && clobber(x0, x1, x2, y0, y1, y2, o0) + => @mergePoint(b,x0,x1,x2) (REVW <t> (MOVWUloadidx <t> ptr idx mem)) + +// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 => load 64-bit, reverse +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + y0:(REVW x0:(MOVWUload [i4] {s} p mem)) + y1:(MOVDnop x1:(MOVBUload [i3] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [i1] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [i0] {s} p mem))) + && i1 == i0+1 + && i2 == i0+2 + && i3 == i0+3 + && i4 == i0+4 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)) +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + y0:(REVW x0:(MOVWUload [4] {s} p mem)) + y1:(MOVDnop x1:(MOVBUload [3] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [2] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + y4:(MOVDnop x4:(MOVBUloadidx ptr0 idx0 mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDloadidx <t> ptr0 idx0 mem)) +(ORshiftLL <t> [56] o0:(ORshiftLL [48] o1:(ORshiftLL [40] o2:(ORshiftLL [32] + y0:(REVW x0:(MOVWUloadidx ptr (ADDconst [4] idx) mem)) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + y4:(MOVDnop x4:(MOVBUloadidx ptr idx mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 && y4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4) != nil + && clobber(x0, x1, x2, x3, x4, y0, y1, y2, y3, y4, o0, o1, o2) + => @mergePoint(b,x0,x1,x2,x3,x4) (REV <t> (MOVDloadidx <t> ptr idx mem)) + +// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] => load 32-bit, reverse +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] + y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem))) + y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [i3] {s} p mem))) + && i1 == i0+1 + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0) + => @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] + y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem))) + y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUload [2] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [3] {s} p mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0) + => @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr0 idx0 mem)) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] s0:(SLLconst [24] + y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem))) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && clobber(x0, x1, x2, x3, y0, y1, y2, y3, o0, o1, s0) + => @mergePoint(b,x0,x1,x2,x3) (REVW <t> (MOVWUloadidx <t> ptr idx mem)) + +// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] => load 64-bit, reverse +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] + y0:(MOVDnop x0:(MOVBUload [i0] {s} p mem))) + y1:(MOVDnop x1:(MOVBUload [i1] {s} p mem))) + y2:(MOVDnop x2:(MOVBUload [i2] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [i3] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [i4] {s} p mem))) + y5:(MOVDnop x5:(MOVBUload [i5] {s} p mem))) + y6:(MOVDnop x6:(MOVBUload [i6] {s} p mem))) + y7:(MOVDnop x7:(MOVBUload [i7] {s} p mem))) + && i1 == i0+1 + && i2 == i0+2 + && i3 == i0+3 + && i4 == i0+4 + && i5 == i0+5 + && i6 == i0+6 + && i7 == i0+7 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 + && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0) + => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [int64(i0)] p) mem)) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] + y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem))) + y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem))) + y2:(MOVDnop x2:(MOVBUload [2] {s} p mem))) + y3:(MOVDnop x3:(MOVBUload [3] {s} p mem))) + y4:(MOVDnop x4:(MOVBUload [4] {s} p mem))) + y5:(MOVDnop x5:(MOVBUload [5] {s} p mem))) + y6:(MOVDnop x6:(MOVBUload [6] {s} p mem))) + y7:(MOVDnop x7:(MOVBUload [7] {s} p mem))) + && s == nil + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 + && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0) + => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDloadidx <t> ptr0 idx0 mem)) +(OR <t> o0:(ORshiftLL [8] o1:(ORshiftLL [16] o2:(ORshiftLL [24] o3:(ORshiftLL [32] o4:(ORshiftLL [40] o5:(ORshiftLL [48] s0:(SLLconst [56] + y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem))) + y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem))) + y2:(MOVDnop x2:(MOVBUloadidx ptr (ADDconst [2] idx) mem))) + y3:(MOVDnop x3:(MOVBUloadidx ptr (ADDconst [3] idx) mem))) + y4:(MOVDnop x4:(MOVBUloadidx ptr (ADDconst [4] idx) mem))) + y5:(MOVDnop x5:(MOVBUloadidx ptr (ADDconst [5] idx) mem))) + y6:(MOVDnop x6:(MOVBUloadidx ptr (ADDconst [6] idx) mem))) + y7:(MOVDnop x7:(MOVBUloadidx ptr (ADDconst [7] idx) mem))) + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && y0.Uses == 1 && y1.Uses == 1 && y2.Uses == 1 && y3.Uses == 1 + && y4.Uses == 1 && y5.Uses == 1 && y6.Uses == 1 && y7.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 + && o4.Uses == 1 && o5.Uses == 1 && s0.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && clobber(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, o0, o1, o2, o3, o4, o5, s0) + => @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDloadidx <t> ptr idx mem)) + +// Combine zero stores into larger (unaligned) stores. +(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(int64(i),int64(j),1) + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem) +(MOVBstorezero [1] {s} (ADD ptr0 idx0) x:(MOVBstorezeroidx ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstorezeroidx ptr1 idx1 mem) +(MOVBstorezeroidx ptr (ADDconst [1] idx) x:(MOVBstorezeroidx ptr idx mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstorezeroidx ptr idx mem) +(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(int64(i),int64(j),2) + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVWstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem) +(MOVHstorezero [2] {s} (ADD ptr0 idx0) x:(MOVHstorezeroidx ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVWstorezeroidx ptr1 idx1 mem) +(MOVHstorezeroidx ptr (ADDconst [2] idx) x:(MOVHstorezeroidx ptr idx mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstorezeroidx ptr idx mem) +(MOVHstorezero [2] {s} (ADDshiftLL [1] ptr0 idx0) x:(MOVHstorezeroidx2 ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVWstorezeroidx ptr1 (SLLconst <idx1.Type> [1] idx1) mem) +(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(int64(i),int64(j),4) + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVDstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem) +(MOVWstorezero [4] {s} (ADD ptr0 idx0) x:(MOVWstorezeroidx ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVDstorezeroidx ptr1 idx1 mem) +(MOVWstorezeroidx ptr (ADDconst [4] idx) x:(MOVWstorezeroidx ptr idx mem)) + && x.Uses == 1 + && clobber(x) + => (MOVDstorezeroidx ptr idx mem) +(MOVWstorezero [4] {s} (ADDshiftLL [2] ptr0 idx0) x:(MOVWstorezeroidx4 ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVDstorezeroidx ptr1 (SLLconst <idx1.Type> [2] idx1) mem) +(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(int64(i),int64(j),8) + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVQstorezero [int32(min(int64(i),int64(j)))] {s} ptr0 mem) +(MOVDstorezero [8] {s} p0:(ADD ptr0 idx0) x:(MOVDstorezeroidx ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVQstorezero [0] {s} p0 mem) +(MOVDstorezero [8] {s} p0:(ADDshiftLL [3] ptr0 idx0) x:(MOVDstorezeroidx8 ptr1 idx1 mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVQstorezero [0] {s} p0 mem) + +// Combine stores into larger (unaligned) stores. +(MOVBstore [i] {s} ptr0 (SRLconst [8] w) x:(MOVBstore [i-1] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [8] w) x:(MOVBstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w mem) +(MOVBstoreidx ptr (ADDconst [1] idx) (SRLconst [8] w) x:(MOVBstoreidx ptr idx w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstoreidx ptr idx w mem) +(MOVBstore [i] {s} ptr0 (UBFX [armBFAuxInt(8, 8)] w) x:(MOVBstore [i-1] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(8, 8)] w) x:(MOVBstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w mem) +(MOVBstore [i] {s} ptr0 (UBFX [armBFAuxInt(8, 24)] w) x:(MOVBstore [i-1] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(8, 24)] w) x:(MOVBstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w mem) +(MOVBstore [i] {s} ptr0 (SRLconst [8] (MOVDreg w)) x:(MOVBstore [i-1] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [8] (MOVDreg w)) x:(MOVBstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w mem) +(MOVBstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVBstore [i-1] {s} ptr1 w0:(SRLconst [j-8] w) mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w0 mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVBstoreidx ptr1 idx1 w0:(SRLconst [j-8] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w0 mem) +(MOVBstore [i] {s} ptr0 (UBFX [bfc] w) x:(MOVBstore [i-1] {s} ptr1 w0:(UBFX [bfc2] w) mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && bfc.getARM64BFwidth() == 32 - bfc.getARM64BFlsb() + && bfc2.getARM64BFwidth() == 32 - bfc2.getARM64BFlsb() + && bfc2.getARM64BFlsb() == bfc.getARM64BFlsb() - 8 + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w0 mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (UBFX [bfc] w) x:(MOVBstoreidx ptr1 idx1 w0:(UBFX [bfc2] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && bfc.getARM64BFwidth() == 32 - bfc.getARM64BFlsb() + && bfc2.getARM64BFwidth() == 32 - bfc2.getARM64BFlsb() + && bfc2.getARM64BFlsb() == bfc.getARM64BFlsb() - 8 + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w0 mem) +(MOVBstore [i] {s} ptr0 (SRLconst [j] (MOVDreg w)) x:(MOVBstore [i-1] {s} ptr1 w0:(SRLconst [j-8] (MOVDreg w)) mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVHstore [i-1] {s} ptr0 w0 mem) +(MOVBstore [1] {s} (ADD ptr0 idx0) (SRLconst [j] (MOVDreg w)) x:(MOVBstoreidx ptr1 idx1 w0:(SRLconst [j-8] (MOVDreg w)) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr1 idx1 w0 mem) +(MOVHstore [i] {s} ptr0 (SRLconst [16] w) x:(MOVHstore [i-2] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVWstore [i-2] {s} ptr0 w mem) +(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [16] w) x:(MOVHstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVWstoreidx ptr1 idx1 w mem) +(MOVHstoreidx ptr (ADDconst [2] idx) (SRLconst [16] w) x:(MOVHstoreidx ptr idx w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWstoreidx ptr idx w mem) +(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [16] w) x:(MOVHstoreidx2 ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem) +(MOVHstore [i] {s} ptr0 (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstore [i-2] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVWstore [i-2] {s} ptr0 w mem) +(MOVHstore [2] {s} (ADD ptr0 idx0) (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVWstoreidx ptr1 idx1 w mem) +(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (UBFX [armBFAuxInt(16, 16)] w) x:(MOVHstoreidx2 ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem) +(MOVHstore [i] {s} ptr0 (SRLconst [16] (MOVDreg w)) x:(MOVHstore [i-2] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVWstore [i-2] {s} ptr0 w mem) +(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [16] (MOVDreg w)) x:(MOVHstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVWstoreidx ptr1 idx1 w mem) +(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [16] (MOVDreg w)) x:(MOVHstoreidx2 ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w mem) +(MOVHstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVHstore [i-2] {s} ptr1 w0:(SRLconst [j-16] w) mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVWstore [i-2] {s} ptr0 w0 mem) +(MOVHstore [2] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVHstoreidx ptr1 idx1 w0:(SRLconst [j-16] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVWstoreidx ptr1 idx1 w0 mem) +(MOVHstore [2] {s} (ADDshiftLL [1] ptr0 idx0) (SRLconst [j] w) x:(MOVHstoreidx2 ptr1 idx1 w0:(SRLconst [j-16] w) mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVWstoreidx ptr1 (SLLconst <idx1.Type> [1] idx1) w0 mem) +(MOVWstore [i] {s} ptr0 (SRLconst [32] w) x:(MOVWstore [i-4] {s} ptr1 w mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVDstore [i-4] {s} ptr0 w mem) +(MOVWstore [4] {s} (ADD ptr0 idx0) (SRLconst [32] w) x:(MOVWstoreidx ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVDstoreidx ptr1 idx1 w mem) +(MOVWstoreidx ptr (ADDconst [4] idx) (SRLconst [32] w) x:(MOVWstoreidx ptr idx w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVDstoreidx ptr idx w mem) +(MOVWstore [4] {s} (ADDshiftLL [2] ptr0 idx0) (SRLconst [32] w) x:(MOVWstoreidx4 ptr1 idx1 w mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVDstoreidx ptr1 (SLLconst <idx1.Type> [2] idx1) w mem) +(MOVWstore [i] {s} ptr0 (SRLconst [j] w) x:(MOVWstore [i-4] {s} ptr1 w0:(SRLconst [j-32] w) mem)) + && x.Uses == 1 + && isSamePtr(ptr0, ptr1) + && clobber(x) + => (MOVDstore [i-4] {s} ptr0 w0 mem) +(MOVWstore [4] {s} (ADD ptr0 idx0) (SRLconst [j] w) x:(MOVWstoreidx ptr1 idx1 w0:(SRLconst [j-32] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVDstoreidx ptr1 idx1 w0 mem) +(MOVWstore [4] {s} (ADDshiftLL [2] ptr0 idx0) (SRLconst [j] w) x:(MOVWstoreidx4 ptr1 idx1 w0:(SRLconst [j-32] w) mem)) + && x.Uses == 1 + && s == nil + && isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) + && clobber(x) + => (MOVDstoreidx ptr1 (SLLconst <idx1.Type> [2] idx1) w0 mem) +(MOVBstore [i] {s} ptr w + x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] w) + x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] w) + x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] w) + x3:(MOVBstore [i-4] {s} ptr (SRLconst [32] w) + x4:(MOVBstore [i-5] {s} ptr (SRLconst [40] w) + x5:(MOVBstore [i-6] {s} ptr (SRLconst [48] w) + x6:(MOVBstore [i-7] {s} ptr (SRLconst [56] w) mem)))))))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && clobber(x0, x1, x2, x3, x4, x5, x6) + => (MOVDstore [i-7] {s} ptr (REV <typ.UInt64> w) mem) +(MOVBstore [7] {s} p w + x0:(MOVBstore [6] {s} p (SRLconst [8] w) + x1:(MOVBstore [5] {s} p (SRLconst [16] w) + x2:(MOVBstore [4] {s} p (SRLconst [24] w) + x3:(MOVBstore [3] {s} p (SRLconst [32] w) + x4:(MOVBstore [2] {s} p (SRLconst [40] w) + x5:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [48] w) + x6:(MOVBstoreidx ptr0 idx0 (SRLconst [56] w) mem)))))))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2, x3, x4, x5, x6) + => (MOVDstoreidx ptr0 idx0 (REV <typ.UInt64> w) mem) +(MOVBstore [i] {s} ptr w + x0:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 24)] w) + x1:(MOVBstore [i-2] {s} ptr (UBFX [armBFAuxInt(16, 16)] w) + x2:(MOVBstore [i-3] {s} ptr (UBFX [armBFAuxInt(24, 8)] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0, x1, x2) + => (MOVWstore [i-3] {s} ptr (REVW <typ.UInt32> w) mem) +(MOVBstore [3] {s} p w + x0:(MOVBstore [2] {s} p (UBFX [armBFAuxInt(8, 24)] w) + x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (UBFX [armBFAuxInt(16, 16)] w) + x2:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(24, 8)] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2) + => (MOVWstoreidx ptr0 idx0 (REVW <typ.UInt32> w) mem) +(MOVBstoreidx ptr (ADDconst [3] idx) w + x0:(MOVBstoreidx ptr (ADDconst [2] idx) (UBFX [armBFAuxInt(8, 24)] w) + x1:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(16, 16)] w) + x2:(MOVBstoreidx ptr idx (UBFX [armBFAuxInt(24, 8)] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0, x1, x2) + => (MOVWstoreidx ptr idx (REVW <typ.UInt32> w) mem) +(MOVBstoreidx ptr idx w + x0:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(8, 24)] w) + x1:(MOVBstoreidx ptr (ADDconst [2] idx) (UBFX [armBFAuxInt(16, 16)] w) + x2:(MOVBstoreidx ptr (ADDconst [3] idx) (UBFX [armBFAuxInt(24, 8)] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0, x1, x2) + => (MOVWstoreidx ptr idx w mem) +(MOVBstore [i] {s} ptr w + x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] (MOVDreg w)) + x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] (MOVDreg w)) + x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] (MOVDreg w)) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0, x1, x2) + => (MOVWstore [i-3] {s} ptr (REVW <typ.UInt32> w) mem) +(MOVBstore [3] {s} p w + x0:(MOVBstore [2] {s} p (SRLconst [8] (MOVDreg w)) + x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [16] (MOVDreg w)) + x2:(MOVBstoreidx ptr0 idx0 (SRLconst [24] (MOVDreg w)) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2) + => (MOVWstoreidx ptr0 idx0 (REVW <typ.UInt32> w) mem) +(MOVBstore [i] {s} ptr w + x0:(MOVBstore [i-1] {s} ptr (SRLconst [8] w) + x1:(MOVBstore [i-2] {s} ptr (SRLconst [16] w) + x2:(MOVBstore [i-3] {s} ptr (SRLconst [24] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0, x1, x2) + => (MOVWstore [i-3] {s} ptr (REVW <typ.UInt32> w) mem) +(MOVBstore [3] {s} p w + x0:(MOVBstore [2] {s} p (SRLconst [8] w) + x1:(MOVBstore [1] {s} p1:(ADD ptr1 idx1) (SRLconst [16] w) + x2:(MOVBstoreidx ptr0 idx0 (SRLconst [24] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && isSamePtr(p1, p) + && clobber(x0, x1, x2) + => (MOVWstoreidx ptr0 idx0 (REVW <typ.UInt32> w) mem) +(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (SRLconst [8] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem) +(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (SRLconst [8] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem) +(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 8)] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem) +(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(8, 8)] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem) +(MOVBstoreidx ptr (ADDconst [1] idx) w x:(MOVBstoreidx ptr idx (UBFX [armBFAuxInt(8, 8)] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstoreidx ptr idx (REV16W <typ.UInt16> w) mem) +(MOVBstoreidx ptr idx w x:(MOVBstoreidx ptr (ADDconst [1] idx) (UBFX [armBFAuxInt(8, 8)] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstoreidx ptr idx w mem) +(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (SRLconst [8] (MOVDreg w)) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem) +(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (SRLconst [8] (MOVDreg w)) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem) +(MOVBstore [i] {s} ptr w x:(MOVBstore [i-1] {s} ptr (UBFX [armBFAuxInt(8, 24)] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} ptr (REV16W <typ.UInt16> w) mem) +(MOVBstore [1] {s} (ADD ptr1 idx1) w x:(MOVBstoreidx ptr0 idx0 (UBFX [armBFAuxInt(8, 24)] w) mem)) + && x.Uses == 1 + && s == nil + && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) + && clobber(x) + => (MOVHstoreidx ptr0 idx0 (REV16W <typ.UInt16> w) mem) + +// FP simplification +(FNEGS (FMULS x y)) => (FNMULS x y) +(FNEGD (FMULD x y)) => (FNMULD x y) +(FMULS (FNEGS x) y) => (FNMULS x y) +(FMULD (FNEGD x) y) => (FNMULD x y) +(FNEGS (FNMULS x y)) => (FMULS x y) +(FNEGD (FNMULD x y)) => (FMULD x y) +(FNMULS (FNEGS x) y) => (FMULS x y) +(FNMULD (FNEGD x) y) => (FMULD x y) + +(FADDS a (FMULS x y)) && a.Block.Func.useFMA(v) => (FMADDS a x y) +(FADDD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD a x y) +(FSUBS a (FMULS x y)) && a.Block.Func.useFMA(v) => (FMSUBS a x y) +(FSUBD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMSUBD a x y) +(FSUBS (FMULS x y) a) && a.Block.Func.useFMA(v) => (FNMSUBS a x y) +(FSUBD (FMULD x y) a) && a.Block.Func.useFMA(v) => (FNMSUBD a x y) +(FADDS a (FNMULS x y)) && a.Block.Func.useFMA(v) => (FMSUBS a x y) +(FADDD a (FNMULD x y)) && a.Block.Func.useFMA(v) => (FMSUBD a x y) +(FSUBS a (FNMULS x y)) && a.Block.Func.useFMA(v) => (FMADDS a x y) +(FSUBD a (FNMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD a x y) +(FSUBS (FNMULS x y) a) && a.Block.Func.useFMA(v) => (FNMADDS a x y) +(FSUBD (FNMULD x y) a) && a.Block.Func.useFMA(v) => (FNMADDD a x y) + +(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read8(sym, int64(off)))]) +(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVWUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVDload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))]) + +// Prefetch instructions (aux is option: 0 - PLDL1KEEP; 1 - PLDL1STRM) +(PrefetchCache addr mem) => (PRFM [0] addr mem) +(PrefetchCacheStreamed addr mem) => (PRFM [1] addr mem) + +// Arch-specific inlining for small or disjoint runtime.memmove +(SelectN [0] call:(CALLstatic {sym} s1:(MOVDstore _ (MOVDconst [sz]) s2:(MOVDstore _ src s3:(MOVDstore {t} _ dst mem))))) + && sz >= 0 + && isSameCall(sym, "runtime.memmove") + && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 + && isInlinableMemmove(dst, src, sz, config) + && clobber(s1, s2, s3, call) + => (Move [sz] dst src mem) + +// Match post-lowering calls, register version. +(SelectN [0] call:(CALLstatic {sym} dst src (MOVDconst [sz]) mem)) + && sz >= 0 + && isSameCall(sym, "runtime.memmove") + && call.Uses == 1 + && isInlinableMemmove(dst, src, sz, config) + && clobber(call) + => (Move [sz] dst src mem) + +((REV|REVW) ((REV|REVW) p)) => p + +// runtime/internal/math.MulUintptr intrinsics + +(Select0 (Mul64uover x y)) => (MUL x y) +(Select1 (Mul64uover x y)) => (NotEqual (CMPconst (UMULH <typ.UInt64> x y) [0])) diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go new file mode 100644 index 0000000..f7cc47b --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go @@ -0,0 +1,794 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - *const instructions may use a constant larger than the instruction can encode. +// In this case the assembler expands to multiple instructions and uses tmp +// register (R27). + +// Suffixes encode the bit width of various instructions. +// D (double word) = 64 bit +// W (word) = 32 bit +// H (half word) = 16 bit +// HU = 16 bit unsigned +// B (byte) = 8 bit +// BU = 8 bit unsigned +// S (single) = 32 bit float +// D (double) = 64 bit float + +// Note: registers not used in regalloc are not included in this list, +// so that regmask stays within int64 +// Be careful when hand coding regmasks. +var regNamesARM64 = []string{ + "R0", + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "R14", + "R15", + "R16", + "R17", + "R18", // platform register, not used + "R19", + "R20", + "R21", + "R22", + "R23", + "R24", + "R25", + "R26", + // R27 = REGTMP not used in regalloc + "g", // aka R28 + "R29", // frame pointer, not used + "R30", // aka REGLINK + "SP", // aka R31 + + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", + "F16", + "F17", + "F18", + "F19", + "F20", + "F21", + "F22", + "F23", + "F24", + "F25", + "F26", + "F27", + "F28", + "F29", + "F30", + "F31", + + // If you add registers, update asyncPreempt in runtime. + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesARM64) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesARM64 { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30") + gpg = gp | buildReg("g") + gpsp = gp | buildReg("SP") + gpspg = gpg | buildReg("SP") + gpspsbg = gpspg | buildReg("SB") + fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") + callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r0 = buildReg("R0") + r1 = buildReg("R1") + r2 = buildReg("R2") + r3 = buildReg("R3") + ) + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp0flags1 = regInfo{inputs: []regMask{0}, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}} + gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}} + gp1flags = regInfo{inputs: []regMask{gpg}} + gp1flags1 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}} + gp11flags = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp, 0}} + gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}} + gp21nog = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}} + gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}} + gp2flags = regInfo{inputs: []regMask{gpg, gpg}} + gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}} + gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}} + gp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}} + gp31 = regInfo{inputs: []regMask{gpg, gpg, gpg}, outputs: []regMask{gp}} + gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}} + gpload2 = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gpg, gpg}} + gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}} + gpstore0 = regInfo{inputs: []regMask{gpspsbg}} + gpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}} + gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}} + gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}} + fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} + fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} + fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}} + gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + fp1flags = regInfo{inputs: []regMask{fp}} + fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} + fp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}} + fpstore = regInfo{inputs: []regMask{gpspsbg, fp}} + fpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, fp}} + readflags = regInfo{inputs: nil, outputs: []regMask{gp}} + prefreg = regInfo{inputs: []regMask{gpspsbg}} + ) + ops := []opData{ + // binary ops + {name: "ADCSflags", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCS", commutative: true}, // arg0+arg1+carry, set flags. + {name: "ADCzerocarry", argLength: 1, reg: gp0flags1, typ: "UInt64", asm: "ADC"}, // ZR+ZR+carry + {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1 + {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int64"}, // arg0 + auxInt + {name: "ADDSconstflags", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDS", aux: "Int64"}, // arg0+auxint, set flags. + {name: "ADDSflags", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDS", commutative: true}, // arg0+arg1, set flags. + {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0 - arg1 + {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int64"}, // arg0 - auxInt + {name: "SBCSflags", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "SBCS"}, // arg0-(arg1+borrowing), set flags. + {name: "SUBSflags", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "SUBS"}, // arg0 - arg1, set flags. + {name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true}, // arg0 * arg1 + {name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true}, // arg0 * arg1, 32-bit + {name: "MNEG", argLength: 2, reg: gp21, asm: "MNEG", commutative: true}, // -arg0 * arg1 + {name: "MNEGW", argLength: 2, reg: gp21, asm: "MNEGW", commutative: true}, // -arg0 * arg1, 32-bit + {name: "MULH", argLength: 2, reg: gp21, asm: "SMULH", commutative: true}, // (arg0 * arg1) >> 64, signed + {name: "UMULH", argLength: 2, reg: gp21, asm: "UMULH", commutative: true}, // (arg0 * arg1) >> 64, unsigned + {name: "MULL", argLength: 2, reg: gp21, asm: "SMULL", commutative: true}, // arg0 * arg1, signed, 32-bit mult results in 64-bit + {name: "UMULL", argLength: 2, reg: gp21, asm: "UMULL", commutative: true}, // arg0 * arg1, unsigned, 32-bit mult results in 64-bit + {name: "DIV", argLength: 2, reg: gp21, asm: "SDIV"}, // arg0 / arg1, signed + {name: "UDIV", argLength: 2, reg: gp21, asm: "UDIV"}, // arg0 / arg1, unsighed + {name: "DIVW", argLength: 2, reg: gp21, asm: "SDIVW"}, // arg0 / arg1, signed, 32 bit + {name: "UDIVW", argLength: 2, reg: gp21, asm: "UDIVW"}, // arg0 / arg1, unsighed, 32 bit + {name: "MOD", argLength: 2, reg: gp21, asm: "REM"}, // arg0 % arg1, signed + {name: "UMOD", argLength: 2, reg: gp21, asm: "UREM"}, // arg0 % arg1, unsigned + {name: "MODW", argLength: 2, reg: gp21, asm: "REMW"}, // arg0 % arg1, signed, 32 bit + {name: "UMODW", argLength: 2, reg: gp21, asm: "UREMW"}, // arg0 % arg1, unsigned, 32 bit + + {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1 + {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1 + {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0 - arg1 + {name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"}, // arg0 - arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1 + {name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1 + {name: "FNMULS", argLength: 2, reg: fp21, asm: "FNMULS", commutative: true}, // -(arg0 * arg1) + {name: "FNMULD", argLength: 2, reg: fp21, asm: "FNMULD", commutative: true}, // -(arg0 * arg1) + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0 / arg1 + {name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"}, // arg0 / arg1 + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt + {name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true}, // arg0 | arg1 + {name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int64"}, // arg0 | auxInt + {name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1 + {name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int64"}, // arg0 ^ auxInt + {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1 + {name: "EON", argLength: 2, reg: gp21, asm: "EON"}, // arg0 ^ ^arg1 + {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // arg0 | ^arg1 + + // unary ops + {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0 + {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0 + {name: "NEGSflags", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "NEGS"}, // -arg0, set flags. + {name: "NGCzerocarry", argLength: 1, reg: gp0flags1, typ: "UInt64", asm: "NGC"}, // -1 if borrowing, 0 otherwise. + {name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD"}, // abs(arg0), float64 + {name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"}, // -arg0, float32 + {name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64 + {name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64 + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0), float32 + {name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // byte reverse, 64-bit + {name: "REVW", argLength: 1, reg: gp11, asm: "REVW"}, // byte reverse, 32-bit + {name: "REV16", argLength: 1, reg: gp11, asm: "REV16"}, // byte reverse in each 16-bit halfword, 64-bit + {name: "REV16W", argLength: 1, reg: gp11, asm: "REV16W"}, // byte reverse in each 16-bit halfword, 32-bit + {name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"}, // bit reverse, 64-bit + {name: "RBITW", argLength: 1, reg: gp11, asm: "RBITW"}, // bit reverse, 32-bit + {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero, 64-bit + {name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // count leading zero, 32-bit + {name: "VCNT", argLength: 1, reg: fp11, asm: "VCNT"}, // count set bits for each 8-bit unit and store the result in each 8-bit unit + {name: "VUADDLV", argLength: 1, reg: fp11, asm: "VUADDLV"}, // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit. + {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true}, + {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true}, + + // 3-operand, the addend comes first + {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // +arg0 + (arg1 * arg2) + {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD"}, // +arg0 + (arg1 * arg2) + {name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS"}, // -arg0 - (arg1 * arg2) + {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD"}, // -arg0 - (arg1 * arg2) + {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // +arg0 - (arg1 * arg2) + {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD"}, // +arg0 - (arg1 * arg2) + {name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS"}, // -arg0 + (arg1 * arg2) + {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD"}, // -arg0 + (arg1 * arg2) + {name: "MADD", argLength: 3, reg: gp31, asm: "MADD"}, // +arg0 + (arg1 * arg2) + {name: "MADDW", argLength: 3, reg: gp31, asm: "MADDW"}, // +arg0 + (arg1 * arg2), 32-bit + {name: "MSUB", argLength: 3, reg: gp31, asm: "MSUB"}, // +arg0 - (arg1 * arg2) + {name: "MSUBW", argLength: 3, reg: gp31, asm: "MSUBW"}, // +arg0 - (arg1 * arg2), 32-bit + + // shifts + {name: "SLL", argLength: 2, reg: gp21, asm: "LSL"}, // arg0 << arg1, shift amount is mod 64 + {name: "SLLconst", argLength: 1, reg: gp11, asm: "LSL", aux: "Int64"}, // arg0 << auxInt, auxInt should be in the range 0 to 63. + {name: "SRL", argLength: 2, reg: gp21, asm: "LSR"}, // arg0 >> arg1, unsigned, shift amount is mod 64 + {name: "SRLconst", argLength: 1, reg: gp11, asm: "LSR", aux: "Int64"}, // arg0 >> auxInt, unsigned, auxInt should be in the range 0 to 63. + {name: "SRA", argLength: 2, reg: gp21, asm: "ASR"}, // arg0 >> arg1, signed, shift amount is mod 64 + {name: "SRAconst", argLength: 1, reg: gp11, asm: "ASR", aux: "Int64"}, // arg0 >> auxInt, signed, auxInt should be in the range 0 to 63. + {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // arg0 right rotate by (arg1 mod 64) bits + {name: "RORW", argLength: 2, reg: gp21, asm: "RORW"}, // arg0 right rotate by (arg1 mod 32) bits + {name: "RORconst", argLength: 1, reg: gp11, asm: "ROR", aux: "Int64"}, // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63. + {name: "RORWconst", argLength: 1, reg: gp11, asm: "RORW", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31. + {name: "EXTRconst", argLength: 2, reg: gp21, asm: "EXTR", aux: "Int64"}, // extract 64 bits from arg0:arg1 starting at lsb auxInt, auxInt should be in the range 0 to 63. + {name: "EXTRWconst", argLength: 2, reg: gp21, asm: "EXTRW", aux: "Int64"}, // extract 32 bits from arg0[31:0]:arg1[31:0] starting at lsb auxInt and zero top 32 bits, auxInt should be in the range 0 to 31. + + // comparisons + {name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to auxInt + {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1, 32 bit + {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt, 32 bit + {name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags", commutative: true}, // arg0 compare to -arg1, provided arg1 is not 1<<63 + {name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // arg0 compare to -auxInt + {name: "CMNW", argLength: 2, reg: gp2flags, asm: "CMNW", typ: "Flags", commutative: true}, // arg0 compare to -arg1, 32 bit, provided arg1 is not 1<<31 + {name: "CMNWconst", argLength: 1, reg: gp1flags, asm: "CMNW", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt, 32 bit + {name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0 + {name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int64", typ: "Flags"}, // arg0 & auxInt compare to 0 + {name: "TSTW", argLength: 2, reg: gp2flags, asm: "TSTW", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0, 32 bit + {name: "TSTWconst", argLength: 1, reg: gp1flags, asm: "TSTW", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0, 32 bit + {name: "FCMPS", argLength: 2, reg: fp2flags, asm: "FCMPS", typ: "Flags"}, // arg0 compare to arg1, float32 + {name: "FCMPD", argLength: 2, reg: fp2flags, asm: "FCMPD", typ: "Flags"}, // arg0 compare to arg1, float64 + {name: "FCMPS0", argLength: 1, reg: fp1flags, asm: "FCMPS", typ: "Flags"}, // arg0 compare to 0, float32 + {name: "FCMPD0", argLength: 1, reg: fp1flags, asm: "FCMPD", typ: "Flags"}, // arg0 compare to 0, float64 + + // shifted ops + {name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0<<auxInt), auxInt should be in the range 0 to 63. + {name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0>>auxInt), unsigned shift, auxInt should be in the range 0 to 63. + {name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0>>auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "MVNshiftRO", argLength: 1, reg: gp11, asm: "MVN", aux: "Int64"}, // ^(arg0 ROR auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "NEGshiftLL", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"}, // -(arg0<<auxInt), auxInt should be in the range 0 to 63. + {name: "NEGshiftRL", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"}, // -(arg0>>auxInt), unsigned shift, auxInt should be in the range 0 to 63. + {name: "NEGshiftRA", argLength: 1, reg: gp11, asm: "NEG", aux: "Int64"}, // -(arg0>>auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"}, // arg0 + arg1<<auxInt, auxInt should be in the range 0 to 63. + {name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"}, // arg0 + arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63. + {name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int64"}, // arg0 + arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"}, // arg0 - arg1<<auxInt, auxInt should be in the range 0 to 63. + {name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"}, // arg0 - arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63. + {name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int64"}, // arg0 - arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1<<auxInt), auxInt should be in the range 0 to 63. + {name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63. + {name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "ANDshiftRO", argLength: 2, reg: gp21, asm: "AND", aux: "Int64"}, // arg0 & (arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1<<auxInt, auxInt should be in the range 0 to 63. + {name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63. + {name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "ORshiftRO", argLength: 2, reg: gp21, asm: "ORR", aux: "Int64"}, // arg0 | arg1 ROR auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1<<auxInt, auxInt should be in the range 0 to 63. + {name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63. + {name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "XORshiftRO", argLength: 2, reg: gp21, asm: "EOR", aux: "Int64"}, // arg0 ^ arg1 ROR auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1<<auxInt), auxInt should be in the range 0 to 63. + {name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63. + {name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "BICshiftRO", argLength: 2, reg: gp21, asm: "BIC", aux: "Int64"}, // arg0 &^ (arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "EONshiftLL", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1<<auxInt), auxInt should be in the range 0 to 63. + {name: "EONshiftRL", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63. + {name: "EONshiftRA", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "EONshiftRO", argLength: 2, reg: gp21, asm: "EON", aux: "Int64"}, // arg0 ^ ^(arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "ORNshiftLL", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1<<auxInt), auxInt should be in the range 0 to 63. + {name: "ORNshiftRL", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63. + {name: "ORNshiftRA", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "ORNshiftRO", argLength: 2, reg: gp21, asm: "ORN", aux: "Int64"}, // arg0 | ^(arg1 ROR auxInt), signed shift, auxInt should be in the range 0 to 63. + {name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1<<auxInt, auxInt should be in the range 0 to 63. + {name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63. + {name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int64", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63. + {name: "CMNshiftLL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1<<auxInt) compare to 0, auxInt should be in the range 0 to 63. + {name: "CMNshiftRL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1>>auxInt) compare to 0, unsigned shift, auxInt should be in the range 0 to 63. + {name: "CMNshiftRA", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int64", typ: "Flags"}, // (arg0 + arg1>>auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63. + {name: "TSTshiftLL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1<<auxInt) compare to 0, auxInt should be in the range 0 to 63. + {name: "TSTshiftRL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1>>auxInt) compare to 0, unsigned shift, auxInt should be in the range 0 to 63. + {name: "TSTshiftRA", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1>>auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63. + {name: "TSTshiftRO", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int64", typ: "Flags"}, // (arg0 & arg1 ROR auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63. + + // bitfield ops + // for all bitfield ops lsb is auxInt>>8, width is auxInt&0xff + // insert low width bits of arg1 into the result starting at bit lsb, copy other bits from arg0 + {name: "BFI", argLength: 2, reg: gp21nog, asm: "BFI", aux: "ARM64BitField", resultInArg0: true}, + // extract width bits of arg1 starting at bit lsb and insert at low end of result, copy other bits from arg0 + {name: "BFXIL", argLength: 2, reg: gp21nog, asm: "BFXIL", aux: "ARM64BitField", resultInArg0: true}, + // insert low width bits of arg0 into the result starting at bit lsb, bits to the left of the inserted bit field are set to the high/sign bit of the inserted bit field, bits to the right are zeroed + {name: "SBFIZ", argLength: 1, reg: gp11, asm: "SBFIZ", aux: "ARM64BitField"}, + // extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are set to the high/sign bit of the extracted bitfield + {name: "SBFX", argLength: 1, reg: gp11, asm: "SBFX", aux: "ARM64BitField"}, + // insert low width bits of arg0 into the result starting at bit lsb, bits to the left and right of the inserted bit field are zeroed + {name: "UBFIZ", argLength: 1, reg: gp11, asm: "UBFIZ", aux: "ARM64BitField"}, + // extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are zeroed + {name: "UBFX", argLength: 1, reg: gp11, asm: "UBFX", aux: "ARM64BitField"}, + + // moves + {name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "UInt64", rematerializeable: true}, // 64 bits from auxint + {name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVS", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float + {name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float + + {name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB + + {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVDload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVD", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "LDP", argLength: 2, reg: gpload2, aux: "SymOff", asm: "LDP", typ: "(UInt64,UInt64)", faultOnNilArg0: true, symEffect: "Read"}, // load from ptr = arg0 + auxInt + aux, returns the tuple <*(*uint64)ptr, *(*uint64)(ptr+8)>. arg1=mem. + {name: "FMOVSload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVS", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "FMOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "FMOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + + // register indexed load + {name: "MOVDloadidx", argLength: 3, reg: gp2load, asm: "MOVD", typ: "UInt64"}, // load 64-bit dword from arg0 + arg1, arg2 = mem. + {name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"}, // load 32-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem. + {name: "MOVWUloadidx", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"}, // load 32-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem. + {name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load 16-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem. + {name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load 16-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem. + {name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"}, // load 8-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem. + {name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"}, // load 8-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem. + {name: "FMOVSloadidx", argLength: 3, reg: fp2load, asm: "FMOVS", typ: "Float32"}, // load 32-bit float from arg0 + arg1, arg2=mem. + {name: "FMOVDloadidx", argLength: 3, reg: fp2load, asm: "FMOVD", typ: "Float64"}, // load 64-bit float from arg0 + arg1, arg2=mem. + + // shifted register indexed load + {name: "MOVHloadidx2", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load 16-bit half-word from arg0 + arg1*2, sign-extended to 64-bit, arg2=mem. + {name: "MOVHUloadidx2", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load 16-bit half-word from arg0 + arg1*2, zero-extended to 64-bit, arg2=mem. + {name: "MOVWloadidx4", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"}, // load 32-bit word from arg0 + arg1*4, sign-extended to 64-bit, arg2=mem. + {name: "MOVWUloadidx4", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"}, // load 32-bit word from arg0 + arg1*4, zero-extended to 64-bit, arg2=mem. + {name: "MOVDloadidx8", argLength: 3, reg: gp2load, asm: "MOVD", typ: "UInt64"}, // load 64-bit double-word from arg0 + arg1*8, arg2 = mem. + {name: "FMOVSloadidx4", argLength: 3, reg: fp2load, asm: "FMOVS", typ: "Float32"}, // load 32-bit float from arg0 + arg1*4, arg2 = mem. + {name: "FMOVDloadidx8", argLength: 3, reg: fp2load, asm: "FMOVD", typ: "Float64"}, // load 64-bit float from arg0 + arg1*8, arg2 = mem. + + {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVDstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "STP", argLength: 4, reg: gpstore2, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of arg1 and arg2 to arg0 + auxInt + aux. arg3=mem. + {name: "FMOVSstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVS", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "FMOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + + // register indexed store + {name: "MOVBstoreidx", argLength: 4, reg: gpstore2, asm: "MOVB", typ: "Mem"}, // store 1 byte of arg2 to arg0 + arg1, arg3 = mem. + {name: "MOVHstoreidx", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg2 to arg0 + arg1, arg3 = mem. + {name: "MOVWstoreidx", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg2 to arg0 + arg1, arg3 = mem. + {name: "MOVDstoreidx", argLength: 4, reg: gpstore2, asm: "MOVD", typ: "Mem"}, // store 8 bytes of arg2 to arg0 + arg1, arg3 = mem. + {name: "FMOVSstoreidx", argLength: 4, reg: fpstore2, asm: "FMOVS", typ: "Mem"}, // store 32-bit float of arg2 to arg0 + arg1, arg3=mem. + {name: "FMOVDstoreidx", argLength: 4, reg: fpstore2, asm: "FMOVD", typ: "Mem"}, // store 64-bit float of arg2 to arg0 + arg1, arg3=mem. + + // shifted register indexed store + {name: "MOVHstoreidx2", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg2 to arg0 + arg1*2, arg3 = mem. + {name: "MOVWstoreidx4", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg2 to arg0 + arg1*4, arg3 = mem. + {name: "MOVDstoreidx8", argLength: 4, reg: gpstore2, asm: "MOVD", typ: "Mem"}, // store 8 bytes of arg2 to arg0 + arg1*8, arg3 = mem. + {name: "FMOVSstoreidx4", argLength: 4, reg: fpstore2, asm: "FMOVS", typ: "Mem"}, // store 32-bit float of arg2 to arg0 + arg1*4, arg3=mem. + {name: "FMOVDstoreidx8", argLength: 4, reg: fpstore2, asm: "FMOVD", typ: "Mem"}, // store 64-bit float of arg2 to arg0 + arg1*8, arg3=mem. + + {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVQstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of zero to arg0 + auxInt + aux. arg1=mem. + + // register indexed store zero + {name: "MOVBstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVB", typ: "Mem"}, // store 1 byte of zero to arg0 + arg1, arg2 = mem. + {name: "MOVHstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1, arg2 = mem. + {name: "MOVWstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1, arg2 = mem. + {name: "MOVDstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1, arg2 = mem. + + // shifted register indexed store zero + {name: "MOVHstorezeroidx2", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1*2, arg2 = mem. + {name: "MOVWstorezeroidx4", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1*4, arg2 = mem. + {name: "MOVDstorezeroidx8", argLength: 3, reg: gpstore, asm: "MOVD", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1*8, arg2 = mem. + + {name: "FMOVDgpfp", argLength: 1, reg: gpfp, asm: "FMOVD"}, // move int64 to float64 (no conversion) + {name: "FMOVDfpgp", argLength: 1, reg: fpgp, asm: "FMOVD"}, // move float64 to int64 (no conversion) + {name: "FMOVSgpfp", argLength: 1, reg: gpfp, asm: "FMOVS"}, // move 32bits from int to float reg (no conversion) + {name: "FMOVSfpgp", argLength: 1, reg: fpgp, asm: "FMOVS"}, // move 32bits from float to int reg, zero extend (no conversion) + + // conversions + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte + {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half + {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word + {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word + {name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOVD"}, // move from arg0 + + {name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register + + {name: "SCVTFWS", argLength: 1, reg: gpfp, asm: "SCVTFWS"}, // int32 -> float32 + {name: "SCVTFWD", argLength: 1, reg: gpfp, asm: "SCVTFWD"}, // int32 -> float64 + {name: "UCVTFWS", argLength: 1, reg: gpfp, asm: "UCVTFWS"}, // uint32 -> float32 + {name: "UCVTFWD", argLength: 1, reg: gpfp, asm: "UCVTFWD"}, // uint32 -> float64 + {name: "SCVTFS", argLength: 1, reg: gpfp, asm: "SCVTFS"}, // int64 -> float32 + {name: "SCVTFD", argLength: 1, reg: gpfp, asm: "SCVTFD"}, // int64 -> float64 + {name: "UCVTFS", argLength: 1, reg: gpfp, asm: "UCVTFS"}, // uint64 -> float32 + {name: "UCVTFD", argLength: 1, reg: gpfp, asm: "UCVTFD"}, // uint64 -> float64 + {name: "FCVTZSSW", argLength: 1, reg: fpgp, asm: "FCVTZSSW"}, // float32 -> int32 + {name: "FCVTZSDW", argLength: 1, reg: fpgp, asm: "FCVTZSDW"}, // float64 -> int32 + {name: "FCVTZUSW", argLength: 1, reg: fpgp, asm: "FCVTZUSW"}, // float32 -> uint32 + {name: "FCVTZUDW", argLength: 1, reg: fpgp, asm: "FCVTZUDW"}, // float64 -> uint32 + {name: "FCVTZSS", argLength: 1, reg: fpgp, asm: "FCVTZSS"}, // float32 -> int64 + {name: "FCVTZSD", argLength: 1, reg: fpgp, asm: "FCVTZSD"}, // float64 -> int64 + {name: "FCVTZUS", argLength: 1, reg: fpgp, asm: "FCVTZUS"}, // float32 -> uint64 + {name: "FCVTZUD", argLength: 1, reg: fpgp, asm: "FCVTZUD"}, // float64 -> uint64 + {name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD"}, // float32 -> float64 + {name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS"}, // float64 -> float32 + + // floating-point round to integral + {name: "FRINTAD", argLength: 1, reg: fp11, asm: "FRINTAD"}, + {name: "FRINTMD", argLength: 1, reg: fp11, asm: "FRINTMD"}, + {name: "FRINTND", argLength: 1, reg: fp11, asm: "FRINTND"}, + {name: "FRINTPD", argLength: 1, reg: fp11, asm: "FRINTPD"}, + {name: "FRINTZD", argLength: 1, reg: fp11, asm: "FRINTZD"}, + + // conditional instructions; auxint is + // one of the arm64 comparison pseudo-ops (LessThan, LessThanU, etc.) + {name: "CSEL", argLength: 3, reg: gp2flags1, asm: "CSEL", aux: "CCop"}, // auxint(flags) ? arg0 : arg1 + {name: "CSEL0", argLength: 2, reg: gp1flags1, asm: "CSEL", aux: "CCop"}, // auxint(flags) ? arg0 : 0 + {name: "CSINC", argLength: 3, reg: gp2flags1, asm: "CSINC", aux: "CCop"}, // auxint(flags) ? arg0 : arg1 + 1 + {name: "CSINV", argLength: 3, reg: gp2flags1, asm: "CSINV", aux: "CCop"}, // auxint(flags) ? arg0 : ^arg1 + {name: "CSNEG", argLength: 3, reg: gp2flags1, asm: "CSNEG", aux: "CCop"}, // auxint(flags) ? arg0 : -arg1 + {name: "CSETM", argLength: 1, reg: readflags, asm: "CSETM", aux: "CCop"}, // auxint(flags) ? -1 : 0 + + // function calls + {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{gpsp, buildReg("R26"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem + + // pseudo-ops + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem. + + {name: "Equal", argLength: 1, reg: readflags}, // bool, true flags encode x==y false otherwise. + {name: "NotEqual", argLength: 1, reg: readflags}, // bool, true flags encode x!=y false otherwise. + {name: "LessThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x<y false otherwise. + {name: "LessEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x<=y false otherwise. + {name: "GreaterThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x>y false otherwise. + {name: "GreaterEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y false otherwise. + {name: "LessThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<y false otherwise. + {name: "LessEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<=y false otherwise. + {name: "GreaterThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>y false otherwise. + {name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise. + {name: "LessThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<y false otherwise. + {name: "LessEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<=y false otherwise. + {name: "GreaterThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>y false otherwise. + {name: "GreaterEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>=y false otherwise. + {name: "NotLessThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>=y || x is unordered with y, false otherwise. + {name: "NotLessEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x>y || x is unordered with y, false otherwise. + {name: "NotGreaterThanF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<=y || x is unordered with y, false otherwise. + {name: "NotGreaterEqualF", argLength: 1, reg: readflags}, // bool, true flags encode floating-point x<y || x is unordered with y, false otherwise. + // duffzero + // arg0 = address of memory to zero + // arg1 = mem + // auxint = offset into duffzero code to start executing + // returns mem + // R20 changed as side effect + // R16 and R17 may be clobbered by linker trampoline. + { + name: "DUFFZERO", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{buildReg("R20")}, + clobbers: buildReg("R16 R17 R20 R30"), + }, + faultOnNilArg0: true, + unsafePoint: true, // FP maintenance around DUFFZERO can be clobbered by interrupts + }, + + // large zeroing + // arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect) + // arg1 = address of the last 16-byte unit to zero + // arg2 = mem + // returns mem + // STP.P (ZR,ZR), 16(R16) + // CMP Rarg1, R16 + // BLE -2(PC) + // Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled. + // the-end-of-the-memory - 16 is with the area to zero, ok to spill. + { + name: "LoweredZero", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R16"), gp}, + clobbers: buildReg("R16"), + }, + clobberFlags: true, + faultOnNilArg0: true, + }, + + // duffcopy + // arg0 = address of dst memory (in R21, changed as side effect) + // arg1 = address of src memory (in R20, changed as side effect) + // arg2 = mem + // auxint = offset into duffcopy code to start executing + // returns mem + // R20, R21 changed as side effect + // R16 and R17 may be clobbered by linker trampoline. + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R21"), buildReg("R20")}, + clobbers: buildReg("R16 R17 R20 R21 R26 R30"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts + }, + + // large move + // arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect) + // arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect) + // arg2 = address of the last element of src + // arg3 = mem + // returns mem + // LDP.P 16(R16), (R25, Rtmp) + // STP.P (R25, Rtmp), 16(R17) + // CMP Rarg2, R16 + // BLE -3(PC) + // Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled. + // the-end-of-src - 16 is within the area to copy, ok to spill. + { + name: "LoweredMove", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")}, + clobbers: buildReg("R16 R17 R25"), + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of R26 (arm64.REGCTXT, the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R26")}}, zeroWidth: true}, + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + // Constant flag value. + // Note: there's an "unordered" outcome for floating-point + // comparisons, but we don't use such a beast yet. + // This op is for temporary use by rewrite rules. It + // cannot appear in the generated assembly. + {name: "FlagConstant", aux: "FlagConstant"}, + + // (InvertFlags (CMP a b)) == (CMP b a) + // InvertFlags is a pseudo-op which can't appear in assembly output. + {name: "InvertFlags", argLength: 1}, // reverse direction of arg0 + + // atomic loads. + // load from arg0. arg1=mem. auxint must be zero. + // returns <value,memory> so they can be properly ordered with other loads. + {name: "LDAR", argLength: 2, reg: gpload, asm: "LDAR", faultOnNilArg0: true}, + {name: "LDARB", argLength: 2, reg: gpload, asm: "LDARB", faultOnNilArg0: true}, + {name: "LDARW", argLength: 2, reg: gpload, asm: "LDARW", faultOnNilArg0: true}, + + // atomic stores. + // store arg1 to arg0. arg2=mem. returns memory. auxint must be zero. + {name: "STLRB", argLength: 3, reg: gpstore, asm: "STLRB", faultOnNilArg0: true, hasSideEffects: true}, + {name: "STLR", argLength: 3, reg: gpstore, asm: "STLR", faultOnNilArg0: true, hasSideEffects: true}, + {name: "STLRW", argLength: 3, reg: gpstore, asm: "STLRW", faultOnNilArg0: true, hasSideEffects: true}, + + // atomic exchange. + // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero. + // LDAXR (Rarg0), Rout + // STLXR Rarg1, (Rarg0), Rtmp + // CBNZ Rtmp, -2(PC) + {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic exchange variant. + // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero. + // SWPALD Rarg1, (Rarg0), Rout + {name: "LoweredAtomicExchange64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicExchange32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic add. + // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero. + // LDAXR (Rarg0), Rout + // ADD Rarg1, Rout + // STLXR Rout, (Rarg0), Rtmp + // CBNZ Rtmp, -3(PC) + {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic add variant. + // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero. + // LDADDAL (Rarg0), Rarg1, Rout + // ADD Rarg1, Rout + {name: "LoweredAtomicAdd64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicAdd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // LDAXR (Rarg0), Rtmp + // CMP Rarg1, Rtmp + // BNE 3(PC) + // STLXR Rarg2, (Rarg0), Rtmp + // CBNZ Rtmp, -4(PC) + // CSET EQ, Rout + {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic compare and swap variant. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // MOV Rarg1, Rtmp + // CASAL Rtmp, (Rarg0), Rarg2 + // CMP Rarg1, Rtmp + // CSET EQ, Rout + {name: "LoweredAtomicCas64Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicCas32Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic and/or. + // *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero. + // LDAXR (Rarg0), Rout + // AND/OR Rarg1, Rout + // STLXR Rout, (Rarg0), Rtmp + // CBNZ Rtmp, -3(PC) + {name: "LoweredAtomicAnd8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAnd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicOr8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicOr32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic and/or variant. + // *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero. + // AND: + // MNV Rarg1, Rtemp + // LDANDALB Rtemp, (Rarg0), Rout + // AND Rarg1, Rout + // OR: + // LDORALB Rarg1, (Rarg0), Rout + // ORR Rarg1, Rout + {name: "LoweredAtomicAnd8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAnd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicOr8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicOr32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers R30 (LR) because it's a call. + // R16 and R17 may be clobbered by linker trampoline. + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R16 R17 R30")}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + + // Prefetch instruction + // Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option. + {name: "PRFM", argLength: 2, aux: "Int64", reg: prefreg, asm: "PRFM", hasSideEffects: true}, + + // Publication barrier + {name: "DMB", argLength: 1, aux: "Int64", asm: "DMB", hasSideEffects: true}, // Do data barrier. arg0=memory, aux=option. + } + + blocks := []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LT", controls: 1}, + {name: "LE", controls: 1}, + {name: "GT", controls: 1}, + {name: "GE", controls: 1}, + {name: "ULT", controls: 1}, + {name: "ULE", controls: 1}, + {name: "UGT", controls: 1}, + {name: "UGE", controls: 1}, + {name: "Z", controls: 1}, // Control == 0 (take a register instead of flags) + {name: "NZ", controls: 1}, // Control != 0 + {name: "ZW", controls: 1}, // Control == 0, 32-bit + {name: "NZW", controls: 1}, // Control != 0, 32-bit + {name: "TBZ", controls: 1, aux: "Int64"}, // Control & (1 << AuxInt) == 0 + {name: "TBNZ", controls: 1, aux: "Int64"}, // Control & (1 << AuxInt) != 0 + {name: "FLT", controls: 1}, + {name: "FLE", controls: 1}, + {name: "FGT", controls: 1}, + {name: "FGE", controls: 1}, + {name: "LTnoov", controls: 1}, // 'LT' but without honoring overflow + {name: "LEnoov", controls: 1}, // 'LE' but without honoring overflow + {name: "GTnoov", controls: 1}, // 'GT' but without honoring overflow + {name: "GEnoov", controls: 1}, // 'GE' but without honoring overflow + + // JUMPTABLE implements jump tables. + // Aux is the symbol (an *obj.LSym) for the jump table. + // control[0] is the index into the jump table. + // control[1] is the address of the jump table (the address of the symbol stored in Aux). + {name: "JUMPTABLE", controls: 2, aux: "Sym"}, + } + + archs = append(archs, arch{ + name: "ARM64", + pkg: "cmd/internal/obj/arm64", + genfile: "../../arm64/ssa.go", + ops: ops, + blocks: blocks, + regnames: regNamesARM64, + ParamIntRegNames: "R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15", + ParamFloatRegNames: "F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15", + gpregmask: gp, + fpregmask: fp, + framepointerreg: -1, // not used + linkreg: int8(num["R30"]), + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules b/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules new file mode 100644 index 0000000..d0c2099 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/ARM64latelower.rules @@ -0,0 +1,21 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains rules used by the laterLower pass. +// These are often the exact inverse of rules in ARM64.rules. + +(ADDconst [c] x) && !isARM64addcon(c) => (ADD x (MOVDconst [c])) +(SUBconst [c] x) && !isARM64addcon(c) => (SUB x (MOVDconst [c])) +(ANDconst [c] x) && !isARM64bitcon(uint64(c)) => (AND x (MOVDconst [c])) +(ORconst [c] x) && !isARM64bitcon(uint64(c)) => (OR x (MOVDconst [c])) +(XORconst [c] x) && !isARM64bitcon(uint64(c)) => (XOR x (MOVDconst [c])) +(TSTconst [c] x) && !isARM64bitcon(uint64(c)) => (TST x (MOVDconst [c])) +(TSTWconst [c] x) && !isARM64bitcon(uint64(c)|uint64(c)<<32) => (TSTW x (MOVDconst [int64(c)])) + +(CMPconst [c] x) && !isARM64addcon(c) => (CMP x (MOVDconst [c])) +(CMPWconst [c] x) && !isARM64addcon(int64(c)) => (CMPW x (MOVDconst [int64(c)])) +(CMNconst [c] x) && !isARM64addcon(c) => (CMN x (MOVDconst [c])) +(CMNWconst [c] x) && !isARM64addcon(int64(c)) => (CMNW x (MOVDconst [int64(c)])) + +(ADDSconstflags [c] x) && !isARM64addcon(c) => (ADDSflags x (MOVDconst [c])) diff --git a/src/cmd/compile/internal/ssa/_gen/ARMOps.go b/src/cmd/compile/internal/ssa/_gen/ARMOps.go new file mode 100644 index 0000000..de477a2 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/ARMOps.go @@ -0,0 +1,600 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - *const instructions may use a constant larger than the instruction can encode. +// In this case the assembler expands to multiple instructions and uses tmp +// register (R11). + +// Suffixes encode the bit width of various instructions. +// W (word) = 32 bit +// H (half word) = 16 bit +// HU = 16 bit unsigned +// B (byte) = 8 bit +// BU = 8 bit unsigned +// F (float) = 32 bit float +// D (double) = 64 bit float + +var regNamesARM = []string{ + "R0", + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "g", // aka R10 + "R11", // tmp + "R12", + "SP", // aka R13 + "R14", // link + "R15", // pc + + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", // tmp + + // If you add registers, update asyncPreempt in runtime. + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesARM) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesARM { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14") + gpg = gp | buildReg("g") + gpsp = gp | buildReg("SP") + gpspg = gpg | buildReg("SP") + gpspsbg = gpspg | buildReg("SB") + fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15") + callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r0 = buildReg("R0") + r1 = buildReg("R1") + r2 = buildReg("R2") + r3 = buildReg("R3") + r4 = buildReg("R4") + ) + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}} + gp11carry = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp, 0}} + gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}} + gp1flags = regInfo{inputs: []regMask{gpg}} + gp1flags1 = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}} + gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}} + gp21carry = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, 0}} + gp2flags = regInfo{inputs: []regMask{gpg, gpg}} + gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}} + gp22 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}} + gp31 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}} + gp31carry = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp, 0}} + gp3flags = regInfo{inputs: []regMask{gp, gp, gp}} + gp3flags1 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}} + gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}} + gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}} + gp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}} + gp2store = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}} + fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} + fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} + fp1flags = regInfo{inputs: []regMask{fp}} + fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}, clobbers: buildReg("F15")} // int-float conversion uses F15 as tmp + gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}, clobbers: buildReg("F15")} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} + fpstore = regInfo{inputs: []regMask{gpspsbg, fp}} + readflags = regInfo{inputs: nil, outputs: []regMask{gp}} + ) + ops := []opData{ + // binary ops + {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1 + {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32"}, // arg0 + auxInt + {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0 - arg1 + {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32"}, // arg0 - auxInt + {name: "RSB", argLength: 2, reg: gp21, asm: "RSB"}, // arg1 - arg0 + {name: "RSBconst", argLength: 1, reg: gp11, asm: "RSB", aux: "Int32"}, // auxInt - arg0 + {name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true}, // arg0 * arg1 + {name: "HMUL", argLength: 2, reg: gp21, asm: "MULL", commutative: true}, // (arg0 * arg1) >> 32, signed + {name: "HMULU", argLength: 2, reg: gp21, asm: "MULLU", commutative: true}, // (arg0 * arg1) >> 32, unsigned + + // udiv runtime call for soft division + // output0 = arg0/arg1, output1 = arg0%arg1 + // see ../../../../../runtime/vlop_arm.s + { + name: "CALLudiv", + argLength: 2, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), buildReg("R0")}, + outputs: []regMask{buildReg("R0"), buildReg("R1")}, + clobbers: buildReg("R2 R3 R12 R14"), // R14 is LR, R12 is linker trampoline scratch register + }, + clobberFlags: true, + typ: "(UInt32,UInt32)", + call: false, // TODO(mdempsky): Should this be true? + }, + + {name: "ADDS", argLength: 2, reg: gp21carry, asm: "ADD", commutative: true}, // arg0 + arg1, set carry flag + {name: "ADDSconst", argLength: 1, reg: gp11carry, asm: "ADD", aux: "Int32"}, // arg0 + auxInt, set carry flag + {name: "ADC", argLength: 3, reg: gp2flags1, asm: "ADC", commutative: true}, // arg0 + arg1 + carry, arg2=flags + {name: "ADCconst", argLength: 2, reg: gp1flags1, asm: "ADC", aux: "Int32"}, // arg0 + auxInt + carry, arg1=flags + {name: "SUBS", argLength: 2, reg: gp21carry, asm: "SUB"}, // arg0 - arg1, set carry flag + {name: "SUBSconst", argLength: 1, reg: gp11carry, asm: "SUB", aux: "Int32"}, // arg0 - auxInt, set carry flag + {name: "RSBSconst", argLength: 1, reg: gp11carry, asm: "RSB", aux: "Int32"}, // auxInt - arg0, set carry flag + {name: "SBC", argLength: 3, reg: gp2flags1, asm: "SBC"}, // arg0 - arg1 - carry, arg2=flags + {name: "SBCconst", argLength: 2, reg: gp1flags1, asm: "SBC", aux: "Int32"}, // arg0 - auxInt - carry, arg1=flags + {name: "RSCconst", argLength: 2, reg: gp1flags1, asm: "RSC", aux: "Int32"}, // auxInt - arg0 - carry, arg1=flags + + {name: "MULLU", argLength: 2, reg: gp22, asm: "MULLU", commutative: true}, // arg0 * arg1, high 32 bits in out0, low 32 bits in out1 + {name: "MULA", argLength: 3, reg: gp31, asm: "MULA"}, // arg0 * arg1 + arg2 + {name: "MULS", argLength: 3, reg: gp31, asm: "MULS"}, // arg2 - arg0 * arg1 + + {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1 + {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1 + {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1 + {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1 + {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1 + {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1 + {name: "NMULF", argLength: 2, reg: fp21, asm: "NMULF", commutative: true}, // -(arg0 * arg1) + {name: "NMULD", argLength: 2, reg: fp21, asm: "NMULD", commutative: true}, // -(arg0 * arg1) + {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1 + {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1 + + {name: "MULAF", argLength: 3, reg: fp31, asm: "MULAF", resultInArg0: true}, // arg0 + (arg1 * arg2) + {name: "MULAD", argLength: 3, reg: fp31, asm: "MULAD", resultInArg0: true}, // arg0 + (arg1 * arg2) + {name: "MULSF", argLength: 3, reg: fp31, asm: "MULSF", resultInArg0: true}, // arg0 - (arg1 * arg2) + {name: "MULSD", argLength: 3, reg: fp31, asm: "MULSD", resultInArg0: true}, // arg0 - (arg1 * arg2) + + // FMULAD only exists on platforms with the VFPv4 instruction set. + // Any use must be preceded by a successful check of runtime.arm_support_vfpv4. + {name: "FMULAD", argLength: 3, reg: fp31, asm: "FMULAD", resultInArg0: true}, // arg0 + (arg1 * arg2) + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt + {name: "OR", argLength: 2, reg: gp21, asm: "ORR", commutative: true}, // arg0 | arg1 + {name: "ORconst", argLength: 1, reg: gp11, asm: "ORR", aux: "Int32"}, // arg0 | auxInt + {name: "XOR", argLength: 2, reg: gp21, asm: "EOR", commutative: true}, // arg0 ^ arg1 + {name: "XORconst", argLength: 1, reg: gp11, asm: "EOR", aux: "Int32"}, // arg0 ^ auxInt + {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1 + {name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt + + // bit extraction, AuxInt = Width<<8 | LSB + {name: "BFX", argLength: 1, reg: gp11, asm: "BFX", aux: "Int32"}, // extract W bits from bit L in arg0, then signed extend + {name: "BFXU", argLength: 1, reg: gp11, asm: "BFXU", aux: "Int32"}, // extract W bits from bit L in arg0, then unsigned extend + + // unary ops + {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0 + + {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32 + {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64 + {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64 + {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32 + {name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64 + + {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero + {name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // reverse byte order + {name: "REV16", argLength: 1, reg: gp11, asm: "REV16"}, // reverse byte order in 16-bit halfwords + {name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"}, // reverse bit order + + // shifts + {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << arg1, shift amount is mod 256 + {name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt, 0 <= auxInt < 32 + {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> arg1, unsigned, shift amount is mod 256 + {name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, unsigned, 0 <= auxInt < 32 + {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> arg1, signed, shift amount is mod 256 + {name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed, 0 <= auxInt < 32 + {name: "SRR", argLength: 2, reg: gp21}, // arg0 right rotate by arg1 bits + {name: "SRRconst", argLength: 1, reg: gp11, aux: "Int32"}, // arg0 right rotate by auxInt bits, 0 <= auxInt < 32 + + // auxInt for all of these satisfy 0 <= auxInt < 32 + {name: "ADDshiftLL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt + {name: "ADDshiftRL", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift + {name: "ADDshiftRA", argLength: 2, reg: gp21, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift + {name: "SUBshiftLL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt + {name: "SUBshiftRL", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift + {name: "SUBshiftRA", argLength: 2, reg: gp21, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift + {name: "RSBshiftLL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0 + {name: "RSBshiftRL", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift + {name: "RSBshiftRA", argLength: 2, reg: gp21, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift + {name: "ANDshiftLL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1<<auxInt) + {name: "ANDshiftRL", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), unsigned shift + {name: "ANDshiftRA", argLength: 2, reg: gp21, asm: "AND", aux: "Int32"}, // arg0 & (arg1>>auxInt), signed shift + {name: "ORshiftLL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"}, // arg0 | arg1<<auxInt + {name: "ORshiftRL", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"}, // arg0 | arg1>>auxInt, unsigned shift + {name: "ORshiftRA", argLength: 2, reg: gp21, asm: "ORR", aux: "Int32"}, // arg0 | arg1>>auxInt, signed shift + {name: "XORshiftLL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1<<auxInt + {name: "XORshiftRL", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, unsigned shift + {name: "XORshiftRA", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ arg1>>auxInt, signed shift + {name: "XORshiftRR", argLength: 2, reg: gp21, asm: "EOR", aux: "Int32"}, // arg0 ^ (arg1 right rotate by auxInt) + {name: "BICshiftLL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1<<auxInt) + {name: "BICshiftRL", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), unsigned shift + {name: "BICshiftRA", argLength: 2, reg: gp21, asm: "BIC", aux: "Int32"}, // arg0 &^ (arg1>>auxInt), signed shift + {name: "MVNshiftLL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0<<auxInt) + {name: "MVNshiftRL", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), unsigned shift + {name: "MVNshiftRA", argLength: 1, reg: gp11, asm: "MVN", aux: "Int32"}, // ^(arg0>>auxInt), signed shift + + {name: "ADCshiftLL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1<<auxInt + carry, arg2=flags + {name: "ADCshiftRL", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, unsigned shift, arg2=flags + {name: "ADCshiftRA", argLength: 3, reg: gp2flags1, asm: "ADC", aux: "Int32"}, // arg0 + arg1>>auxInt + carry, signed shift, arg2=flags + {name: "SBCshiftLL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1<<auxInt - carry, arg2=flags + {name: "SBCshiftRL", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, unsigned shift, arg2=flags + {name: "SBCshiftRA", argLength: 3, reg: gp2flags1, asm: "SBC", aux: "Int32"}, // arg0 - arg1>>auxInt - carry, signed shift, arg2=flags + {name: "RSCshiftLL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1<<auxInt - arg0 - carry, arg2=flags + {name: "RSCshiftRL", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, unsigned shift, arg2=flags + {name: "RSCshiftRA", argLength: 3, reg: gp2flags1, asm: "RSC", aux: "Int32"}, // arg1>>auxInt - arg0 - carry, signed shift, arg2=flags + + {name: "ADDSshiftLL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1<<auxInt, set carry flag + {name: "ADDSshiftRL", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, unsigned shift, set carry flag + {name: "ADDSshiftRA", argLength: 2, reg: gp21carry, asm: "ADD", aux: "Int32"}, // arg0 + arg1>>auxInt, signed shift, set carry flag + {name: "SUBSshiftLL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1<<auxInt, set carry flag + {name: "SUBSshiftRL", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, unsigned shift, set carry flag + {name: "SUBSshiftRA", argLength: 2, reg: gp21carry, asm: "SUB", aux: "Int32"}, // arg0 - arg1>>auxInt, signed shift, set carry flag + {name: "RSBSshiftLL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1<<auxInt - arg0, set carry flag + {name: "RSBSshiftRL", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, unsigned shift, set carry flag + {name: "RSBSshiftRA", argLength: 2, reg: gp21carry, asm: "RSB", aux: "Int32"}, // arg1>>auxInt - arg0, signed shift, set carry flag + + {name: "ADDshiftLLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1<<arg2 + {name: "ADDshiftRLreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift + {name: "ADDshiftRAreg", argLength: 3, reg: gp31, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift + {name: "SUBshiftLLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1<<arg2 + {name: "SUBshiftRLreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift + {name: "SUBshiftRAreg", argLength: 3, reg: gp31, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift + {name: "RSBshiftLLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1<<arg2 - arg0 + {name: "RSBshiftRLreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift + {name: "RSBshiftRAreg", argLength: 3, reg: gp31, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift + {name: "ANDshiftLLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1<<arg2) + {name: "ANDshiftRLreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), unsigned shift + {name: "ANDshiftRAreg", argLength: 3, reg: gp31, asm: "AND"}, // arg0 & (arg1>>arg2), signed shift + {name: "ORshiftLLreg", argLength: 3, reg: gp31, asm: "ORR"}, // arg0 | arg1<<arg2 + {name: "ORshiftRLreg", argLength: 3, reg: gp31, asm: "ORR"}, // arg0 | arg1>>arg2, unsigned shift + {name: "ORshiftRAreg", argLength: 3, reg: gp31, asm: "ORR"}, // arg0 | arg1>>arg2, signed shift + {name: "XORshiftLLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1<<arg2 + {name: "XORshiftRLreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, unsigned shift + {name: "XORshiftRAreg", argLength: 3, reg: gp31, asm: "EOR"}, // arg0 ^ arg1>>arg2, signed shift + {name: "BICshiftLLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1<<arg2) + {name: "BICshiftRLreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), unsigned shift + {name: "BICshiftRAreg", argLength: 3, reg: gp31, asm: "BIC"}, // arg0 &^ (arg1>>arg2), signed shift + {name: "MVNshiftLLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0<<arg1) + {name: "MVNshiftRLreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), unsigned shift + {name: "MVNshiftRAreg", argLength: 2, reg: gp21, asm: "MVN"}, // ^(arg0>>arg1), signed shift + + {name: "ADCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1<<arg2 + carry, arg3=flags + {name: "ADCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, unsigned shift, arg3=flags + {name: "ADCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "ADC"}, // arg0 + arg1>>arg2 + carry, signed shift, arg3=flags + {name: "SBCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1<<arg2 - carry, arg3=flags + {name: "SBCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, unsigned shift, arg3=flags + {name: "SBCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "SBC"}, // arg0 - arg1>>arg2 - carry, signed shift, arg3=flags + {name: "RSCshiftLLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1<<arg2 - arg0 - carry, arg3=flags + {name: "RSCshiftRLreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, unsigned shift, arg3=flags + {name: "RSCshiftRAreg", argLength: 4, reg: gp3flags1, asm: "RSC"}, // arg1>>arg2 - arg0 - carry, signed shift, arg3=flags + + {name: "ADDSshiftLLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1<<arg2, set carry flag + {name: "ADDSshiftRLreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, unsigned shift, set carry flag + {name: "ADDSshiftRAreg", argLength: 3, reg: gp31carry, asm: "ADD"}, // arg0 + arg1>>arg2, signed shift, set carry flag + {name: "SUBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1<<arg2, set carry flag + {name: "SUBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, unsigned shift, set carry flag + {name: "SUBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "SUB"}, // arg0 - arg1>>arg2, signed shift, set carry flag + {name: "RSBSshiftLLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1<<arg2 - arg0, set carry flag + {name: "RSBSshiftRLreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, unsigned shift, set carry flag + {name: "RSBSshiftRAreg", argLength: 3, reg: gp31carry, asm: "RSB"}, // arg1>>arg2 - arg0, signed shift, set carry flag + + // comparisons + {name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to auxInt + {name: "CMN", argLength: 2, reg: gp2flags, asm: "CMN", typ: "Flags", commutative: true}, // arg0 compare to -arg1, provided arg1 is not 1<<63 + {name: "CMNconst", argLength: 1, reg: gp1flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -auxInt + {name: "TST", argLength: 2, reg: gp2flags, asm: "TST", typ: "Flags", commutative: true}, // arg0 & arg1 compare to 0 + {name: "TSTconst", argLength: 1, reg: gp1flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & auxInt compare to 0 + {name: "TEQ", argLength: 2, reg: gp2flags, asm: "TEQ", typ: "Flags", commutative: true}, // arg0 ^ arg1 compare to 0 + {name: "TEQconst", argLength: 1, reg: gp1flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ auxInt compare to 0 + {name: "CMPF", argLength: 2, reg: fp2flags, asm: "CMPF", typ: "Flags"}, // arg0 compare to arg1, float32 + {name: "CMPD", argLength: 2, reg: fp2flags, asm: "CMPD", typ: "Flags"}, // arg0 compare to arg1, float64 + + {name: "CMPshiftLL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1<<auxInt + {name: "CMPshiftRL", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, unsigned shift + {name: "CMPshiftRA", argLength: 2, reg: gp2flags, asm: "CMP", aux: "Int32", typ: "Flags"}, // arg0 compare to arg1>>auxInt, signed shift + {name: "CMNshiftLL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1<<auxInt) + {name: "CMNshiftRL", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1>>auxInt), unsigned shift + {name: "CMNshiftRA", argLength: 2, reg: gp2flags, asm: "CMN", aux: "Int32", typ: "Flags"}, // arg0 compare to -(arg1>>auxInt), signed shift + {name: "TSTshiftLL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1<<auxInt) compare to 0 + {name: "TSTshiftRL", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1>>auxInt) compare to 0, unsigned shift + {name: "TSTshiftRA", argLength: 2, reg: gp2flags, asm: "TST", aux: "Int32", typ: "Flags"}, // arg0 & (arg1>>auxInt) compare to 0, signed shift + {name: "TEQshiftLL", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1<<auxInt) compare to 0 + {name: "TEQshiftRL", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1>>auxInt) compare to 0, unsigned shift + {name: "TEQshiftRA", argLength: 2, reg: gp2flags, asm: "TEQ", aux: "Int32", typ: "Flags"}, // arg0 ^ (arg1>>auxInt) compare to 0, signed shift + + {name: "CMPshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1<<arg2 + {name: "CMPshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, unsigned shift + {name: "CMPshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1>>arg2, signed shift + {name: "CMNshiftLLreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1<<arg2) compare to 0 + {name: "CMNshiftRLreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1>>arg2) compare to 0, unsigned shift + {name: "CMNshiftRAreg", argLength: 3, reg: gp3flags, asm: "CMN", typ: "Flags"}, // arg0 + (arg1>>arg2) compare to 0, signed shift + {name: "TSTshiftLLreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1<<arg2) compare to 0 + {name: "TSTshiftRLreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1>>arg2) compare to 0, unsigned shift + {name: "TSTshiftRAreg", argLength: 3, reg: gp3flags, asm: "TST", typ: "Flags"}, // arg0 & (arg1>>arg2) compare to 0, signed shift + {name: "TEQshiftLLreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1<<arg2) compare to 0 + {name: "TEQshiftRLreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1>>arg2) compare to 0, unsigned shift + {name: "TEQshiftRAreg", argLength: 3, reg: gp3flags, asm: "TEQ", typ: "Flags"}, // arg0 ^ (arg1>>arg2) compare to 0, signed shift + + {name: "CMPF0", argLength: 1, reg: fp1flags, asm: "CMPF", typ: "Flags"}, // arg0 compare to 0, float32 + {name: "CMPD0", argLength: 1, reg: fp1flags, asm: "CMPD", typ: "Flags"}, // arg0 compare to 0, float64 + + // moves + {name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true}, // 32 low bits of auxint + {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float + {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float + + {name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB + + {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + + {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + + {name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "UInt32"}, // load from arg0 + arg1. arg2=mem + {name: "MOVWloadshiftLL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1<<auxInt. arg2=mem + {name: "MOVWloadshiftRL", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1>>auxInt, unsigned shift. arg2=mem + {name: "MOVWloadshiftRA", argLength: 3, reg: gp2load, asm: "MOVW", aux: "Int32", typ: "UInt32"}, // load from arg0 + arg1>>auxInt, signed shift. arg2=mem + {name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"}, // load from arg0 + arg1. arg2=mem + {name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"}, // load from arg0 + arg1. arg2=mem + {name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load from arg0 + arg1. arg2=mem + {name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load from arg0 + arg1. arg2=mem + + {name: "MOVWstoreidx", argLength: 4, reg: gp2store, asm: "MOVW", typ: "Mem"}, // store arg2 to arg0 + arg1. arg3=mem + {name: "MOVWstoreshiftLL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1<<auxInt. arg3=mem + {name: "MOVWstoreshiftRL", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1>>auxInt, unsigned shift. arg3=mem + {name: "MOVWstoreshiftRA", argLength: 4, reg: gp2store, asm: "MOVW", aux: "Int32", typ: "Mem"}, // store arg2 to arg0 + arg1>>auxInt, signed shift. arg3=mem + {name: "MOVBstoreidx", argLength: 4, reg: gp2store, asm: "MOVB", typ: "Mem"}, // store arg2 to arg0 + arg1. arg3=mem + {name: "MOVHstoreidx", argLength: 4, reg: gp2store, asm: "MOVH", typ: "Mem"}, // store arg2 to arg0 + arg1. arg3=mem + + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVBS"}, // move from arg0, sign-extended from byte + {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVHS"}, // move from arg0, sign-extended from half + {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0 + + {name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register + + {name: "MOVWF", argLength: 1, reg: gpfp, asm: "MOVWF"}, // int32 -> float32 + {name: "MOVWD", argLength: 1, reg: gpfp, asm: "MOVWD"}, // int32 -> float64 + {name: "MOVWUF", argLength: 1, reg: gpfp, asm: "MOVWF"}, // uint32 -> float32, set U bit in the instruction + {name: "MOVWUD", argLength: 1, reg: gpfp, asm: "MOVWD"}, // uint32 -> float64, set U bit in the instruction + {name: "MOVFW", argLength: 1, reg: fpgp, asm: "MOVFW"}, // float32 -> int32 + {name: "MOVDW", argLength: 1, reg: fpgp, asm: "MOVDW"}, // float64 -> int32 + {name: "MOVFWU", argLength: 1, reg: fpgp, asm: "MOVFW"}, // float32 -> uint32, set U bit in the instruction + {name: "MOVDWU", argLength: 1, reg: fpgp, asm: "MOVDW"}, // float64 -> uint32, set U bit in the instruction + {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64 + {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32 + + // conditional instructions, for lowering shifts + {name: "CMOVWHSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates HS, arg1=flags + {name: "CMOVWLSconst", argLength: 2, reg: gp1flags1, asm: "MOVW", aux: "Int32", resultInArg0: true}, // replace arg0 w/ const if flags indicates LS, arg1=flags + {name: "SRAcond", argLength: 3, reg: gp2flags1, asm: "SRA"}, // arg0 >> 31 if flags indicates HS, arg0 >> arg1 otherwise, signed shift, arg2=flags + + // function calls + {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R7"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // pseudo-ops + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem. + + {name: "Equal", argLength: 1, reg: readflags}, // bool, true flags encode x==y false otherwise. + {name: "NotEqual", argLength: 1, reg: readflags}, // bool, true flags encode x!=y false otherwise. + {name: "LessThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x<y false otherwise. + {name: "LessEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x<=y false otherwise. + {name: "GreaterThan", argLength: 1, reg: readflags}, // bool, true flags encode signed x>y false otherwise. + {name: "GreaterEqual", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y false otherwise. + {name: "LessThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<y false otherwise. + {name: "LessEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x<=y false otherwise. + {name: "GreaterThanU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>y false otherwise. + {name: "GreaterEqualU", argLength: 1, reg: readflags}, // bool, true flags encode unsigned x>=y false otherwise. + + // duffzero (must be 4-byte aligned) + // arg0 = address of memory to zero (in R1, changed as side effect) + // arg1 = value to store (always zero) + // arg2 = mem + // auxint = offset into duffzero code to start executing + // returns mem + { + name: "DUFFZERO", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), buildReg("R0")}, + clobbers: buildReg("R1 R12 R14"), // R14 is LR, R12 is linker trampoline scratch register + }, + faultOnNilArg0: true, + }, + + // duffcopy (must be 4-byte aligned) + // arg0 = address of dst memory (in R2, changed as side effect) + // arg1 = address of src memory (in R1, changed as side effect) + // arg2 = mem + // auxint = offset into duffcopy code to start executing + // returns mem + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R2"), buildReg("R1")}, + clobbers: buildReg("R0 R1 R2 R12 R14"), // R14 is LR, R12 is linker trampoline scratch register + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // large or unaligned zeroing + // arg0 = address of memory to zero (in R1, changed as side effect) + // arg1 = address of the last element to zero + // arg2 = value to store (always zero) + // arg3 = mem + // returns mem + // MOVW.P Rarg2, 4(R1) + // CMP R1, Rarg1 + // BLE -2(PC) + { + name: "LoweredZero", + aux: "Int64", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), gp, gp}, + clobbers: buildReg("R1"), + }, + clobberFlags: true, + faultOnNilArg0: true, + }, + + // large or unaligned move + // arg0 = address of dst memory (in R2, changed as side effect) + // arg1 = address of src memory (in R1, changed as side effect) + // arg2 = address of the last element of src + // arg3 = mem + // returns mem + // MOVW.P 4(R1), Rtmp + // MOVW.P Rtmp, 4(R2) + // CMP R1, Rarg2 + // BLE -3(PC) + { + name: "LoweredMove", + aux: "Int64", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R2"), buildReg("R1"), gp}, + clobbers: buildReg("R1 R2"), + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of R7 (arm.REGCTXT, the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R7")}}, zeroWidth: true}, + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + // Extend ops are the same as Bounds ops except the indexes are 64-bit. + {name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + {name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + {name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r0, r1}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + + // Constant flag value. + // Note: there's an "unordered" outcome for floating-point + // comparisons, but we don't use such a beast yet. + // This op is for temporary use by rewrite rules. It + // cannot appear in the generated assembly. + {name: "FlagConstant", aux: "FlagConstant"}, + + // (InvertFlags (CMP a b)) == (CMP b a) + // InvertFlags is a pseudo-op which can't appear in assembly output. + {name: "InvertFlags", argLength: 1}, // reverse direction of arg0 + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers R14 (LR) because it's a call, and R12 which is linker trampoline scratch register. + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R12 R14")}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + } + + blocks := []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LT", controls: 1}, + {name: "LE", controls: 1}, + {name: "GT", controls: 1}, + {name: "GE", controls: 1}, + {name: "ULT", controls: 1}, + {name: "ULE", controls: 1}, + {name: "UGT", controls: 1}, + {name: "UGE", controls: 1}, + {name: "LTnoov", controls: 1}, // 'LT' but without honoring overflow + {name: "LEnoov", controls: 1}, // 'LE' but without honoring overflow + {name: "GTnoov", controls: 1}, // 'GT' but without honoring overflow + {name: "GEnoov", controls: 1}, // 'GE' but without honoring overflow + } + + archs = append(archs, arch{ + name: "ARM", + pkg: "cmd/internal/obj/arm", + genfile: "../../arm/ssa.go", + ops: ops, + blocks: blocks, + regnames: regNamesARM, + gpregmask: gp, + fpregmask: fp, + framepointerreg: -1, // not used + linkreg: int8(num["R14"]), + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules new file mode 100644 index 0000000..1caaf13 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules @@ -0,0 +1,694 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +(Add(Ptr|64|32|16|8) ...) => (ADDV ...) +(Add(32|64)F ...) => (ADD(F|D) ...) + +(Sub(Ptr|64|32|16|8) ...) => (SUBV ...) +(Sub(32|64)F ...) => (SUB(F|D) ...) + +(Mul(64|32|16|8) x y) => (Select1 (MULVU x y)) +(Mul(32|64)F ...) => (MUL(F|D) ...) +(Mul64uhilo ...) => (MULVU ...) +(Select0 (Mul64uover x y)) => (Select1 <typ.UInt64> (MULVU x y)) +(Select1 (Mul64uover x y)) => (SGTU <typ.Bool> (Select0 <typ.UInt64> (MULVU x y)) (MOVVconst <typ.UInt64> [0])) + +(Hmul64 x y) => (Select0 (MULV x y)) +(Hmul64u x y) => (Select0 (MULVU x y)) +(Hmul32 x y) => (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32]) +(Hmul32u x y) => (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32]) + +(Div64 x y) => (Select1 (DIVV x y)) +(Div64u x y) => (Select1 (DIVVU x y)) +(Div32 x y) => (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y))) +(Div32u x y) => (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Div16 x y) => (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y))) +(Div16u x y) => (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Div8 x y) => (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y))) +(Div8u x y) => (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Div(32|64)F ...) => (DIV(F|D) ...) + +(Mod64 x y) => (Select0 (DIVV x y)) +(Mod64u x y) => (Select0 (DIVVU x y)) +(Mod32 x y) => (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y))) +(Mod32u x y) => (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Mod16 x y) => (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y))) +(Mod16u x y) => (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Mod8 x y) => (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y))) +(Mod8u x y) => (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y))) + +(Select0 <t> (Add64carry x y c)) => (ADDV (ADDV <t> x y) c) +(Select1 <t> (Add64carry x y c)) => + (OR (SGTU <t> x s:(ADDV <t> x y)) (SGTU <t> s (ADDV <t> s c))) + +(Select0 <t> (Sub64borrow x y c)) => (SUBV (SUBV <t> x y) c) +(Select1 <t> (Sub64borrow x y c)) => + (OR (SGTU <t> s:(SUBV <t> x y) x) (SGTU <t> (SUBV <t> s c) s)) + +// (x + y) / 2 with x>=y => (x - y) / 2 + y +(Avg64u <t> x y) => (ADDV (SRLVconst <t> (SUBV <t> x y) [1]) y) + +(And(64|32|16|8) ...) => (AND ...) +(Or(64|32|16|8) ...) => (OR ...) +(Xor(64|32|16|8) ...) => (XOR ...) + +// shifts +// hardware instruction uses only the low 6 bits of the shift +// we compare to 64 to ensure Go semantics for large shifts +(Lsh64x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh64x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh64x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh64x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Lsh32x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh32x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh32x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh32x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Lsh16x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh16x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh16x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh16x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Lsh8x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh8x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh8x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh8x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Rsh64Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y)) +(Rsh64Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y))) +(Rsh64Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y))) +(Rsh64Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> x (ZeroExt8to64 y))) + +(Rsh32Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y)) +(Rsh32Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Rsh32Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y))) +(Rsh32Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64 y))) + +(Rsh16Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y)) +(Rsh16Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y))) +(Rsh16Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Rsh16Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64 y))) + +(Rsh8Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y)) +(Rsh8Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y))) +(Rsh8Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y))) +(Rsh8Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64 y))) + +(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh64x8 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh32x8 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh16x8 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh8x8 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +// rotates +(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7]))) +(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15]))) +(RotateLeft32 x y) => (ROTR x (NEGV <y.Type> y)) +(RotateLeft64 x y) => (ROTRV x (NEGV <y.Type> y)) + +// unary ops +(Neg(64|32|16|8) ...) => (NEGV ...) +(Neg(32|64)F ...) => (NEG(F|D) ...) + +(Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x) + +(Sqrt ...) => (SQRTD ...) +(Sqrt32 ...) => (SQRTF ...) + +// boolean ops -- booleans are represented with 0=false, 1=true +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(EqB x y) => (XOR (MOVVconst [1]) (XOR <typ.Bool> x y)) +(NeqB ...) => (XOR ...) +(Not x) => (XORconst [1] x) + +// constants +(Const(64|32|16|8) [val]) => (MOVVconst [int64(val)]) +(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)]) +(ConstNil) => (MOVVconst [0]) +(ConstBool [t]) => (MOVVconst [int64(b2i(t))]) + +(Slicemask <t> x) => (SRAVconst (NEGV <t> x) [63]) + +// truncations +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) +(Trunc64to8 ...) => (Copy ...) +(Trunc64to16 ...) => (Copy ...) +(Trunc64to32 ...) => (Copy ...) + +// Zero-/Sign-extensions +(ZeroExt8to16 ...) => (MOVBUreg ...) +(ZeroExt8to32 ...) => (MOVBUreg ...) +(ZeroExt16to32 ...) => (MOVHUreg ...) +(ZeroExt8to64 ...) => (MOVBUreg ...) +(ZeroExt16to64 ...) => (MOVHUreg ...) +(ZeroExt32to64 ...) => (MOVWUreg ...) + +(SignExt8to16 ...) => (MOVBreg ...) +(SignExt8to32 ...) => (MOVBreg ...) +(SignExt16to32 ...) => (MOVHreg ...) +(SignExt8to64 ...) => (MOVBreg ...) +(SignExt16to64 ...) => (MOVHreg ...) +(SignExt32to64 ...) => (MOVWreg ...) + +// float <=> int conversion +(Cvt32to32F ...) => (MOVWF ...) +(Cvt32to64F ...) => (MOVWD ...) +(Cvt64to32F ...) => (MOVVF ...) +(Cvt64to64F ...) => (MOVVD ...) +(Cvt32Fto32 ...) => (TRUNCFW ...) +(Cvt64Fto32 ...) => (TRUNCDW ...) +(Cvt32Fto64 ...) => (TRUNCFV ...) +(Cvt64Fto64 ...) => (TRUNCDV ...) +(Cvt32Fto64F ...) => (MOVFD ...) +(Cvt64Fto32F ...) => (MOVDF ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round(32|64)F ...) => (Copy ...) + +// comparisons +(Eq8 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Eq16 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Eq32 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Eq64 x y) => (SGTU (MOVVconst [1]) (XOR x y)) +(EqPtr x y) => (SGTU (MOVVconst [1]) (XOR x y)) +(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y)) + +(Neq8 x y) => (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0])) +(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0])) +(Neq32 x y) => (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0])) +(Neq64 x y) => (SGTU (XOR x y) (MOVVconst [0])) +(NeqPtr x y) => (SGTU (XOR x y) (MOVVconst [0])) +(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y)) + +(Less8 x y) => (SGT (SignExt8to64 y) (SignExt8to64 x)) +(Less16 x y) => (SGT (SignExt16to64 y) (SignExt16to64 x)) +(Less32 x y) => (SGT (SignExt32to64 y) (SignExt32to64 x)) +(Less64 x y) => (SGT y x) +(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN + +(Less8U x y) => (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x)) +(Less16U x y) => (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x)) +(Less32U x y) => (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x)) +(Less64U x y) => (SGTU y x) + +(Leq8 x y) => (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y))) +(Leq16 x y) => (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y))) +(Leq32 x y) => (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y))) +(Leq64 x y) => (XOR (MOVVconst [1]) (SGT x y)) +(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN + +(Leq8U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Leq16U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Leq32U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Leq64U x y) => (XOR (MOVVconst [1]) (SGTU x y)) + +(OffPtr [off] ptr:(SP)) => (MOVVaddr [int32(off)] ptr) +(OffPtr [off] ptr) => (ADDVconst [off] ptr) + +(Addr {sym} base) => (MOVVaddr {sym} base) +(LocalAddr {sym} base _) => (MOVVaddr {sym} base) + +// loads +(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem) +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVVload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem) + +// stores +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVVstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem) + +// zeroing +(Zero [0] _ mem) => mem +(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem) +(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore ptr (MOVVconst [0]) mem) +(Zero [2] ptr mem) => + (MOVBstore [1] ptr (MOVVconst [0]) + (MOVBstore [0] ptr (MOVVconst [0]) mem)) +(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore ptr (MOVVconst [0]) mem) +(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] ptr (MOVVconst [0]) + (MOVHstore [0] ptr (MOVVconst [0]) mem)) +(Zero [4] ptr mem) => + (MOVBstore [3] ptr (MOVVconst [0]) + (MOVBstore [2] ptr (MOVVconst [0]) + (MOVBstore [1] ptr (MOVVconst [0]) + (MOVBstore [0] ptr (MOVVconst [0]) mem)))) +(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVVstore ptr (MOVVconst [0]) mem) +(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] ptr (MOVVconst [0]) + (MOVWstore [0] ptr (MOVVconst [0]) mem)) +(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] ptr (MOVVconst [0]) + (MOVHstore [4] ptr (MOVVconst [0]) + (MOVHstore [2] ptr (MOVVconst [0]) + (MOVHstore [0] ptr (MOVVconst [0]) mem)))) + +(Zero [3] ptr mem) => + (MOVBstore [2] ptr (MOVVconst [0]) + (MOVBstore [1] ptr (MOVVconst [0]) + (MOVBstore [0] ptr (MOVVconst [0]) mem))) +(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] ptr (MOVVconst [0]) + (MOVHstore [2] ptr (MOVVconst [0]) + (MOVHstore [0] ptr (MOVVconst [0]) mem))) +(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] ptr (MOVVconst [0]) + (MOVWstore [4] ptr (MOVVconst [0]) + (MOVWstore [0] ptr (MOVVconst [0]) mem))) +(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVVstore [8] ptr (MOVVconst [0]) + (MOVVstore [0] ptr (MOVVconst [0]) mem)) +(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVVstore [16] ptr (MOVVconst [0]) + (MOVVstore [8] ptr (MOVVconst [0]) + (MOVVstore [0] ptr (MOVVconst [0]) mem))) + +// medium zeroing uses a duff device +// 8, and 128 are magic constants, see runtime/mkduff.go +(Zero [s] {t} ptr mem) + && s%8 == 0 && s > 24 && s <= 8*128 + && t.Alignment()%8 == 0 && !config.noDuffDevice => + (DUFFZERO [8 * (128 - s/8)] ptr mem) + +// large or unaligned zeroing uses a loop +(Zero [s] {t} ptr mem) + && (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 => + (LoweredZero [t.Alignment()] + ptr + (ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)]) + mem) + +// moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem) +(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore dst (MOVHload src mem) mem) +(Move [2] dst src mem) => + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem)) +(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore dst (MOVWload src mem) mem) +(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)) +(Move [4] dst src mem) => + (MOVBstore [3] dst (MOVBload [3] src mem) + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem)))) +(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVVstore dst (MOVVload src mem) mem) +(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem)) +(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] dst (MOVHload [6] src mem) + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)))) + +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem))) +(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem))) +(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] dst (MOVWload [8] src mem) + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem))) +(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVVstore [8] dst (MOVVload [8] src mem) + (MOVVstore dst (MOVVload src mem) mem)) +(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVVstore [16] dst (MOVVload [16] src mem) + (MOVVstore [8] dst (MOVVload [8] src mem) + (MOVVstore dst (MOVVload src mem) mem))) + +// medium move uses a duff device +(Move [s] {t} dst src mem) + && s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [16 * (128 - s/8)] dst src mem) +// 16 and 128 are magic constants. 16 is the number of bytes to encode: +// MOVV (R1), R23 +// ADDV $8, R1 +// MOVV R23, (R2) +// ADDV $8, R2 +// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy. + +// large or unaligned move uses a loop +(Move [s] {t} dst src mem) + && s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 => + (LoweredMove [t.Alignment()] + dst + src + (ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)]) + mem) + +// calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// atomic intrinsics +(AtomicLoad(8|32|64) ...) => (LoweredAtomicLoad(8|32|64) ...) +(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...) + +(AtomicStore(8|32|64) ...) => (LoweredAtomicStore(8|32|64) ...) +(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...) + +(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...) + +(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...) + +(AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem) +(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...) + +// checks +(NilCheck ...) => (LoweredNilCheck ...) +(IsNonNil ptr) => (SGTU ptr (MOVVconst [0])) +(IsInBounds idx len) => (SGTU len idx) +(IsSliceInBounds idx len) => (XOR (MOVVconst [1]) (SGTU idx len)) + +// pseudo-ops +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) + +(If cond yes no) => (NE cond yes no) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +(CondSelect <t> x y cond) => (OR (MASKEQZ <t> x cond) (MASKNEZ <t> y cond)) + +// Optimizations + +// Absorb boolean tests into block +(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no) +(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no) +(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no) +(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no) +(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no) +(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no) +(NE (SGTUconst [1] x) yes no) => (EQ x yes no) +(EQ (SGTUconst [1] x) yes no) => (NE x yes no) +(NE (SGTU x (MOVVconst [0])) yes no) => (NE x yes no) +(EQ (SGTU x (MOVVconst [0])) yes no) => (EQ x yes no) +(NE (SGTconst [0] x) yes no) => (LTZ x yes no) +(EQ (SGTconst [0] x) yes no) => (GEZ x yes no) +(NE (SGT x (MOVVconst [0])) yes no) => (GTZ x yes no) +(EQ (SGT x (MOVVconst [0])) yes no) => (LEZ x yes no) + +// fold offset into address +(ADDVconst [off1] (MOVVaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) => (MOVVaddr [int32(off1)+int32(off2)] {sym} ptr) + +// fold address into load/store +(MOVBload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBload [off1+int32(off2)] {sym} ptr mem) +(MOVBUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBUload [off1+int32(off2)] {sym} ptr mem) +(MOVHload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHload [off1+int32(off2)] {sym} ptr mem) +(MOVHUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHUload [off1+int32(off2)] {sym} ptr mem) +(MOVWload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWload [off1+int32(off2)] {sym} ptr mem) +(MOVWUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWUload [off1+int32(off2)] {sym} ptr mem) +(MOVVload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVload [off1+int32(off2)] {sym} ptr mem) +(MOVFload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVFload [off1+int32(off2)] {sym} ptr mem) +(MOVDload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDload [off1+int32(off2)] {sym} ptr mem) + +(MOVBstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVBstore [off1+int32(off2)] {sym} ptr val mem) +(MOVHstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVHstore [off1+int32(off2)] {sym} ptr val mem) +(MOVWstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVWstore [off1+int32(off2)] {sym} ptr val mem) +(MOVVstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVVstore [off1+int32(off2)] {sym} ptr val mem) +(MOVFstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVFstore [off1+int32(off2)] {sym} ptr val mem) +(MOVDstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVDstore [off1+int32(off2)] {sym} ptr val mem) +(MOVBstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVHstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVWstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVVstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVstorezero [off1+int32(off2)] {sym} ptr mem) + +(MOVBload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVBUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVHload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVHUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVWload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVWUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVVload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVVload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVFload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVFload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVDload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVDload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) + +(MOVBstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVHstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVWstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVVstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVVstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVFstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVFstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVDstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVDstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVBstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVHstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVWstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVVstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVVstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) + +(LoweredAtomicStore(32|64) ptr (MOVVconst [0]) mem) => (LoweredAtomicStorezero(32|64) ptr mem) +(LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst32 [int32(c)] ptr mem) +(LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst64 [c] ptr mem) + +// don't extend after proper load +(MOVBreg x:(MOVBload _ _)) => (MOVVreg x) +(MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVHreg x:(MOVBload _ _)) => (MOVVreg x) +(MOVHreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVHreg x:(MOVHload _ _)) => (MOVVreg x) +(MOVHUreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVHUreg x:(MOVHUload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVBload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVHload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVHUload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVWload _ _)) => (MOVVreg x) +(MOVWUreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVWUreg x:(MOVHUload _ _)) => (MOVVreg x) +(MOVWUreg x:(MOVWUload _ _)) => (MOVVreg x) + +// fold double extensions +(MOVBreg x:(MOVBreg _)) => (MOVVreg x) +(MOVBUreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVHreg x:(MOVBreg _)) => (MOVVreg x) +(MOVHreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVHreg x:(MOVHreg _)) => (MOVVreg x) +(MOVHUreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVHUreg x:(MOVHUreg _)) => (MOVVreg x) +(MOVWreg x:(MOVBreg _)) => (MOVVreg x) +(MOVWreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVWreg x:(MOVHreg _)) => (MOVVreg x) +(MOVWreg x:(MOVWreg _)) => (MOVVreg x) +(MOVWUreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVWUreg x:(MOVHUreg _)) => (MOVVreg x) +(MOVWUreg x:(MOVWUreg _)) => (MOVVreg x) + +// don't extend before store +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem) + +// if a register move has only 1 use, just use the same register without emitting instruction +// MOVVnop doesn't emit instruction, only for ensuring the type. +(MOVVreg x) && x.Uses == 1 => (MOVVnop x) + +// fold constant into arithmetic ops +(ADDV x (MOVVconst [c])) && is32Bit(c) => (ADDVconst [c] x) +(SUBV x (MOVVconst [c])) && is32Bit(c) => (SUBVconst [c] x) +(AND x (MOVVconst [c])) && is32Bit(c) => (ANDconst [c] x) +(OR x (MOVVconst [c])) && is32Bit(c) => (ORconst [c] x) +(XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x) +(NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x) + +(SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0]) +(SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0]) +(SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63]) +(SLLV x (MOVVconst [c])) => (SLLVconst x [c]) +(SRLV x (MOVVconst [c])) => (SRLVconst x [c]) +(SRAV x (MOVVconst [c])) => (SRAVconst x [c]) +(ROTR x (MOVVconst [c])) => (ROTRconst x [c&31]) +(ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63]) + +(SGT (MOVVconst [c]) x) && is32Bit(c) => (SGTconst [c] x) +(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x) + +// mul by constant +(Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x) +(Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0]) +(Select1 (MULVU x (MOVVconst [1]))) => x +(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SLLVconst [log64(c)] x) + +// div by constant +(Select1 (DIVVU x (MOVVconst [1]))) => x +(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SRLVconst [log64(c)] x) +(Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0]) // mod +(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (ANDconst [c-1] x) // mod + +// generic simplifications +(ADDV x (NEGV y)) => (SUBV x y) +(SUBV x x) => (MOVVconst [0]) +(SUBV (MOVVconst [0]) x) => (NEGV x) +(AND x x) => x +(OR x x) => x +(XOR x x) => (MOVVconst [0]) + +// remove redundant *const ops +(ADDVconst [0] x) => x +(SUBVconst [0] x) => x +(ANDconst [0] _) => (MOVVconst [0]) +(ANDconst [-1] x) => x +(ORconst [0] x) => x +(ORconst [-1] _) => (MOVVconst [-1]) +(XORconst [0] x) => x +(XORconst [-1] x) => (NORconst [0] x) +(MASKEQZ (MOVVconst [0]) cond) => (MOVVconst [0]) +(MASKNEZ (MOVVconst [0]) cond) => (MOVVconst [0]) + +// generic constant folding +(ADDVconst [c] (MOVVconst [d])) => (MOVVconst [c+d]) +(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) => (ADDVconst [c+d] x) +(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) => (ADDVconst [c-d] x) +(SUBVconst [c] (MOVVconst [d])) => (MOVVconst [d-c]) +(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) => (ADDVconst [-c-d] x) +(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) => (ADDVconst [-c+d] x) +(SLLVconst [c] (MOVVconst [d])) => (MOVVconst [d<<uint64(c)]) +(SRLVconst [c] (MOVVconst [d])) => (MOVVconst [int64(uint64(d)>>uint64(c))]) +(SRAVconst [c] (MOVVconst [d])) => (MOVVconst [d>>uint64(c)]) +(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c*d]) +(Select1 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c/d]) +(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)/uint64(d))]) +(Select0 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c%d]) // mod +(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)%uint64(d))]) // mod +(ANDconst [c] (MOVVconst [d])) => (MOVVconst [c&d]) +(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x) +(ORconst [c] (MOVVconst [d])) => (MOVVconst [c|d]) +(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) => (ORconst [c|d] x) +(XORconst [c] (MOVVconst [d])) => (MOVVconst [c^d]) +(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) => (XORconst [c^d] x) +(NORconst [c] (MOVVconst [d])) => (MOVVconst [^(c|d)]) +(NEGV (MOVVconst [c])) => (MOVVconst [-c]) +(MOVBreg (MOVVconst [c])) => (MOVVconst [int64(int8(c))]) +(MOVBUreg (MOVVconst [c])) => (MOVVconst [int64(uint8(c))]) +(MOVHreg (MOVVconst [c])) => (MOVVconst [int64(int16(c))]) +(MOVHUreg (MOVVconst [c])) => (MOVVconst [int64(uint16(c))]) +(MOVWreg (MOVVconst [c])) => (MOVVconst [int64(int32(c))]) +(MOVWUreg (MOVVconst [c])) => (MOVVconst [int64(uint32(c))]) +(MOVVreg (MOVVconst [c])) => (MOVVconst [c]) + +// constant comparisons +(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1]) +(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0]) +(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) => (MOVVconst [1]) +(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) => (MOVVconst [0]) + +// other known comparisons +(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVVconst [1]) +(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVVconst [0]) +(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVVconst [1]) +(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVVconst [0]) +(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) => (MOVVconst [1]) +(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVVconst [1]) +(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVVconst [0]) +(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVVconst [1]) +(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVVconst [0]) +(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) => (MOVVconst [1]) +(SGTconst [c] (MOVWUreg _)) && c < 0 => (MOVVconst [0]) +(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVVconst [1]) +(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) => (MOVVconst [1]) +(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1]) +(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1]) + +// absorb constants into branches +(EQ (MOVVconst [0]) yes no) => (First yes no) +(EQ (MOVVconst [c]) yes no) && c != 0 => (First no yes) +(NE (MOVVconst [0]) yes no) => (First no yes) +(NE (MOVVconst [c]) yes no) && c != 0 => (First yes no) +(LTZ (MOVVconst [c]) yes no) && c < 0 => (First yes no) +(LTZ (MOVVconst [c]) yes no) && c >= 0 => (First no yes) +(LEZ (MOVVconst [c]) yes no) && c <= 0 => (First yes no) +(LEZ (MOVVconst [c]) yes no) && c > 0 => (First no yes) +(GTZ (MOVVconst [c]) yes no) && c > 0 => (First yes no) +(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes) +(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no) +(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes) + +// SGT/SGTU with known outcomes. +(SGT x x) => (MOVVconst [0]) +(SGTU x x) => (MOVVconst [0]) diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go new file mode 100644 index 0000000..22a83fb --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go @@ -0,0 +1,484 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - *const instructions may use a constant larger than the instruction can encode. +// In this case the assembler expands to multiple instructions and uses tmp +// register (R23). + +// Suffixes encode the bit width of various instructions. +// V (vlong) = 64 bit +// WU (word) = 32 bit unsigned +// W (word) = 32 bit +// H (half word) = 16 bit +// HU = 16 bit unsigned +// B (byte) = 8 bit +// BU = 8 bit unsigned +// F (float) = 32 bit float +// D (double) = 64 bit float + +// Note: registers not used in regalloc are not included in this list, +// so that regmask stays within int64 +// Be careful when hand coding regmasks. +var regNamesLOONG64 = []string{ + "R0", // constant 0 + "R1", + "SP", // aka R3 + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "R14", + "R15", + "R16", + "R17", + "R18", + "R19", + "R20", + "R21", + "g", // aka R22 + "R23", + "R24", + "R25", + "R26", + "R27", + "R28", + "R29", + // R30 is REGTMP not used in regalloc + "R31", + + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", + "F16", + "F17", + "F18", + "F19", + "F20", + "F21", + "F22", + "F23", + "F24", + "F25", + "F26", + "F27", + "F28", + "F29", + "F30", + "F31", + + // If you add registers, update asyncPreempt in runtime. + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesLOONG64) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesLOONG64 { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + gp = buildReg("R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R23 R24 R25 R26 R27 R28 R29 R31") // R1 is LR, R2 is thread pointer, R3 is stack pointer, R21-unused, R22 is g, R30 is REGTMP + gps = buildReg("R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R19 R20 R23 R24 R25 R26 R27 R28 R29 R31") | buildReg("g") + gpg = gp | buildReg("g") + gpsp = gp | buildReg("SP") + gpspg = gpg | buildReg("SP") + gpspsbg = gpspg | buildReg("SB") + fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") + callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r1 = buildReg("R19") + r2 = buildReg("R18") + r3 = buildReg("R17") + r4 = buildReg("R4") + ) + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}} + gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}} + gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}} + gp22 = regInfo{inputs: []regMask{gps, gps}, outputs: []regMask{gp, gp}} + gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}} + gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}} + gpstore0 = regInfo{inputs: []regMask{gpspsbg}} + gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}} + gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}} + fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} + fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} + fpstore = regInfo{inputs: []regMask{gpspsbg, fp}} + readflags = regInfo{inputs: nil, outputs: []regMask{gp}} + ) + ops := []opData{ + // binary ops + {name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true}, // arg0 + arg1 + {name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"}, // arg0 + auxInt. auxInt is 32-bit, also in other *const ops. + {name: "SUBV", argLength: 2, reg: gp21, asm: "SUBVU"}, // arg0 - arg1 + {name: "SUBVconst", argLength: 1, reg: gp11, asm: "SUBVU", aux: "Int64"}, // arg0 - auxInt + + {name: "MULV", argLength: 2, reg: gp22, resultNotInArgs: true, commutative: true, typ: "(Int64,Int64)"}, // arg0 * arg1, signed + {name: "MULVU", argLength: 2, reg: gp22, resultNotInArgs: true, commutative: true, typ: "(UInt64,UInt64)"}, // arg0 * arg1, unsigned + {name: "DIVV", argLength: 2, reg: gp22, resultNotInArgs: true, typ: "(Int64,Int64)"}, // arg0 / arg1, signed + {name: "DIVVU", argLength: 2, reg: gp22, resultNotInArgs: true, typ: "(UInt64,UInt64)"}, // arg0 / arg1, unsigned + + {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1 + {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1 + {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1 + {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1 + {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1 + {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1 + {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1 + {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1 + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 + {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0 | auxInt + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt64"}, // arg0 ^ arg1 + {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", typ: "UInt64"}, // arg0 ^ auxInt + {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1) + {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt) + + {name: "NEGV", argLength: 1, reg: gp11}, // -arg0 + {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32 + {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64 + {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64 + {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32 + + {name: "MASKEQZ", argLength: 2, reg: gp21, asm: "MASKEQZ"}, // returns 0 if arg1 == 0, otherwise returns arg0 + {name: "MASKNEZ", argLength: 2, reg: gp21, asm: "MASKNEZ"}, // returns 0 if arg1 != 0, otherwise returns arg0 + + // shifts + {name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64 + {name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt + {name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64 + {name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned + {name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64 + {name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed + {name: "ROTR", argLength: 2, reg: gp21, asm: "ROTR"}, // arg0 right rotate by (arg1 mod 32) bits + {name: "ROTRV", argLength: 2, reg: gp21, asm: "ROTRV"}, // arg0 right rotate by (arg1 mod 64) bits + {name: "ROTRconst", argLength: 1, reg: gp11, asm: "ROTR", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31. + {name: "ROTRVconst", argLength: 1, reg: gp11, asm: "ROTRV", aux: "Int64"}, // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63. + + // comparisons + {name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise + {name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise + {name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise + {name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise + + {name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32 + {name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64 + {name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32 + {name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64 + {name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32 + {name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64 + + // moves + {name: "MOVVconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVV", typ: "UInt64", rematerializeable: true}, // auxint + {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float + {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float + + {name: "MOVVaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVV", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB + + {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVVload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVV", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + + {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVVstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + + {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVVstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. ar12=mem. + + // conversions + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte + {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half + {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word + {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word + {name: "MOVVreg", argLength: 1, reg: gp11, asm: "MOVV"}, // move from arg0 + + {name: "MOVVnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register + + {name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32 + {name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64 + {name: "MOVVF", argLength: 1, reg: fp11, asm: "MOVVF"}, // int64 -> float32 + {name: "MOVVD", argLength: 1, reg: fp11, asm: "MOVVD"}, // int64 -> float64 + {name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32 + {name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32 + {name: "TRUNCFV", argLength: 1, reg: fp11, asm: "TRUNCFV"}, // float32 -> int64 + {name: "TRUNCDV", argLength: 1, reg: fp11, asm: "TRUNCDV"}, // float64 -> int64 + {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64 + {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32 + + // function calls + {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R29"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // duffzero + // arg0 = address of memory to zero + // arg1 = mem + // auxint = offset into duffzero code to start executing + // returns mem + // R19 aka loong64.REGRT1 changed as side effect + { + name: "DUFFZERO", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{gp}, + clobbers: buildReg("R19 R1"), + }, + faultOnNilArg0: true, + }, + + // duffcopy + // arg0 = address of dst memory (in R20, changed as side effect) REGRT2 + // arg1 = address of src memory (in R19, changed as side effect) REGRT1 + // arg2 = mem + // auxint = offset into duffcopy code to start executing + // returns mem + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R20"), buildReg("R19")}, + clobbers: buildReg("R19 R20 R1"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // large or unaligned zeroing + // arg0 = address of memory to zero (in R19, changed as side effect) + // arg1 = address of the last element to zero + // arg2 = mem + // auxint = alignment + // returns mem + // SUBV $8, R19 + // MOVV R0, 8(R19) + // ADDV $8, R19 + // BNE Rarg1, R19, -2(PC) + { + name: "LoweredZero", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R19"), gp}, + clobbers: buildReg("R19"), + }, + clobberFlags: true, + faultOnNilArg0: true, + }, + + // large or unaligned move + // arg0 = address of dst memory (in R4, changed as side effect) + // arg1 = address of src memory (in R19, changed as side effect) + // arg2 = address of the last element of src + // arg3 = mem + // auxint = alignment + // returns mem + // SUBV $8, R19 + // MOVV 8(R19), Rtmp + // MOVV Rtmp, (R4) + // ADDV $8, R19 + // ADDV $8, R4 + // BNE Rarg2, R19, -4(PC) + { + name: "LoweredMove", + aux: "Int64", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R4"), buildReg("R19"), gp}, + clobbers: buildReg("R19 R4"), + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // atomic loads. + // load from arg0. arg1=mem. + // returns <value,memory> so they can be properly ordered with other loads. + {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true}, + + // atomic stores. + // store arg1 to arg0. arg2=mem. returns memory. + {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + // store zero to arg0. arg1=mem. returns memory. + {name: "LoweredAtomicStorezero32", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStorezero64", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic exchange. + // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. + // DBAR + // LL (Rarg0), Rout + // MOVV Rarg1, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // DBAR + {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic add. + // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. + // DBAR + // LL (Rarg0), Rout + // ADDV Rarg1, Rout, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // DBAR + // ADDV Rarg1, Rout + {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + // *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit. + {name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // DBAR + // MOVV $0, Rout + // LL (Rarg0), Rtmp + // BNE Rtmp, Rarg1, 4(PC) + // MOVV Rarg2, Rout + // SC Rout, (Rarg0) + // BEQ Rout, -4(PC) + // DBAR + {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // pseudo-ops + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem. + + {name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true + {name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false + + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of R22 (loong64.REGCTXT, the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R29")}}, zeroWidth: true}, + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers R1 (LR) because it's a call + // and R30 (REGTMP). + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R27"), buildReg("R28")}, clobbers: (callerSave &^ gpg) | buildReg("R1")}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + } + + blocks := []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LTZ", controls: 1}, // < 0 + {name: "LEZ", controls: 1}, // <= 0 + {name: "GTZ", controls: 1}, // > 0 + {name: "GEZ", controls: 1}, // >= 0 + {name: "FPT", controls: 1}, // FP flag is true + {name: "FPF", controls: 1}, // FP flag is false + } + + archs = append(archs, arch{ + name: "LOONG64", + pkg: "cmd/internal/obj/loong64", + genfile: "../../loong64/ssa.go", + ops: ops, + blocks: blocks, + regnames: regNamesLOONG64, + // TODO: support register ABI on loong64 + ParamIntRegNames: "R4 R5 R6 R7 R8 R9 R10 R11", + ParamFloatRegNames: "F0 F1 F2 F3 F4 F5 F6 F7", + gpregmask: gp, + fpregmask: fp, + framepointerreg: -1, // not used + linkreg: int8(num["R1"]), + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS.rules b/src/cmd/compile/internal/ssa/_gen/MIPS.rules new file mode 100644 index 0000000..6f696da --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/MIPS.rules @@ -0,0 +1,703 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +(Add(Ptr|32|16|8) ...) => (ADD ...) +(Add(32|64)F ...) => (ADD(F|D) ...) + +(Select0 (Add32carry <t> x y)) => (ADD <t.FieldType(0)> x y) +(Select1 (Add32carry <t> x y)) => (SGTU <typ.Bool> x (ADD <t.FieldType(0)> x y)) +(Add32withcarry <t> x y c) => (ADD c (ADD <t> x y)) + +(Sub(Ptr|32|16|8) ...) => (SUB ...) +(Sub(32|64)F ...) => (SUB(F|D) ...) + +(Select0 (Sub32carry <t> x y)) => (SUB <t.FieldType(0)> x y) +(Select1 (Sub32carry <t> x y)) => (SGTU <typ.Bool> (SUB <t.FieldType(0)> x y) x) +(Sub32withcarry <t> x y c) => (SUB (SUB <t> x y) c) + +(Mul(32|16|8) ...) => (MUL ...) +(Mul(32|64)F ...) => (MUL(F|D) ...) + +(Hmul(32|32u) x y) => (Select0 (MUL(T|TU) x y)) +(Mul32uhilo ...) => (MULTU ...) + +(Div32 x y) => (Select1 (DIV x y)) +(Div32u x y) => (Select1 (DIVU x y)) +(Div16 x y) => (Select1 (DIV (SignExt16to32 x) (SignExt16to32 y))) +(Div16u x y) => (Select1 (DIVU (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Div8 x y) => (Select1 (DIV (SignExt8to32 x) (SignExt8to32 y))) +(Div8u x y) => (Select1 (DIVU (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Div(32|64)F ...) => (DIV(F|D) ...) + +(Mod32 x y) => (Select0 (DIV x y)) +(Mod32u x y) => (Select0 (DIVU x y)) +(Mod16 x y) => (Select0 (DIV (SignExt16to32 x) (SignExt16to32 y))) +(Mod16u x y) => (Select0 (DIVU (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Mod8 x y) => (Select0 (DIV (SignExt8to32 x) (SignExt8to32 y))) +(Mod8u x y) => (Select0 (DIVU (ZeroExt8to32 x) (ZeroExt8to32 y))) + +// (x + y) / 2 with x>=y becomes (x - y) / 2 + y +(Avg32u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y) + +(And(32|16|8) ...) => (AND ...) +(Or(32|16|8) ...) => (OR ...) +(Xor(32|16|8) ...) => (XOR ...) + +// constant shifts +// generic opt rewrites all constant shifts to shift by Const64 +(Lsh32x64 x (Const64 [c])) && uint32(c) < 32 => (SLLconst x [int32(c)]) +(Rsh32x64 x (Const64 [c])) && uint32(c) < 32 => (SRAconst x [int32(c)]) +(Rsh32Ux64 x (Const64 [c])) && uint32(c) < 32 => (SRLconst x [int32(c)]) +(Lsh16x64 x (Const64 [c])) && uint32(c) < 16 => (SLLconst x [int32(c)]) +(Rsh16x64 x (Const64 [c])) && uint32(c) < 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)]) +(Rsh16Ux64 x (Const64 [c])) && uint32(c) < 16 => (SRLconst (SLLconst <typ.UInt32> x [16]) [int32(c+16)]) +(Lsh8x64 x (Const64 [c])) && uint32(c) < 8 => (SLLconst x [int32(c)]) +(Rsh8x64 x (Const64 [c])) && uint32(c) < 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)]) +(Rsh8Ux64 x (Const64 [c])) && uint32(c) < 8 => (SRLconst (SLLconst <typ.UInt32> x [24]) [int32(c+24)]) + +// large constant shifts +(Lsh32x64 _ (Const64 [c])) && uint32(c) >= 32 => (MOVWconst [0]) +(Rsh32Ux64 _ (Const64 [c])) && uint32(c) >= 32 => (MOVWconst [0]) +(Lsh16x64 _ (Const64 [c])) && uint32(c) >= 16 => (MOVWconst [0]) +(Rsh16Ux64 _ (Const64 [c])) && uint32(c) >= 16 => (MOVWconst [0]) +(Lsh8x64 _ (Const64 [c])) && uint32(c) >= 8 => (MOVWconst [0]) +(Rsh8Ux64 _ (Const64 [c])) && uint32(c) >= 8 => (MOVWconst [0]) + +// large constant signed right shift, we leave the sign bit +(Rsh32x64 x (Const64 [c])) && uint32(c) >= 32 => (SRAconst x [31]) +(Rsh16x64 x (Const64 [c])) && uint32(c) >= 16 => (SRAconst (SLLconst <typ.UInt32> x [16]) [31]) +(Rsh8x64 x (Const64 [c])) && uint32(c) >= 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [31]) + +// shifts +// hardware instruction uses only the low 5 bits of the shift +// we compare to 32 to ensure Go semantics for large shifts +(Lsh32x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y)) +(Lsh32x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y))) +(Lsh32x8 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y))) + +(Lsh16x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y)) +(Lsh16x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y))) +(Lsh16x8 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y))) + +(Lsh8x32 <t> x y) => (CMOVZ (SLL <t> x y) (MOVWconst [0]) (SGTUconst [32] y)) +(Lsh8x16 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y))) +(Lsh8x8 <t> x y) => (CMOVZ (SLL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y))) + +(Rsh32Ux32 <t> x y) => (CMOVZ (SRL <t> x y) (MOVWconst [0]) (SGTUconst [32] y)) +(Rsh32Ux16 <t> x y) => (CMOVZ (SRL <t> x (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y))) +(Rsh32Ux8 <t> x y) => (CMOVZ (SRL <t> x (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y))) + +(Rsh16Ux32 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) y) (MOVWconst [0]) (SGTUconst [32] y)) +(Rsh16Ux16 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y))) +(Rsh16Ux8 <t> x y) => (CMOVZ (SRL <t> (ZeroExt16to32 x) (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y))) + +(Rsh8Ux32 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) y) (MOVWconst [0]) (SGTUconst [32] y)) +(Rsh8Ux16 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) (ZeroExt16to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt16to32 y))) +(Rsh8Ux8 <t> x y) => (CMOVZ (SRL <t> (ZeroExt8to32 x) (ZeroExt8to32 y) ) (MOVWconst [0]) (SGTUconst [32] (ZeroExt8to32 y))) + +(Rsh32x32 x y) => (SRA x ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y))) +(Rsh32x16 x y) => (SRA x ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y)))) +(Rsh32x8 x y) => (SRA x ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y)))) + +(Rsh16x32 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y))) +(Rsh16x16 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y)))) +(Rsh16x8 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y)))) + +(Rsh8x32 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> y (MOVWconst [31]) (SGTUconst [32] y))) +(Rsh8x16 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt16to32 y)))) +(Rsh8x8 x y) => (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [31]) (SGTUconst [32] (ZeroExt8to32 y)))) + +// rotates +(RotateLeft8 <t> x (MOVWconst [c])) => (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7]))) +(RotateLeft16 <t> x (MOVWconst [c])) => (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15]))) +(RotateLeft32 <t> x (MOVWconst [c])) => (Or32 (Lsh32x32 <t> x (MOVWconst [c&31])) (Rsh32Ux32 <t> x (MOVWconst [-c&31]))) +(RotateLeft64 <t> x (MOVWconst [c])) => (Or64 (Lsh64x32 <t> x (MOVWconst [c&63])) (Rsh64Ux32 <t> x (MOVWconst [-c&63]))) + +// unary ops +(Neg(32|16|8) ...) => (NEG ...) +(Neg(32|64)F ...) => (NEG(F|D) ...) + +(Com(32|16|8) x) => (NORconst [0] x) + +(Sqrt ...) => (SQRTD ...) +(Sqrt32 ...) => (SQRTF ...) + +// TODO: optimize this case? +(Ctz32NonZero ...) => (Ctz32 ...) + +// count trailing zero +// 32 - CLZ(x&-x - 1) +(Ctz32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> (SUBconst <t> [1] (AND <t> x (NEG <t> x))))) + +// bit length +(BitLen32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> x)) + +// boolean ops -- booleans are represented with 0=false, 1=true +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(EqB x y) => (XORconst [1] (XOR <typ.Bool> x y)) +(NeqB ...) => (XOR ...) +(Not x) => (XORconst [1] x) + +// constants +(Const(32|16|8) [val]) => (MOVWconst [int32(val)]) +(Const(32|64)F ...) => (MOV(F|D)const ...) +(ConstNil) => (MOVWconst [0]) +(ConstBool [t]) => (MOVWconst [b2i32(t)]) + +// truncations +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) + +// Zero-/Sign-extensions +(ZeroExt8to16 ...) => (MOVBUreg ...) +(ZeroExt8to32 ...) => (MOVBUreg ...) +(ZeroExt16to32 ...) => (MOVHUreg ...) + +(SignExt8to16 ...) => (MOVBreg ...) +(SignExt8to32 ...) => (MOVBreg ...) +(SignExt16to32 ...) => (MOVHreg ...) + +(Signmask x) => (SRAconst x [31]) +(Zeromask x) => (NEG (SGTU x (MOVWconst [0]))) +(Slicemask <t> x) => (SRAconst (NEG <t> x) [31]) + +// float-int conversion +(Cvt32to(32|64)F ...) => (MOVW(F|D) ...) +(Cvt(32|64)Fto32 ...) => (TRUNC(F|D)W ...) +(Cvt32Fto64F ...) => (MOVFD ...) +(Cvt64Fto32F ...) => (MOVDF ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round(32|64)F ...) => (Copy ...) + +// comparisons +(Eq8 x y) => (SGTUconst [1] (XOR (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Eq16 x y) => (SGTUconst [1] (XOR (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Eq32 x y) => (SGTUconst [1] (XOR x y)) +(EqPtr x y) => (SGTUconst [1] (XOR x y)) +(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y)) + +(Neq8 x y) => (SGTU (XOR (ZeroExt8to32 x) (ZeroExt8to32 y)) (MOVWconst [0])) +(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to32 y)) (MOVWconst [0])) +(Neq32 x y) => (SGTU (XOR x y) (MOVWconst [0])) +(NeqPtr x y) => (SGTU (XOR x y) (MOVWconst [0])) +(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y)) + +(Less8 x y) => (SGT (SignExt8to32 y) (SignExt8to32 x)) +(Less16 x y) => (SGT (SignExt16to32 y) (SignExt16to32 x)) +(Less32 x y) => (SGT y x) +(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN + +(Less8U x y) => (SGTU (ZeroExt8to32 y) (ZeroExt8to32 x)) +(Less16U x y) => (SGTU (ZeroExt16to32 y) (ZeroExt16to32 x)) +(Less32U x y) => (SGTU y x) + +(Leq8 x y) => (XORconst [1] (SGT (SignExt8to32 x) (SignExt8to32 y))) +(Leq16 x y) => (XORconst [1] (SGT (SignExt16to32 x) (SignExt16to32 y))) +(Leq32 x y) => (XORconst [1] (SGT x y)) +(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN + +(Leq8U x y) => (XORconst [1] (SGTU (ZeroExt8to32 x) (ZeroExt8to32 y))) +(Leq16U x y) => (XORconst [1] (SGTU (ZeroExt16to32 x) (ZeroExt16to32 y))) +(Leq32U x y) => (XORconst [1] (SGTU x y)) + +(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr) +(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr) + +(Addr {sym} base) => (MOVWaddr {sym} base) +(LocalAddr {sym} base _) => (MOVWaddr {sym} base) + +// loads +(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) || isPtr(t)) => (MOVWload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem) + +// stores +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem) + +// zero instructions +(Zero [0] _ mem) => mem +(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem) +(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore ptr (MOVWconst [0]) mem) +(Zero [2] ptr mem) => + (MOVBstore [1] ptr (MOVWconst [0]) + (MOVBstore [0] ptr (MOVWconst [0]) mem)) +(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore ptr (MOVWconst [0]) mem) +(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] ptr (MOVWconst [0]) + (MOVHstore [0] ptr (MOVWconst [0]) mem)) +(Zero [4] ptr mem) => + (MOVBstore [3] ptr (MOVWconst [0]) + (MOVBstore [2] ptr (MOVWconst [0]) + (MOVBstore [1] ptr (MOVWconst [0]) + (MOVBstore [0] ptr (MOVWconst [0]) mem)))) +(Zero [3] ptr mem) => + (MOVBstore [2] ptr (MOVWconst [0]) + (MOVBstore [1] ptr (MOVWconst [0]) + (MOVBstore [0] ptr (MOVWconst [0]) mem))) +(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] ptr (MOVWconst [0]) + (MOVHstore [2] ptr (MOVWconst [0]) + (MOVHstore [0] ptr (MOVWconst [0]) mem))) +(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] ptr (MOVWconst [0]) + (MOVWstore [0] ptr (MOVWconst [0]) mem)) +(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] ptr (MOVWconst [0]) + (MOVWstore [4] ptr (MOVWconst [0]) + (MOVWstore [0] ptr (MOVWconst [0]) mem))) +(Zero [16] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [12] ptr (MOVWconst [0]) + (MOVWstore [8] ptr (MOVWconst [0]) + (MOVWstore [4] ptr (MOVWconst [0]) + (MOVWstore [0] ptr (MOVWconst [0]) mem)))) + +// large or unaligned zeroing uses a loop +(Zero [s] {t} ptr mem) + && (s > 16 || t.Alignment()%4 != 0) => + (LoweredZero [int32(t.Alignment())] + ptr + (ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))]) + mem) + +// moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBUload src mem) mem) +(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore dst (MOVHUload src mem) mem) +(Move [2] dst src mem) => + (MOVBstore [1] dst (MOVBUload [1] src mem) + (MOVBstore dst (MOVBUload src mem) mem)) +(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore dst (MOVWload src mem) mem) +(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] dst (MOVHUload [2] src mem) + (MOVHstore dst (MOVHUload src mem) mem)) +(Move [4] dst src mem) => + (MOVBstore [3] dst (MOVBUload [3] src mem) + (MOVBstore [2] dst (MOVBUload [2] src mem) + (MOVBstore [1] dst (MOVBUload [1] src mem) + (MOVBstore dst (MOVBUload src mem) mem)))) +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBUload [2] src mem) + (MOVBstore [1] dst (MOVBUload [1] src mem) + (MOVBstore dst (MOVBUload src mem) mem))) +(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem)) +(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] dst (MOVHload [6] src mem) + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)))) +(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem))) +(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] dst (MOVWload [8] src mem) + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem))) +(Move [16] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [12] dst (MOVWload [12] src mem) + (MOVWstore [8] dst (MOVWload [8] src mem) + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem)))) + + +// large or unaligned move uses a loop +(Move [s] {t} dst src mem) + && (s > 16 && logLargeCopy(v, s) || t.Alignment()%4 != 0) => + (LoweredMove [int32(t.Alignment())] + dst + src + (ADDconst <src.Type> src [int32(s-moveSize(t.Alignment(), config))]) + mem) + +// calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// atomic intrinsics +(AtomicLoad(8|32) ...) => (LoweredAtomicLoad(8|32) ...) +(AtomicLoadPtr ...) => (LoweredAtomicLoad32 ...) + +(AtomicStore(8|32) ...) => (LoweredAtomicStore(8|32) ...) +(AtomicStorePtrNoWB ...) => (LoweredAtomicStore32 ...) + +(AtomicExchange32 ...) => (LoweredAtomicExchange ...) +(AtomicAdd32 ...) => (LoweredAtomicAdd ...) + +(AtomicCompareAndSwap32 ...) => (LoweredAtomicCas ...) + +// AtomicOr8(ptr,val) => LoweredAtomicOr(ptr&^3,uint32(val) << ((ptr & 3) * 8)) +(AtomicOr8 ptr val mem) && !config.BigEndian => + (LoweredAtomicOr (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr) + (SLL <typ.UInt32> (ZeroExt8to32 val) + (SLLconst <typ.UInt32> [3] + (ANDconst <typ.UInt32> [3] ptr))) mem) + +// AtomicAnd8(ptr,val) => LoweredAtomicAnd(ptr&^3,(uint32(val) << ((ptr & 3) * 8)) | ^(uint32(0xFF) << ((ptr & 3) * 8)))) +(AtomicAnd8 ptr val mem) && !config.BigEndian => + (LoweredAtomicAnd (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr) + (OR <typ.UInt32> (SLL <typ.UInt32> (ZeroExt8to32 val) + (SLLconst <typ.UInt32> [3] + (ANDconst <typ.UInt32> [3] ptr))) + (NORconst [0] <typ.UInt32> (SLL <typ.UInt32> + (MOVWconst [0xff]) (SLLconst <typ.UInt32> [3] + (ANDconst <typ.UInt32> [3] ptr))))) mem) + +// AtomicOr8(ptr,val) => LoweredAtomicOr(ptr&^3,uint32(val) << (((ptr^3) & 3) * 8)) +(AtomicOr8 ptr val mem) && config.BigEndian => + (LoweredAtomicOr (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr) + (SLL <typ.UInt32> (ZeroExt8to32 val) + (SLLconst <typ.UInt32> [3] + (ANDconst <typ.UInt32> [3] + (XORconst <typ.UInt32> [3] ptr)))) mem) + +// AtomicAnd8(ptr,val) => LoweredAtomicAnd(ptr&^3,(uint32(val) << (((ptr^3) & 3) * 8)) | ^(uint32(0xFF) << (((ptr^3) & 3) * 8)))) +(AtomicAnd8 ptr val mem) && config.BigEndian => + (LoweredAtomicAnd (AND <typ.UInt32Ptr> (MOVWconst [^3]) ptr) + (OR <typ.UInt32> (SLL <typ.UInt32> (ZeroExt8to32 val) + (SLLconst <typ.UInt32> [3] + (ANDconst <typ.UInt32> [3] + (XORconst <typ.UInt32> [3] ptr)))) + (NORconst [0] <typ.UInt32> (SLL <typ.UInt32> + (MOVWconst [0xff]) (SLLconst <typ.UInt32> [3] + (ANDconst <typ.UInt32> [3] + (XORconst <typ.UInt32> [3] ptr)))))) mem) + +(AtomicAnd32 ...) => (LoweredAtomicAnd ...) +(AtomicOr32 ...) => (LoweredAtomicOr ...) + + +// checks +(NilCheck ...) => (LoweredNilCheck ...) +(IsNonNil ptr) => (SGTU ptr (MOVWconst [0])) +(IsInBounds idx len) => (SGTU len idx) +(IsSliceInBounds idx len) => (XORconst [1] (SGTU idx len)) + +// pseudo-ops +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) + +(If cond yes no) => (NE cond yes no) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 0 => (LoweredPanicExtendA [kind] hi lo y mem) +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 1 => (LoweredPanicExtendB [kind] hi lo y mem) +(PanicExtend [kind] hi lo y mem) && boundsABI(kind) == 2 => (LoweredPanicExtendC [kind] hi lo y mem) + +// Optimizations + +// Absorb boolean tests into block +(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no) +(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no) +(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no) +(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no) +(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTzero _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTUzero _)) yes no) => (EQ cmp yes no) +(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTzero _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTUzero _)) yes no) => (NE cmp yes no) +(NE (SGTUconst [1] x) yes no) => (EQ x yes no) +(EQ (SGTUconst [1] x) yes no) => (NE x yes no) +(NE (SGTUzero x) yes no) => (NE x yes no) +(EQ (SGTUzero x) yes no) => (EQ x yes no) +(NE (SGTconst [0] x) yes no) => (LTZ x yes no) +(EQ (SGTconst [0] x) yes no) => (GEZ x yes no) +(NE (SGTzero x) yes no) => (GTZ x yes no) +(EQ (SGTzero x) yes no) => (LEZ x yes no) + +// fold offset into address +(ADDconst [off1] (MOVWaddr [off2] {sym} ptr)) => (MOVWaddr [off1+off2] {sym} ptr) + +// fold address into load/store +(MOVBload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBload [off1+off2] {sym} ptr mem) +(MOVBUload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBUload [off1+off2] {sym} ptr mem) +(MOVHload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHload [off1+off2] {sym} ptr mem) +(MOVHUload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHUload [off1+off2] {sym} ptr mem) +(MOVWload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWload [off1+off2] {sym} ptr mem) +(MOVFload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVFload [off1+off2] {sym} ptr mem) +(MOVDload [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVDload [off1+off2] {sym} ptr mem) + +(MOVBstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBstore [off1+off2] {sym} ptr val mem) +(MOVHstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHstore [off1+off2] {sym} ptr val mem) +(MOVWstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWstore [off1+off2] {sym} ptr val mem) +(MOVFstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVFstore [off1+off2] {sym} ptr val mem) +(MOVDstore [off1] {sym} x:(ADDconst [off2] ptr) val mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVDstore [off1+off2] {sym} ptr val mem) + +(MOVBstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVBstorezero [off1+off2] {sym} ptr mem) +(MOVHstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVHstorezero [off1+off2] {sym} ptr mem) +(MOVWstorezero [off1] {sym} x:(ADDconst [off2] ptr) mem) && (is16Bit(int64(off1+off2)) || x.Uses == 1) => (MOVWstorezero [off1+off2] {sym} ptr mem) + +(MOVBload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVBload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVBUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVHload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHUload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVWload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVFload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVFload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVDload [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVDload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +(MOVBstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVHstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVWstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVFstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVFstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVDstore [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) => + (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) +(MOVBstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWstorezero [off1] {sym1} (MOVWaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) => + (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +// replace load from same location as preceding store with zero/sign extension (or copy in case of full width) +(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBreg x) +(MOVBUload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBUreg x) +(MOVHload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHreg x) +(MOVHUload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVHUreg x) +(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x +(MOVFload [off] {sym} ptr (MOVFstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x +(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x + +// store zero +(MOVBstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem) +(MOVHstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem) + +// don't extend after proper load +(MOVBreg x:(MOVBload _ _)) => (MOVWreg x) +(MOVBUreg x:(MOVBUload _ _)) => (MOVWreg x) +(MOVHreg x:(MOVBload _ _)) => (MOVWreg x) +(MOVHreg x:(MOVBUload _ _)) => (MOVWreg x) +(MOVHreg x:(MOVHload _ _)) => (MOVWreg x) +(MOVHUreg x:(MOVBUload _ _)) => (MOVWreg x) +(MOVHUreg x:(MOVHUload _ _)) => (MOVWreg x) + +// fold double extensions +(MOVBreg x:(MOVBreg _)) => (MOVWreg x) +(MOVBUreg x:(MOVBUreg _)) => (MOVWreg x) +(MOVHreg x:(MOVBreg _)) => (MOVWreg x) +(MOVHreg x:(MOVBUreg _)) => (MOVWreg x) +(MOVHreg x:(MOVHreg _)) => (MOVWreg x) +(MOVHUreg x:(MOVBUreg _)) => (MOVWreg x) +(MOVHUreg x:(MOVHUreg _)) => (MOVWreg x) + +// sign extended loads +// Note: The combined instruction must end up in the same block +// as the original load. If not, we end up making a value with +// memory type live in two different blocks, which can lead to +// multiple memory values alive simultaneously. +// Make sure we don't combine these ops if the load has another use. +// This prevents a single load from being split into multiple loads +// which then might return different values. See test/atomicload.go. +(MOVBreg <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <t> [off] {sym} ptr mem) +(MOVBUreg <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem) +(MOVHreg <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload <t> [off] {sym} ptr mem) +(MOVHUreg <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem) + +// fold extensions and ANDs together +(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x) +(MOVHUreg (ANDconst [c] x)) => (ANDconst [c&0xffff] x) +(MOVBreg (ANDconst [c] x)) && c & 0x80 == 0 => (ANDconst [c&0x7f] x) +(MOVHreg (ANDconst [c] x)) && c & 0x8000 == 0 => (ANDconst [c&0x7fff] x) + +// don't extend before store +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem) + +// if a register move has only 1 use, just use the same register without emitting instruction +// MOVWnop doesn't emit instruction, only for ensuring the type. +(MOVWreg x) && x.Uses == 1 => (MOVWnop x) + +// TODO: we should be able to get rid of MOVWnop all together. +// But for now, this is enough to get rid of lots of them. +(MOVWnop (MOVWconst [c])) => (MOVWconst [c]) + +// fold constant into arithmetic ops +(ADD x (MOVWconst [c])) => (ADDconst [c] x) +(SUB x (MOVWconst [c])) => (SUBconst [c] x) +(AND x (MOVWconst [c])) => (ANDconst [c] x) +(OR x (MOVWconst [c])) => (ORconst [c] x) +(XOR x (MOVWconst [c])) => (XORconst [c] x) +(NOR x (MOVWconst [c])) => (NORconst [c] x) + +(SLL x (MOVWconst [c])) => (SLLconst x [c&31]) +(SRL x (MOVWconst [c])) => (SRLconst x [c&31]) +(SRA x (MOVWconst [c])) => (SRAconst x [c&31]) + +(SGT (MOVWconst [c]) x) => (SGTconst [c] x) +(SGTU (MOVWconst [c]) x) => (SGTUconst [c] x) +(SGT x (MOVWconst [0])) => (SGTzero x) +(SGTU x (MOVWconst [0])) => (SGTUzero x) + +// mul with constant +(Select1 (MULTU (MOVWconst [0]) _ )) => (MOVWconst [0]) +(Select0 (MULTU (MOVWconst [0]) _ )) => (MOVWconst [0]) +(Select1 (MULTU (MOVWconst [1]) x )) => x +(Select0 (MULTU (MOVWconst [1]) _ )) => (MOVWconst [0]) +(Select1 (MULTU (MOVWconst [-1]) x )) => (NEG <x.Type> x) +(Select0 (MULTU (MOVWconst [-1]) x )) => (CMOVZ (ADDconst <x.Type> [-1] x) (MOVWconst [0]) x) +(Select1 (MULTU (MOVWconst [c]) x )) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x) +(Select0 (MULTU (MOVWconst [c]) x )) && isPowerOfTwo64(int64(uint32(c))) => (SRLconst [int32(32-log2uint32(int64(c)))] x) + +(MUL (MOVWconst [0]) _ ) => (MOVWconst [0]) +(MUL (MOVWconst [1]) x ) => x +(MUL (MOVWconst [-1]) x ) => (NEG x) +(MUL (MOVWconst [c]) x ) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x) + +// generic simplifications +(ADD x (NEG y)) => (SUB x y) +(SUB x x) => (MOVWconst [0]) +(SUB (MOVWconst [0]) x) => (NEG x) +(AND x x) => x +(OR x x) => x +(XOR x x) => (MOVWconst [0]) + +// miscellaneous patterns generated by dec64 +(AND (SGTUconst [1] x) (SGTUconst [1] y)) => (SGTUconst [1] (OR <x.Type> x y)) +(OR (SGTUzero x) (SGTUzero y)) => (SGTUzero (OR <x.Type> x y)) + +// remove redundant *const ops +(ADDconst [0] x) => x +(SUBconst [0] x) => x +(ANDconst [0] _) => (MOVWconst [0]) +(ANDconst [-1] x) => x +(ORconst [0] x) => x +(ORconst [-1] _) => (MOVWconst [-1]) +(XORconst [0] x) => x +(XORconst [-1] x) => (NORconst [0] x) + +// generic constant folding +(ADDconst [c] (MOVWconst [d])) => (MOVWconst [int32(c+d)]) +(ADDconst [c] (ADDconst [d] x)) => (ADDconst [c+d] x) +(ADDconst [c] (SUBconst [d] x)) => (ADDconst [c-d] x) +(SUBconst [c] (MOVWconst [d])) => (MOVWconst [d-c]) +(SUBconst [c] (SUBconst [d] x)) => (ADDconst [-c-d] x) +(SUBconst [c] (ADDconst [d] x)) => (ADDconst [-c+d] x) +(SLLconst [c] (MOVWconst [d])) => (MOVWconst [d<<uint32(c)]) +(SRLconst [c] (MOVWconst [d])) => (MOVWconst [int32(uint32(d)>>uint32(c))]) +(SRAconst [c] (MOVWconst [d])) => (MOVWconst [d>>uint32(c)]) +(MUL (MOVWconst [c]) (MOVWconst [d])) => (MOVWconst [c*d]) +(Select1 (MULTU (MOVWconst [c]) (MOVWconst [d]))) => (MOVWconst [int32(uint32(c)*uint32(d))]) +(Select0 (MULTU (MOVWconst [c]) (MOVWconst [d]))) => (MOVWconst [int32((int64(uint32(c))*int64(uint32(d)))>>32)]) +(Select1 (DIV (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [c/d]) +(Select1 (DIVU (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)/uint32(d))]) +(Select0 (DIV (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [c%d]) +(Select0 (DIVU (MOVWconst [c]) (MOVWconst [d]))) && d != 0 => (MOVWconst [int32(uint32(c)%uint32(d))]) +(ANDconst [c] (MOVWconst [d])) => (MOVWconst [c&d]) +(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x) +(ORconst [c] (MOVWconst [d])) => (MOVWconst [c|d]) +(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x) +(XORconst [c] (MOVWconst [d])) => (MOVWconst [c^d]) +(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x) +(NORconst [c] (MOVWconst [d])) => (MOVWconst [^(c|d)]) +(NEG (MOVWconst [c])) => (MOVWconst [-c]) +(MOVBreg (MOVWconst [c])) => (MOVWconst [int32(int8(c))]) +(MOVBUreg (MOVWconst [c])) => (MOVWconst [int32(uint8(c))]) +(MOVHreg (MOVWconst [c])) => (MOVWconst [int32(int16(c))]) +(MOVHUreg (MOVWconst [c])) => (MOVWconst [int32(uint16(c))]) +(MOVWreg (MOVWconst [c])) => (MOVWconst [c]) + +// constant comparisons +(SGTconst [c] (MOVWconst [d])) && c > d => (MOVWconst [1]) +(SGTconst [c] (MOVWconst [d])) && c <= d => (MOVWconst [0]) +(SGTUconst [c] (MOVWconst [d])) && uint32(c) > uint32(d) => (MOVWconst [1]) +(SGTUconst [c] (MOVWconst [d])) && uint32(c) <= uint32(d) => (MOVWconst [0]) +(SGTzero (MOVWconst [d])) && d > 0 => (MOVWconst [1]) +(SGTzero (MOVWconst [d])) && d <= 0 => (MOVWconst [0]) +(SGTUzero (MOVWconst [d])) && d != 0 => (MOVWconst [1]) +(SGTUzero (MOVWconst [d])) && d == 0 => (MOVWconst [0]) + +// other known comparisons +(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVWconst [1]) +(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVWconst [0]) +(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVWconst [1]) +(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVWconst [0]) +(SGTUconst [c] (MOVBUreg _)) && 0xff < uint32(c) => (MOVWconst [1]) +(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVWconst [1]) +(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVWconst [0]) +(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVWconst [1]) +(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVWconst [0]) +(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint32(c) => (MOVWconst [1]) +(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVWconst [1]) +(SGTUconst [c] (ANDconst [m] _)) && uint32(m) < uint32(c) => (MOVWconst [1]) +(SGTconst [c] (SRLconst _ [d])) && 0 <= c && uint32(d) <= 31 && 0xffffffff>>uint32(d) < uint32(c) => (MOVWconst [1]) +(SGTUconst [c] (SRLconst _ [d])) && uint32(d) <= 31 && 0xffffffff>>uint32(d) < uint32(c) => (MOVWconst [1]) + +// absorb constants into branches +(EQ (MOVWconst [0]) yes no) => (First yes no) +(EQ (MOVWconst [c]) yes no) && c != 0 => (First no yes) +(NE (MOVWconst [0]) yes no) => (First no yes) +(NE (MOVWconst [c]) yes no) && c != 0 => (First yes no) +(LTZ (MOVWconst [c]) yes no) && c < 0 => (First yes no) +(LTZ (MOVWconst [c]) yes no) && c >= 0 => (First no yes) +(LEZ (MOVWconst [c]) yes no) && c <= 0 => (First yes no) +(LEZ (MOVWconst [c]) yes no) && c > 0 => (First no yes) +(GTZ (MOVWconst [c]) yes no) && c > 0 => (First yes no) +(GTZ (MOVWconst [c]) yes no) && c <= 0 => (First no yes) +(GEZ (MOVWconst [c]) yes no) && c >= 0 => (First yes no) +(GEZ (MOVWconst [c]) yes no) && c < 0 => (First no yes) + +// conditional move +(CMOVZ _ f (MOVWconst [0])) => f +(CMOVZ a _ (MOVWconst [c])) && c!=0 => a +(CMOVZzero _ (MOVWconst [0])) => (MOVWconst [0]) +(CMOVZzero a (MOVWconst [c])) && c!=0 => a +(CMOVZ a (MOVWconst [0]) c) => (CMOVZzero a c) + +// atomic +(LoweredAtomicStore32 ptr (MOVWconst [0]) mem) => (LoweredAtomicStorezero ptr mem) +(LoweredAtomicAdd ptr (MOVWconst [c]) mem) && is16Bit(int64(c)) => (LoweredAtomicAddconst [c] ptr mem) + diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS64.rules b/src/cmd/compile/internal/ssa/_gen/MIPS64.rules new file mode 100644 index 0000000..a594df2 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/MIPS64.rules @@ -0,0 +1,691 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +(Add(Ptr|64|32|16|8) ...) => (ADDV ...) +(Add(32|64)F ...) => (ADD(F|D) ...) + +(Sub(Ptr|64|32|16|8) ...) => (SUBV ...) +(Sub(32|64)F ...) => (SUB(F|D) ...) + +(Mul(64|32|16|8) x y) => (Select1 (MULVU x y)) +(Mul(32|64)F ...) => (MUL(F|D) ...) +(Mul64uhilo ...) => (MULVU ...) +(Select0 (Mul64uover x y)) => (Select1 <typ.UInt64> (MULVU x y)) +(Select1 (Mul64uover x y)) => (SGTU <typ.Bool> (Select0 <typ.UInt64> (MULVU x y)) (MOVVconst <typ.UInt64> [0])) + +(Hmul64 x y) => (Select0 (MULV x y)) +(Hmul64u x y) => (Select0 (MULVU x y)) +(Hmul32 x y) => (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32]) +(Hmul32u x y) => (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32]) + +(Div64 x y) => (Select1 (DIVV x y)) +(Div64u x y) => (Select1 (DIVVU x y)) +(Div32 x y) => (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y))) +(Div32u x y) => (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Div16 x y) => (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y))) +(Div16u x y) => (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Div8 x y) => (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y))) +(Div8u x y) => (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Div(32|64)F ...) => (DIV(F|D) ...) + +(Mod64 x y) => (Select0 (DIVV x y)) +(Mod64u x y) => (Select0 (DIVVU x y)) +(Mod32 x y) => (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y))) +(Mod32u x y) => (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Mod16 x y) => (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y))) +(Mod16u x y) => (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Mod8 x y) => (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y))) +(Mod8u x y) => (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y))) + +// (x + y) / 2 with x>=y => (x - y) / 2 + y +(Avg64u <t> x y) => (ADDV (SRLVconst <t> (SUBV <t> x y) [1]) y) + +(And(64|32|16|8) ...) => (AND ...) +(Or(64|32|16|8) ...) => (OR ...) +(Xor(64|32|16|8) ...) => (XOR ...) + +// shifts +// hardware instruction uses only the low 6 bits of the shift +// we compare to 64 to ensure Go semantics for large shifts +(Lsh64x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh64x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh64x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh64x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Lsh32x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh32x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh32x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh32x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Lsh16x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh16x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh16x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh16x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Lsh8x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y)) +(Lsh8x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y))) +(Lsh8x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y))) +(Lsh8x8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SLLV <t> x (ZeroExt8to64 y))) + +(Rsh64Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y)) +(Rsh64Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y))) +(Rsh64Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y))) +(Rsh64Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> x (ZeroExt8to64 y))) + +(Rsh32Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y)) +(Rsh32Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Rsh32Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y))) +(Rsh32Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64 y))) + +(Rsh16Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y)) +(Rsh16Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y))) +(Rsh16Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Rsh16Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64 y))) + +(Rsh8Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y)) +(Rsh8Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y))) +(Rsh8Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y))) +(Rsh8Ux8 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64 y))) + +(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh64x8 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh32x8 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh16x8 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y)) +(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y))) +(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y))) +(Rsh8x8 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y))) + +// rotates +(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7]))) +(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15]))) +(RotateLeft32 <t> x (MOVVconst [c])) => (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31]))) +(RotateLeft64 <t> x (MOVVconst [c])) => (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63]))) + +// unary ops +(Neg(64|32|16|8) ...) => (NEGV ...) +(Neg(32|64)F ...) => (NEG(F|D) ...) + +(Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x) + +(Sqrt ...) => (SQRTD ...) +(Sqrt32 ...) => (SQRTF ...) + +// boolean ops -- booleans are represented with 0=false, 1=true +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(EqB x y) => (XOR (MOVVconst [1]) (XOR <typ.Bool> x y)) +(NeqB ...) => (XOR ...) +(Not x) => (XORconst [1] x) + +// constants +(Const(64|32|16|8) [val]) => (MOVVconst [int64(val)]) +(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)]) +(ConstNil) => (MOVVconst [0]) +(ConstBool [t]) => (MOVVconst [int64(b2i(t))]) + +(Slicemask <t> x) => (SRAVconst (NEGV <t> x) [63]) + +// truncations +// Because we ignore high parts of registers, truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) +(Trunc64to8 ...) => (Copy ...) +(Trunc64to16 ...) => (Copy ...) +(Trunc64to32 ...) => (Copy ...) + +// Zero-/Sign-extensions +(ZeroExt8to16 ...) => (MOVBUreg ...) +(ZeroExt8to32 ...) => (MOVBUreg ...) +(ZeroExt16to32 ...) => (MOVHUreg ...) +(ZeroExt8to64 ...) => (MOVBUreg ...) +(ZeroExt16to64 ...) => (MOVHUreg ...) +(ZeroExt32to64 ...) => (MOVWUreg ...) + +(SignExt8to16 ...) => (MOVBreg ...) +(SignExt8to32 ...) => (MOVBreg ...) +(SignExt16to32 ...) => (MOVHreg ...) +(SignExt8to64 ...) => (MOVBreg ...) +(SignExt16to64 ...) => (MOVHreg ...) +(SignExt32to64 ...) => (MOVWreg ...) + +// float <=> int conversion +(Cvt32to32F ...) => (MOVWF ...) +(Cvt32to64F ...) => (MOVWD ...) +(Cvt64to32F ...) => (MOVVF ...) +(Cvt64to64F ...) => (MOVVD ...) +(Cvt32Fto32 ...) => (TRUNCFW ...) +(Cvt64Fto32 ...) => (TRUNCDW ...) +(Cvt32Fto64 ...) => (TRUNCFV ...) +(Cvt64Fto64 ...) => (TRUNCDV ...) +(Cvt32Fto64F ...) => (MOVFD ...) +(Cvt64Fto32F ...) => (MOVDF ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round(32|64)F ...) => (Copy ...) + +// comparisons +(Eq8 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Eq16 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Eq32 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Eq64 x y) => (SGTU (MOVVconst [1]) (XOR x y)) +(EqPtr x y) => (SGTU (MOVVconst [1]) (XOR x y)) +(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y)) + +(Neq8 x y) => (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0])) +(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0])) +(Neq32 x y) => (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0])) +(Neq64 x y) => (SGTU (XOR x y) (MOVVconst [0])) +(NeqPtr x y) => (SGTU (XOR x y) (MOVVconst [0])) +(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y)) + +(Less8 x y) => (SGT (SignExt8to64 y) (SignExt8to64 x)) +(Less16 x y) => (SGT (SignExt16to64 y) (SignExt16to64 x)) +(Less32 x y) => (SGT (SignExt32to64 y) (SignExt32to64 x)) +(Less64 x y) => (SGT y x) +(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN + +(Less8U x y) => (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x)) +(Less16U x y) => (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x)) +(Less32U x y) => (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x)) +(Less64U x y) => (SGTU y x) + +(Leq8 x y) => (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y))) +(Leq16 x y) => (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y))) +(Leq32 x y) => (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y))) +(Leq64 x y) => (XOR (MOVVconst [1]) (SGT x y)) +(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN + +(Leq8U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Leq16U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Leq32U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Leq64U x y) => (XOR (MOVVconst [1]) (SGTU x y)) + +(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVVaddr [int32(off)] ptr) +(OffPtr [off] ptr) => (ADDVconst [off] ptr) + +(Addr {sym} base) => (MOVVaddr {sym} base) +(LocalAddr {sym} base _) => (MOVVaddr {sym} base) + +// loads +(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem) +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVVload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem) + +// stores +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVVstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem) + +// zeroing +(Zero [0] _ mem) => mem +(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem) +(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore ptr (MOVVconst [0]) mem) +(Zero [2] ptr mem) => + (MOVBstore [1] ptr (MOVVconst [0]) + (MOVBstore [0] ptr (MOVVconst [0]) mem)) +(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore ptr (MOVVconst [0]) mem) +(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] ptr (MOVVconst [0]) + (MOVHstore [0] ptr (MOVVconst [0]) mem)) +(Zero [4] ptr mem) => + (MOVBstore [3] ptr (MOVVconst [0]) + (MOVBstore [2] ptr (MOVVconst [0]) + (MOVBstore [1] ptr (MOVVconst [0]) + (MOVBstore [0] ptr (MOVVconst [0]) mem)))) +(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVVstore ptr (MOVVconst [0]) mem) +(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] ptr (MOVVconst [0]) + (MOVWstore [0] ptr (MOVVconst [0]) mem)) +(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] ptr (MOVVconst [0]) + (MOVHstore [4] ptr (MOVVconst [0]) + (MOVHstore [2] ptr (MOVVconst [0]) + (MOVHstore [0] ptr (MOVVconst [0]) mem)))) + +(Zero [3] ptr mem) => + (MOVBstore [2] ptr (MOVVconst [0]) + (MOVBstore [1] ptr (MOVVconst [0]) + (MOVBstore [0] ptr (MOVVconst [0]) mem))) +(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] ptr (MOVVconst [0]) + (MOVHstore [2] ptr (MOVVconst [0]) + (MOVHstore [0] ptr (MOVVconst [0]) mem))) +(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] ptr (MOVVconst [0]) + (MOVWstore [4] ptr (MOVVconst [0]) + (MOVWstore [0] ptr (MOVVconst [0]) mem))) +(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVVstore [8] ptr (MOVVconst [0]) + (MOVVstore [0] ptr (MOVVconst [0]) mem)) +(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVVstore [16] ptr (MOVVconst [0]) + (MOVVstore [8] ptr (MOVVconst [0]) + (MOVVstore [0] ptr (MOVVconst [0]) mem))) + +// medium zeroing uses a duff device +// 8, and 128 are magic constants, see runtime/mkduff.go +(Zero [s] {t} ptr mem) + && s%8 == 0 && s > 24 && s <= 8*128 + && t.Alignment()%8 == 0 && !config.noDuffDevice => + (DUFFZERO [8 * (128 - s/8)] ptr mem) + +// large or unaligned zeroing uses a loop +(Zero [s] {t} ptr mem) + && (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 => + (LoweredZero [t.Alignment()] + ptr + (ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)]) + mem) + +// moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem) +(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore dst (MOVHload src mem) mem) +(Move [2] dst src mem) => + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem)) +(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore dst (MOVWload src mem) mem) +(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)) +(Move [4] dst src mem) => + (MOVBstore [3] dst (MOVBload [3] src mem) + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem)))) +(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVVstore dst (MOVVload src mem) mem) +(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem)) +(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] dst (MOVHload [6] src mem) + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)))) + +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem))) +(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem))) +(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] dst (MOVWload [8] src mem) + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem))) +(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVVstore [8] dst (MOVVload [8] src mem) + (MOVVstore dst (MOVVload src mem) mem)) +(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVVstore [16] dst (MOVVload [16] src mem) + (MOVVstore [8] dst (MOVVload [8] src mem) + (MOVVstore dst (MOVVload src mem) mem))) + +// medium move uses a duff device +(Move [s] {t} dst src mem) + && s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [16 * (128 - s/8)] dst src mem) +// 16 and 128 are magic constants. 16 is the number of bytes to encode: +// MOVV (R1), R23 +// ADDV $8, R1 +// MOVV R23, (R2) +// ADDV $8, R2 +// and 128 is the number of such blocks. See runtime/duff_mips64.s:duffcopy. + +// large or unaligned move uses a loop +(Move [s] {t} dst src mem) + && s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 => + (LoweredMove [t.Alignment()] + dst + src + (ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)]) + mem) + +// calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// atomic intrinsics +(AtomicLoad(8|32|64) ...) => (LoweredAtomicLoad(8|32|64) ...) +(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...) + +(AtomicStore(8|32|64) ...) => (LoweredAtomicStore(8|32|64) ...) +(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...) + +(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...) + +(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...) + +(AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem) +(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...) + +// checks +(NilCheck ...) => (LoweredNilCheck ...) +(IsNonNil ptr) => (SGTU ptr (MOVVconst [0])) +(IsInBounds idx len) => (SGTU len idx) +(IsSliceInBounds idx len) => (XOR (MOVVconst [1]) (SGTU idx len)) + +// pseudo-ops +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) + +(If cond yes no) => (NE cond yes no) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +// Optimizations + +// Absorb boolean tests into block +(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no) +(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no) +(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no) +(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no) +(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no) +(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no) +(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no) +(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no) +(NE (SGTUconst [1] x) yes no) => (EQ x yes no) +(EQ (SGTUconst [1] x) yes no) => (NE x yes no) +(NE (SGTU x (MOVVconst [0])) yes no) => (NE x yes no) +(EQ (SGTU x (MOVVconst [0])) yes no) => (EQ x yes no) +(NE (SGTconst [0] x) yes no) => (LTZ x yes no) +(EQ (SGTconst [0] x) yes no) => (GEZ x yes no) +(NE (SGT x (MOVVconst [0])) yes no) => (GTZ x yes no) +(EQ (SGT x (MOVVconst [0])) yes no) => (LEZ x yes no) + +// fold offset into address +(ADDVconst [off1] (MOVVaddr [off2] {sym} ptr)) && is32Bit(off1+int64(off2)) => (MOVVaddr [int32(off1)+int32(off2)] {sym} ptr) + +// fold address into load/store +(MOVBload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBload [off1+int32(off2)] {sym} ptr mem) +(MOVBUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBUload [off1+int32(off2)] {sym} ptr mem) +(MOVHload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHload [off1+int32(off2)] {sym} ptr mem) +(MOVHUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHUload [off1+int32(off2)] {sym} ptr mem) +(MOVWload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWload [off1+int32(off2)] {sym} ptr mem) +(MOVWUload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWUload [off1+int32(off2)] {sym} ptr mem) +(MOVVload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVload [off1+int32(off2)] {sym} ptr mem) +(MOVFload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVFload [off1+int32(off2)] {sym} ptr mem) +(MOVDload [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDload [off1+int32(off2)] {sym} ptr mem) + +(MOVBstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVBstore [off1+int32(off2)] {sym} ptr val mem) +(MOVHstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVHstore [off1+int32(off2)] {sym} ptr val mem) +(MOVWstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVWstore [off1+int32(off2)] {sym} ptr val mem) +(MOVVstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVVstore [off1+int32(off2)] {sym} ptr val mem) +(MOVFstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVFstore [off1+int32(off2)] {sym} ptr val mem) +(MOVDstore [off1] {sym} (ADDVconst [off2] ptr) val mem) && is32Bit(int64(off1)+off2) => (MOVDstore [off1+int32(off2)] {sym} ptr val mem) +(MOVBstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVHstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVWstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVVstorezero [off1] {sym} (ADDVconst [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVVstorezero [off1+int32(off2)] {sym} ptr mem) + +(MOVBload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVBUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVHload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVHUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVWload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVWUload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWUload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVVload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVVload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVFload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVFload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVDload [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVDload [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) + +(MOVBstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVHstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVWstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVVstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVVstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVFstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVFstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVDstore [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVDstore [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr val mem) +(MOVBstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVHstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVWstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) +(MOVVstorezero [off1] {sym1} (MOVVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVVstorezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem) + +// store zero +(MOVBstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem) +(MOVHstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem) +(MOVVstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVVstorezero [off] {sym} ptr mem) + +// don't extend after proper load +(MOVBreg x:(MOVBload _ _)) => (MOVVreg x) +(MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVHreg x:(MOVBload _ _)) => (MOVVreg x) +(MOVHreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVHreg x:(MOVHload _ _)) => (MOVVreg x) +(MOVHUreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVHUreg x:(MOVHUload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVBload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVHload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVHUload _ _)) => (MOVVreg x) +(MOVWreg x:(MOVWload _ _)) => (MOVVreg x) +(MOVWUreg x:(MOVBUload _ _)) => (MOVVreg x) +(MOVWUreg x:(MOVHUload _ _)) => (MOVVreg x) +(MOVWUreg x:(MOVWUload _ _)) => (MOVVreg x) + +// fold double extensions +(MOVBreg x:(MOVBreg _)) => (MOVVreg x) +(MOVBUreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVHreg x:(MOVBreg _)) => (MOVVreg x) +(MOVHreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVHreg x:(MOVHreg _)) => (MOVVreg x) +(MOVHUreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVHUreg x:(MOVHUreg _)) => (MOVVreg x) +(MOVWreg x:(MOVBreg _)) => (MOVVreg x) +(MOVWreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVWreg x:(MOVHreg _)) => (MOVVreg x) +(MOVWreg x:(MOVWreg _)) => (MOVVreg x) +(MOVWUreg x:(MOVBUreg _)) => (MOVVreg x) +(MOVWUreg x:(MOVHUreg _)) => (MOVVreg x) +(MOVWUreg x:(MOVWUreg _)) => (MOVVreg x) + +// don't extend before store +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem) + +// if a register move has only 1 use, just use the same register without emitting instruction +// MOVVnop doesn't emit instruction, only for ensuring the type. +(MOVVreg x) && x.Uses == 1 => (MOVVnop x) + +// TODO: we should be able to get rid of MOVVnop all together. +// But for now, this is enough to get rid of lots of them. +(MOVVnop (MOVVconst [c])) => (MOVVconst [c]) + +// fold constant into arithmetic ops +(ADDV x (MOVVconst [c])) && is32Bit(c) => (ADDVconst [c] x) +(SUBV x (MOVVconst [c])) && is32Bit(c) => (SUBVconst [c] x) +(AND x (MOVVconst [c])) && is32Bit(c) => (ANDconst [c] x) +(OR x (MOVVconst [c])) && is32Bit(c) => (ORconst [c] x) +(XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x) +(NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x) + +(SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0]) +(SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0]) +(SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63]) +(SLLV x (MOVVconst [c])) => (SLLVconst x [c]) +(SRLV x (MOVVconst [c])) => (SRLVconst x [c]) +(SRAV x (MOVVconst [c])) => (SRAVconst x [c]) + +(SGT (MOVVconst [c]) x) && is32Bit(c) => (SGTconst [c] x) +(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x) + +// mul by constant +(Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x) +(Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0]) +(Select1 (MULVU x (MOVVconst [1]))) => x +(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SLLVconst [log64(c)] x) + +// div by constant +(Select1 (DIVVU x (MOVVconst [1]))) => x +(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SRLVconst [log64(c)] x) +(Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0]) // mod +(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (ANDconst [c-1] x) // mod + +// generic simplifications +(ADDV x (NEGV y)) => (SUBV x y) +(SUBV x x) => (MOVVconst [0]) +(SUBV (MOVVconst [0]) x) => (NEGV x) +(AND x x) => x +(OR x x) => x +(XOR x x) => (MOVVconst [0]) + +// remove redundant *const ops +(ADDVconst [0] x) => x +(SUBVconst [0] x) => x +(ANDconst [0] _) => (MOVVconst [0]) +(ANDconst [-1] x) => x +(ORconst [0] x) => x +(ORconst [-1] _) => (MOVVconst [-1]) +(XORconst [0] x) => x +(XORconst [-1] x) => (NORconst [0] x) + +// generic constant folding +(ADDVconst [c] (MOVVconst [d])) => (MOVVconst [c+d]) +(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) => (ADDVconst [c+d] x) +(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) => (ADDVconst [c-d] x) +(SUBVconst [c] (MOVVconst [d])) => (MOVVconst [d-c]) +(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) => (ADDVconst [-c-d] x) +(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) => (ADDVconst [-c+d] x) +(SLLVconst [c] (MOVVconst [d])) => (MOVVconst [d<<uint64(c)]) +(SRLVconst [c] (MOVVconst [d])) => (MOVVconst [int64(uint64(d)>>uint64(c))]) +(SRAVconst [c] (MOVVconst [d])) => (MOVVconst [d>>uint64(c)]) +(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c*d]) +(Select1 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c/d]) +(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)/uint64(d))]) +(Select0 (DIVV (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [c%d]) // mod +(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) && d != 0 => (MOVVconst [int64(uint64(c)%uint64(d))]) // mod +(ANDconst [c] (MOVVconst [d])) => (MOVVconst [c&d]) +(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x) +(ORconst [c] (MOVVconst [d])) => (MOVVconst [c|d]) +(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) => (ORconst [c|d] x) +(XORconst [c] (MOVVconst [d])) => (MOVVconst [c^d]) +(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) => (XORconst [c^d] x) +(NORconst [c] (MOVVconst [d])) => (MOVVconst [^(c|d)]) +(NEGV (MOVVconst [c])) => (MOVVconst [-c]) +(MOVBreg (MOVVconst [c])) => (MOVVconst [int64(int8(c))]) +(MOVBUreg (MOVVconst [c])) => (MOVVconst [int64(uint8(c))]) +(MOVHreg (MOVVconst [c])) => (MOVVconst [int64(int16(c))]) +(MOVHUreg (MOVVconst [c])) => (MOVVconst [int64(uint16(c))]) +(MOVWreg (MOVVconst [c])) => (MOVVconst [int64(int32(c))]) +(MOVWUreg (MOVVconst [c])) => (MOVVconst [int64(uint32(c))]) +(MOVVreg (MOVVconst [c])) => (MOVVconst [c]) +(LoweredAtomicStore(32|64) ptr (MOVVconst [0]) mem) => (LoweredAtomicStorezero(32|64) ptr mem) +(LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst32 [int32(c)] ptr mem) +(LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst64 [c] ptr mem) + +// constant comparisons +(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1]) +(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0]) +(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) => (MOVVconst [1]) +(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) => (MOVVconst [0]) + +// other known comparisons +(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVVconst [1]) +(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVVconst [0]) +(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVVconst [1]) +(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVVconst [0]) +(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) => (MOVVconst [1]) +(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVVconst [1]) +(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVVconst [0]) +(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVVconst [1]) +(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVVconst [0]) +(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) => (MOVVconst [1]) +(SGTconst [c] (MOVWUreg _)) && c < 0 => (MOVVconst [0]) +(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVVconst [1]) +(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) => (MOVVconst [1]) +(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1]) +(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1]) + +// absorb constants into branches +(EQ (MOVVconst [0]) yes no) => (First yes no) +(EQ (MOVVconst [c]) yes no) && c != 0 => (First no yes) +(NE (MOVVconst [0]) yes no) => (First no yes) +(NE (MOVVconst [c]) yes no) && c != 0 => (First yes no) +(LTZ (MOVVconst [c]) yes no) && c < 0 => (First yes no) +(LTZ (MOVVconst [c]) yes no) && c >= 0 => (First no yes) +(LEZ (MOVVconst [c]) yes no) && c <= 0 => (First yes no) +(LEZ (MOVVconst [c]) yes no) && c > 0 => (First no yes) +(GTZ (MOVVconst [c]) yes no) && c > 0 => (First yes no) +(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes) +(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no) +(GEZ (MOVVconst [c]) yes no) && c < 0 => (First no yes) + +// fold readonly sym load +(MOVBload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read8(sym, int64(off)))]) +(MOVHload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))]) +(MOVVload [off] {sym} (SB) _) && symIsRO(sym) => (MOVVconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))]) diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go b/src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go new file mode 100644 index 0000000..89c8772 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/MIPS64Ops.go @@ -0,0 +1,482 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - *const instructions may use a constant larger than the instruction can encode. +// In this case the assembler expands to multiple instructions and uses tmp +// register (R23). + +// Suffixes encode the bit width of various instructions. +// V (vlong) = 64 bit +// WU (word) = 32 bit unsigned +// W (word) = 32 bit +// H (half word) = 16 bit +// HU = 16 bit unsigned +// B (byte) = 8 bit +// BU = 8 bit unsigned +// F (float) = 32 bit float +// D (double) = 64 bit float + +// Note: registers not used in regalloc are not included in this list, +// so that regmask stays within int64 +// Be careful when hand coding regmasks. +var regNamesMIPS64 = []string{ + "R0", // constant 0 + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "R14", + "R15", + "R16", + "R17", + "R18", + "R19", + "R20", + "R21", + "R22", + // R23 = REGTMP not used in regalloc + "R24", + "R25", + // R26 reserved by kernel + // R27 reserved by kernel + // R28 = REGSB not used in regalloc + "SP", // aka R29 + "g", // aka R30 + "R31", // aka REGLINK + + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", + "F16", + "F17", + "F18", + "F19", + "F20", + "F21", + "F22", + "F23", + "F24", + "F25", + "F26", + "F27", + "F28", + "F29", + "F30", + "F31", + + "HI", // high bits of multiplication + "LO", // low bits of multiplication + + // If you add registers, update asyncPreempt in runtime. + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesMIPS64) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesMIPS64 { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + gp = buildReg("R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R31") + gpg = gp | buildReg("g") + gpsp = gp | buildReg("SP") + gpspg = gpg | buildReg("SP") + gpspsbg = gpspg | buildReg("SB") + fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") + lo = buildReg("LO") + hi = buildReg("HI") + callerSave = gp | fp | lo | hi | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r1 = buildReg("R1") + r2 = buildReg("R2") + r3 = buildReg("R3") + r4 = buildReg("R4") + ) + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}} + gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}} + gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}} + gp2hilo = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{hi, lo}} + gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}} + gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}} + gpstore0 = regInfo{inputs: []regMask{gpspsbg}} + gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}} + gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}} + fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} + fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} + //fp1flags = regInfo{inputs: []regMask{fp}} + //fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}} + //gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} + fpstore = regInfo{inputs: []regMask{gpspsbg, fp}} + readflags = regInfo{inputs: nil, outputs: []regMask{gp}} + ) + ops := []opData{ + // binary ops + {name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true}, // arg0 + arg1 + {name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"}, // arg0 + auxInt. auxInt is 32-bit, also in other *const ops. + {name: "SUBV", argLength: 2, reg: gp21, asm: "SUBVU"}, // arg0 - arg1 + {name: "SUBVconst", argLength: 1, reg: gp11, asm: "SUBVU", aux: "Int64"}, // arg0 - auxInt + {name: "MULV", argLength: 2, reg: gp2hilo, asm: "MULV", commutative: true, typ: "(Int64,Int64)"}, // arg0 * arg1, signed, results hi,lo + {name: "MULVU", argLength: 2, reg: gp2hilo, asm: "MULVU", commutative: true, typ: "(UInt64,UInt64)"}, // arg0 * arg1, unsigned, results hi,lo + {name: "DIVV", argLength: 2, reg: gp2hilo, asm: "DIVV", typ: "(Int64,Int64)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1 + {name: "DIVVU", argLength: 2, reg: gp2hilo, asm: "DIVVU", typ: "(UInt64,UInt64)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1 + + {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1 + {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1 + {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1 + {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1 + {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1 + {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1 + {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1 + {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1 + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 + {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0 | auxInt + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt64"}, // arg0 ^ arg1 + {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", typ: "UInt64"}, // arg0 ^ auxInt + {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1) + {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt) + + {name: "NEGV", argLength: 1, reg: gp11}, // -arg0 + {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32 + {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64 + {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64 + {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32 + + // shifts + {name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64 + {name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt + {name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64 + {name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned + {name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64 + {name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed + + // comparisons + {name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise + {name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise + {name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise + {name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise + + {name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32 + {name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64 + {name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32 + {name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64 + {name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32 + {name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64 + + // moves + {name: "MOVVconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVV", typ: "UInt64", rematerializeable: true}, // auxint + {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float + {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float + + {name: "MOVVaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVV", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB + + {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVVload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVV", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + + {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVVstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + + {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVVstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. ar12=mem. + + // conversions + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte + {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half + {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word + {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word + {name: "MOVVreg", argLength: 1, reg: gp11, asm: "MOVV"}, // move from arg0 + + {name: "MOVVnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register + + {name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32 + {name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64 + {name: "MOVVF", argLength: 1, reg: fp11, asm: "MOVVF"}, // int64 -> float32 + {name: "MOVVD", argLength: 1, reg: fp11, asm: "MOVVD"}, // int64 -> float64 + {name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32 + {name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32 + {name: "TRUNCFV", argLength: 1, reg: fp11, asm: "TRUNCFV"}, // float32 -> int64 + {name: "TRUNCDV", argLength: 1, reg: fp11, asm: "TRUNCDV"}, // float64 -> int64 + {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64 + {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32 + + // function calls + {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R22"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // duffzero + // arg0 = address of memory to zero + // arg1 = mem + // auxint = offset into duffzero code to start executing + // returns mem + // R1 aka mips.REGRT1 changed as side effect + { + name: "DUFFZERO", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{gp}, + clobbers: buildReg("R1 R31"), + }, + faultOnNilArg0: true, + }, + + // duffcopy + // arg0 = address of dst memory (in R2, changed as side effect) + // arg1 = address of src memory (in R1, changed as side effect) + // arg2 = mem + // auxint = offset into duffcopy code to start executing + // returns mem + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R2"), buildReg("R1")}, + clobbers: buildReg("R1 R2 R31"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // large or unaligned zeroing + // arg0 = address of memory to zero (in R1, changed as side effect) + // arg1 = address of the last element to zero + // arg2 = mem + // auxint = alignment + // returns mem + // SUBV $8, R1 + // MOVV R0, 8(R1) + // ADDV $8, R1 + // BNE Rarg1, R1, -2(PC) + { + name: "LoweredZero", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), gp}, + clobbers: buildReg("R1"), + }, + clobberFlags: true, + faultOnNilArg0: true, + }, + + // large or unaligned move + // arg0 = address of dst memory (in R2, changed as side effect) + // arg1 = address of src memory (in R1, changed as side effect) + // arg2 = address of the last element of src + // arg3 = mem + // auxint = alignment + // returns mem + // SUBV $8, R1 + // MOVV 8(R1), Rtmp + // MOVV Rtmp, (R2) + // ADDV $8, R1 + // ADDV $8, R2 + // BNE Rarg2, R1, -4(PC) + { + name: "LoweredMove", + aux: "Int64", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R2"), buildReg("R1"), gp}, + clobbers: buildReg("R1 R2"), + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // atomic loads. + // load from arg0. arg1=mem. + // returns <value,memory> so they can be properly ordered with other loads. + {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true}, + + // atomic stores. + // store arg1 to arg0. arg2=mem. returns memory. + {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + // store zero to arg0. arg1=mem. returns memory. + {name: "LoweredAtomicStorezero32", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStorezero64", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic exchange. + // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. + // SYNC + // LL (Rarg0), Rout + // MOVV Rarg1, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // SYNC + {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic add. + // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. + // SYNC + // LL (Rarg0), Rout + // ADDV Rarg1, Rout, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // SYNC + // ADDV Rarg1, Rout + {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + // *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit. + {name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // SYNC + // MOVV $0, Rout + // LL (Rarg0), Rtmp + // BNE Rtmp, Rarg1, 4(PC) + // MOVV Rarg2, Rout + // SC Rout, (Rarg0) + // BEQ Rout, -4(PC) + // SYNC + {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // pseudo-ops + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem. + + {name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true + {name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false + + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of R22 (mips.REGCTXT, the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R22")}}, zeroWidth: true}, + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers R31 (LR) because it's a call + // and R23 (REGTMP). + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ gpg) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + } + + blocks := []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LTZ", controls: 1}, // < 0 + {name: "LEZ", controls: 1}, // <= 0 + {name: "GTZ", controls: 1}, // > 0 + {name: "GEZ", controls: 1}, // >= 0 + {name: "FPT", controls: 1}, // FP flag is true + {name: "FPF", controls: 1}, // FP flag is false + } + + archs = append(archs, arch{ + name: "MIPS64", + pkg: "cmd/internal/obj/mips", + genfile: "../../mips64/ssa.go", + ops: ops, + blocks: blocks, + regnames: regNamesMIPS64, + gpregmask: gp, + fpregmask: fp, + specialregmask: hi | lo, + framepointerreg: -1, // not used + linkreg: int8(num["R31"]), + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/MIPSOps.go b/src/cmd/compile/internal/ssa/_gen/MIPSOps.go new file mode 100644 index 0000000..22a7a5c --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/MIPSOps.go @@ -0,0 +1,439 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - Unused portions of AuxInt are filled by sign-extending the used portion. +// - *const instructions may use a constant larger than the instruction can encode. +// In this case the assembler expands to multiple instructions and uses tmp +// register (R23). + +// Suffixes encode the bit width of various instructions. +// W (word) = 32 bit +// H (half word) = 16 bit +// HU = 16 bit unsigned +// B (byte) = 8 bit +// BU = 8 bit unsigned +// F (float) = 32 bit float +// D (double) = 64 bit float + +// Note: registers not used in regalloc are not included in this list, +// so that regmask stays within int64 +// Be careful when hand coding regmasks. +var regNamesMIPS = []string{ + "R0", // constant 0 + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "R14", + "R15", + "R16", + "R17", + "R18", + "R19", + "R20", + "R21", + "R22", + //REGTMP + "R24", + "R25", + // R26 reserved by kernel + // R27 reserved by kernel + "R28", + "SP", // aka R29 + "g", // aka R30 + "R31", // REGLINK + + // odd FP registers contain high parts of 64-bit FP values + "F0", + "F2", + "F4", + "F6", + "F8", + "F10", + "F12", + "F14", + "F16", + "F18", + "F20", + "F22", + "F24", + "F26", + "F28", + "F30", + + "HI", // high bits of multiplication + "LO", // low bits of multiplication + + // If you add registers, update asyncPreempt in runtime. + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesMIPS) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesMIPS { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + gp = buildReg("R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31") + gpg = gp | buildReg("g") + gpsp = gp | buildReg("SP") + gpspg = gpg | buildReg("SP") + gpspsbg = gpspg | buildReg("SB") + fp = buildReg("F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30") + lo = buildReg("LO") + hi = buildReg("HI") + callerSave = gp | fp | lo | hi | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r1 = buildReg("R1") + r2 = buildReg("R2") + r3 = buildReg("R3") + r4 = buildReg("R4") + r5 = buildReg("R5") + ) + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}} + gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}} + gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}} + gp31 = regInfo{inputs: []regMask{gp, gp, gp}, outputs: []regMask{gp}} + gp2hilo = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{hi, lo}} + gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}} + gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}} + gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}} + gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}} + gpstore0 = regInfo{inputs: []regMask{gpspsbg}} + fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} + fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} + fpstore = regInfo{inputs: []regMask{gpspsbg, fp}} + readflags = regInfo{inputs: nil, outputs: []regMask{gp}} + ) + ops := []opData{ + {name: "ADD", argLength: 2, reg: gp21, asm: "ADDU", commutative: true}, // arg0 + arg1 + {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADDU", aux: "Int32"}, // arg0 + auxInt + {name: "SUB", argLength: 2, reg: gp21, asm: "SUBU"}, // arg0 - arg1 + {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUBU", aux: "Int32"}, // arg0 - auxInt + {name: "MUL", argLength: 2, reg: regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}, clobbers: hi | lo}, asm: "MUL", commutative: true}, // arg0 * arg1 + {name: "MULT", argLength: 2, reg: gp2hilo, asm: "MUL", commutative: true, typ: "(Int32,Int32)"}, // arg0 * arg1, signed, results hi,lo + {name: "MULTU", argLength: 2, reg: gp2hilo, asm: "MULU", commutative: true, typ: "(UInt32,UInt32)"}, // arg0 * arg1, unsigned, results hi,lo + {name: "DIV", argLength: 2, reg: gp2hilo, asm: "DIV", typ: "(Int32,Int32)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1 + {name: "DIVU", argLength: 2, reg: gp2hilo, asm: "DIVU", typ: "(UInt32,UInt32)"}, // arg0 / arg1, signed, results hi=arg0%arg1,lo=arg0/arg1 + + {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1 + {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1 + {name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1 + {name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1 + {name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1 + {name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1 + {name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1 + {name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1 + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int32"}, // arg0 & auxInt + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 + {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int32"}, // arg0 | auxInt + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt32"}, // arg0 ^ arg1 + {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int32", typ: "UInt32"}, // arg0 ^ auxInt + {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1) + {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int32"}, // ^(arg0 | auxInt) + + {name: "NEG", argLength: 1, reg: gp11}, // -arg0 + {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32 + {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64 + {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64 + {name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32 + + // shifts + {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << arg1, shift amount is mod 32 + {name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int32"}, // arg0 << auxInt, shift amount must be 0 through 31 inclusive + {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> arg1, unsigned, shift amount is mod 32 + {name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int32"}, // arg0 >> auxInt, shift amount must be 0 through 31 inclusive + {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> arg1, signed, shift amount is mod 32 + {name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int32"}, // arg0 >> auxInt, signed, shift amount must be 0 through 31 inclusive + + {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, + + // comparisons + {name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise + {name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int32", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise + {name: "SGTzero", argLength: 1, reg: gp11, asm: "SGT", typ: "Bool"}, // 1 if arg0 > 0 (signed), 0 otherwise + {name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise + {name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int32", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise + {name: "SGTUzero", argLength: 1, reg: gp11, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > 0 (unsigned), 0 otherwise + + {name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32 + {name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64 + {name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32 + {name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64 + {name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32 + {name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64 + + // moves + {name: "MOVWconst", argLength: 0, reg: gp01, aux: "Int32", asm: "MOVW", typ: "UInt32", rematerializeable: true}, // auxint + {name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float32", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float + {name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float + + {name: "MOVWaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVW", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB + + {name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + {name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem. + + {name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + {name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem. + + {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem. + {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem. + + // conversions + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte + {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half + {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0 + + {name: "MOVWnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register + + // conditional move on zero (returns arg1 if arg2 is 0, otherwise arg0) + // order of parameters is reversed so we can use resultInArg0 (OpCMOVZ result arg1 arg2-> CMOVZ arg2reg, arg1reg, resultReg) + {name: "CMOVZ", argLength: 3, reg: gp31, asm: "CMOVZ", resultInArg0: true}, + {name: "CMOVZzero", argLength: 2, reg: regInfo{inputs: []regMask{gp, gpg}, outputs: []regMask{gp}}, asm: "CMOVZ", resultInArg0: true}, + + {name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32 + {name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64 + {name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32 + {name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32 + {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64 + {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32 + + // function calls + {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("R22"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // atomic ops + + // load from arg0. arg1=mem. + // returns <value,memory> so they can be properly ordered with other loads. + // SYNC + // MOV(B|W) (Rarg0), Rout + // SYNC + {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true}, + + // store arg1 to arg0. arg2=mem. returns memory. + // SYNC + // MOV(B|W) Rarg1, (Rarg0) + // SYNC + {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStorezero", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic exchange. + // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. + // SYNC + // LL (Rarg0), Rout + // MOVW Rarg1, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // SYNC + {name: "LoweredAtomicExchange", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic add. + // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. + // SYNC + // LL (Rarg0), Rout + // ADDU Rarg1, Rout, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // SYNC + // ADDU Rarg1, Rout + {name: "LoweredAtomicAdd", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAddconst", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // SYNC + // MOVW $0, Rout + // LL (Rarg0), Rtmp + // BNE Rtmp, Rarg1, 4(PC) + // MOVW Rarg2, Rout + // SC Rout, (Rarg0) + // BEQ Rout, -4(PC) + // SYNC + {name: "LoweredAtomicCas", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // atomic and/or. + // *arg0 &= (|=) arg1. arg2=mem. returns memory. + // SYNC + // LL (Rarg0), Rtmp + // AND Rarg1, Rtmp + // SC Rtmp, (Rarg0) + // BEQ Rtmp, -3(PC) + // SYNC + {name: "LoweredAtomicAnd", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicOr", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // large or unaligned zeroing + // arg0 = address of memory to zero (in R1, changed as side effect) + // arg1 = address of the last element to zero + // arg2 = mem + // auxint = alignment + // returns mem + // SUBU $4, R1 + // MOVW R0, 4(R1) + // ADDU $4, R1 + // BNE Rarg1, R1, -2(PC) + { + name: "LoweredZero", + aux: "Int32", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), gp}, + clobbers: buildReg("R1"), + }, + faultOnNilArg0: true, + }, + + // large or unaligned move + // arg0 = address of dst memory (in R2, changed as side effect) + // arg1 = address of src memory (in R1, changed as side effect) + // arg2 = address of the last element of src + // arg3 = mem + // auxint = alignment + // returns mem + // SUBU $4, R1 + // MOVW 4(R1), Rtmp + // MOVW Rtmp, (R2) + // ADDU $4, R1 + // ADDU $4, R2 + // BNE Rarg2, R1, -4(PC) + { + name: "LoweredMove", + aux: "Int32", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R2"), buildReg("R1"), gp}, + clobbers: buildReg("R1 R2"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // pseudo-ops + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem. + + {name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true + {name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false + + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of R22 (mips.REGCTXT, the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R22")}}, zeroWidth: true}, + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers R31 (LR) because it's a call + // and R23 (REGTMP). + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ gpg) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + // Extend ops are the same as Bounds ops except the indexes are 64-bit. + {name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r3, r4}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + {name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + {name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go). + } + + blocks := []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LTZ", controls: 1}, // < 0 + {name: "LEZ", controls: 1}, // <= 0 + {name: "GTZ", controls: 1}, // > 0 + {name: "GEZ", controls: 1}, // >= 0 + {name: "FPT", controls: 1}, // FP flag is true + {name: "FPF", controls: 1}, // FP flag is false + } + + archs = append(archs, arch{ + name: "MIPS", + pkg: "cmd/internal/obj/mips", + genfile: "../../mips/ssa.go", + ops: ops, + blocks: blocks, + regnames: regNamesMIPS, + gpregmask: gp, + fpregmask: fp, + specialregmask: hi | lo, + framepointerreg: -1, // not used + linkreg: int8(num["R31"]), + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules new file mode 100644 index 0000000..3aea9ab --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules @@ -0,0 +1,1274 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Lowering arithmetic +(Add(Ptr|64|32|16|8) ...) => (ADD ...) +(Add64F ...) => (FADD ...) +(Add32F ...) => (FADDS ...) + +(Sub(Ptr|64|32|16|8) ...) => (SUB ...) +(Sub32F ...) => (FSUBS ...) +(Sub64F ...) => (FSUB ...) + +// Combine 64 bit integer multiply and adds +(ADD l:(MULLD x y) z) && buildcfg.GOPPC64 >= 9 && l.Uses == 1 && clobber(l) => (MADDLD x y z) + +(Mod16 x y) => (Mod32 (SignExt16to32 x) (SignExt16to32 y)) +(Mod16u x y) => (Mod32u (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Mod8 x y) => (Mod32 (SignExt8to32 x) (SignExt8to32 y)) +(Mod8u x y) => (Mod32u (ZeroExt8to32 x) (ZeroExt8to32 y)) +(Mod64 x y) && buildcfg.GOPPC64 >=9 => (MODSD x y) +(Mod64 x y) && buildcfg.GOPPC64 <=8 => (SUB x (MULLD y (DIVD x y))) +(Mod64u x y) && buildcfg.GOPPC64 >= 9 => (MODUD x y) +(Mod64u x y) && buildcfg.GOPPC64 <= 8 => (SUB x (MULLD y (DIVDU x y))) +(Mod32 x y) && buildcfg.GOPPC64 >= 9 => (MODSW x y) +(Mod32 x y) && buildcfg.GOPPC64 <= 8 => (SUB x (MULLW y (DIVW x y))) +(Mod32u x y) && buildcfg.GOPPC64 >= 9 => (MODUW x y) +(Mod32u x y) && buildcfg.GOPPC64 <= 8 => (SUB x (MULLW y (DIVWU x y))) + +// (x + y) / 2 with x>=y => (x - y) / 2 + y +(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y) + +(Mul64 ...) => (MULLD ...) +(Mul(32|16|8) ...) => (MULLW ...) +(Select0 (Mul64uhilo x y)) => (MULHDU x y) +(Select1 (Mul64uhilo x y)) => (MULLD x y) + +(Div64 [false] x y) => (DIVD x y) +(Div64u ...) => (DIVDU ...) +(Div32 [false] x y) => (DIVW x y) +(Div32u ...) => (DIVWU ...) +(Div16 [false] x y) => (DIVW (SignExt16to32 x) (SignExt16to32 y)) +(Div16u x y) => (DIVWU (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y)) +(Div8u x y) => (DIVWU (ZeroExt8to32 x) (ZeroExt8to32 y)) + +(Hmul(64|64u|32|32u) ...) => (MULH(D|DU|W|WU) ...) + +(Mul(32|64)F ...) => ((FMULS|FMUL) ...) + +(Div(32|64)F ...) => ((FDIVS|FDIV) ...) + +// Lowering float <=> int +(Cvt32to(32|64)F x) => ((FCFIDS|FCFID) (MTVSRD (SignExt32to64 x))) +(Cvt64to(32|64)F x) => ((FCFIDS|FCFID) (MTVSRD x)) + +(Cvt32Fto(32|64) x) => (MFVSRD (FCTI(W|D)Z x)) +(Cvt64Fto(32|64) x) => (MFVSRD (FCTI(W|D)Z x)) + +(Cvt32Fto64F ...) => (Copy ...) // Note v will have the wrong type for patterns dependent on Float32/Float64 +(Cvt64Fto32F ...) => (FRSP ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round(32|64)F ...) => (LoweredRound(32|64)F ...) + +(Sqrt ...) => (FSQRT ...) +(Sqrt32 ...) => (FSQRTS ...) +(Floor ...) => (FFLOOR ...) +(Ceil ...) => (FCEIL ...) +(Trunc ...) => (FTRUNC ...) +(Round ...) => (FROUND ...) +(Copysign x y) => (FCPSGN y x) +(Abs ...) => (FABS ...) +(FMA ...) => (FMADD ...) + +// Lowering extension +// Note: we always extend to 64 bits even though some ops don't need that many result bits. +(SignExt8to(16|32|64) ...) => (MOVBreg ...) +(SignExt16to(32|64) ...) => (MOVHreg ...) +(SignExt32to64 ...) => (MOVWreg ...) + +(ZeroExt8to(16|32|64) ...) => (MOVBZreg ...) +(ZeroExt16to(32|64) ...) => (MOVHZreg ...) +(ZeroExt32to64 ...) => (MOVWZreg ...) + +(Trunc(16|32|64)to8 <t> x) && isSigned(t) => (MOVBreg x) +(Trunc(16|32|64)to8 x) => (MOVBZreg x) +(Trunc(32|64)to16 <t> x) && isSigned(t) => (MOVHreg x) +(Trunc(32|64)to16 x) => (MOVHZreg x) +(Trunc64to32 <t> x) && isSigned(t) => (MOVWreg x) +(Trunc64to32 x) => (MOVWZreg x) + +// Lowering constants +(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)]) +(Const(32|64)F ...) => (FMOV(S|D)const ...) +(ConstNil) => (MOVDconst [0]) +(ConstBool [t]) => (MOVDconst [b2i(t)]) + +// Carrying addition. +(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADDE x y (Select1 <typ.UInt64> (ADDCconst c [-1])))) +(Select1 (Add64carry x y c)) => (ADDZEzero (Select1 <typ.UInt64> (ADDE x y (Select1 <typ.UInt64> (ADDCconst c [-1]))))) +// Fold initial carry bit if 0. +(ADDE x y (Select1 <typ.UInt64> (ADDCconst (MOVDconst [0]) [-1]))) => (ADDC x y) +// Fold transfer of CA -> GPR -> CA. Note 2 uses when feeding into a chained Add64carry. +(Select1 (ADDCconst n:(ADDZEzero x) [-1])) && n.Uses <= 2 => x + +// Borrowing subtraction. +(Select0 (Sub64borrow x y c)) => (Select0 <typ.UInt64> (SUBE x y (Select1 <typ.UInt64> (SUBCconst c [0])))) +(Select1 (Sub64borrow x y c)) => (NEG (SUBZEzero (Select1 <typ.UInt64> (SUBE x y (Select1 <typ.UInt64> (SUBCconst c [0])))))) +// Fold initial borrow bit if 0. +(SUBE x y (Select1 <typ.UInt64> (SUBCconst (MOVDconst [0]) [0]))) => (SUBC x y) +// Fold transfer of CA -> GPR -> CA. Note 2 uses when feeding into a chained Sub64borrow. +(Select1 (SUBCconst n:(NEG (SUBZEzero x)) [0])) && n.Uses <= 2 => x + +// Constant folding +(FABS (FMOVDconst [x])) => (FMOVDconst [math.Abs(x)]) +(FSQRT (FMOVDconst [x])) && x >= 0 => (FMOVDconst [math.Sqrt(x)]) +(FFLOOR (FMOVDconst [x])) => (FMOVDconst [math.Floor(x)]) +(FCEIL (FMOVDconst [x])) => (FMOVDconst [math.Ceil(x)]) +(FTRUNC (FMOVDconst [x])) => (FMOVDconst [math.Trunc(x)]) + +// Rotates +(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7]))) +(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15]))) +(RotateLeft(32|64) ...) => ((ROTLW|ROTL) ...) + +// Constant rotate generation +(ROTLW x (MOVDconst [c])) => (ROTLWconst x [c&31]) +(ROTL x (MOVDconst [c])) => (ROTLconst x [c&63]) + +// Combine rotate and mask operations +(Select0 (ANDCCconst [m] (ROTLWconst [r] x))) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x) +(AND (MOVDconst [m]) (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x) +(Select0 (ANDCCconst [m] (ROTLW x r))) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r) +(AND (MOVDconst [m]) (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r) + +// Note, any rotated word bitmask is still a valid word bitmask. +(ROTLWconst [r] (AND (MOVDconst [m]) x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x) +(ROTLWconst [r] (Select0 (ANDCCconst [m] x))) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x) + +(Select0 (ANDCCconst [m] (SRWconst x [s]))) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0]) +(Select0 (ANDCCconst [m] (SRWconst x [s]))) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x) +(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0]) +(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x) + +(SRWconst (Select0 (ANDCCconst [m] x)) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0]) +(SRWconst (Select0 (ANDCCconst [m] x)) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x) +(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0]) +(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x) + +// Merge shift right + shift left and clear left (e.g for a table lookup) +(CLRLSLDI [c] (SRWconst [s] x)) && mergePPC64ClrlsldiSrw(int64(c),s) != 0 => (RLWINM [mergePPC64ClrlsldiSrw(int64(c),s)] x) +(SLDconst [l] (SRWconst [r] x)) && mergePPC64SldiSrw(l,r) != 0 => (RLWINM [mergePPC64SldiSrw(l,r)] x) +// The following reduction shows up frequently too. e.g b[(x>>14)&0xFF] +(CLRLSLDI [c] i:(RLWINM [s] x)) && mergePPC64ClrlsldiRlwinm(c,s) != 0 => (RLWINM [mergePPC64ClrlsldiRlwinm(c,s)] x) + +// large constant signed right shift, we leave the sign bit +(Rsh64x64 x (MOVDconst [c])) && uint64(c) >= 64 => (SRADconst x [63]) +(Rsh32x64 x (MOVDconst [c])) && uint64(c) >= 32 => (SRAWconst x [63]) +(Rsh16x64 x (MOVDconst [c])) && uint64(c) >= 16 => (SRAWconst (SignExt16to32 x) [63]) +(Rsh8x64 x (MOVDconst [c])) && uint64(c) >= 8 => (SRAWconst (SignExt8to32 x) [63]) + +// constant shifts +((Lsh64|Rsh64|Rsh64U)x64 x (MOVDconst [c])) && uint64(c) < 64 => (S(L|RA|R)Dconst x [c]) +((Lsh32|Rsh32|Rsh32U)x64 x (MOVDconst [c])) && uint64(c) < 32 => (S(L|RA|R)Wconst x [c]) +((Rsh16|Rsh16U)x64 x (MOVDconst [c])) && uint64(c) < 16 => (SR(AW|W)const ((Sign|Zero)Ext16to32 x) [c]) +(Lsh16x64 x (MOVDconst [c])) && uint64(c) < 16 => (SLWconst x [c]) +((Rsh8|Rsh8U)x64 x (MOVDconst [c])) && uint64(c) < 8 => (SR(AW|W)const ((Sign|Zero)Ext8to32 x) [c]) +(Lsh8x64 x (MOVDconst [c])) && uint64(c) < 8 => (SLWconst x [c]) + +// Lower bounded shifts first. No need to check shift value. +(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y) +(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y) +(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y) +(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y) +(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y) +(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y) +(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y) +(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y) +(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD x y) +(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y) +(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y) +(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y) + +// non-constant rotates +// If shift > 64 then use -1 as shift count to shift all bits. +((Lsh64|Rsh64|Rsh64U)x64 x y) => (S(L|RA|R)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64])))) +((Rsh32|Rsh32U|Lsh32)x64 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32])))) + +(Rsh(16|16U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16])))) +(Lsh16x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16])))) + +(Rsh(8|8U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8])))) +(Lsh8x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8])))) + +((Rsh64|Rsh64U|Lsh64)x32 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64])))) +((Rsh32|Rsh32U|Lsh32)x32 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32])))) + +(Rsh(16|16U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16])))) +(Lsh16x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16])))) + +(Rsh(8|8U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8])))) +(Lsh8x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8])))) + +((Rsh64|Rsh64U|Lsh64)x16 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64])))) + +((Rsh32|Rsh32U|Lsh32)x16 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32])))) + +(Rsh(16|16U)x16 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16])))) +(Lsh16x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16])))) + +(Rsh(8|8U)x16 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8])))) +(Lsh8x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8])))) + + +((Rsh64|Rsh64U|Lsh64)x8 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64])))) + +((Rsh32|Rsh32U|Lsh32)x8 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32])))) + +(Rsh(16|16U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16])))) +(Lsh16x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16])))) + +(Rsh(8|8U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8])))) +(Lsh8x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8])))) + +// Cleaning up shift ops +(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPU (Select0 (ANDCCconst [d] y)) (MOVDconst [c]))) && c >= d => (Select0 (ANDCCconst [d] y)) +(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPUconst [c] (Select0 (ANDCCconst [d] y)))) && c >= d => (Select0 (ANDCCconst [d] y)) +(ORN x (MOVDconst [-1])) => x + +(S(RAD|RD|LD) x (MOVDconst [c])) => (S(RAD|RD|LD)const [c&63 | (c>>6&1*63)] x) +(S(RAW|RW|LW) x (MOVDconst [c])) => (S(RAW|RW|LW)const [c&31 | (c>>5&1*31)] x) + +(Addr {sym} base) => (MOVDaddr {sym} [0] base) +(LocalAddr {sym} base _) => (MOVDaddr {sym} base) +(OffPtr [off] ptr) => (ADD (MOVDconst <typ.Int64> [off]) ptr) + +// TODO: optimize these cases? +(Ctz32NonZero ...) => (Ctz32 ...) +(Ctz64NonZero ...) => (Ctz64 ...) + +(Ctz64 x) && buildcfg.GOPPC64<=8 => (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x)) +(Ctz64 x) => (CNTTZD x) +(Ctz32 x) && buildcfg.GOPPC64<=8 => (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x))) +(Ctz32 x) => (CNTTZW (MOVWZreg x)) +(Ctz16 x) => (POPCNTW (MOVHZreg (ANDN <typ.Int16> (ADDconst <typ.Int16> [-1] x) x))) +(Ctz8 x) => (POPCNTB (MOVBZreg (ANDN <typ.UInt8> (ADDconst <typ.UInt8> [-1] x) x))) + +(BitLen64 x) => (SUBFCconst [64] (CNTLZD <typ.Int> x)) +(BitLen32 x) => (SUBFCconst [32] (CNTLZW <typ.Int> x)) + +(PopCount64 ...) => (POPCNTD ...) +(PopCount(32|16|8) x) => (POPCNT(W|W|B) (MOV(W|H|B)Zreg x)) + +(And(64|32|16|8) ...) => (AND ...) +(Or(64|32|16|8) ...) => (OR ...) +(Xor(64|32|16|8) ...) => (XOR ...) + +(Neg(64|32|16|8) ...) => (NEG ...) +(Neg(64|32)F ...) => (FNEG ...) + +(Com(64|32|16|8) x) => (NOR x x) + +// Lowering boolean ops +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(Not x) => (XORconst [1] x) + +// Merge logical operations +(AND x (NOR y y)) => (ANDN x y) +(OR x (NOR y y)) => (ORN x y) + +// Lowering comparisons +(EqB x y) => (Select0 <typ.Int> (ANDCCconst [1] (EQV x y))) +// Sign extension dependence on operand sign sets up for sign/zero-extension elision later +(Eq(8|16) x y) && isSigned(x.Type) && isSigned(y.Type) => (Equal (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y))) +(Eq(8|16) x y) => (Equal (CMPW (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y))) +(Eq(32|64|Ptr) x y) => (Equal ((CMPW|CMP|CMP) x y)) +(Eq(32|64)F x y) => (Equal (FCMPU x y)) + +(NeqB ...) => (XOR ...) +// Like Eq8 and Eq16, prefer sign extension likely to enable later elision. +(Neq(8|16) x y) && isSigned(x.Type) && isSigned(y.Type) => (NotEqual (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y))) +(Neq(8|16) x y) => (NotEqual (CMPW (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y))) +(Neq(32|64|Ptr) x y) => (NotEqual ((CMPW|CMP|CMP) x y)) +(Neq(32|64)F x y) => (NotEqual (FCMPU x y)) + +(Less(8|16) x y) => (LessThan (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y))) +(Less(32|64) x y) => (LessThan ((CMPW|CMP) x y)) +(Less(32|64)F x y) => (FLessThan (FCMPU x y)) + +(Less(8|16)U x y) => (LessThan (CMPWU (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y))) +(Less(32|64)U x y) => (LessThan ((CMPWU|CMPU) x y)) + +(Leq(8|16) x y) => (LessEqual (CMPW (SignExt(8|16)to32 x) (SignExt(8|16)to32 y))) +(Leq(32|64) x y) => (LessEqual ((CMPW|CMP) x y)) +(Leq(32|64)F x y) => (FLessEqual (FCMPU x y)) + +(Leq(8|16)U x y) => (LessEqual (CMPWU (ZeroExt(8|16)to32 x) (ZeroExt(8|16)to32 y))) +(Leq(32|64)U x y) => (LessEqual (CMP(WU|U) x y)) + +// Absorb pseudo-ops into blocks. +(If (Equal cc) yes no) => (EQ cc yes no) +(If (NotEqual cc) yes no) => (NE cc yes no) +(If (LessThan cc) yes no) => (LT cc yes no) +(If (LessEqual cc) yes no) => (LE cc yes no) +(If (GreaterThan cc) yes no) => (GT cc yes no) +(If (GreaterEqual cc) yes no) => (GE cc yes no) +(If (FLessThan cc) yes no) => (FLT cc yes no) +(If (FLessEqual cc) yes no) => (FLE cc yes no) +(If (FGreaterThan cc) yes no) => (FGT cc yes no) +(If (FGreaterEqual cc) yes no) => (FGE cc yes no) + +(If cond yes no) => (NE (CMPWconst [0] (Select0 <typ.UInt32> (ANDCCconst [1] cond))) yes no) + +// Absorb boolean tests into block +(NE (CMPWconst [0] (Select0 (ANDCCconst [1] ((Equal|NotEqual|LessThan|LessEqual|GreaterThan|GreaterEqual) cc)))) yes no) => ((EQ|NE|LT|LE|GT|GE) cc yes no) +(NE (CMPWconst [0] (Select0 (ANDCCconst [1] ((FLessThan|FLessEqual|FGreaterThan|FGreaterEqual) cc)))) yes no) => ((FLT|FLE|FGT|FGE) cc yes no) + +// Elide compares of bit tests +((EQ|NE) (CMPconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no) +((EQ|NE) (CMPWconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no) + +// absorb flag constants into branches +(EQ (FlagEQ) yes no) => (First yes no) +(EQ (FlagLT) yes no) => (First no yes) +(EQ (FlagGT) yes no) => (First no yes) + +(NE (FlagEQ) yes no) => (First no yes) +(NE (FlagLT) yes no) => (First yes no) +(NE (FlagGT) yes no) => (First yes no) + +(LT (FlagEQ) yes no) => (First no yes) +(LT (FlagLT) yes no) => (First yes no) +(LT (FlagGT) yes no) => (First no yes) + +(LE (FlagEQ) yes no) => (First yes no) +(LE (FlagLT) yes no) => (First yes no) +(LE (FlagGT) yes no) => (First no yes) + +(GT (FlagEQ) yes no) => (First no yes) +(GT (FlagLT) yes no) => (First no yes) +(GT (FlagGT) yes no) => (First yes no) + +(GE (FlagEQ) yes no) => (First yes no) +(GE (FlagLT) yes no) => (First no yes) +(GE (FlagGT) yes no) => (First yes no) + +// absorb InvertFlags into branches +(LT (InvertFlags cmp) yes no) => (GT cmp yes no) +(GT (InvertFlags cmp) yes no) => (LT cmp yes no) +(LE (InvertFlags cmp) yes no) => (GE cmp yes no) +(GE (InvertFlags cmp) yes no) => (LE cmp yes no) +(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no) +(NE (InvertFlags cmp) yes no) => (NE cmp yes no) + +// constant comparisons +(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ) +(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) => (FlagLT) +(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) => (FlagGT) + +(CMPconst (MOVDconst [x]) [y]) && x==y => (FlagEQ) +(CMPconst (MOVDconst [x]) [y]) && x<y => (FlagLT) +(CMPconst (MOVDconst [x]) [y]) && x>y => (FlagGT) + +(CMPWUconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ) +(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) => (FlagLT) +(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) => (FlagGT) + +(CMPUconst (MOVDconst [x]) [y]) && x==y => (FlagEQ) +(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) => (FlagLT) +(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) => (FlagGT) + +// absorb flag constants into boolean values +(Equal (FlagEQ)) => (MOVDconst [1]) +(Equal (FlagLT)) => (MOVDconst [0]) +(Equal (FlagGT)) => (MOVDconst [0]) + +(NotEqual (FlagEQ)) => (MOVDconst [0]) +(NotEqual (FlagLT)) => (MOVDconst [1]) +(NotEqual (FlagGT)) => (MOVDconst [1]) + +(LessThan (FlagEQ)) => (MOVDconst [0]) +(LessThan (FlagLT)) => (MOVDconst [1]) +(LessThan (FlagGT)) => (MOVDconst [0]) + +(LessEqual (FlagEQ)) => (MOVDconst [1]) +(LessEqual (FlagLT)) => (MOVDconst [1]) +(LessEqual (FlagGT)) => (MOVDconst [0]) + +(GreaterThan (FlagEQ)) => (MOVDconst [0]) +(GreaterThan (FlagLT)) => (MOVDconst [0]) +(GreaterThan (FlagGT)) => (MOVDconst [1]) + +(GreaterEqual (FlagEQ)) => (MOVDconst [1]) +(GreaterEqual (FlagLT)) => (MOVDconst [0]) +(GreaterEqual (FlagGT)) => (MOVDconst [1]) + +// absorb InvertFlags into boolean values +((Equal|NotEqual|LessThan|GreaterThan|LessEqual|GreaterEqual) (InvertFlags x)) => ((Equal|NotEqual|GreaterThan|LessThan|GreaterEqual|LessEqual) x) + + +// Elide compares of bit tests +((EQ|NE|LT|LE|GT|GE) (CMPconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no) +((EQ|NE|LT|LE|GT|GE) (CMPWconst [0] (Select0 (ANDCCconst [c] x))) yes no) => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ANDCCconst [c] x)) yes no) +((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(AND x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ANDCC x y)) yes no) +((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(OR x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (ORCC x y)) yes no) +((EQ|NE|LT|LE|GT|GE) (CMPconst [0] z:(XOR x y)) yes no) && z.Uses == 1 => ((EQ|NE|LT|LE|GT|GE) (Select1 <types.TypeFlags> (XORCC x y)) yes no) + +// Only lower after bool is lowered. It should always lower. This helps ensure the folding below happens reliably. +(CondSelect x y bool) && flagArg(bool) == nil => (ISEL [6] x y (Select1 <types.TypeFlags> (ANDCCconst [1] bool))) +// Fold any CR -> GPR -> CR transfers when applying the above rule. +(ISEL [6] x y (Select1 (ANDCCconst [1] (ISELB [c] one cmp)))) => (ISEL [c] x y cmp) + +// Lowering loads +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem) +(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) => (MOVWload ptr mem) +(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) => (MOVWZload ptr mem) +(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) => (MOVHload ptr mem) +(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) => (MOVHZload ptr mem) +(Load <t> ptr mem) && t.IsBoolean() => (MOVBZload ptr mem) +(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) => (MOVBreg (MOVBZload ptr mem)) // PPC has no signed-byte load. +(Load <t> ptr mem) && is8BitInt(t) && !isSigned(t) => (MOVBZload ptr mem) + +(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem) + +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is32BitFloat(val.Type) => (FMOVDstore ptr val mem) // glitch from (Cvt32Fto64F x) => x -- type is wrong +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitInt(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) + +// Using Zero instead of LoweredZero allows the +// target address to be folded where possible. +(Zero [0] _ mem) => mem +(Zero [1] destptr mem) => (MOVBstorezero destptr mem) +(Zero [2] destptr mem) => + (MOVHstorezero destptr mem) +(Zero [3] destptr mem) => + (MOVBstorezero [2] destptr + (MOVHstorezero destptr mem)) +(Zero [4] destptr mem) => + (MOVWstorezero destptr mem) +(Zero [5] destptr mem) => + (MOVBstorezero [4] destptr + (MOVWstorezero destptr mem)) +(Zero [6] destptr mem) => + (MOVHstorezero [4] destptr + (MOVWstorezero destptr mem)) +(Zero [7] destptr mem) => + (MOVBstorezero [6] destptr + (MOVHstorezero [4] destptr + (MOVWstorezero destptr mem))) + +(Zero [8] {t} destptr mem) => (MOVDstorezero destptr mem) +(Zero [12] {t} destptr mem) => + (MOVWstorezero [8] destptr + (MOVDstorezero [0] destptr mem)) +(Zero [16] {t} destptr mem) => + (MOVDstorezero [8] destptr + (MOVDstorezero [0] destptr mem)) +(Zero [24] {t} destptr mem) => + (MOVDstorezero [16] destptr + (MOVDstorezero [8] destptr + (MOVDstorezero [0] destptr mem))) +(Zero [32] {t} destptr mem) => + (MOVDstorezero [24] destptr + (MOVDstorezero [16] destptr + (MOVDstorezero [8] destptr + (MOVDstorezero [0] destptr mem)))) + +// Handle cases not handled above +// Lowered Short cases do not generate loops, and as a result don't clobber +// the address registers or flags. +(Zero [s] ptr mem) && buildcfg.GOPPC64 <= 8 && s < 64 => (LoweredZeroShort [s] ptr mem) +(Zero [s] ptr mem) && buildcfg.GOPPC64 <= 8 => (LoweredZero [s] ptr mem) +(Zero [s] ptr mem) && s < 128 && buildcfg.GOPPC64 >= 9 => (LoweredQuadZeroShort [s] ptr mem) +(Zero [s] ptr mem) && buildcfg.GOPPC64 >= 9 => (LoweredQuadZero [s] ptr mem) + +// moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBZload src mem) mem) +(Move [2] dst src mem) => + (MOVHstore dst (MOVHZload src mem) mem) +(Move [4] dst src mem) => + (MOVWstore dst (MOVWZload src mem) mem) +// MOVD for load and store must have offsets that are multiple of 4 +(Move [8] {t} dst src mem) => + (MOVDstore dst (MOVDload src mem) mem) +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBZload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)) +(Move [5] dst src mem) => + (MOVBstore [4] dst (MOVBZload [4] src mem) + (MOVWstore dst (MOVWZload src mem) mem)) +(Move [6] dst src mem) => + (MOVHstore [4] dst (MOVHZload [4] src mem) + (MOVWstore dst (MOVWZload src mem) mem)) +(Move [7] dst src mem) => + (MOVBstore [6] dst (MOVBZload [6] src mem) + (MOVHstore [4] dst (MOVHZload [4] src mem) + (MOVWstore dst (MOVWZload src mem) mem))) + +// Large move uses a loop. Since the address is computed and the +// offset is zero, any alignment can be used. +(Move [s] dst src mem) && s > 8 && buildcfg.GOPPC64 <= 8 && logLargeCopy(v, s) => + (LoweredMove [s] dst src mem) +(Move [s] dst src mem) && s > 8 && s <= 64 && buildcfg.GOPPC64 >= 9 => + (LoweredQuadMoveShort [s] dst src mem) +(Move [s] dst src mem) && s > 8 && buildcfg.GOPPC64 >= 9 && logLargeCopy(v, s) => + (LoweredQuadMove [s] dst src mem) + +// Calls +// Lowering calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// Miscellaneous +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) +(IsNonNil ptr) => (NotEqual (CMPconst [0] ptr)) +(IsInBounds idx len) => (LessThan (CMPU idx len)) +(IsSliceInBounds idx len) => (LessEqual (CMPU idx len)) +(NilCheck ...) => (LoweredNilCheck ...) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +// Publication barrier as intrinsic +(PubBarrier ...) => (LoweredPubBarrier ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +// Optimizations +// Note that PPC "logical" immediates come in 0:15 and 16:31 unsigned immediate forms, +// so ORconst, XORconst easily expand into a pair. + +// Include very-large constants in the const-const case. +(AND (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&d]) +(OR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|d]) +(XOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c^d]) +(ORN (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|^d]) +(ANDN (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&^d]) +(NOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [^(c|d)]) + +// Discover consts +(AND x (MOVDconst [c])) && isU16Bit(c) => (Select0 (ANDCCconst [c] x)) +(XOR x (MOVDconst [c])) && isU32Bit(c) => (XORconst [c] x) +(OR x (MOVDconst [c])) && isU32Bit(c) => (ORconst [c] x) + +// Simplify consts +(Select0 (ANDCCconst [c] (Select0 (ANDCCconst [d] x)))) => (Select0 (ANDCCconst [c&d] x)) +(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x) +(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x) +(Select0 (ANDCCconst [-1] x)) => x +(Select0 (ANDCCconst [0] _)) => (MOVDconst [0]) +(XORconst [0] x) => x +(ORconst [-1] _) => (MOVDconst [-1]) +(ORconst [0] x) => x + +// zero-extend of small and => small and +(MOVBZreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFF => y +(MOVHZreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFFFF => y +(MOVWZreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFFFFFFFF => y +(MOVWZreg y:(AND (MOVDconst [c]) _)) && uint64(c) <= 0xFFFFFFFF => y + +// sign extend of small-positive and => small-positive-and +(MOVBreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0x7F => y +(MOVHreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0x7FFF => y +(MOVWreg y:(Select0 (ANDCCconst [c] _))) && uint64(c) <= 0xFFFF => y // 0xFFFF is largest immediate constant, when regarded as 32-bit is > 0 +(MOVWreg y:(AND (MOVDconst [c]) _)) && uint64(c) <= 0x7FFFFFFF => y + +// small and of zero-extend => either zero-extend or small and +(Select0 (ANDCCconst [c] y:(MOVBZreg _))) && c&0xFF == 0xFF => y +(Select0 (ANDCCconst [0xFF] (MOVBreg x))) => (MOVBZreg x) +(Select0 (ANDCCconst [c] y:(MOVHZreg _))) && c&0xFFFF == 0xFFFF => y +(Select0 (ANDCCconst [0xFFFF] (MOVHreg x))) => (MOVHZreg x) + +(AND (MOVDconst [c]) y:(MOVWZreg _)) && c&0xFFFFFFFF == 0xFFFFFFFF => y +(AND (MOVDconst [0xFFFFFFFF]) y:(MOVWreg x)) => (MOVWZreg x) +// normal case +(Select0 (ANDCCconst [c] (MOVBZreg x))) => (Select0 (ANDCCconst [c&0xFF] x)) +(Select0 (ANDCCconst [c] (MOVHZreg x))) => (Select0 (ANDCCconst [c&0xFFFF] x)) +(Select0 (ANDCCconst [c] (MOVWZreg x))) => (Select0 (ANDCCconst [c&0xFFFFFFFF] x)) + +// Eliminate unnecessary sign/zero extend following right shift +(MOV(B|H|W)Zreg (SRWconst [c] (MOVBZreg x))) => (SRWconst [c] (MOVBZreg x)) +(MOV(H|W)Zreg (SRWconst [c] (MOVHZreg x))) => (SRWconst [c] (MOVHZreg x)) +(MOVWZreg (SRWconst [c] (MOVWZreg x))) => (SRWconst [c] (MOVWZreg x)) +(MOV(B|H|W)reg (SRAWconst [c] (MOVBreg x))) => (SRAWconst [c] (MOVBreg x)) +(MOV(H|W)reg (SRAWconst [c] (MOVHreg x))) => (SRAWconst [c] (MOVHreg x)) +(MOVWreg (SRAWconst [c] (MOVWreg x))) => (SRAWconst [c] (MOVWreg x)) + +(MOV(WZ|W)reg (S(R|RA)Wconst [c] x)) && sizeof(x.Type) <= 32 => (S(R|RA)Wconst [c] x) +(MOV(HZ|H)reg (S(R|RA)Wconst [c] x)) && sizeof(x.Type) <= 16 => (S(R|RA)Wconst [c] x) +(MOV(BZ|B)reg (S(R|RA)Wconst [c] x)) && sizeof(x.Type) == 8 => (S(R|RA)Wconst [c] x) + +// initial right shift will handle sign/zero extend +(MOVBZreg (SRDconst [c] x)) && c>=56 => (SRDconst [c] x) +(MOVBreg (SRDconst [c] x)) && c>56 => (SRDconst [c] x) +(MOVBreg (SRDconst [c] x)) && c==56 => (SRADconst [c] x) +(MOVBreg (SRADconst [c] x)) && c>=56 => (SRADconst [c] x) +(MOVBZreg (SRWconst [c] x)) && c>=24 => (SRWconst [c] x) +(MOVBreg (SRWconst [c] x)) && c>24 => (SRWconst [c] x) +(MOVBreg (SRWconst [c] x)) && c==24 => (SRAWconst [c] x) +(MOVBreg (SRAWconst [c] x)) && c>=24 => (SRAWconst [c] x) + +(MOVHZreg (SRDconst [c] x)) && c>=48 => (SRDconst [c] x) +(MOVHreg (SRDconst [c] x)) && c>48 => (SRDconst [c] x) +(MOVHreg (SRDconst [c] x)) && c==48 => (SRADconst [c] x) +(MOVHreg (SRADconst [c] x)) && c>=48 => (SRADconst [c] x) +(MOVHZreg (SRWconst [c] x)) && c>=16 => (SRWconst [c] x) +(MOVHreg (SRWconst [c] x)) && c>16 => (SRWconst [c] x) +(MOVHreg (SRAWconst [c] x)) && c>=16 => (SRAWconst [c] x) +(MOVHreg (SRWconst [c] x)) && c==16 => (SRAWconst [c] x) + +(MOVWZreg (SRDconst [c] x)) && c>=32 => (SRDconst [c] x) +(MOVWreg (SRDconst [c] x)) && c>32 => (SRDconst [c] x) +(MOVWreg (SRADconst [c] x)) && c>=32 => (SRADconst [c] x) +(MOVWreg (SRDconst [c] x)) && c==32 => (SRADconst [c] x) + +// Various redundant zero/sign extension combinations. +(MOVBZreg y:(MOVBZreg _)) => y // repeat +(MOVBreg y:(MOVBreg _)) => y // repeat +(MOVBreg (MOVBZreg x)) => (MOVBreg x) +(MOVBZreg (MOVBreg x)) => (MOVBZreg x) + +// H - there are more combinations than these + +(MOVHZreg y:(MOV(H|B)Zreg _)) => y // repeat +(MOVHZreg y:(MOVHBRload _ _)) => y + +(MOVHreg y:(MOV(H|B)reg _)) => y // repeat + +(MOV(H|HZ)reg y:(MOV(HZ|H)reg x)) => (MOV(H|HZ)reg x) + +// W - there are more combinations than these + +(MOV(WZ|WZ|WZ|W|W|W)reg y:(MOV(WZ|HZ|BZ|W|H|B)reg _)) => y // repeat +(MOVWZreg y:(MOV(H|W)BRload _ _)) => y + +(MOV(W|WZ)reg y:(MOV(WZ|W)reg x)) => (MOV(W|WZ)reg x) + +// Truncate then logical then truncate: omit first, lesser or equal truncate +(MOVWZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVWZreg ((OR|XOR|AND) <t> x y)) +(MOVHZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVHZreg ((OR|XOR|AND) <t> x y)) +(MOVHZreg ((OR|XOR|AND) <t> x (MOVHZreg y))) => (MOVHZreg ((OR|XOR|AND) <t> x y)) +(MOVBZreg ((OR|XOR|AND) <t> x (MOVWZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y)) +(MOVBZreg ((OR|XOR|AND) <t> x (MOVHZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y)) +(MOVBZreg ((OR|XOR|AND) <t> x (MOVBZreg y))) => (MOVBZreg ((OR|XOR|AND) <t> x y)) + +(MOV(B|H|W)Zreg z:(Select0 (ANDCCconst [c] (MOVBZload ptr x)))) => z +(MOV(B|H|W)Zreg z:(AND y (MOV(B|H|W)Zload ptr x))) => z +(MOV(H|W)Zreg z:(Select0 (ANDCCconst [c] (MOVHZload ptr x)))) => z +(MOVWZreg z:(Select0 (ANDCCconst [c] (MOVWZload ptr x)))) => z + +// Arithmetic constant ops + +(ADD x (MOVDconst [c])) && is32Bit(c) => (ADDconst [c] x) +(ADDconst [c] (ADDconst [d] x)) && is32Bit(c+d) => (ADDconst [c+d] x) +(ADDconst [0] x) => x +(SUB x (MOVDconst [c])) && is32Bit(-c) => (ADDconst [-c] x) + +(ADDconst [c] (MOVDaddr [d] {sym} x)) && is32Bit(c+int64(d)) => (MOVDaddr [int32(c+int64(d))] {sym} x) +(ADDconst [c] x:(SP)) && is32Bit(c) => (MOVDaddr [int32(c)] x) // so it is rematerializeable + +(MULL(W|D) x (MOVDconst [c])) && is16Bit(c) => (MULL(W|D)const [int32(c)] x) + +// Subtract from (with carry, but ignored) constant. +// Note, these clobber the carry bit. +(SUB (MOVDconst [c]) x) && is32Bit(c) => (SUBFCconst [c] x) +(SUBFCconst [c] (NEG x)) => (ADDconst [c] x) +(SUBFCconst [c] (SUBFCconst [d] x)) && is32Bit(c-d) => (ADDconst [c-d] x) +(SUBFCconst [0] x) => (NEG x) +(ADDconst [c] (SUBFCconst [d] x)) && is32Bit(c+d) => (SUBFCconst [c+d] x) +(NEG (ADDconst [c] x)) && is32Bit(-c) => (SUBFCconst [-c] x) +(NEG (SUBFCconst [c] x)) && is32Bit(-c) => (ADDconst [-c] x) +(NEG (SUB x y)) => (SUB y x) +(NEG (NEG x)) => x + +// Use register moves instead of stores and loads to move int<=>float values +// Common with math Float64bits, Float64frombits +(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr x _)) => (MFVSRD x) +(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr x _)) => (MTVSRD x) + +(FMOVDstore [off] {sym} ptr (MTVSRD x) mem) => (MOVDstore [off] {sym} ptr x mem) +(MOVDstore [off] {sym} ptr (MFVSRD x) mem) => (FMOVDstore [off] {sym} ptr x mem) + +(MTVSRD (MOVDconst [c])) && !math.IsNaN(math.Float64frombits(uint64(c))) => (FMOVDconst [math.Float64frombits(uint64(c))]) +(MFVSRD (FMOVDconst [c])) => (MOVDconst [int64(math.Float64bits(c))]) + +(MTVSRD x:(MOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (FMOVDload [off] {sym} ptr mem) +(MFVSRD x:(FMOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVDload [off] {sym} ptr mem) + +// Fold offsets for stores. +(MOV(D|W|H|B)store [off1] {sym} (ADDconst [off2] x) val mem) && is16Bit(int64(off1)+off2) => (MOV(D|W|H|B)store [off1+int32(off2)] {sym} x val mem) + +(FMOV(S|D)store [off1] {sym} (ADDconst [off2] ptr) val mem) && is16Bit(int64(off1)+off2) => (FMOV(S|D)store [off1+int32(off2)] {sym} ptr val mem) + +// Fold address into load/store. +// The assembler needs to generate several instructions and use +// temp register for accessing global, and each time it will reload +// the temp register. So don't fold address of global, unless there +// is only one use. +(MOV(B|H|W|D)store [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) + && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) => + (MOV(B|H|W|D)store [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) + +(FMOV(S|D)store [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) val mem) && canMergeSym(sym1,sym2) + && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) => + (FMOV(S|D)store [off1+off2] {mergeSym(sym1,sym2)} ptr val mem) + +(MOV(B|H|W)Zload [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) + && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) => + (MOV(B|H|W)Zload [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOV(H|W|D)load [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) + && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) => + (MOV(H|W|D)load [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(FMOV(S|D)load [off1] {sym1} p:(MOVDaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) + && is16Bit(int64(off1+off2)) && (ptr.Op != OpSB || p.Uses == 1) => + (FMOV(S|D)load [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +// Fold offsets for loads. +(FMOV(S|D)load [off1] {sym} (ADDconst [off2] ptr) mem) && is16Bit(int64(off1)+off2) => (FMOV(S|D)load [off1+int32(off2)] {sym} ptr mem) + +(MOV(D|W|WZ|H|HZ|BZ)load [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => (MOV(D|W|WZ|H|HZ|BZ)load [off1+int32(off2)] {sym} x mem) + +// Determine load + addressing that can be done as a register indexed load +(MOV(D|W|WZ|H|HZ|BZ)load [0] {sym} p:(ADD ptr idx) mem) && sym == nil && p.Uses == 1 => (MOV(D|W|WZ|H|HZ|BZ)loadidx ptr idx mem) + +// Determine if there is benefit to using a non-indexed load, since that saves the load +// of the index register. With MOVDload and MOVWload, there is no benefit if the offset +// value is not a multiple of 4, since that results in an extra instruction in the base +// register address computation. +(MOV(D|W)loadidx ptr (MOVDconst [c]) mem) && is16Bit(c) && c%4 == 0 => (MOV(D|W)load [int32(c)] ptr mem) +(MOV(WZ|H|HZ|BZ)loadidx ptr (MOVDconst [c]) mem) && is16Bit(c) => (MOV(WZ|H|HZ|BZ)load [int32(c)] ptr mem) +(MOV(D|W)loadidx (MOVDconst [c]) ptr mem) && is16Bit(c) && c%4 == 0 => (MOV(D|W)load [int32(c)] ptr mem) +(MOV(WZ|H|HZ|BZ)loadidx (MOVDconst [c]) ptr mem) && is16Bit(c) => (MOV(WZ|H|HZ|BZ)load [int32(c)] ptr mem) + +// Store of zero => storezero +(MOV(D|W|H|B)store [off] {sym} ptr (MOVDconst [0]) mem) => (MOV(D|W|H|B)storezero [off] {sym} ptr mem) + +// Fold offsets for storezero +(MOV(D|W|H|B)storezero [off1] {sym} (ADDconst [off2] x) mem) && is16Bit(int64(off1)+off2) => + (MOV(D|W|H|B)storezero [off1+int32(off2)] {sym} x mem) + +// Stores with addressing that can be done as indexed stores +(MOV(D|W|H|B)store [0] {sym} p:(ADD ptr idx) val mem) && sym == nil && p.Uses == 1 => (MOV(D|W|H|B)storeidx ptr idx val mem) + +// Stores with constant index values can be done without indexed instructions +// No need to lower the idx cases if c%4 is not 0 +(MOVDstoreidx ptr (MOVDconst [c]) val mem) && is16Bit(c) && c%4 == 0 => (MOVDstore [int32(c)] ptr val mem) +(MOV(W|H|B)storeidx ptr (MOVDconst [c]) val mem) && is16Bit(c) => (MOV(W|H|B)store [int32(c)] ptr val mem) +(MOVDstoreidx (MOVDconst [c]) ptr val mem) && is16Bit(c) && c%4 == 0 => (MOVDstore [int32(c)] ptr val mem) +(MOV(W|H|B)storeidx (MOVDconst [c]) ptr val mem) && is16Bit(c) => (MOV(W|H|B)store [int32(c)] ptr val mem) + +// Fold symbols into storezero +(MOV(D|W|H|B)storezero [off1] {sym1} p:(MOVDaddr [off2] {sym2} x) mem) && canMergeSym(sym1,sym2) + && (x.Op != OpSB || p.Uses == 1) => + (MOV(D|W|H|B)storezero [off1+off2] {mergeSym(sym1,sym2)} x mem) + +// atomic intrinsics +(AtomicLoad(8|32|64|Ptr) ptr mem) => (LoweredAtomicLoad(8|32|64|Ptr) [1] ptr mem) +(AtomicLoadAcq(32|64) ptr mem) => (LoweredAtomicLoad(32|64) [0] ptr mem) + +(AtomicStore(8|32|64) ptr val mem) => (LoweredAtomicStore(8|32|64) [1] ptr val mem) +(AtomicStoreRel(32|64) ptr val mem) => (LoweredAtomicStore(32|64) [0] ptr val mem) + +(AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...) + +(AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...) + +(AtomicCompareAndSwap(32|64) ptr old new_ mem) => (LoweredAtomicCas(32|64) [1] ptr old new_ mem) +(AtomicCompareAndSwapRel32 ptr old new_ mem) => (LoweredAtomicCas32 [0] ptr old new_ mem) + +(AtomicAnd(8|32) ...) => (LoweredAtomicAnd(8|32) ...) +(AtomicOr(8|32) ...) => (LoweredAtomicOr(8|32) ...) + +(Slicemask <t> x) => (SRADconst (NEG <t> x) [63]) + +// Note that MOV??reg returns a 64-bit int, x is not necessarily that wide +// This may interact with other patterns in the future. (Compare with arm64) +(MOV(B|H|W)Zreg x:(MOVBZload _ _)) => x +(MOV(B|H|W)Zreg x:(MOVBZloadidx _ _ _)) => x +(MOV(H|W)Zreg x:(MOVHZload _ _)) => x +(MOV(H|W)Zreg x:(MOVHZloadidx _ _ _)) => x +(MOV(H|W)reg x:(MOVHload _ _)) => x +(MOV(H|W)reg x:(MOVHloadidx _ _ _)) => x +(MOV(WZ|W)reg x:(MOV(WZ|W)load _ _)) => x +(MOV(WZ|W)reg x:(MOV(WZ|W)loadidx _ _ _)) => x +(MOV(B|W)Zreg x:(Select0 (LoweredAtomicLoad(8|32) _ _))) => x + +// don't extend if argument is already extended +(MOVBreg x:(Arg <t>)) && is8BitInt(t) && isSigned(t) => x +(MOVBZreg x:(Arg <t>)) && is8BitInt(t) && !isSigned(t) => x +(MOVHreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && isSigned(t) => x +(MOVHZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && !isSigned(t) => x +(MOVWreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && isSigned(t) => x +(MOVWZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && !isSigned(t) => x + +(MOVBZreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))]) +(MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))]) +(MOVHZreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))]) +(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))]) +(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))]) +(MOVWZreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))]) + +// Implement clrsldi and clrslwi extended mnemonics as described in +// ISA 3.0 section C.8. AuxInt field contains values needed for +// the instructions, packed together since there is only one available. +(SLDconst [c] z:(MOVBZreg x)) && c < 8 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,56,63,64)] x) +(SLDconst [c] z:(MOVHZreg x)) && c < 16 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,48,63,64)] x) +(SLDconst [c] z:(MOVWZreg x)) && c < 32 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,32,63,64)] x) + +(SLDconst [c] z:(Select0 (ANDCCconst [d] x))) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x) +(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x) +(SLWconst [c] z:(MOVBZreg x)) && z.Uses == 1 && c < 8 => (CLRLSLWI [newPPC64ShiftAuxInt(c,24,31,32)] x) +(SLWconst [c] z:(MOVHZreg x)) && z.Uses == 1 && c < 16 => (CLRLSLWI [newPPC64ShiftAuxInt(c,16,31,32)] x) +(SLWconst [c] z:(Select0 (ANDCCconst [d] x))) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x) +(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x) +// special case for power9 +(SL(W|D)const [c] z:(MOVWreg x)) && c < 32 && buildcfg.GOPPC64 >= 9 => (EXTSWSLconst [c] x) + +// Lose widening ops fed to stores +(MOVBstore [off] {sym} ptr (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOV(W|WZ)reg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 => (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem) +(MOVBstore [off] {sym} ptr (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 => (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem) +(MOVBstoreidx ptr idx (MOV(B|BZ|H|HZ|W|WZ)reg x) mem) => (MOVBstoreidx ptr idx x mem) +(MOVHstoreidx ptr idx (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHstoreidx ptr idx x mem) +(MOVWstoreidx ptr idx (MOV(W|WZ)reg x) mem) => (MOVWstoreidx ptr idx x mem) +(MOVBstoreidx ptr idx (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 => (MOVBstoreidx ptr idx (SRWconst <typ.UInt32> x [c]) mem) +(MOVBstoreidx ptr idx (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 => (MOVBstoreidx ptr idx (SRWconst <typ.UInt32> x [c]) mem) +(MOVHBRstore {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) => (MOVHBRstore {sym} ptr x mem) +(MOVWBRstore {sym} ptr (MOV(W|WZ)reg x) mem) => (MOVWBRstore {sym} ptr x mem) + +// Lose W-widening ops fed to compare-W +(CMP(W|WU) x (MOV(W|WZ)reg y)) => (CMP(W|WU) x y) +(CMP(W|WU) (MOV(W|WZ)reg x) y) => (CMP(W|WU) x y) + +(CMP x (MOVDconst [c])) && is16Bit(c) => (CMPconst x [c]) +(CMP (MOVDconst [c]) y) && is16Bit(c) => (InvertFlags (CMPconst y [c])) +(CMPW x (MOVDconst [c])) && is16Bit(c) => (CMPWconst x [int32(c)]) +(CMPW (MOVDconst [c]) y) && is16Bit(c) => (InvertFlags (CMPWconst y [int32(c)])) + +(CMPU x (MOVDconst [c])) && isU16Bit(c) => (CMPUconst x [c]) +(CMPU (MOVDconst [c]) y) && isU16Bit(c) => (InvertFlags (CMPUconst y [c])) +(CMPWU x (MOVDconst [c])) && isU16Bit(c) => (CMPWUconst x [int32(c)]) +(CMPWU (MOVDconst [c]) y) && isU16Bit(c) => (InvertFlags (CMPWUconst y [int32(c)])) + +// Canonicalize the order of arguments to comparisons - helps with CSE. +((CMP|CMPW|CMPU|CMPWU) x y) && canonLessThan(x,y) => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x)) + +// ISEL auxInt values 0=LT 1=GT 2=EQ arg2 ? arg0 : arg1 +// ISEL auxInt values 4=GE 5=LE 6=NE !arg2 ? arg1 : arg0 +// ISELB special case where arg0, arg1 values are 0, 1 + +(Equal cmp) => (ISELB [2] (MOVDconst [1]) cmp) +(NotEqual cmp) => (ISELB [6] (MOVDconst [1]) cmp) +(LessThan cmp) => (ISELB [0] (MOVDconst [1]) cmp) +(FLessThan cmp) => (ISELB [0] (MOVDconst [1]) cmp) +(FLessEqual cmp) => (ISEL [2] (MOVDconst [1]) (ISELB [0] (MOVDconst [1]) cmp) cmp) +(GreaterEqual cmp) => (ISELB [4] (MOVDconst [1]) cmp) +(GreaterThan cmp) => (ISELB [1] (MOVDconst [1]) cmp) +(FGreaterThan cmp) => (ISELB [1] (MOVDconst [1]) cmp) +(FGreaterEqual cmp) => (ISEL [2] (MOVDconst [1]) (ISELB [1] (MOVDconst [1]) cmp) cmp) +(LessEqual cmp) => (ISELB [5] (MOVDconst [1]) cmp) + +(ISELB [0] _ (FlagLT)) => (MOVDconst [1]) +(ISELB [0] _ (Flag(GT|EQ))) => (MOVDconst [0]) +(ISELB [1] _ (FlagGT)) => (MOVDconst [1]) +(ISELB [1] _ (Flag(LT|EQ))) => (MOVDconst [0]) +(ISELB [2] _ (FlagEQ)) => (MOVDconst [1]) +(ISELB [2] _ (Flag(LT|GT))) => (MOVDconst [0]) +(ISELB [4] _ (FlagLT)) => (MOVDconst [0]) +(ISELB [4] _ (Flag(GT|EQ))) => (MOVDconst [1]) +(ISELB [5] _ (FlagGT)) => (MOVDconst [0]) +(ISELB [5] _ (Flag(LT|EQ))) => (MOVDconst [1]) +(ISELB [6] _ (FlagEQ)) => (MOVDconst [0]) +(ISELB [6] _ (Flag(LT|GT))) => (MOVDconst [1]) + +(ISEL [2] x _ (FlagEQ)) => x +(ISEL [2] _ y (Flag(LT|GT))) => y + +(ISEL [6] _ y (FlagEQ)) => y +(ISEL [6] x _ (Flag(LT|GT))) => x + +(ISEL [0] _ y (Flag(EQ|GT))) => y +(ISEL [0] x _ (FlagLT)) => x + +(ISEL [5] _ x (Flag(EQ|LT))) => x +(ISEL [5] y _ (FlagGT)) => y + +(ISEL [1] _ y (Flag(EQ|LT))) => y +(ISEL [1] x _ (FlagGT)) => x + +(ISEL [4] x _ (Flag(EQ|GT))) => x +(ISEL [4] _ y (FlagLT)) => y + +(ISEL [2] x y ((CMP|CMPW)const [0] (Select0 (ANDCCconst [n] z)))) => (ISEL [2] x y (Select1 <types.TypeFlags> (ANDCCconst [n] z ))) +(ISEL [6] x y ((CMP|CMPW)const [0] (Select0 (ANDCCconst [n] z)))) => (ISEL [6] x y (Select1 <types.TypeFlags> (ANDCCconst [n] z ))) +(ISELB [2] x ((CMP|CMPW)const [0] (Select0 (ANDCCconst [1] z)))) => (XORconst [1] (Select0 <typ.UInt64> (ANDCCconst [1] z ))) +(ISELB [6] x ((CMP|CMPW)const [0] (Select0 (ANDCCconst [1] z)))) => (Select0 <typ.UInt64> (ANDCCconst [1] z )) + +(ISELB [2] x (CMPWconst [0] (Select0 (ANDCCconst [n] z)))) => (ISELB [2] x (Select1 <types.TypeFlags> (ANDCCconst [n] z ))) +(ISELB [6] x (CMPWconst [0] (Select0 (ANDCCconst [n] z)))) => (ISELB [6] x (Select1 <types.TypeFlags> (ANDCCconst [n] z ))) + +// Only CMPconst for these in case AND|OR|XOR result is > 32 bits +(ISELB [2] x (CMPconst [0] a:(AND y z))) && a.Uses == 1 => (ISELB [2] x (Select1 <types.TypeFlags> (ANDCC y z ))) +(ISELB [6] x (CMPconst [0] a:(AND y z))) && a.Uses == 1 => (ISELB [6] x (Select1 <types.TypeFlags> (ANDCC y z ))) + +(ISELB [2] x (CMPconst [0] o:(OR y z))) && o.Uses == 1 => (ISELB [2] x (Select1 <types.TypeFlags> (ORCC y z ))) +(ISELB [6] x (CMPconst [0] o:(OR y z))) && o.Uses == 1 => (ISELB [6] x (Select1 <types.TypeFlags> (ORCC y z ))) + +(ISELB [2] x (CMPconst [0] a:(XOR y z))) && a.Uses == 1 => (ISELB [2] x (Select1 <types.TypeFlags> (XORCC y z ))) +(ISELB [6] x (CMPconst [0] a:(XOR y z))) && a.Uses == 1 => (ISELB [6] x (Select1 <types.TypeFlags> (XORCC y z ))) + +(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 0 => (ISELB [n+1] (MOVDconst [1]) bool) +(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 1 => (ISELB [n-1] (MOVDconst [1]) bool) +(ISELB [n] (MOVDconst [1]) (InvertFlags bool)) && n%4 == 2 => (ISELB [n] (MOVDconst [1]) bool) +(ISEL [n] x y (InvertFlags bool)) && n%4 == 0 => (ISEL [n+1] x y bool) +(ISEL [n] x y (InvertFlags bool)) && n%4 == 1 => (ISEL [n-1] x y bool) +(ISEL [n] x y (InvertFlags bool)) && n%4 == 2 => (ISEL [n] x y bool) +(XORconst [1] (ISELB [6] (MOVDconst [1]) cmp)) => (ISELB [2] (MOVDconst [1]) cmp) +(XORconst [1] (ISELB [5] (MOVDconst [1]) cmp)) => (ISELB [1] (MOVDconst [1]) cmp) +(XORconst [1] (ISELB [4] (MOVDconst [1]) cmp)) => (ISELB [0] (MOVDconst [1]) cmp) + +// A particular pattern seen in cgo code: +(AND (MOVDconst [c]) x:(MOVBZload _ _)) => (Select0 (ANDCCconst [c&0xFF] x)) + +// floating point negative abs +(FNEG (F(ABS|NABS) x)) => (F(NABS|ABS) x) + +// floating-point fused multiply-add/sub +(F(ADD|SUB) (FMUL x y) z) && x.Block.Func.useFMA(v) => (FM(ADD|SUB) x y z) +(F(ADDS|SUBS) (FMULS x y) z) && x.Block.Func.useFMA(v) => (FM(ADDS|SUBS) x y z) + +// The following statements are found in encoding/binary functions UintXX (load) and PutUintXX (store) +// and convert the statements in these functions from multiple single byte loads or stores to +// the single largest possible load or store. +// Some are marked big or little endian based on the order in which the bytes are loaded or stored, +// not on the ordering of the machine. These are intended for little endian machines. +// To implement for big endian machines, most rules would have to be duplicated but the +// resulting rule would be reversed, i. e., MOVHZload on little endian would be MOVHBRload on big endian +// and vice versa. +// b[0] | b[1]<<8 => load 16-bit Little endian +(OR <t> x0:(MOVBZload [i0] {s} p mem) + o1:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [8])) + && !config.BigEndian + && i1 == i0+1 + && x0.Uses ==1 && x1.Uses == 1 + && o1.Uses == 1 + && mergePoint(b, x0, x1) != nil + && clobber(x0, x1, o1) + => @mergePoint(b,x0,x1) (MOVHZload <t> {s} [i0] p mem) + +// b[0]<<8 | b[1] => load 16-bit Big endian on Little endian arch. +// Use byte-reverse indexed load for 2 bytes. +(OR <t> x0:(MOVBZload [i1] {s} p mem) + o1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [8])) + && !config.BigEndian + && i1 == i0+1 + && x0.Uses ==1 && x1.Uses == 1 + && o1.Uses == 1 + && mergePoint(b, x0, x1) != nil + && clobber(x0, x1, o1) + => @mergePoint(b,x0,x1) (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) + +// b[0]<<n+8 | b[1]<<n => load 16-bit Big endian (where n%8== 0) +// Use byte-reverse indexed load for 2 bytes, +// then shift left to the correct position. Used to match subrules +// from longer rules. +(OR <t> s0:(SL(W|D)const x0:(MOVBZload [i1] {s} p mem) [n1]) + s1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [n2])) + && !config.BigEndian + && i1 == i0+1 + && n1%8 == 0 + && n2 == n1+8 + && x0.Uses == 1 && x1.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 + && mergePoint(b, x0, x1) != nil + && clobber(x0, x1, s0, s1) + => @mergePoint(b,x0,x1) (SLDconst <t> (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [n1]) + +// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 => load 32-bit Little endian +// Use byte-reverse indexed load for 4 bytes. +(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i3] {s} p mem) [24]) + o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [16]) + x0:(MOVHZload [i0] {s} p mem))) + && !config.BigEndian + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses ==1 && x1.Uses == 1 && x2.Uses == 1 + && o0.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 + && mergePoint(b, x0, x1, x2) != nil + && clobber(x0, x1, x2, s0, s1, o0) + => @mergePoint(b,x0,x1,x2) (MOVWZload <t> {s} [i0] p mem) + +// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] => load 32-bit Big endian order on Little endian arch +// Use byte-reverse indexed load for 4 bytes with computed address. +// Could be used to match subrules of a longer rule. +(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i0] {s} p mem) [24]) + o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [16]) + x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem))) + && !config.BigEndian + && i1 == i0+1 + && i2 == i0+2 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && o0.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 + && mergePoint(b, x0, x1, x2) != nil + && clobber(x0, x1, x2, s0, s1, o0) + => @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) + +// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 => load 32-bit Big endian order on Little endian arch +// Use byte-reverse indexed load for 4 bytes with computed address. +// Could be used to match subrules of a longer rule. +(OR <t> x0:(MOVBZload [i3] {s} p mem) + o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [8]) + s1:(SL(W|D)const x2:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [16]))) + && !config.BigEndian + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && o0.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 + && mergePoint(b, x0, x1, x2) != nil + && clobber(x0, x1, x2, s0, s1, o0) + => @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) + +// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 => load 32-bit Big endian order on Little endian arch +// Use byte-reverse indexed load to for 4 bytes with computed address. +// Used to match longer rules. +(OR <t> s2:(SLDconst x2:(MOVBZload [i3] {s} p mem) [32]) + o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i2] {s} p mem) [40]) + s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [48]))) + && !config.BigEndian + && i2 == i0+2 + && i3 == i0+3 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && o0.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 + && mergePoint(b, x0, x1, x2) != nil + && clobber(x0, x1, x2, s0, s1, s2, o0) + => @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32]) + +// b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 => load 32-bit Big endian order on Little endian arch +// Use byte-reverse indexed load for 4 bytes with constant address. +// Used to match longer rules. +(OR <t> s2:(SLDconst x2:(MOVBZload [i0] {s} p mem) [56]) + o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48]) + s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem) [32]))) + && !config.BigEndian + && i1 == i0+1 + && i2 == i0+2 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && o0.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 + && mergePoint(b, x0, x1, x2) != nil + && clobber(x0, x1, x2, s0, s1, s2, o0) + => @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32]) + +// b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4] <<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 => load 64-bit Little endian +// Rules with commutative ops and many operands will result in extremely large functions in rewritePPC64, +// so matching shorter previously defined subrules is important. +// Offset must be multiple of 4 for MOVD +(OR <t> s6:(SLDconst x7:(MOVBZload [i7] {s} p mem) [56]) + o5:(OR <t> s5:(SLDconst x6:(MOVBZload [i6] {s} p mem) [48]) + o4:(OR <t> s4:(SLDconst x5:(MOVBZload [i5] {s} p mem) [40]) + o3:(OR <t> s3:(SLDconst x4:(MOVBZload [i4] {s} p mem) [32]) + x0:(MOVWZload {s} [i0] p mem))))) + && !config.BigEndian + && i4 == i0+4 + && i5 == i0+5 + && i6 == i0+6 + && i7 == i0+7 + && x0.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses ==1 && x7.Uses == 1 + && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 + && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 + && mergePoint(b, x0, x4, x5, x6, x7) != nil + && clobber(x0, x4, x5, x6, x7, s3, s4, s5, s6, o3, o4, o5) + => @mergePoint(b,x0,x4,x5,x6,x7) (MOVDload <t> {s} [i0] p mem) + +// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 load 64-bit Big endian ordered bytes on Little endian arch +// Use byte-reverse indexed load of 8 bytes. +// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64, +// so matching shorter previously defined subrules is important. +(OR <t> s0:(SLDconst x0:(MOVBZload [i0] {s} p mem) [56]) + o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48]) + o1:(OR <t> s2:(SLDconst x2:(MOVBZload [i2] {s} p mem) [40]) + o2:(OR <t> s3:(SLDconst x3:(MOVBZload [i3] {s} p mem) [32]) + x4:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i4] p) mem))))) + && !config.BigEndian + && i1 == i0+1 + && i2 == i0+2 + && i3 == i0+3 + && i4 == i0+4 + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 + && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 + && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 + && mergePoint(b, x0, x1, x2, x3, x4) != nil + && clobber(x0, x1, x2, x3, x4, o0, o1, o2, s0, s1, s2, s3) + => @mergePoint(b,x0,x1,x2,x3,x4) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) + +// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] => load 64-bit Big endian ordered bytes on Little endian arch +// Use byte-reverse indexed load of 8 bytes. +// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64, +// so matching shorter previously defined subrules is important. +(OR <t> x7:(MOVBZload [i7] {s} p mem) + o5:(OR <t> s6:(SLDconst x6:(MOVBZload [i6] {s} p mem) [8]) + o4:(OR <t> s5:(SLDconst x5:(MOVBZload [i5] {s} p mem) [16]) + o3:(OR <t> s4:(SLDconst x4:(MOVBZload [i4] {s} p mem) [24]) + s0:(SL(W|D)const x3:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32]))))) + && !config.BigEndian + && i4 == i0+4 + && i5 == i0+5 + && i6 == i0+6 + && i7 == i0+7 + && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 + && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 + && s0.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 + && mergePoint(b, x3, x4, x5, x6, x7) != nil + && clobber(x3, x4, x5, x6, x7, o3, o4, o5, s0, s4, s5, s6) + => @mergePoint(b,x3,x4,x5,x6,x7) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) + +// 2 byte store Little endian as in: +// b[0] = byte(v >> 16) +// b[1] = byte(v >> 24) +// Added for use in matching longer rules. +(MOVBstore [i1] {s} p (SR(W|D)const w [24]) + x0:(MOVBstore [i0] {s} p (SR(W|D)const w [16]) mem)) + && !config.BigEndian + && x0.Uses == 1 + && i1 == i0+1 + && clobber(x0) + => (MOVHstore [i0] {s} p (SRWconst <typ.UInt16> w [16]) mem) + +// 2 byte store Little endian as in: +// b[0] = byte(v) +// b[1] = byte(v >> 8) +(MOVBstore [i1] {s} p (SR(W|D)const w [8]) + x0:(MOVBstore [i0] {s} p w mem)) + && !config.BigEndian + && x0.Uses == 1 + && i1 == i0+1 + && clobber(x0) + => (MOVHstore [i0] {s} p w mem) + +// 4 byte store Little endian as in: +// b[0:1] = uint16(v) +// b[2:3] = uint16(v >> 16) +(MOVHstore [i1] {s} p (SR(W|D)const w [16]) + x0:(MOVHstore [i0] {s} p w mem)) + && !config.BigEndian + && x0.Uses == 1 + && i1 == i0+2 + && clobber(x0) + => (MOVWstore [i0] {s} p w mem) + +// 4 byte store Big endian as in: +// b[0] = byte(v >> 24) +// b[1] = byte(v >> 16) +// b[2] = byte(v >> 8) +// b[3] = byte(v) +// Use byte-reverse indexed 4 byte store. +(MOVBstore [i3] {s} p w + x0:(MOVBstore [i2] {s} p (SRWconst w [8]) + x1:(MOVBstore [i1] {s} p (SRWconst w [16]) + x2:(MOVBstore [i0] {s} p (SRWconst w [24]) mem)))) + && !config.BigEndian + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 + && i1 == i0+1 && i2 == i0+2 && i3 == i0+3 + && clobber(x0, x1, x2) + => (MOVWBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem) + +// The 2 byte store appears after the 4 byte store so that the +// match for the 2 byte store is not done first. +// If the 4 byte store is based on the 2 byte store then there are +// variations on the MOVDaddr subrule that would require additional +// rules to be written. + +// 2 byte store Big endian as in: +// b[0] = byte(v >> 8) +// b[1] = byte(v) +(MOVBstore [i1] {s} p w x0:(MOVBstore [i0] {s} p (SRWconst w [8]) mem)) + && !config.BigEndian + && x0.Uses == 1 + && i1 == i0+1 + && clobber(x0) + => (MOVHBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem) + +// 8 byte store Little endian as in: +// b[0] = byte(v) +// b[1] = byte(v >> 8) +// b[2] = byte(v >> 16) +// b[3] = byte(v >> 24) +// b[4] = byte(v >> 32) +// b[5] = byte(v >> 40) +// b[6] = byte(v >> 48) +// b[7] = byte(v >> 56) +// Built on previously defined rules +// Offset must be multiple of 4 for MOVDstore +(MOVBstore [i7] {s} p (SRDconst w [56]) + x0:(MOVBstore [i6] {s} p (SRDconst w [48]) + x1:(MOVBstore [i5] {s} p (SRDconst w [40]) + x2:(MOVBstore [i4] {s} p (SRDconst w [32]) + x3:(MOVWstore [i0] {s} p w mem))))) + && !config.BigEndian + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 + && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7 + && clobber(x0, x1, x2, x3) + => (MOVDstore [i0] {s} p w mem) + +// 8 byte store Big endian as in: +// b[0] = byte(v >> 56) +// b[1] = byte(v >> 48) +// b[2] = byte(v >> 40) +// b[3] = byte(v >> 32) +// b[4] = byte(v >> 24) +// b[5] = byte(v >> 16) +// b[6] = byte(v >> 8) +// b[7] = byte(v) +// Use byte-reverse indexed 8 byte store. +(MOVBstore [i7] {s} p w + x0:(MOVBstore [i6] {s} p (SRDconst w [8]) + x1:(MOVBstore [i5] {s} p (SRDconst w [16]) + x2:(MOVBstore [i4] {s} p (SRDconst w [24]) + x3:(MOVBstore [i3] {s} p (SRDconst w [32]) + x4:(MOVBstore [i2] {s} p (SRDconst w [40]) + x5:(MOVBstore [i1] {s} p (SRDconst w [48]) + x6:(MOVBstore [i0] {s} p (SRDconst w [56]) mem)))))))) + && !config.BigEndian + && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 + && i1 == i0+1 && i2 == i0+2 && i3 == i0+3 && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7 + && clobber(x0, x1, x2, x3, x4, x5, x6) + => (MOVDBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem) + +// Arch-specific inlining for small or disjoint runtime.memmove +(SelectN [0] call:(CALLstatic {sym} s1:(MOVDstore _ (MOVDconst [sz]) s2:(MOVDstore _ src s3:(MOVDstore {t} _ dst mem))))) + && sz >= 0 + && isSameCall(sym, "runtime.memmove") + && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 + && isInlinableMemmove(dst, src, sz, config) + && clobber(s1, s2, s3, call) + => (Move [sz] dst src mem) + +// Match post-lowering calls, register version. +(SelectN [0] call:(CALLstatic {sym} dst src (MOVDconst [sz]) mem)) + && sz >= 0 + && isSameCall(sym, "runtime.memmove") + && call.Uses == 1 + && isInlinableMemmove(dst, src, sz, config) + && clobber(call) + => (Move [sz] dst src mem) + +// Prefetch instructions (TH specified using aux field) +// For DCBT Ra,Rb,TH, A value of TH indicates: +// 0, hint this cache line will be used soon. (PrefetchCache) +// 16, hint this cache line will not be used for long. (PrefetchCacheStreamed) +// See ISA 3.0 Book II 4.3.2 for more detail. https://openpower.foundation/specifications/isa/ +(PrefetchCache ptr mem) => (DCBT ptr mem [0]) +(PrefetchCacheStreamed ptr mem) => (DCBT ptr mem [16]) + diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go new file mode 100644 index 0000000..2d651dd --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/PPC64Ops.go @@ -0,0 +1,740 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Less-than-64-bit integer types live in the low portion of registers. +// The upper portion is junk. +// - Boolean types are zero or 1; stored in a byte, with upper bytes of the register containing junk. +// - *const instructions may use a constant larger than the instruction can encode. +// In this case the assembler expands to multiple instructions and uses tmp +// register (R31). + +var regNamesPPC64 = []string{ + "R0", // REGZERO, not used, but simplifies counting in regalloc + "SP", // REGSP + "SB", // REGSB + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", // REGCTXT for closures + "R12", + "R13", // REGTLS + "R14", + "R15", + "R16", + "R17", + "R18", + "R19", + "R20", + "R21", + "R22", + "R23", + "R24", + "R25", + "R26", + "R27", + "R28", + "R29", + "g", // REGG. Using name "g" and setting Config.hasGReg makes it "just happen". + "R31", // REGTMP + + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", + "F16", + "F17", + "F18", + "F19", + "F20", + "F21", + "F22", + "F23", + "F24", + "F25", + "F26", + "F27", + "F28", + "F29", + "F30", + // "F31", the allocator is limited to 64 entries. We sacrifice this FPR to support XER. + + "XER", + + // If you add registers, update asyncPreempt in runtime. + + // "CR0", + // "CR1", + // "CR2", + // "CR3", + // "CR4", + // "CR5", + // "CR6", + // "CR7", + + // "CR", + // "LR", + // "CTR", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesPPC64) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesPPC64 { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + var ( + gp = buildReg("R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29") + fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30") + sp = buildReg("SP") + sb = buildReg("SB") + gr = buildReg("g") + xer = buildReg("XER") + // cr = buildReg("CR") + // ctr = buildReg("CTR") + // lr = buildReg("LR") + tmp = buildReg("R31") + ctxt = buildReg("R11") + callptr = buildReg("R12") + // tls = buildReg("R13") + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}} + xergp = regInfo{inputs: []regMask{xer}, outputs: []regMask{gp}, clobbers: xer} + gp11cxer = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}, clobbers: xer} + gp11xer = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp, xer}} + gp21 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}} + gp21a0 = regInfo{inputs: []regMask{gp, gp | sp | sb}, outputs: []regMask{gp}} + gp21cxer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}, clobbers: xer} + gp21xer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, xer}, clobbers: xer} + gp2xer1xer = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, xer}, outputs: []regMask{gp, xer}, clobbers: xer} + gp31 = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}} + gp1cr = regInfo{inputs: []regMask{gp | sp | sb}} + gp2cr = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}} + crgp = regInfo{inputs: nil, outputs: []regMask{gp}} + crgp11 = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}} + crgp21 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}} + gpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}} + gploadidx = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}} + prefreg = regInfo{inputs: []regMask{gp | sp | sb}} + gpstore = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}} + gpstoreidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}} + gpstorezero = regInfo{inputs: []regMask{gp | sp | sb}} // ppc64.REGZERO is reserved zero value + gpxchg = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}} + gpcas = regInfo{inputs: []regMask{gp | sp | sb, gp, gp}, outputs: []regMask{gp}} + fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} + fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} + fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}} + gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}} + fp2cr = regInfo{inputs: []regMask{fp, fp}} + fpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{fp}} + fploadidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{fp}} + fpstore = regInfo{inputs: []regMask{gp | sp | sb, fp}} + fpstoreidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, fp}} + callerSave = regMask(gp | fp | gr | xer) + r3 = buildReg("R3") + r4 = buildReg("R4") + r5 = buildReg("R5") + r6 = buildReg("R6") + ) + ops := []opData{ + {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1 + {name: "ADDconst", argLength: 1, reg: gp11, asm: "ADD", aux: "Int64"}, // arg0 + auxInt + {name: "FADD", argLength: 2, reg: fp21, asm: "FADD", commutative: true}, // arg0+arg1 + {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0+arg1 + {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0-arg1 + {name: "SUBFCconst", argLength: 1, reg: gp11cxer, asm: "SUBC", aux: "Int64"}, // auxInt - arg0 (carry is ignored) + {name: "FSUB", argLength: 2, reg: fp21, asm: "FSUB"}, // arg0-arg1 + {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0-arg1 + + {name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit) + {name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit) + {name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit) + {name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit) + {name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"}, // (arg0*arg1)+arg2 (signed 64-bit) + + {name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true}, // (arg0 * arg1) >> 64, signed + {name: "MULHW", argLength: 2, reg: gp21, asm: "MULHW", commutative: true}, // (arg0 * arg1) >> 32, signed + {name: "MULHDU", argLength: 2, reg: gp21, asm: "MULHDU", commutative: true}, // (arg0 * arg1) >> 64, unsigned + {name: "MULHWU", argLength: 2, reg: gp21, asm: "MULHWU", commutative: true}, // (arg0 * arg1) >> 32, unsigned + + {name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true}, // arg0*arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1 + + {name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD"}, // arg0*arg1 + arg2 + {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // arg0*arg1 + arg2 + {name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB"}, // arg0*arg1 - arg2 + {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // arg0*arg1 - arg2 + + {name: "SRAD", argLength: 2, reg: gp21cxer, asm: "SRAD"}, // signed arg0 >> (arg1&127), 64 bit width (note: 127, not 63!) + {name: "SRAW", argLength: 2, reg: gp21cxer, asm: "SRAW"}, // signed arg0 >> (arg1&63), 32 bit width + {name: "SRD", argLength: 2, reg: gp21, asm: "SRD"}, // unsigned arg0 >> (arg1&127), 64 bit width + {name: "SRW", argLength: 2, reg: gp21, asm: "SRW"}, // unsigned arg0 >> (arg1&63), 32 bit width + {name: "SLD", argLength: 2, reg: gp21, asm: "SLD"}, // arg0 << (arg1&127), 64 bit width + {name: "SLW", argLength: 2, reg: gp21, asm: "SLW"}, // arg0 << (arg1&63), 32 bit width + + {name: "ROTL", argLength: 2, reg: gp21, asm: "ROTL"}, // arg0 rotate left by arg1 mod 64 + {name: "ROTLW", argLength: 2, reg: gp21, asm: "ROTLW"}, // uint32(arg0) rotate left by arg1 mod 32 + // The following are ops to implement the extended mnemonics for shifts as described in section C.8 of the ISA. + // The constant shift values are packed into the aux int32. + {name: "RLDICL", argLength: 1, reg: gp11, asm: "RLDICL", aux: "Int32"}, // arg0 extract bits identified by shift params" + {name: "CLRLSLWI", argLength: 1, reg: gp11, asm: "CLRLSLWI", aux: "Int32"}, // + {name: "CLRLSLDI", argLength: 1, reg: gp11, asm: "CLRLSLDI", aux: "Int32"}, // + + // Operations which consume or generate the CA (xer) + {name: "ADDC", argLength: 2, reg: gp21xer, asm: "ADDC", commutative: true, typ: "(UInt64, UInt64)"}, // arg0 + arg1 -> out, CA + {name: "SUBC", argLength: 2, reg: gp21xer, asm: "SUBC", typ: "(UInt64, UInt64)"}, // arg0 - arg1 -> out, CA + {name: "ADDCconst", argLength: 1, reg: gp11xer, asm: "ADDC", typ: "(UInt64, UInt64)", aux: "Int64"}, // arg0 + imm16 -> out, CA + {name: "SUBCconst", argLength: 1, reg: gp11xer, asm: "SUBC", typ: "(UInt64, UInt64)", aux: "Int64"}, // imm16 - arg0 -> out, CA + {name: "ADDE", argLength: 3, reg: gp2xer1xer, asm: "ADDE", typ: "(UInt64, UInt64)", commutative: true}, // arg0 + arg1 + CA (arg2) -> out, CA + {name: "SUBE", argLength: 3, reg: gp2xer1xer, asm: "SUBE", typ: "(UInt64, UInt64)"}, // arg0 - arg1 - CA (arg2) -> out, CA + {name: "ADDZEzero", argLength: 1, reg: xergp, asm: "ADDZE", typ: "UInt64"}, // CA (arg0) + $0 -> out + {name: "SUBZEzero", argLength: 1, reg: xergp, asm: "SUBZE", typ: "UInt64"}, // $0 - CA (arg0) -> out + + {name: "SRADconst", argLength: 1, reg: gp11cxer, asm: "SRAD", aux: "Int64"}, // signed arg0 >> auxInt, 0 <= auxInt < 64, 64 bit width + {name: "SRAWconst", argLength: 1, reg: gp11cxer, asm: "SRAW", aux: "Int64"}, // signed arg0 >> auxInt, 0 <= auxInt < 32, 32 bit width + {name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "Int64"}, // unsigned arg0 >> auxInt, 0 <= auxInt < 64, 64 bit width + {name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "Int64"}, // unsigned arg0 >> auxInt, 0 <= auxInt < 32, 32 bit width + {name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "Int64"}, // arg0 << auxInt, 0 <= auxInt < 64, 64 bit width + {name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "Int64"}, // arg0 << auxInt, 0 <= auxInt < 32, 32 bit width + + {name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"}, // arg0 rotate left by auxInt bits + {name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits + {name: "EXTSWSLconst", argLength: 1, reg: gp11, asm: "EXTSWSLI", aux: "Int64"}, + + {name: "RLWINM", argLength: 1, reg: gp11, asm: "RLWNM", aux: "Int64"}, // Rotate and mask by immediate "rlwinm". encodePPC64RotateMask describes aux + {name: "RLWNM", argLength: 2, reg: gp21, asm: "RLWNM", aux: "Int64"}, // Rotate and mask by "rlwnm". encodePPC64RotateMask describes aux + {name: "RLWMI", argLength: 2, reg: gp21a0, asm: "RLWMI", aux: "Int64", resultInArg0: true}, // "rlwimi" similar aux encoding as above + + {name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros + {name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit) + + {name: "CNTTZD", argLength: 1, reg: gp11, asm: "CNTTZD"}, // count trailing zeros + {name: "CNTTZW", argLength: 1, reg: gp11, asm: "CNTTZW"}, // count trailing zeros (32 bit) + + {name: "POPCNTD", argLength: 1, reg: gp11, asm: "POPCNTD"}, // number of set bits in arg0 + {name: "POPCNTW", argLength: 1, reg: gp11, asm: "POPCNTW"}, // number of set bits in each word of arg0 placed in corresponding word + {name: "POPCNTB", argLength: 1, reg: gp11, asm: "POPCNTB"}, // number of set bits in each byte of arg0 placed in corresponding byte + + {name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"}, // arg0/arg1 + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1 + + {name: "DIVD", argLength: 2, reg: gp21, asm: "DIVD", typ: "Int64"}, // arg0/arg1 (signed 64-bit) + {name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"}, // arg0/arg1 (signed 32-bit) + {name: "DIVDU", argLength: 2, reg: gp21, asm: "DIVDU", typ: "Int64"}, // arg0/arg1 (unsigned 64-bit) + {name: "DIVWU", argLength: 2, reg: gp21, asm: "DIVWU", typ: "Int32"}, // arg0/arg1 (unsigned 32-bit) + + {name: "MODUD", argLength: 2, reg: gp21, asm: "MODUD", typ: "UInt64"}, // arg0 % arg1 (unsigned 64-bit) + {name: "MODSD", argLength: 2, reg: gp21, asm: "MODSD", typ: "Int64"}, // arg0 % arg1 (signed 64-bit) + {name: "MODUW", argLength: 2, reg: gp21, asm: "MODUW", typ: "UInt32"}, // arg0 % arg1 (unsigned 32-bit) + {name: "MODSW", argLength: 2, reg: gp21, asm: "MODSW", typ: "Int32"}, // arg0 % arg1 (signed 32-bit) + // MOD is implemented as rem := arg0 - (arg0/arg1) * arg1 + + // Conversions are all float-to-float register operations. "Integer" refers to encoding in the FP register. + {name: "FCTIDZ", argLength: 1, reg: fp11, asm: "FCTIDZ", typ: "Float64"}, // convert float to 64-bit int round towards zero + {name: "FCTIWZ", argLength: 1, reg: fp11, asm: "FCTIWZ", typ: "Float64"}, // convert float to 32-bit int round towards zero + {name: "FCFID", argLength: 1, reg: fp11, asm: "FCFID", typ: "Float64"}, // convert 64-bit integer to float + {name: "FCFIDS", argLength: 1, reg: fp11, asm: "FCFIDS", typ: "Float32"}, // convert 32-bit integer to float + {name: "FRSP", argLength: 1, reg: fp11, asm: "FRSP", typ: "Float64"}, // round float to 32-bit value + + // Movement between float and integer registers with no change in bits; accomplished with stores+loads on PPC. + // Because the 32-bit load-literal-bits instructions have impoverished addressability, always widen the + // data instead and use FMOVDload and FMOVDstore instead (this will also dodge endianess issues). + // There are optimizations that should apply -- (Xi2f64 (MOVWload (not-ADD-ptr+offset) ) ) could use + // the word-load instructions. (Xi2f64 (MOVDload ptr )) can be (FMOVDload ptr) + + {name: "MFVSRD", argLength: 1, reg: fpgp, asm: "MFVSRD", typ: "Int64"}, // move 64 bits of F register into G register + {name: "MTVSRD", argLength: 1, reg: gpfp, asm: "MTVSRD", typ: "Float64"}, // move 64 bits of G register into F register + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0&arg1 + {name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"}, // arg0&^arg1 + {name: "ANDCC", argLength: 2, reg: gp21, asm: "ANDCC", commutative: true, clobberFlags: true, typ: "(Int64,Flags)"}, // arg0&arg1 sets CC + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0|arg1 + {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // arg0|^arg1 + {name: "ORCC", argLength: 2, reg: gp21, asm: "ORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"}, // arg0|arg1 sets CC + {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0|arg1) + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", typ: "Int64", commutative: true}, // arg0^arg1 + {name: "XORCC", argLength: 2, reg: gp21, asm: "XORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"}, // arg0^arg1 sets CC + {name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true}, // arg0^^arg1 + {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0 (integer) + {name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"}, // -arg0 (floating point) + {name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0) (floating point) + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0) (floating point, single precision) + {name: "FFLOOR", argLength: 1, reg: fp11, asm: "FRIM"}, // floor(arg0), float64 + {name: "FCEIL", argLength: 1, reg: fp11, asm: "FRIP"}, // ceil(arg0), float64 + {name: "FTRUNC", argLength: 1, reg: fp11, asm: "FRIZ"}, // trunc(arg0), float64 + {name: "FROUND", argLength: 1, reg: fp11, asm: "FRIN"}, // round(arg0), float64 + {name: "FABS", argLength: 1, reg: fp11, asm: "FABS"}, // abs(arg0), float64 + {name: "FNABS", argLength: 1, reg: fp11, asm: "FNABS"}, // -abs(arg0), float64 + {name: "FCPSGN", argLength: 2, reg: fp21, asm: "FCPSGN"}, // copysign arg0 -> arg1, float64 + + {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0|aux + {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64"}, // arg0^aux + {name: "ANDCCconst", argLength: 1, reg: regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}, asm: "ANDCC", aux: "Int64", clobberFlags: true, typ: "(Int,Flags)"}, // arg0&aux == 0 // and-immediate sets CC on PPC, always. + + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB", typ: "Int64"}, // sign extend int8 to int64 + {name: "MOVBZreg", argLength: 1, reg: gp11, asm: "MOVBZ", typ: "Int64"}, // zero extend uint8 to uint64 + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH", typ: "Int64"}, // sign extend int16 to int64 + {name: "MOVHZreg", argLength: 1, reg: gp11, asm: "MOVHZ", typ: "Int64"}, // zero extend uint16 to uint64 + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW", typ: "Int64"}, // sign extend int32 to int64 + {name: "MOVWZreg", argLength: 1, reg: gp11, asm: "MOVWZ", typ: "Int64"}, // zero extend uint32 to uint64 + + // Load bytes in the endian order of the arch from arg0+aux+auxint into a 64 bit register. + {name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte zero extend + {name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes sign extend + {name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend + {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes sign extend + {name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend + {name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes + + // Load bytes in reverse endian order of the arch from arg0 into a 64 bit register, all zero extend. + // The generated instructions are indexed loads with no offset field in the instruction so the aux fields are not used. + // In these cases the index register field is set to 0 and the full address is in the base register. + {name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes reverse order + {name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend reverse order + {name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend reverse order + + // In these cases an index register is used in addition to a base register + // Loads from memory location arg[0] + arg[1]. + {name: "MOVBZloadidx", argLength: 3, reg: gploadidx, asm: "MOVBZ", typ: "UInt8"}, // zero extend uint8 to uint64 + {name: "MOVHloadidx", argLength: 3, reg: gploadidx, asm: "MOVH", typ: "Int16"}, // sign extend int16 to int64 + {name: "MOVHZloadidx", argLength: 3, reg: gploadidx, asm: "MOVHZ", typ: "UInt16"}, // zero extend uint16 to uint64 + {name: "MOVWloadidx", argLength: 3, reg: gploadidx, asm: "MOVW", typ: "Int32"}, // sign extend int32 to int64 + {name: "MOVWZloadidx", argLength: 3, reg: gploadidx, asm: "MOVWZ", typ: "UInt32"}, // zero extend uint32 to uint64 + {name: "MOVDloadidx", argLength: 3, reg: gploadidx, asm: "MOVD", typ: "Int64"}, + {name: "MOVHBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVHBR", typ: "Int16"}, // sign extend int16 to int64 + {name: "MOVWBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVWBR", typ: "Int32"}, // sign extend int32 to int64 + {name: "MOVDBRloadidx", argLength: 3, reg: gploadidx, asm: "MOVDBR", typ: "Int64"}, + {name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", typ: "Float64"}, + {name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", typ: "Float32"}, + + // Prefetch instruction + // Do prefetch of address generated with arg0 and arg1 with option aux. arg0=addr,arg1=memory, aux=option. + {name: "DCBT", argLength: 2, aux: "Int64", reg: prefreg, asm: "DCBT", hasSideEffects: true}, + + // Store bytes in the reverse endian order of the arch into arg0. + // These are indexed stores with no offset field in the instruction so the auxint fields are not used. + {name: "MOVDBRstore", argLength: 3, reg: gpstore, asm: "MOVDBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes reverse order + {name: "MOVWBRstore", argLength: 3, reg: gpstore, asm: "MOVWBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes reverse order + {name: "MOVHBRstore", argLength: 3, reg: gpstore, asm: "MOVHBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes reverse order + + // Floating point loads from arg0+aux+auxint + {name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load double float + {name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load single float + + // Store bytes in the endian order of the arch into arg0+aux+auxint + {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte + {name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes + {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes + {name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes + + // Store floating point value into arg0+aux+auxint + {name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store double flot + {name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store single float + + // Stores using index and base registers + // Stores to arg[0] + arg[1] + {name: "MOVBstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVB", typ: "Mem"}, // store bye + {name: "MOVHstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVH", typ: "Mem"}, // store half word + {name: "MOVWstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVW", typ: "Mem"}, // store word + {name: "MOVDstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVD", typ: "Mem"}, // store double word + {name: "FMOVDstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVD", typ: "Mem"}, // store double float + {name: "FMOVSstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVS", typ: "Mem"}, // store single float + {name: "MOVHBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVHBR", typ: "Mem"}, // store half word reversed byte using index reg + {name: "MOVWBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVWBR", typ: "Mem"}, // store word reversed byte using index reg + {name: "MOVDBRstoreidx", argLength: 4, reg: gpstoreidx, asm: "MOVDBR", typ: "Mem"}, // store double word reversed byte using index reg + + // The following ops store 0 into arg0+aux+auxint arg1=mem + {name: "MOVBstorezero", argLength: 2, reg: gpstorezero, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 1 byte + {name: "MOVHstorezero", argLength: 2, reg: gpstorezero, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 2 bytes + {name: "MOVWstorezero", argLength: 2, reg: gpstorezero, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 4 bytes + {name: "MOVDstorezero", argLength: 2, reg: gpstorezero, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 8 bytes + + {name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb | gp}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB/GP + + {name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "Int64", rematerializeable: true}, // + {name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", rematerializeable: true}, // + {name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float32", asm: "FMOVS", rematerializeable: true}, // + {name: "FCMPU", argLength: 2, reg: fp2cr, asm: "FCMPU", typ: "Flags"}, + + {name: "CMP", argLength: 2, reg: gp2cr, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPU", argLength: 2, reg: gp2cr, asm: "CMPU", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPW", argLength: 2, reg: gp2cr, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPWU", argLength: 2, reg: gp2cr, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPconst", argLength: 1, reg: gp1cr, asm: "CMP", aux: "Int64", typ: "Flags"}, + {name: "CMPUconst", argLength: 1, reg: gp1cr, asm: "CMPU", aux: "Int64", typ: "Flags"}, + {name: "CMPWconst", argLength: 1, reg: gp1cr, asm: "CMPW", aux: "Int32", typ: "Flags"}, + {name: "CMPWUconst", argLength: 1, reg: gp1cr, asm: "CMPWU", aux: "Int32", typ: "Flags"}, + + // ISEL arg2 ? arg0 : arg1 + // ISELB arg1 ? arg0 : $0. arg0 is some register holding $1. + // ISELZ arg1 ? arg0 : $0 + // auxInt values 0=LT 1=GT 2=EQ 3=SO (summary overflow/unordered) 4=GE 5=LE 6=NE 7=NSO (not summary overflow/not unordered) + // Note, auxInt^4 inverts the comparison condition. For example, LT^4 becomes GE, and "ISEL [a] x y z" is equivalent to ISEL [a^4] y x z". + {name: "ISEL", argLength: 3, reg: crgp21, asm: "ISEL", aux: "Int32", typ: "Int32"}, + {name: "ISELB", argLength: 2, reg: crgp11, asm: "ISEL", aux: "Int32", typ: "Int32"}, + {name: "ISELZ", argLength: 2, reg: crgp11, asm: "ISEL", aux: "Int32"}, + + // pseudo-ops + {name: "Equal", argLength: 1, reg: crgp}, // bool, true flags encode x==y false otherwise. + {name: "NotEqual", argLength: 1, reg: crgp}, // bool, true flags encode x!=y false otherwise. + {name: "LessThan", argLength: 1, reg: crgp}, // bool, true flags encode x<y false otherwise. + {name: "FLessThan", argLength: 1, reg: crgp}, // bool, true flags encode x<y false otherwise. + {name: "LessEqual", argLength: 1, reg: crgp}, // bool, true flags encode x<=y false otherwise. + {name: "FLessEqual", argLength: 1, reg: crgp}, // bool, true flags encode x<=y false otherwise; PPC <= === !> which is wrong for NaN + {name: "GreaterThan", argLength: 1, reg: crgp}, // bool, true flags encode x>y false otherwise. + {name: "FGreaterThan", argLength: 1, reg: crgp}, // bool, true flags encode x>y false otherwise. + {name: "GreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode x>=y false otherwise. + {name: "FGreaterEqual", argLength: 1, reg: crgp}, // bool, true flags encode x>=y false otherwise.; PPC >= === !< which is wrong for NaN + + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of the closure pointer. + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{ctxt}}, zeroWidth: true}, + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true}, + // Round ops to block fused-multiply-add extraction. + {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true}, + {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true}, + + {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{callptr, ctxt, 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{callptr}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // large or unaligned zeroing + // arg0 = address of memory to zero (in R3, changed as side effect) + // returns mem + // + // a loop is generated when there is more than one iteration + // needed to clear 4 doublewords + // + // XXLXOR VS32,VS32,VS32 + // MOVD $len/32,R31 + // MOVD R31,CTR + // MOVD $16,R31 + // loop: + // STXVD2X VS32,(R0)(R3) + // STXVD2X VS32,(R31),R3) + // ADD R3,32 + // BC loop + + // remaining doubleword clears generated as needed + // MOVD R0,(R3) + // MOVD R0,8(R3) + // MOVD R0,16(R3) + // MOVD R0,24(R3) + + // one or more of these to clear remainder < 8 bytes + // MOVW R0,n1(R3) + // MOVH R0,n2(R3) + // MOVB R0,n3(R3) + { + name: "LoweredZero", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{buildReg("R20")}, + clobbers: buildReg("R20"), + }, + clobberFlags: true, + typ: "Mem", + faultOnNilArg0: true, + unsafePoint: true, + }, + { + name: "LoweredZeroShort", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{gp}}, + typ: "Mem", + faultOnNilArg0: true, + unsafePoint: true, + }, + { + name: "LoweredQuadZeroShort", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{gp}, + }, + typ: "Mem", + faultOnNilArg0: true, + unsafePoint: true, + }, + { + name: "LoweredQuadZero", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{buildReg("R20")}, + clobbers: buildReg("R20"), + }, + clobberFlags: true, + typ: "Mem", + faultOnNilArg0: true, + unsafePoint: true, + }, + + // R31 is temp register + // Loop code: + // MOVD len/32,R31 set up loop ctr + // MOVD R31,CTR + // MOVD $16,R31 index register + // loop: + // LXVD2X (R0)(R4),VS32 + // LXVD2X (R31)(R4),VS33 + // ADD R4,$32 increment src + // STXVD2X VS32,(R0)(R3) + // STXVD2X VS33,(R31)(R3) + // ADD R3,$32 increment dst + // BC 16,0,loop branch ctr + // For this purpose, VS32 and VS33 are treated as + // scratch registers. Since regalloc does not + // track vector registers, even if it could be marked + // as clobbered it would have no effect. + // TODO: If vector registers are managed by regalloc + // mark these as clobbered. + // + // Bytes not moved by this loop are moved + // with a combination of the following instructions, + // starting with the largest sizes and generating as + // many as needed, using the appropriate offset value. + // MOVD n(R4),R14 + // MOVD R14,n(R3) + // MOVW n1(R4),R14 + // MOVW R14,n1(R3) + // MOVH n2(R4),R14 + // MOVH R14,n2(R3) + // MOVB n3(R4),R14 + // MOVB R14,n3(R3) + + { + name: "LoweredMove", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R20"), buildReg("R21")}, + clobbers: buildReg("R20 R21"), + }, + clobberFlags: true, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + unsafePoint: true, + }, + { + name: "LoweredMoveShort", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{gp, gp}, + }, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + unsafePoint: true, + }, + + // The following is similar to the LoweredMove, but uses + // LXV instead of LXVD2X, which does not require an index + // register and will do 4 in a loop instead of only. + { + name: "LoweredQuadMove", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R20"), buildReg("R21")}, + clobbers: buildReg("R20 R21"), + }, + clobberFlags: true, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + unsafePoint: true, + }, + + { + name: "LoweredQuadMoveShort", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{gp, gp}, + }, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + unsafePoint: true, + }, + + {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true}, + + {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, typ: "UInt8", aux: "Int64", clobberFlags: true, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, typ: "UInt32", aux: "Int64", clobberFlags: true, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true}, + {name: "LoweredAtomicLoadPtr", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true}, + + // atomic add32, 64 + // LWSYNC + // LDAR (Rarg0), Rout + // ADD Rarg1, Rout + // STDCCC Rout, (Rarg0) + // BNE -3(PC) + // return new sum + {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic exchange32, 64 + // LWSYNC + // LDAR (Rarg0), Rout + // STDCCC Rarg1, (Rarg0) + // BNE -2(PC) + // ISYNC + // return old val + {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // SYNC + // LDAR (Rarg0), Rtmp + // CMP Rarg1, Rtmp + // BNE 3(PC) + // STDCCC Rarg2, (Rarg0) + // BNE -4(PC) + // CBNZ Rtmp, -4(PC) + // CSET EQ, Rout + {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true}, + + // atomic 8/32 and/or. + // *arg0 &= (|=) arg1. arg2=mem. returns memory. auxint must be zero. + // LBAR/LWAT (Rarg0), Rtmp + // AND/OR Rarg1, Rtmp + // STBCCC/STWCCC Rtmp, (Rarg0), Rtmp + // BNE Rtmp, -3(PC) + {name: "LoweredAtomicAnd8", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicAnd32", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicOr8", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicOr32", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It preserves R0 through R17 (except special registers R1, R2, R11, R12, R13), g, and its arguments R20 and R21, + // but may clobber anything else, including R31 (REGTMP). + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R20"), buildReg("R21")}, clobbers: (callerSave &^ buildReg("R0 R3 R4 R5 R6 R7 R8 R9 R10 R14 R15 R16 R17 R20 R21 g")) | buildReg("R31")}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + {name: "LoweredPubBarrier", argLength: 1, asm: "LWSYNC", hasSideEffects: true}, // Do data barrier. arg0=memory + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r5, r6}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r5}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + + // (InvertFlags (CMP a b)) == (CMP b a) + // So if we want (LessThan (CMP a b)) but we can't do that because a is a constant, + // then we do (LessThan (InvertFlags (CMP b a))) instead. + // Rewrites will convert this to (GreaterThan (CMP b a)). + // InvertFlags is a pseudo-op which can't appear in assembly output. + {name: "InvertFlags", argLength: 1}, // reverse direction of arg0 + + // Constant flag values. For any comparison, there are 3 possible + // outcomes: either the three from the signed total order (<,==,>) + // or the three from the unsigned total order, depending on which + // comparison operation was used (CMP or CMPU -- PPC is different from + // the other architectures, which have a single comparison producing + // both signed and unsigned comparison results.) + + // These ops are for temporary use by rewrite rules. They + // cannot appear in the generated assembly. + {name: "FlagEQ"}, // equal + {name: "FlagLT"}, // signed < or unsigned < + {name: "FlagGT"}, // signed > or unsigned > + } + + blocks := []blockData{ + {name: "EQ", controls: 1}, + {name: "NE", controls: 1}, + {name: "LT", controls: 1}, + {name: "LE", controls: 1}, + {name: "GT", controls: 1}, + {name: "GE", controls: 1}, + {name: "FLT", controls: 1}, + {name: "FLE", controls: 1}, + {name: "FGT", controls: 1}, + {name: "FGE", controls: 1}, + } + + archs = append(archs, arch{ + name: "PPC64", + pkg: "cmd/internal/obj/ppc64", + genfile: "../../ppc64/ssa.go", + ops: ops, + blocks: blocks, + regnames: regNamesPPC64, + ParamIntRegNames: "R3 R4 R5 R6 R7 R8 R9 R10 R14 R15 R16 R17", + ParamFloatRegNames: "F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12", + gpregmask: gp, + fpregmask: fp, + specialregmask: xer, + framepointerreg: -1, + linkreg: -1, // not used + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules new file mode 100644 index 0000000..ada97b2 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules @@ -0,0 +1,10 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains rules used by the laterLower pass. + +// Simplify ISEL x $0 z into ISELZ +(ISEL [a] x (MOVDconst [0]) z) => (ISELZ [a] x z) +// Simplify ISEL $0 y z into ISELZ by inverting comparison and reversing arguments. +(ISEL [a] (MOVDconst [0]) y z) => (ISELZ [a^0x4] y z) diff --git a/src/cmd/compile/internal/ssa/_gen/README b/src/cmd/compile/internal/ssa/_gen/README new file mode 100644 index 0000000..0c7ceba --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/README @@ -0,0 +1,7 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +This package generates opcode tables, rewrite rules, etc. for the ssa compiler. +Run it with go-1.13 (or above): + go run . diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules new file mode 100644 index 0000000..59f71be --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules @@ -0,0 +1,845 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Lowering arithmetic +(Add64 ...) => (ADD ...) +(AddPtr ...) => (ADD ...) +(Add32 ...) => (ADD ...) +(Add16 ...) => (ADD ...) +(Add8 ...) => (ADD ...) +(Add32F ...) => (FADDS ...) +(Add64F ...) => (FADDD ...) + +(Sub64 ...) => (SUB ...) +(SubPtr ...) => (SUB ...) +(Sub32 ...) => (SUB ...) +(Sub16 ...) => (SUB ...) +(Sub8 ...) => (SUB ...) +(Sub32F ...) => (FSUBS ...) +(Sub64F ...) => (FSUBD ...) + +(Mul64 ...) => (MUL ...) +(Mul64uhilo ...) => (LoweredMuluhilo ...) +(Mul64uover ...) => (LoweredMuluover ...) +(Mul32 ...) => (MULW ...) +(Mul16 x y) => (MULW (SignExt16to32 x) (SignExt16to32 y)) +(Mul8 x y) => (MULW (SignExt8to32 x) (SignExt8to32 y)) +(Mul32F ...) => (FMULS ...) +(Mul64F ...) => (FMULD ...) + +(Div32F ...) => (FDIVS ...) +(Div64F ...) => (FDIVD ...) + +(Div64 x y [false]) => (DIV x y) +(Div64u ...) => (DIVU ...) +(Div32 x y [false]) => (DIVW x y) +(Div32u ...) => (DIVUW ...) +(Div16 x y [false]) => (DIVW (SignExt16to32 x) (SignExt16to32 y)) +(Div16u x y) => (DIVUW (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y)) +(Div8u x y) => (DIVUW (ZeroExt8to32 x) (ZeroExt8to32 y)) + +(Hmul64 ...) => (MULH ...) +(Hmul64u ...) => (MULHU ...) +(Hmul32 x y) => (SRAI [32] (MUL (SignExt32to64 x) (SignExt32to64 y))) +(Hmul32u x y) => (SRLI [32] (MUL (ZeroExt32to64 x) (ZeroExt32to64 y))) + +(Select0 (Add64carry x y c)) => (ADD (ADD <typ.UInt64> x y) c) +(Select1 (Add64carry x y c)) => + (OR (SLTU <typ.UInt64> s:(ADD <typ.UInt64> x y) x) (SLTU <typ.UInt64> (ADD <typ.UInt64> s c) s)) + +(Select0 (Sub64borrow x y c)) => (SUB (SUB <typ.UInt64> x y) c) +(Select1 (Sub64borrow x y c)) => + (OR (SLTU <typ.UInt64> x s:(SUB <typ.UInt64> x y)) (SLTU <typ.UInt64> s (SUB <typ.UInt64> s c))) + +// (x + y) / 2 => (x / 2) + (y / 2) + (x & y & 1) +(Avg64u <t> x y) => (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y))) + +(Mod64 x y [false]) => (REM x y) +(Mod64u ...) => (REMU ...) +(Mod32 x y [false]) => (REMW x y) +(Mod32u ...) => (REMUW ...) +(Mod16 x y [false]) => (REMW (SignExt16to32 x) (SignExt16to32 y)) +(Mod16u x y) => (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y)) +(Mod8 x y) => (REMW (SignExt8to32 x) (SignExt8to32 y)) +(Mod8u x y) => (REMUW (ZeroExt8to32 x) (ZeroExt8to32 y)) + +(And64 ...) => (AND ...) +(And32 ...) => (AND ...) +(And16 ...) => (AND ...) +(And8 ...) => (AND ...) + +(Or64 ...) => (OR ...) +(Or32 ...) => (OR ...) +(Or16 ...) => (OR ...) +(Or8 ...) => (OR ...) + +(Xor64 ...) => (XOR ...) +(Xor32 ...) => (XOR ...) +(Xor16 ...) => (XOR ...) +(Xor8 ...) => (XOR ...) + +(Neg64 ...) => (NEG ...) +(Neg32 ...) => (NEG ...) +(Neg16 ...) => (NEG ...) +(Neg8 ...) => (NEG ...) +(Neg32F ...) => (FNEGS ...) +(Neg64F ...) => (FNEGD ...) + +(Com64 ...) => (NOT ...) +(Com32 ...) => (NOT ...) +(Com16 ...) => (NOT ...) +(Com8 ...) => (NOT ...) + +(Sqrt ...) => (FSQRTD ...) +(Sqrt32 ...) => (FSQRTS ...) + +(Copysign ...) => (FSGNJD ...) + +(Abs ...) => (FABSD ...) + +(FMA ...) => (FMADDD ...) + +// Sign and zero extension. + +(SignExt8to16 ...) => (MOVBreg ...) +(SignExt8to32 ...) => (MOVBreg ...) +(SignExt8to64 ...) => (MOVBreg ...) +(SignExt16to32 ...) => (MOVHreg ...) +(SignExt16to64 ...) => (MOVHreg ...) +(SignExt32to64 ...) => (MOVWreg ...) + +(ZeroExt8to16 ...) => (MOVBUreg ...) +(ZeroExt8to32 ...) => (MOVBUreg ...) +(ZeroExt8to64 ...) => (MOVBUreg ...) +(ZeroExt16to32 ...) => (MOVHUreg ...) +(ZeroExt16to64 ...) => (MOVHUreg ...) +(ZeroExt32to64 ...) => (MOVWUreg ...) + +(Cvt32to32F ...) => (FCVTSW ...) +(Cvt32to64F ...) => (FCVTDW ...) +(Cvt64to32F ...) => (FCVTSL ...) +(Cvt64to64F ...) => (FCVTDL ...) + +(Cvt32Fto32 ...) => (FCVTWS ...) +(Cvt32Fto64 ...) => (FCVTLS ...) +(Cvt64Fto32 ...) => (FCVTWD ...) +(Cvt64Fto64 ...) => (FCVTLD ...) + +(Cvt32Fto64F ...) => (FCVTDS ...) +(Cvt64Fto32F ...) => (FCVTSD ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round32F ...) => (Copy ...) +(Round64F ...) => (Copy ...) + +(Slicemask <t> x) => (SRAI [63] (NEG <t> x)) + +// Truncations +// We ignore the unused high parts of registers, so truncates are just copies. +(Trunc16to8 ...) => (Copy ...) +(Trunc32to8 ...) => (Copy ...) +(Trunc32to16 ...) => (Copy ...) +(Trunc64to8 ...) => (Copy ...) +(Trunc64to16 ...) => (Copy ...) +(Trunc64to32 ...) => (Copy ...) + +// Shifts + +// SLL only considers the bottom 6 bits of y. If y > 64, the result should +// always be 0. +// +// Breaking down the operation: +// +// (SLL x y) generates x << (y & 63). +// +// If y < 64, this is the value we want. Otherwise, we want zero. +// +// So, we AND with -1 * uint64(y < 64), which is 0xfffff... if y < 64 and 0 otherwise. +(Lsh8x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Lsh8x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Lsh8x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Lsh8x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] y))) +(Lsh16x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Lsh16x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Lsh16x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Lsh16x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] y))) +(Lsh32x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Lsh32x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Lsh32x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Lsh32x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] y))) +(Lsh64x8 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Lsh64x16 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Lsh64x32 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Lsh64x64 <t> x y) && !shiftIsBounded(v) => (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] y))) + +(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) +(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) +(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) +(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) + +// SRL only considers the bottom 6 bits of y. If y > 64, the result should +// always be 0. See Lsh above for a detailed description. +(Rsh8Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Rsh8Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Rsh8Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Rsh8Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] y))) +(Rsh16Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y))) +(Rsh32Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y))) +(Rsh64Ux8 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64 y)))) +(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y)))) +(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y)))) +(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] y))) + +(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt8to64 x) y) +(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt16to64 x) y) +(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt32to64 x) y) +(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x y) + +// SRA only considers the bottom 6 bits of y. If y > 64, the result should +// be either 0 or -1 based on the sign bit. +// +// We implement this by performing the max shift (-1) if y >= 64. +// +// We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves +// us with -1 (0xffff...) if y >= 64. +// +// We don't need to sign-extend the OR result, as it will be at minimum 8 bits, +// more than the 6 bits SRA cares about. +(Rsh8x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y))))) +(Rsh8x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y))))) +(Rsh8x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y))))) +(Rsh8x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y)))) +(Rsh16x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y))))) +(Rsh16x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y))))) +(Rsh16x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y))))) +(Rsh16x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y)))) +(Rsh32x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y))))) +(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y))))) +(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y))))) +(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y)))) +(Rsh64x8 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y))))) +(Rsh64x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y))))) +(Rsh64x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y))))) +(Rsh64x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y)))) + +(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt8to64 x) y) +(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt16to64 x) y) +(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt32to64 x) y) +(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x y) + +// Rotates. +(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7]))) +(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15]))) +(RotateLeft32 <t> x (MOVDconst [c])) => (Or32 (Lsh32x64 <t> x (MOVDconst [c&31])) (Rsh32Ux64 <t> x (MOVDconst [-c&31]))) +(RotateLeft64 <t> x (MOVDconst [c])) => (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63]))) + +(Less64 ...) => (SLT ...) +(Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) +(Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y)) +(Less8 x y) => (SLT (SignExt8to64 x) (SignExt8to64 y)) +(Less64U ...) => (SLTU ...) +(Less32U x y) => (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Less16U x y) => (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Less8U x y) => (SLTU (ZeroExt8to64 x) (ZeroExt8to64 y)) +(Less64F ...) => (FLTD ...) +(Less32F ...) => (FLTS ...) + +// Convert x <= y to !(y > x). +(Leq64 x y) => (Not (Less64 y x)) +(Leq32 x y) => (Not (Less32 y x)) +(Leq16 x y) => (Not (Less16 y x)) +(Leq8 x y) => (Not (Less8 y x)) +(Leq64U x y) => (Not (Less64U y x)) +(Leq32U x y) => (Not (Less32U y x)) +(Leq16U x y) => (Not (Less16U y x)) +(Leq8U x y) => (Not (Less8U y x)) +(Leq64F ...) => (FLED ...) +(Leq32F ...) => (FLES ...) + +(EqPtr x y) => (SEQZ (SUB <typ.Uintptr> x y)) +(Eq64 x y) => (SEQZ (SUB <x.Type> x y)) +(Eq32 x y) => (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Eq16 x y) => (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Eq8 x y) => (SEQZ (SUB <x.Type> (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Eq64F ...) => (FEQD ...) +(Eq32F ...) => (FEQS ...) + +(NeqPtr x y) => (SNEZ (SUB <typ.Uintptr> x y)) +(Neq64 x y) => (SNEZ (SUB <x.Type> x y)) +(Neq32 x y) => (SNEZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y))) +(Neq16 x y) => (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y))) +(Neq8 x y) => (SNEZ (SUB <x.Type> (ZeroExt8to64 x) (ZeroExt8to64 y))) +(Neq64F ...) => (FNED ...) +(Neq32F ...) => (FNES ...) + +// Loads +(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +(Load <t> ptr mem) && ( is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem) +(Load <t> ptr mem) && ( is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem) +(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem) +(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem) +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (FMOVWload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem) + +// Stores +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem) + +// We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis +// knows what variables are being read/written by the ops. +(MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVBload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVHload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem) + +(MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +(MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => + (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) + +(MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVBUload [off1+int32(off2)] {sym} base mem) +(MOVBload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVBload [off1+int32(off2)] {sym} base mem) +(MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVHUload [off1+int32(off2)] {sym} base mem) +(MOVHload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVHload [off1+int32(off2)] {sym} base mem) +(MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVWUload [off1+int32(off2)] {sym} base mem) +(MOVWload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVWload [off1+int32(off2)] {sym} base mem) +(MOVDload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOVDload [off1+int32(off2)] {sym} base mem) + +(MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => + (MOVBstore [off1+int32(off2)] {sym} base val mem) +(MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => + (MOVHstore [off1+int32(off2)] {sym} base val mem) +(MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => + (MOVWstore [off1+int32(off2)] {sym} base val mem) +(MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => + (MOVDstore [off1+int32(off2)] {sym} base val mem) +(MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem) +(MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDstorezero [off1+int32(off2)] {sym} ptr mem) + +// Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis +// with OffPtr -> ADDI. +(ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+int64(d)) => (MOVaddr [int32(c)+d] {s} x) + +// Small zeroing +(Zero [0] _ mem) => mem +(Zero [1] ptr mem) => (MOVBstore ptr (MOVDconst [0]) mem) +(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore ptr (MOVDconst [0]) mem) +(Zero [2] ptr mem) => + (MOVBstore [1] ptr (MOVDconst [0]) + (MOVBstore ptr (MOVDconst [0]) mem)) +(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore ptr (MOVDconst [0]) mem) +(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] ptr (MOVDconst [0]) + (MOVHstore ptr (MOVDconst [0]) mem)) +(Zero [4] ptr mem) => + (MOVBstore [3] ptr (MOVDconst [0]) + (MOVBstore [2] ptr (MOVDconst [0]) + (MOVBstore [1] ptr (MOVDconst [0]) + (MOVBstore ptr (MOVDconst [0]) mem)))) +(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVDstore ptr (MOVDconst [0]) mem) +(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] ptr (MOVDconst [0]) + (MOVWstore ptr (MOVDconst [0]) mem)) +(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] ptr (MOVDconst [0]) + (MOVHstore [4] ptr (MOVDconst [0]) + (MOVHstore [2] ptr (MOVDconst [0]) + (MOVHstore ptr (MOVDconst [0]) mem)))) + +(Zero [3] ptr mem) => + (MOVBstore [2] ptr (MOVDconst [0]) + (MOVBstore [1] ptr (MOVDconst [0]) + (MOVBstore ptr (MOVDconst [0]) mem))) +(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] ptr (MOVDconst [0]) + (MOVHstore [2] ptr (MOVDconst [0]) + (MOVHstore ptr (MOVDconst [0]) mem))) +(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] ptr (MOVDconst [0]) + (MOVWstore [4] ptr (MOVDconst [0]) + (MOVWstore ptr (MOVDconst [0]) mem))) +(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVDstore [8] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)) +(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVDstore [16] ptr (MOVDconst [0]) + (MOVDstore [8] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem))) +(Zero [32] {t} ptr mem) && t.Alignment()%8 == 0 => + (MOVDstore [24] ptr (MOVDconst [0]) + (MOVDstore [16] ptr (MOVDconst [0]) + (MOVDstore [8] ptr (MOVDconst [0]) + (MOVDstore ptr (MOVDconst [0]) mem)))) + +// Medium 8-aligned zeroing uses a Duff's device +// 8 and 128 are magic constants, see runtime/mkduff.go +(Zero [s] {t} ptr mem) + && s%8 == 0 && s <= 8*128 + && t.Alignment()%8 == 0 && !config.noDuffDevice => + (DUFFZERO [8 * (128 - s/8)] ptr mem) + +// Generic zeroing uses a loop +(Zero [s] {t} ptr mem) => + (LoweredZero [t.Alignment()] + ptr + (ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) + mem) + +(Convert ...) => (MOVconvert ...) + +// Checks +(IsNonNil ...) => (SNEZ ...) +(IsInBounds ...) => (Less64U ...) +(IsSliceInBounds ...) => (Leq64U ...) + +// Trivial lowering +(NilCheck ...) => (LoweredNilCheck ...) +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +// Small moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem) +(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore dst (MOVHload src mem) mem) +(Move [2] dst src mem) => + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem)) +(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore dst (MOVWload src mem) mem) +(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)) +(Move [4] dst src mem) => + (MOVBstore [3] dst (MOVBload [3] src mem) + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem)))) +(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVDstore dst (MOVDload src mem) mem) +(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem)) +(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [6] dst (MOVHload [6] src mem) + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem)))) + +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBload [2] src mem) + (MOVBstore [1] dst (MOVBload [1] src mem) + (MOVBstore dst (MOVBload src mem) mem))) +(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 => + (MOVHstore [4] dst (MOVHload [4] src mem) + (MOVHstore [2] dst (MOVHload [2] src mem) + (MOVHstore dst (MOVHload src mem) mem))) +(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 => + (MOVWstore [8] dst (MOVWload [8] src mem) + (MOVWstore [4] dst (MOVWload [4] src mem) + (MOVWstore dst (MOVWload src mem) mem))) +(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVDstore [8] dst (MOVDload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVDstore [16] dst (MOVDload [16] src mem) + (MOVDstore [8] dst (MOVDload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem))) +(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 => + (MOVDstore [24] dst (MOVDload [24] src mem) + (MOVDstore [16] dst (MOVDload [16] src mem) + (MOVDstore [8] dst (MOVDload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem)))) + +// Medium 8-aligned move uses a Duff's device +// 16 and 128 are magic constants, see runtime/mkduff.go +(Move [s] {t} dst src mem) + && s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 + && !config.noDuffDevice && logLargeCopy(v, s) => + (DUFFCOPY [16 * (128 - s/8)] dst src mem) + +// Generic move uses a loop +(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) => + (LoweredMove [t.Alignment()] + dst + src + (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src) + mem) + +// Boolean ops; 0=false, 1=true +(AndB ...) => (AND ...) +(OrB ...) => (OR ...) +(EqB x y) => (SEQZ (SUB <typ.Bool> x y)) +(NeqB x y) => (SNEZ (SUB <typ.Bool> x y)) +(Not ...) => (SEQZ ...) + +// Lowering pointer arithmetic +// TODO: Special handling for SP offsets, like ARM +(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVaddr [int32(off)] ptr) +(OffPtr [off] ptr) && is32Bit(off) => (ADDI [off] ptr) +(OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr) + +(Const8 [val]) => (MOVDconst [int64(val)]) +(Const16 [val]) => (MOVDconst [int64(val)]) +(Const32 [val]) => (MOVDconst [int64(val)]) +(Const64 [val]) => (MOVDconst [int64(val)]) +(Const32F [val]) => (FMVSX (MOVDconst [int64(math.Float32bits(val))])) +(Const64F [val]) => (FMVDX (MOVDconst [int64(math.Float64bits(val))])) +(ConstNil) => (MOVDconst [0]) +(ConstBool [val]) => (MOVDconst [int64(b2i(val))]) + +(Addr {sym} base) => (MOVaddr {sym} [0] base) +(LocalAddr {sym} base _) => (MOVaddr {sym} base) + +// Calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// Atomic Intrinsics +(AtomicLoad8 ...) => (LoweredAtomicLoad8 ...) +(AtomicLoad32 ...) => (LoweredAtomicLoad32 ...) +(AtomicLoad64 ...) => (LoweredAtomicLoad64 ...) +(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...) + +(AtomicStore8 ...) => (LoweredAtomicStore8 ...) +(AtomicStore32 ...) => (LoweredAtomicStore32 ...) +(AtomicStore64 ...) => (LoweredAtomicStore64 ...) +(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...) + +(AtomicAdd32 ...) => (LoweredAtomicAdd32 ...) +(AtomicAdd64 ...) => (LoweredAtomicAdd64 ...) + +// AtomicAnd8(ptr,val) => LoweredAtomicAnd32(ptr&^3, ^((uint8(val) ^ 0xff) << ((ptr & 3) * 8))) +(AtomicAnd8 ptr val mem) => + (LoweredAtomicAnd32 (ANDI <typ.Uintptr> [^3] ptr) + (NOT <typ.UInt32> (SLL <typ.UInt32> (XORI <typ.UInt32> [0xff] (ZeroExt8to32 val)) + (SLLI <typ.UInt64> [3] (ANDI <typ.UInt64> [3] ptr)))) mem) + +(AtomicAnd32 ...) => (LoweredAtomicAnd32 ...) + +(AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem) +(AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...) + +(AtomicExchange32 ...) => (LoweredAtomicExchange32 ...) +(AtomicExchange64 ...) => (LoweredAtomicExchange64 ...) + +// AtomicOr8(ptr,val) => LoweredAtomicOr32(ptr&^3, uint32(val)<<((ptr&3)*8)) +(AtomicOr8 ptr val mem) => + (LoweredAtomicOr32 (ANDI <typ.Uintptr> [^3] ptr) + (SLL <typ.UInt32> (ZeroExt8to32 val) + (SLLI <typ.UInt64> [3] (ANDI <typ.UInt64> [3] ptr))) mem) + +(AtomicOr32 ...) => (LoweredAtomicOr32 ...) + +// Conditional branches +(If cond yes no) => (BNEZ (MOVBUreg <typ.UInt64> cond) yes no) + +// Optimizations + +// Absorb SEQZ/SNEZ into branch. +(BEQZ (SEQZ x) yes no) => (BNEZ x yes no) +(BEQZ (SNEZ x) yes no) => (BEQZ x yes no) +(BNEZ (SEQZ x) yes no) => (BEQZ x yes no) +(BNEZ (SNEZ x) yes no) => (BNEZ x yes no) + +// Remove redundant NEG from BEQZ/BNEZ. +(BEQZ (NEG x) yes no) => (BEQZ x yes no) +(BNEZ (NEG x) yes no) => (BNEZ x yes no) + +// Negate comparison with FNES/FNED. +(BEQZ (FNES <t> x y) yes no) => (BNEZ (FEQS <t> x y) yes no) +(BNEZ (FNES <t> x y) yes no) => (BEQZ (FEQS <t> x y) yes no) +(BEQZ (FNED <t> x y) yes no) => (BNEZ (FEQD <t> x y) yes no) +(BNEZ (FNED <t> x y) yes no) => (BEQZ (FEQD <t> x y) yes no) + +// Convert BEQZ/BNEZ into more optimal branch conditions. +(BEQZ (SUB x y) yes no) => (BEQ x y yes no) +(BNEZ (SUB x y) yes no) => (BNE x y yes no) +(BEQZ (SLT x y) yes no) => (BGE x y yes no) +(BNEZ (SLT x y) yes no) => (BLT x y yes no) +(BEQZ (SLTU x y) yes no) => (BGEU x y yes no) +(BNEZ (SLTU x y) yes no) => (BLTU x y yes no) +(BEQZ (SLTI [x] y) yes no) => (BGE y (MOVDconst [x]) yes no) +(BNEZ (SLTI [x] y) yes no) => (BLT y (MOVDconst [x]) yes no) +(BEQZ (SLTIU [x] y) yes no) => (BGEU y (MOVDconst [x]) yes no) +(BNEZ (SLTIU [x] y) yes no) => (BLTU y (MOVDconst [x]) yes no) + +// Convert branch with zero to more optimal branch zero. +(BEQ (MOVDconst [0]) cond yes no) => (BEQZ cond yes no) +(BEQ cond (MOVDconst [0]) yes no) => (BEQZ cond yes no) +(BNE (MOVDconst [0]) cond yes no) => (BNEZ cond yes no) +(BNE cond (MOVDconst [0]) yes no) => (BNEZ cond yes no) +(BLT (MOVDconst [0]) cond yes no) => (BGTZ cond yes no) +(BLT cond (MOVDconst [0]) yes no) => (BLTZ cond yes no) +(BGE (MOVDconst [0]) cond yes no) => (BLEZ cond yes no) +(BGE cond (MOVDconst [0]) yes no) => (BGEZ cond yes no) + +// Remove redundant NEG from SEQZ/SNEZ. +(SEQZ (NEG x)) => (SEQZ x) +(SNEZ (NEG x)) => (SNEZ x) + +// Remove redundant SEQZ/SNEZ. +(SEQZ (SEQZ x)) => (SNEZ x) +(SEQZ (SNEZ x)) => (SEQZ x) +(SNEZ (SEQZ x)) => (SEQZ x) +(SNEZ (SNEZ x)) => (SNEZ x) + +// Store zero. +(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem) +(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem) +(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem) + +// Boolean ops are already extended. +(MOVBUreg x:((FLES|FLTS|FEQS|FNES) _ _)) => x +(MOVBUreg x:((FLED|FLTD|FEQD|FNED) _ _)) => x +(MOVBUreg x:((SEQZ|SNEZ) _)) => x +(MOVBUreg x:((SLT|SLTU) _ _)) => x + +// Avoid extending when already sufficiently masked. +(MOVBreg x:(ANDI [c] y)) && c >= 0 && int64(int8(c)) == c => x +(MOVHreg x:(ANDI [c] y)) && c >= 0 && int64(int16(c)) == c => x +(MOVWreg x:(ANDI [c] y)) && c >= 0 && int64(int32(c)) == c => x +(MOVBUreg x:(ANDI [c] y)) && c >= 0 && int64(uint8(c)) == c => x +(MOVHUreg x:(ANDI [c] y)) && c >= 0 && int64(uint16(c)) == c => x +(MOVWUreg x:(ANDI [c] y)) && c >= 0 && int64(uint32(c)) == c => x + +// Combine masking and zero extension. +(MOVBUreg (ANDI [c] x)) && c < 0 => (ANDI [int64(uint8(c))] x) +(MOVHUreg (ANDI [c] x)) && c < 0 => (ANDI [int64(uint16(c))] x) +(MOVWUreg (ANDI [c] x)) && c < 0 => (AND (MOVDconst [int64(uint32(c))]) x) + +// Avoid sign/zero extension for consts. +(MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))]) +(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))]) +(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))]) +(MOVBUreg (MOVDconst [c])) => (MOVDconst [int64(uint8(c))]) +(MOVHUreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))]) +(MOVWUreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))]) + +// Avoid sign/zero extension after properly typed load. +(MOVBreg x:(MOVBload _ _)) => (MOVDreg x) +(MOVHreg x:(MOVBload _ _)) => (MOVDreg x) +(MOVHreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVHreg x:(MOVHload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVBload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVHUload _ _)) => (MOVDreg x) +(MOVWreg x:(MOVWload _ _)) => (MOVDreg x) +(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x) +(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x) + +// Avoid zero extension after properly typed atomic operation. +(MOVBUreg x:(Select0 (LoweredAtomicLoad8 _ _))) => (MOVDreg x) +(MOVBUreg x:(Select0 (LoweredAtomicCas32 _ _ _ _))) => (MOVDreg x) +(MOVBUreg x:(Select0 (LoweredAtomicCas64 _ _ _ _))) => (MOVDreg x) + +// Avoid sign extension after word arithmetic. +(MOVWreg x:(ADDIW _)) => (MOVDreg x) +(MOVWreg x:(SUBW _ _)) => (MOVDreg x) +(MOVWreg x:(NEGW _)) => (MOVDreg x) +(MOVWreg x:(MULW _ _)) => (MOVDreg x) +(MOVWreg x:(DIVW _ _)) => (MOVDreg x) +(MOVWreg x:(DIVUW _ _)) => (MOVDreg x) +(MOVWreg x:(REMW _ _)) => (MOVDreg x) +(MOVWreg x:(REMUW _ _)) => (MOVDreg x) + +// Fold double extensions. +(MOVBreg x:(MOVBreg _)) => (MOVDreg x) +(MOVHreg x:(MOVBreg _)) => (MOVDreg x) +(MOVHreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVHreg x:(MOVHreg _)) => (MOVDreg x) +(MOVWreg x:(MOVBreg _)) => (MOVDreg x) +(MOVWreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVWreg x:(MOVHreg _)) => (MOVDreg x) +(MOVWreg x:(MOVWreg _)) => (MOVDreg x) +(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x) +(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x) +(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x) +(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x) + +// Do not extend before store. +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem) + +// Replace extend after load with alternate load where possible. +(MOVBreg <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <t> [off] {sym} ptr mem) +(MOVHreg <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload <t> [off] {sym} ptr mem) +(MOVWreg <t> x:(MOVWUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <t> [off] {sym} ptr mem) +(MOVBUreg <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem) +(MOVHUreg <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem) +(MOVWUreg <t> x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWUload <t> [off] {sym} ptr mem) + +// If a register move has only 1 use, just use the same register without emitting instruction +// MOVnop does not emit an instruction, only for ensuring the type. +(MOVDreg x) && x.Uses == 1 => (MOVDnop x) + +// TODO: we should be able to get rid of MOVDnop all together. +// But for now, this is enough to get rid of lots of them. +(MOVDnop (MOVDconst [c])) => (MOVDconst [c]) + +// Fold constant into immediate instructions where possible. +(ADD (MOVDconst [val]) x) && is32Bit(val) => (ADDI [val] x) +(AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x) +(OR (MOVDconst [val]) x) && is32Bit(val) => (ORI [val] x) +(XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x) +(SLL x (MOVDconst [val])) => (SLLI [int64(val&63)] x) +(SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x) +(SRA x (MOVDconst [val])) => (SRAI [int64(val&63)] x) +(SLT x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI [val] x) +(SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x) + +// Convert const subtraction into ADDI with negative immediate, where possible. +(SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x) +(SUB <t> (MOVDconst [val]) y) && is32Bit(-val) => (NEG (ADDI <t> [-val] y)) + +// Subtraction of zero. +(SUB x (MOVDconst [0])) => x +(SUBW x (MOVDconst [0])) => (ADDIW [0] x) + +// Subtraction from zero. +(SUB (MOVDconst [0]) x) => (NEG x) +(SUBW (MOVDconst [0]) x) => (NEGW x) + +// Fold negation into subtraction. +(NEG (SUB x y)) => (SUB y x) +(NEG <t> s:(ADDI [val] (SUB x y))) && s.Uses == 1 && is32Bit(-val) => (ADDI [-val] (SUB <t> y x)) + +// Double negation. +(NEG (NEG x)) => x + +// Addition of zero or two constants. +(ADDI [0] x) => x +(ADDI [x] (MOVDconst [y])) && is32Bit(x + y) => (MOVDconst [x + y]) + +// ANDI with all zeros, all ones or two constants. +(ANDI [0] x) => (MOVDconst [0]) +(ANDI [-1] x) => x +(ANDI [x] (MOVDconst [y])) => (MOVDconst [x & y]) + +// ORI with all zeroes, all ones or two constants. +(ORI [0] x) => x +(ORI [-1] x) => (MOVDconst [-1]) +(ORI [x] (MOVDconst [y])) => (MOVDconst [x | y]) + +// Combine operations with immediate. +(ADDI [x] (ADDI [y] z)) && is32Bit(x + y) => (ADDI [x + y] z) +(ANDI [x] (ANDI [y] z)) => (ANDI [x & y] z) +(ORI [x] (ORI [y] z)) => (ORI [x | y] z) + +// Negation of a constant. +(NEG (MOVDconst [x])) => (MOVDconst [-x]) +(NEGW (MOVDconst [x])) => (MOVDconst [int64(int32(-x))]) + +// Shift of a constant. +(SLLI [x] (MOVDconst [y])) && is32Bit(y << uint32(x)) => (MOVDconst [y << uint32(x)]) +(SRLI [x] (MOVDconst [y])) => (MOVDconst [int64(uint64(y) >> uint32(x))]) +(SRAI [x] (MOVDconst [y])) => (MOVDconst [int64(y) >> uint32(x)]) + +// SLTI/SLTIU with constants. +(SLTI [x] (MOVDconst [y])) => (MOVDconst [b2i(int64(y) < int64(x))]) +(SLTIU [x] (MOVDconst [y])) => (MOVDconst [b2i(uint64(y) < uint64(x))]) + +// SLTI/SLTIU with known outcomes. +(SLTI [x] (ANDI [y] _)) && y >= 0 && int64(y) < int64(x) => (MOVDconst [1]) +(SLTIU [x] (ANDI [y] _)) && y >= 0 && uint64(y) < uint64(x) => (MOVDconst [1]) +(SLTI [x] (ORI [y] _)) && y >= 0 && int64(y) >= int64(x) => (MOVDconst [0]) +(SLTIU [x] (ORI [y] _)) && y >= 0 && uint64(y) >= uint64(x) => (MOVDconst [0]) + +// SLT/SLTU with known outcomes. +(SLT x x) => (MOVDconst [0]) +(SLTU x x) => (MOVDconst [0]) + +// Deadcode for LoweredMuluhilo +(Select0 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MULHU x y) +(Select1 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MUL x y) + +// Merge negation into fused multiply-add and multiply-subtract. +// +// Key: +// +// [+ -](x * y) [+ -] z. +// _ N A S +// D U +// D B +// +// Note: multiplication commutativity handled by rule generator. +(F(MADD|NMADD|MSUB|NMSUB)D neg:(FNEGD x) y z) && neg.Uses == 1 => (F(NMADD|MADD|NMSUB|MSUB)D x y z) +(F(MADD|NMADD|MSUB|NMSUB)D x y neg:(FNEGD z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)D x y z) diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go new file mode 100644 index 0000000..09b1620 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go @@ -0,0 +1,482 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "fmt" +) + +// Notes: +// - Boolean types occupy the entire register. 0=false, 1=true. + +// Suffixes encode the bit width of various instructions: +// +// D (double word) = 64 bit int +// W (word) = 32 bit int +// H (half word) = 16 bit int +// B (byte) = 8 bit int +// S (single) = 32 bit float +// D (double) = 64 bit float +// L = 64 bit int, used when the opcode starts with F + +const ( + riscv64REG_G = 27 + riscv64REG_CTXT = 26 + riscv64REG_LR = 1 + riscv64REG_SP = 2 + riscv64REG_GP = 3 + riscv64REG_TP = 4 + riscv64REG_TMP = 31 + riscv64REG_ZERO = 0 +) + +func riscv64RegName(r int) string { + switch { + case r == riscv64REG_G: + return "g" + case r == riscv64REG_SP: + return "SP" + case 0 <= r && r <= 31: + return fmt.Sprintf("X%d", r) + case 32 <= r && r <= 63: + return fmt.Sprintf("F%d", r-32) + default: + panic(fmt.Sprintf("unknown register %d", r)) + } +} + +func init() { + var regNamesRISCV64 []string + var gpMask, fpMask, gpgMask, gpspMask, gpspsbMask, gpspsbgMask regMask + regNamed := make(map[string]regMask) + + // Build the list of register names, creating an appropriately indexed + // regMask for the gp and fp registers as we go. + // + // If name is specified, use it rather than the riscv reg number. + addreg := func(r int, name string) regMask { + mask := regMask(1) << uint(len(regNamesRISCV64)) + if name == "" { + name = riscv64RegName(r) + } + regNamesRISCV64 = append(regNamesRISCV64, name) + regNamed[name] = mask + return mask + } + + // General purpose registers. + for r := 0; r <= 31; r++ { + if r == riscv64REG_LR { + // LR is not used by regalloc, so we skip it to leave + // room for pseudo-register SB. + continue + } + + mask := addreg(r, "") + + // Add general purpose registers to gpMask. + switch r { + // ZERO, GP, TP and TMP are not in any gp mask. + case riscv64REG_ZERO, riscv64REG_GP, riscv64REG_TP, riscv64REG_TMP: + case riscv64REG_G: + gpgMask |= mask + gpspsbgMask |= mask + case riscv64REG_SP: + gpspMask |= mask + gpspsbMask |= mask + gpspsbgMask |= mask + default: + gpMask |= mask + gpgMask |= mask + gpspMask |= mask + gpspsbMask |= mask + gpspsbgMask |= mask + } + } + + // Floating pointer registers. + for r := 32; r <= 63; r++ { + mask := addreg(r, "") + fpMask |= mask + } + + // Pseudo-register: SB + mask := addreg(-1, "SB") + gpspsbMask |= mask + gpspsbgMask |= mask + + if len(regNamesRISCV64) > 64 { + // regMask is only 64 bits. + panic("Too many RISCV64 registers") + } + + regCtxt := regNamed["X26"] + callerSave := gpMask | fpMask | regNamed["g"] + + var ( + gpstore = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register + gpstore0 = regInfo{inputs: []regMask{gpspsbMask}} + gp01 = regInfo{outputs: []regMask{gpMask}} + gp11 = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}} + gp21 = regInfo{inputs: []regMask{gpMask, gpMask}, outputs: []regMask{gpMask}} + gp22 = regInfo{inputs: []regMask{gpMask, gpMask}, outputs: []regMask{gpMask, gpMask}} + gpload = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{gpMask}} + gp11sb = regInfo{inputs: []regMask{gpspsbMask}, outputs: []regMask{gpMask}} + gpxchg = regInfo{inputs: []regMask{gpspsbgMask, gpgMask}, outputs: []regMask{gpMask}} + gpcas = regInfo{inputs: []regMask{gpspsbgMask, gpgMask, gpgMask}, outputs: []regMask{gpMask}} + gpatomic = regInfo{inputs: []regMask{gpspsbgMask, gpgMask}} + + fp11 = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{fpMask}} + fp21 = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{fpMask}} + fp31 = regInfo{inputs: []regMask{fpMask, fpMask, fpMask}, outputs: []regMask{fpMask}} + gpfp = regInfo{inputs: []regMask{gpMask}, outputs: []regMask{fpMask}} + fpgp = regInfo{inputs: []regMask{fpMask}, outputs: []regMask{gpMask}} + fpstore = regInfo{inputs: []regMask{gpspsbMask, fpMask, 0}} + fpload = regInfo{inputs: []regMask{gpspsbMask, 0}, outputs: []regMask{fpMask}} + fp2gp = regInfo{inputs: []regMask{fpMask, fpMask}, outputs: []regMask{gpMask}} + + call = regInfo{clobbers: callerSave} + callClosure = regInfo{inputs: []regMask{gpspMask, regCtxt, 0}, clobbers: callerSave} + callInter = regInfo{inputs: []regMask{gpMask}, clobbers: callerSave} + ) + + RISCV64ops := []opData{ + {name: "ADD", argLength: 2, reg: gp21, asm: "ADD", commutative: true}, // arg0 + arg1 + {name: "ADDI", argLength: 1, reg: gp11sb, asm: "ADDI", aux: "Int64"}, // arg0 + auxint + {name: "ADDIW", argLength: 1, reg: gp11, asm: "ADDIW", aux: "Int64"}, // 32 low bits of arg0 + auxint, sign extended to 64 bits + {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0 + {name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW"}, // -arg0 of 32 bits, sign extended to 64 bits + {name: "SUB", argLength: 2, reg: gp21, asm: "SUB"}, // arg0 - arg1 + {name: "SUBW", argLength: 2, reg: gp21, asm: "SUBW"}, // 32 low bits of arg 0 - 32 low bits of arg 1, sign extended to 64 bits + + // M extension. H means high (i.e., it returns the top bits of + // the result). U means unsigned. W means word (i.e., 32-bit). + {name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true, typ: "Int64"}, // arg0 * arg1 + {name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true, typ: "Int32"}, + {name: "MULH", argLength: 2, reg: gp21, asm: "MULH", commutative: true, typ: "Int64"}, + {name: "MULHU", argLength: 2, reg: gp21, asm: "MULHU", commutative: true, typ: "UInt64"}, + {name: "LoweredMuluhilo", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, return (hi, lo) + {name: "LoweredMuluover", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, return (64 bits of arg0*arg1, overflow) + + {name: "DIV", argLength: 2, reg: gp21, asm: "DIV", typ: "Int64"}, // arg0 / arg1 + {name: "DIVU", argLength: 2, reg: gp21, asm: "DIVU", typ: "UInt64"}, + {name: "DIVW", argLength: 2, reg: gp21, asm: "DIVW", typ: "Int32"}, + {name: "DIVUW", argLength: 2, reg: gp21, asm: "DIVUW", typ: "UInt32"}, + {name: "REM", argLength: 2, reg: gp21, asm: "REM", typ: "Int64"}, // arg0 % arg1 + {name: "REMU", argLength: 2, reg: gp21, asm: "REMU", typ: "UInt64"}, + {name: "REMW", argLength: 2, reg: gp21, asm: "REMW", typ: "Int32"}, + {name: "REMUW", argLength: 2, reg: gp21, asm: "REMUW", typ: "UInt32"}, + + {name: "MOVaddr", argLength: 1, reg: gp11sb, asm: "MOV", aux: "SymOff", rematerializeable: true, symEffect: "RdWr"}, // arg0 + auxint + offset encoded in aux + // auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address + + {name: "MOVDconst", reg: gp01, asm: "MOV", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint + + // Loads: load <size> bits from arg0+auxint+aux and extend to 64 bits; arg1=mem + {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // 8 bits, sign extend + {name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // 16 bits, sign extend + {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // 32 bits, sign extend + {name: "MOVDload", argLength: 2, reg: gpload, asm: "MOV", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // 64 bits + {name: "MOVBUload", argLength: 2, reg: gpload, asm: "MOVBU", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // 8 bits, zero extend + {name: "MOVHUload", argLength: 2, reg: gpload, asm: "MOVHU", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // 16 bits, zero extend + {name: "MOVWUload", argLength: 2, reg: gpload, asm: "MOVWU", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // 32 bits, zero extend + + // Stores: store <size> lowest bits in arg1 to arg0+auxint+aux; arg2=mem + {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 8 bits + {name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits + {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits + {name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOV", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 64 bits + + // Stores: store <size> of zero in arg0+auxint+aux; arg1=mem + {name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 8 bits + {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 16 bits + {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits + {name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 64 bits + + // Conversions + {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte + {name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half + {name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word + {name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOV"}, // move from arg0 + {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte + {name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half + {name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word + + {name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register + + // Shift ops + {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63) + {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), signed + {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), unsigned + {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63 + {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63 + {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63 + + // Bitwise ops + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 + {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 + {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint + {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0 + + // Generate boolean values + {name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"}, // arg0 == 0, result is 0 or 1 + {name: "SNEZ", argLength: 1, reg: gp11, asm: "SNEZ"}, // arg0 != 0, result is 0 or 1 + {name: "SLT", argLength: 2, reg: gp21, asm: "SLT"}, // arg0 < arg1, result is 0 or 1 + {name: "SLTI", argLength: 1, reg: gp11, asm: "SLTI", aux: "Int64"}, // arg0 < auxint, result is 0 or 1 + {name: "SLTU", argLength: 2, reg: gp21, asm: "SLTU"}, // arg0 < arg1, unsigned, result is 0 or 1 + {name: "SLTIU", argLength: 1, reg: gp11, asm: "SLTIU", aux: "Int64"}, // arg0 < auxint, unsigned, result is 0 or 1 + + // MOVconvert converts between pointers and integers. + // We have a special op for this so as to not confuse GC + // (particularly stack maps). It takes a memory arg so it + // gets correctly ordered with respect to GC safepoints. + {name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem + + // Calls + {name: "CALLstatic", argLength: -1, reg: call, aux: "CallOff", call: true}, // call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: -1, reg: call, aux: "CallOff", call: true, tailCall: true}, // tail call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: -1, reg: callClosure, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: -1, reg: callInter, aux: "CallOff", call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem + + // duffzero + // arg0 = address of memory to zero (in X25, changed as side effect) + // arg1 = mem + // auxint = offset into duffzero code to start executing + // X1 (link register) changed because of function call + // returns mem + { + name: "DUFFZERO", + aux: "Int64", + argLength: 2, + reg: regInfo{ + inputs: []regMask{regNamed["X25"]}, + clobbers: regNamed["X1"] | regNamed["X25"], + }, + typ: "Mem", + faultOnNilArg0: true, + }, + + // duffcopy + // arg0 = address of dst memory (in X25, changed as side effect) + // arg1 = address of src memory (in X24, changed as side effect) + // arg2 = mem + // auxint = offset into duffcopy code to start executing + // X1 (link register) changed because of function call + // returns mem + { + name: "DUFFCOPY", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{regNamed["X25"], regNamed["X24"]}, + clobbers: regNamed["X1"] | regNamed["X24"] | regNamed["X25"], + }, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // Generic moves and zeros + + // general unaligned zeroing + // arg0 = address of memory to zero (in X5, changed as side effect) + // arg1 = address of the last element to zero (inclusive) + // arg2 = mem + // auxint = element size + // returns mem + // mov ZERO, (X5) + // ADD $sz, X5 + // BGEU Rarg1, X5, -2(PC) + { + name: "LoweredZero", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{regNamed["X5"], gpMask}, + clobbers: regNamed["X5"], + }, + typ: "Mem", + faultOnNilArg0: true, + }, + + // general unaligned move + // arg0 = address of dst memory (in X5, changed as side effect) + // arg1 = address of src memory (in X6, changed as side effect) + // arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2) + // arg3 = mem + // auxint = alignment + // clobbers X7 as a tmp register. + // returns mem + // mov (X6), X7 + // mov X7, (X5) + // ADD $sz, X5 + // ADD $sz, X6 + // BGEU Rarg2, X5, -4(PC) + { + name: "LoweredMove", + aux: "Int64", + argLength: 4, + reg: regInfo{ + inputs: []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]}, + clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"], + }, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // Atomic loads. + // load from arg0. arg1=mem. + // returns <value,memory> so they can be properly ordered with other loads. + {name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true}, + {name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true}, + + // Atomic stores. + // store arg1 to *arg0. arg2=mem. returns memory. + {name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true}, + + // Atomic exchange. + // store arg1 to *arg0. arg2=mem. returns <old content of *arg0, memory>. + {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true}, + + // Atomic add. + // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. + {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // Atomic compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. + // if *arg0 == arg1 { + // *arg0 = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // MOV $0, Rout + // LR (Rarg0), Rtmp + // BNE Rtmp, Rarg1, 3(PC) + // SC Rarg2, (Rarg0), Rtmp + // BNE Rtmp, ZERO, -3(PC) + // MOV $1, Rout + {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true}, + + // Atomic 32 bit AND/OR. + // *arg0 &= (|=) arg1. arg2=mem. returns nil. + {name: "LoweredAtomicAnd32", argLength: 3, reg: gpatomic, asm: "AMOANDW", faultOnNilArg0: true, hasSideEffects: true}, + {name: "LoweredAtomicOr32", argLength: 3, reg: gpatomic, asm: "AMOORW", faultOnNilArg0: true, hasSideEffects: true}, + + // Lowering pass-throughs + {name: "LoweredNilCheck", argLength: 2, faultOnNilArg0: true, nilCheck: true, reg: regInfo{inputs: []regMask{gpspMask}}}, // arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{regCtxt}}}, // scheduler ensures only at beginning of entry block + + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers RA (LR) because it's a call + // and T6 (REG_TMP). + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}, clobbers: (callerSave &^ (gpMask | regNamed["g"])) | regNamed["X1"]}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X7"], regNamed["X28"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X6"], regNamed["X7"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{regNamed["X5"], regNamed["X6"]}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go). + + // F extension. + {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true, typ: "Float32"}, // arg0 + arg1 + {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS", commutative: false, typ: "Float32"}, // arg0 - arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, typ: "Float32"}, // arg0 * arg1 + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", commutative: false, typ: "Float32"}, // arg0 / arg1 + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"}, // sqrt(arg0) + {name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"}, // -arg0 + {name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float + {name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"}, // float32(low 32 bits of arg0) + {name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"}, // float32(arg0) + {name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"}, // int32(arg0) + {name: "FCVTLS", argLength: 1, reg: fpgp, asm: "FCVTLS", typ: "Int64"}, // int64(arg0) + {name: "FMOVWload", argLength: 2, reg: fpload, asm: "MOVF", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load float32 from arg0+auxint+aux + {name: "FMOVWstore", argLength: 3, reg: fpstore, asm: "MOVF", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store float32 to arg0+auxint+aux + {name: "FEQS", argLength: 2, reg: fp2gp, asm: "FEQS", commutative: true}, // arg0 == arg1 + {name: "FNES", argLength: 2, reg: fp2gp, asm: "FNES", commutative: true}, // arg0 != arg1 + {name: "FLTS", argLength: 2, reg: fp2gp, asm: "FLTS"}, // arg0 < arg1 + {name: "FLES", argLength: 2, reg: fp2gp, asm: "FLES"}, // arg0 <= arg1 + + // D extension. + {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true, typ: "Float64"}, // arg0 + arg1 + {name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD", commutative: false, typ: "Float64"}, // arg0 - arg1 + {name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true, typ: "Float64"}, // arg0 * arg1 + {name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD", commutative: false, typ: "Float64"}, // arg0 / arg1 + {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD", commutative: true, typ: "Float64"}, // (arg0 * arg1) + arg2 + {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD", commutative: true, typ: "Float64"}, // (arg0 * arg1) - arg2 + {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD", commutative: true, typ: "Float64"}, // -(arg0 * arg1) + arg2 + {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD", commutative: true, typ: "Float64"}, // -(arg0 * arg1) - arg2 + {name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD", typ: "Float64"}, // sqrt(arg0) + {name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"}, // -arg0 + {name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD", typ: "Float64"}, // abs(arg0) + {name: "FSGNJD", argLength: 2, reg: fp21, asm: "FSGNJD", typ: "Float64"}, // copy sign of arg1 to arg0 + {name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"}, // reinterpret arg0 as float + {name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"}, // float64(low 32 bits of arg0) + {name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"}, // float64(arg0) + {name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"}, // int32(arg0) + {name: "FCVTLD", argLength: 1, reg: fpgp, asm: "FCVTLD", typ: "Int64"}, // int64(arg0) + {name: "FCVTDS", argLength: 1, reg: fp11, asm: "FCVTDS", typ: "Float64"}, // float64(arg0) + {name: "FCVTSD", argLength: 1, reg: fp11, asm: "FCVTSD", typ: "Float32"}, // float32(arg0) + {name: "FMOVDload", argLength: 2, reg: fpload, asm: "MOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load float64 from arg0+auxint+aux + {name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store float6 to arg0+auxint+aux + {name: "FEQD", argLength: 2, reg: fp2gp, asm: "FEQD", commutative: true}, // arg0 == arg1 + {name: "FNED", argLength: 2, reg: fp2gp, asm: "FNED", commutative: true}, // arg0 != arg1 + {name: "FLTD", argLength: 2, reg: fp2gp, asm: "FLTD"}, // arg0 < arg1 + {name: "FLED", argLength: 2, reg: fp2gp, asm: "FLED"}, // arg0 <= arg1 + } + + RISCV64blocks := []blockData{ + {name: "BEQ", controls: 2}, + {name: "BNE", controls: 2}, + {name: "BLT", controls: 2}, + {name: "BGE", controls: 2}, + {name: "BLTU", controls: 2}, + {name: "BGEU", controls: 2}, + + {name: "BEQZ", controls: 1}, + {name: "BNEZ", controls: 1}, + {name: "BLEZ", controls: 1}, + {name: "BGEZ", controls: 1}, + {name: "BLTZ", controls: 1}, + {name: "BGTZ", controls: 1}, + } + + archs = append(archs, arch{ + name: "RISCV64", + pkg: "cmd/internal/obj/riscv", + genfile: "../../riscv64/ssa.go", + ops: RISCV64ops, + blocks: RISCV64blocks, + regnames: regNamesRISCV64, + gpregmask: gpMask, + fpregmask: fpMask, + framepointerreg: -1, // not used + // Integer parameters passed in register X10-X17, X8-X9, X18-X23 + ParamIntRegNames: "X10 X11 X12 X13 X14 X15 X16 X17 X8 X9 X18 X19 X20 X21 X22 X23", + // Float parameters passed in register F10-F17, F8-F9, F18-F23 + ParamFloatRegNames: "F10 F11 F12 F13 F14 F15 F16 F17 F8 F9 F18 F19 F20 F21 F22 F23", + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules new file mode 100644 index 0000000..cd55331 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules @@ -0,0 +1,19 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fold constant shift with extension. +(SRAI [c] (MOVBreg x)) && c < 8 => (SRAI [56+c] (SLLI <typ.Int64> [56] x)) +(SRAI [c] (MOVHreg x)) && c < 16 => (SRAI [48+c] (SLLI <typ.Int64> [48] x)) +(SRAI [c] (MOVWreg x)) && c < 32 => (SRAI [32+c] (SLLI <typ.Int64> [32] x)) +(SRLI [c] (MOVBUreg x)) && c < 8 => (SRLI [56+c] (SLLI <typ.UInt64> [56] x)) +(SRLI [c] (MOVHUreg x)) && c < 16 => (SRLI [48+c] (SLLI <typ.UInt64> [48] x)) +(SRLI [c] (MOVWUreg x)) && c < 32 => (SRLI [32+c] (SLLI <typ.UInt64> [32] x)) +(SLLI [c] (MOVBUreg x)) && c <= 56 => (SRLI [56-c] (SLLI <typ.UInt64> [56] x)) +(SLLI [c] (MOVHUreg x)) && c <= 48 => (SRLI [48-c] (SLLI <typ.UInt64> [48] x)) +(SLLI [c] (MOVWUreg x)) && c <= 32 => (SRLI [32-c] (SLLI <typ.UInt64> [32] x)) + +// Shift by zero. +(SRAI [0] x) => x +(SRLI [0] x) => x +(SLLI [0] x) => x diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules new file mode 100644 index 0000000..e9becb2 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules @@ -0,0 +1,1704 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Lowering arithmetic +(Add(64|Ptr) ...) => (ADD ...) +(Add(32|16|8) ...) => (ADDW ...) +(Add32F x y) => (Select0 (FADDS x y)) +(Add64F x y) => (Select0 (FADD x y)) + +(Sub(64|Ptr) ...) => (SUB ...) +(Sub(32|16|8) ...) => (SUBW ...) +(Sub32F x y) => (Select0 (FSUBS x y)) +(Sub64F x y) => (Select0 (FSUB x y)) + +(Mul64 ...) => (MULLD ...) +(Mul(32|16|8) ...) => (MULLW ...) +(Mul32F ...) => (FMULS ...) +(Mul64F ...) => (FMUL ...) +(Mul64uhilo ...) => (MLGR ...) + +(Div32F ...) => (FDIVS ...) +(Div64F ...) => (FDIV ...) + +(Div64 x y) => (DIVD x y) +(Div64u ...) => (DIVDU ...) +// DIVW/DIVWU has a 64-bit dividend and a 32-bit divisor, +// so a sign/zero extension of the dividend is required. +(Div32 x y) => (DIVW (MOVWreg x) y) +(Div32u x y) => (DIVWU (MOVWZreg x) y) +(Div16 x y) => (DIVW (MOVHreg x) (MOVHreg y)) +(Div16u x y) => (DIVWU (MOVHZreg x) (MOVHZreg y)) +(Div8 x y) => (DIVW (MOVBreg x) (MOVBreg y)) +(Div8u x y) => (DIVWU (MOVBZreg x) (MOVBZreg y)) + +(Hmul(64|64u) ...) => (MULH(D|DU) ...) +(Hmul32 x y) => (SRDconst [32] (MULLD (MOVWreg x) (MOVWreg y))) +(Hmul32u x y) => (SRDconst [32] (MULLD (MOVWZreg x) (MOVWZreg y))) + +(Mod64 x y) => (MODD x y) +(Mod64u ...) => (MODDU ...) +// MODW/MODWU has a 64-bit dividend and a 32-bit divisor, +// so a sign/zero extension of the dividend is required. +(Mod32 x y) => (MODW (MOVWreg x) y) +(Mod32u x y) => (MODWU (MOVWZreg x) y) +(Mod16 x y) => (MODW (MOVHreg x) (MOVHreg y)) +(Mod16u x y) => (MODWU (MOVHZreg x) (MOVHZreg y)) +(Mod8 x y) => (MODW (MOVBreg x) (MOVBreg y)) +(Mod8u x y) => (MODWU (MOVBZreg x) (MOVBZreg y)) + +// (x + y) / 2 with x>=y -> (x - y) / 2 + y +(Avg64u <t> x y) => (ADD (SRDconst <t> (SUB <t> x y) [1]) y) + +(And64 ...) => (AND ...) +(And(32|16|8) ...) => (ANDW ...) + +(Or64 ...) => (OR ...) +(Or(32|16|8) ...) => (ORW ...) + +(Xor64 ...) => (XOR ...) +(Xor(32|16|8) ...) => (XORW ...) + +(Neg64 ...) => (NEG ...) +(Neg(32|16|8) ...) => (NEGW ...) +(Neg32F ...) => (FNEGS ...) +(Neg64F ...) => (FNEG ...) + +(Com64 ...) => (NOT ...) +(Com(32|16|8) ...) => (NOTW ...) +(NOT x) => (XOR (MOVDconst [-1]) x) +(NOTW x) => (XORWconst [-1] x) + +// Lowering boolean ops +(AndB ...) => (ANDW ...) +(OrB ...) => (ORW ...) +(Not x) => (XORWconst [1] x) + +// Lowering pointer arithmetic +(OffPtr [off] ptr:(SP)) => (MOVDaddr [int32(off)] ptr) +(OffPtr [off] ptr) && is32Bit(off) => (ADDconst [int32(off)] ptr) +(OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr) + +// TODO: optimize these cases? +(Ctz64NonZero ...) => (Ctz64 ...) +(Ctz32NonZero ...) => (Ctz32 ...) + +// Ctz(x) = 64 - findLeftmostOne((x-1)&^x) +(Ctz64 <t> x) => (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x)))) +(Ctz32 <t> x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x))))) + +(BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x)) + +// POPCNT treats the input register as a vector of 8 bytes, producing +// a population count for each individual byte. For inputs larger than +// a single byte we therefore need to sum the individual bytes produced +// by the POPCNT instruction. For example, the following instruction +// sequence could be used to calculate the population count of a 4-byte +// value: +// +// MOVD $0x12345678, R1 // R1=0x12345678 <-- input +// POPCNT R1, R2 // R2=0x02030404 +// SRW $16, R2, R3 // R3=0x00000203 +// ADDW R2, R3, R4 // R4=0x02030607 +// SRW $8, R4, R5 // R5=0x00020306 +// ADDW R4, R5, R6 // R6=0x0205090d +// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13 +// +(PopCount8 x) => (POPCNT (MOVBZreg x)) +(PopCount16 x) => (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x))) +(PopCount32 x) => (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x))) +(PopCount64 x) => (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x))) + +// SumBytes{2,4,8} pseudo operations sum the values of the rightmost +// 2, 4 or 8 bytes respectively. The result is a single byte however +// other bytes might contain junk so a zero extension is required if +// the desired output type is larger than 1 byte. +(SumBytes2 x) => (ADDW (SRWconst <typ.UInt8> x [8]) x) +(SumBytes4 x) => (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x)) +(SumBytes8 x) => (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x)) + +(Bswap64 ...) => (MOVDBR ...) +(Bswap32 ...) => (MOVWBR ...) + +// add with carry +(Select0 (Add64carry x y c)) + => (Select0 <typ.UInt64> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1])))) +(Select1 (Add64carry x y c)) + => (Select0 <typ.UInt64> (ADDE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1])))))) + +// subtract with borrow +(Select0 (Sub64borrow x y c)) + => (Select0 <typ.UInt64> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c)))) +(Select1 (Sub64borrow x y c)) + => (NEG (Select0 <typ.UInt64> (SUBE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c))))))) + +// math package intrinsics +(Sqrt ...) => (FSQRT ...) +(Floor x) => (FIDBR [7] x) +(Ceil x) => (FIDBR [6] x) +(Trunc x) => (FIDBR [5] x) +(RoundToEven x) => (FIDBR [4] x) +(Round x) => (FIDBR [1] x) +(FMA x y z) => (FMADD z x y) + +(Sqrt32 ...) => (FSQRTS ...) + +// Atomic loads and stores. +// The SYNC instruction (fast-BCR-serialization) prevents store-load +// reordering. Other sequences of memory operations (load-load, +// store-store and load-store) are already guaranteed not to be reordered. +(AtomicLoad(8|32|Acq32|64|Ptr) ptr mem) => (MOV(BZ|WZ|WZ|D|D)atomicload ptr mem) +(AtomicStore(8|32|64|PtrNoWB) ptr val mem) => (SYNC (MOV(B|W|D|D)atomicstore ptr val mem)) + +// Store-release doesn't require store-load ordering. +(AtomicStoreRel32 ptr val mem) => (MOVWatomicstore ptr val mem) + +// Atomic adds. +(AtomicAdd32 ptr val mem) => (AddTupleFirst32 val (LAA ptr val mem)) +(AtomicAdd64 ptr val mem) => (AddTupleFirst64 val (LAAG ptr val mem)) +(Select0 <t> (AddTupleFirst32 val tuple)) => (ADDW val (Select0 <t> tuple)) +(Select1 (AddTupleFirst32 _ tuple)) => (Select1 tuple) +(Select0 <t> (AddTupleFirst64 val tuple)) => (ADD val (Select0 <t> tuple)) +(Select1 (AddTupleFirst64 _ tuple)) => (Select1 tuple) + +// Atomic exchanges. +(AtomicExchange32 ptr val mem) => (LoweredAtomicExchange32 ptr val mem) +(AtomicExchange64 ptr val mem) => (LoweredAtomicExchange64 ptr val mem) + +// Atomic compare and swap. +(AtomicCompareAndSwap32 ptr old new_ mem) => (LoweredAtomicCas32 ptr old new_ mem) +(AtomicCompareAndSwap64 ptr old new_ mem) => (LoweredAtomicCas64 ptr old new_ mem) + +// Atomic and: *(*uint8)(ptr) &= val +// +// Round pointer down to nearest word boundary and pad value with ones before +// applying atomic AND operation to target word. +// +// *(*uint32)(ptr &^ 3) &= rotateleft(uint32(val) | 0xffffff00, ((3 << 3) ^ ((ptr & 3) << 3)) +// +(AtomicAnd8 ptr val mem) + => (LANfloor + ptr + (RLL <typ.UInt32> + (ORWconst <typ.UInt32> val [-1<<8]) + (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr)) + mem) + +// Atomic or: *(*uint8)(ptr) |= val +// +// Round pointer down to nearest word boundary and pad value with zeros before +// applying atomic OR operation to target word. +// +// *(*uint32)(ptr &^ 3) |= uint32(val) << ((3 << 3) ^ ((ptr & 3) << 3)) +// +(AtomicOr8 ptr val mem) + => (LAOfloor + ptr + (SLW <typ.UInt32> + (MOVBZreg <typ.UInt32> val) + (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr)) + mem) + +(AtomicAnd32 ...) => (LAN ...) +(AtomicOr32 ...) => (LAO ...) + +// Lowering extension +// Note: we always extend to 64 bits even though some ops don't need that many result bits. +(SignExt8to(16|32|64) ...) => (MOVBreg ...) +(SignExt16to(32|64) ...) => (MOVHreg ...) +(SignExt32to64 ...) => (MOVWreg ...) + +(ZeroExt8to(16|32|64) ...) => (MOVBZreg ...) +(ZeroExt16to(32|64) ...) => (MOVHZreg ...) +(ZeroExt32to64 ...) => (MOVWZreg ...) + +(Slicemask <t> x) => (SRADconst (NEG <t> x) [63]) + +// Lowering truncation +// Because we ignore high parts of registers, truncates are just copies. +(Trunc(16|32|64)to8 ...) => (Copy ...) +(Trunc(32|64)to16 ...) => (Copy ...) +(Trunc64to32 ...) => (Copy ...) + +// Lowering float <-> int +(Cvt32to32F ...) => (CEFBRA ...) +(Cvt32to64F ...) => (CDFBRA ...) +(Cvt64to32F ...) => (CEGBRA ...) +(Cvt64to64F ...) => (CDGBRA ...) + +(Cvt32Fto32 ...) => (CFEBRA ...) +(Cvt32Fto64 ...) => (CGEBRA ...) +(Cvt64Fto32 ...) => (CFDBRA ...) +(Cvt64Fto64 ...) => (CGDBRA ...) + +// Lowering float <-> uint +(Cvt32Uto32F ...) => (CELFBR ...) +(Cvt32Uto64F ...) => (CDLFBR ...) +(Cvt64Uto32F ...) => (CELGBR ...) +(Cvt64Uto64F ...) => (CDLGBR ...) + +(Cvt32Fto32U ...) => (CLFEBR ...) +(Cvt32Fto64U ...) => (CLGEBR ...) +(Cvt64Fto32U ...) => (CLFDBR ...) +(Cvt64Fto64U ...) => (CLGDBR ...) + +// Lowering float32 <-> float64 +(Cvt32Fto64F ...) => (LDEBR ...) +(Cvt64Fto32F ...) => (LEDBR ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round(32|64)F ...) => (LoweredRound(32|64)F ...) + +// Lowering shifts + +// Lower bounded shifts first. No need to check shift value. +(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y) +(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y) +(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y) +(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y) +(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y) +(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y) +(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y) +(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y) +(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD x y) +(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y) +(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y) +(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y) + +// Unsigned shifts need to return 0 if shift amount is >= width of shifted value. +// result = shift >= 64 ? 0 : arg << shift +(Lsh(64|32|16|8)x64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64])) +(Lsh(64|32|16|8)x32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64])) +(Lsh(64|32|16|8)x16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64])) +(Lsh(64|32|16|8)x8 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64])) + +(Rsh(64|32)Ux64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64])) +(Rsh(64|32)Ux32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64])) +(Rsh(64|32)Ux16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64])) +(Rsh(64|32)Ux8 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64])) + +(Rsh(16|8)Ux64 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPUconst y [64])) +(Rsh(16|8)Ux32 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst y [64])) +(Rsh(16|8)Ux16 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64])) +(Rsh(16|8)Ux8 <t> x y) => (LOCGR {s390x.GreaterOrEqual} <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64])) + +// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value. +// We implement this by setting the shift value to 63 (all ones) if the shift value is more than 63. +// result = arg >> (shift >= 64 ? 63 : shift) +(Rsh(64|32)x64 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst y [64]))) +(Rsh(64|32)x32 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64]))) +(Rsh(64|32)x16 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64]))) +(Rsh(64|32)x8 x y) => (SRA(D|W) x (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64]))) + +(Rsh(16|8)x64 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst y [64]))) +(Rsh(16|8)x32 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64]))) +(Rsh(16|8)x16 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64]))) +(Rsh(16|8)x8 x y) => (SRAW (MOV(H|B)reg x) (LOCGR {s390x.GreaterOrEqual} <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64]))) + +// Lowering rotates +(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7]))) +(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15]))) +(RotateLeft32 ...) => (RLL ...) +(RotateLeft64 ...) => (RLLG ...) + +// Lowering comparisons +(Less64 x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMP x y)) +(Less32 x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y)) +(Less(16|8) x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y))) +(Less64U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y)) +(Less32U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y)) +(Less(16|8)U x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y))) +(Less64F x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y)) +(Less32F x y) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y)) + +(Leq64 x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y)) +(Leq32 x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y)) +(Leq(16|8) x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B)reg x) (MOV(H|B)reg y))) +(Leq64U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU x y)) +(Leq32U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y)) +(Leq(16|8)U x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPWU (MOV(H|B)Zreg x) (MOV(H|B)Zreg y))) +(Leq64F x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y)) +(Leq32F x y) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y)) + +(Eq(64|Ptr) x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMP x y)) +(Eq32 x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y)) +(Eq(16|8|B) x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y))) +(Eq64F x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y)) +(Eq32F x y) => (LOCGR {s390x.Equal} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y)) + +(Neq(64|Ptr) x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMP x y)) +(Neq32 x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW x y)) +(Neq(16|8|B) x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPW (MOV(H|B|B)reg x) (MOV(H|B|B)reg y))) +(Neq64F x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMP x y)) +(Neq32F x y) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y)) + +// Lowering loads +(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVDload ptr mem) +(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) => (MOVWload ptr mem) +(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) => (MOVWZload ptr mem) +(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) => (MOVHload ptr mem) +(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) => (MOVHZload ptr mem) +(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) => (MOVBload ptr mem) +(Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && !isSigned(t))) => (MOVBZload ptr mem) +(Load <t> ptr mem) && is32BitFloat(t) => (FMOVSload ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (FMOVDload ptr mem) + +// Lowering stores +// These more-specific FP versions of Store pattern should come first. +(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVSstore ptr val mem) + +(Store {t} ptr val mem) && t.Size() == 8 => (MOVDstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 => (MOVWstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem) +(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem) + +// Lowering moves + +// Load and store for small copies. +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (MOVBstore dst (MOVBZload src mem) mem) +(Move [2] dst src mem) => (MOVHstore dst (MOVHZload src mem) mem) +(Move [4] dst src mem) => (MOVWstore dst (MOVWZload src mem) mem) +(Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem) +(Move [16] dst src mem) => + (MOVDstore [8] dst (MOVDload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem)) +(Move [24] dst src mem) => + (MOVDstore [16] dst (MOVDload [16] src mem) + (MOVDstore [8] dst (MOVDload [8] src mem) + (MOVDstore dst (MOVDload src mem) mem))) +(Move [3] dst src mem) => + (MOVBstore [2] dst (MOVBZload [2] src mem) + (MOVHstore dst (MOVHZload src mem) mem)) +(Move [5] dst src mem) => + (MOVBstore [4] dst (MOVBZload [4] src mem) + (MOVWstore dst (MOVWZload src mem) mem)) +(Move [6] dst src mem) => + (MOVHstore [4] dst (MOVHZload [4] src mem) + (MOVWstore dst (MOVWZload src mem) mem)) +(Move [7] dst src mem) => + (MOVBstore [6] dst (MOVBZload [6] src mem) + (MOVHstore [4] dst (MOVHZload [4] src mem) + (MOVWstore dst (MOVWZload src mem) mem))) + +// MVC for other moves. Use up to 4 instructions (sizes up to 1024 bytes). +(Move [s] dst src mem) && s > 0 && s <= 256 && logLargeCopy(v, s) => + (MVC [makeValAndOff(int32(s), 0)] dst src mem) +(Move [s] dst src mem) && s > 256 && s <= 512 && logLargeCopy(v, s) => + (MVC [makeValAndOff(int32(s)-256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)) +(Move [s] dst src mem) && s > 512 && s <= 768 && logLargeCopy(v, s) => + (MVC [makeValAndOff(int32(s)-512, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))) +(Move [s] dst src mem) && s > 768 && s <= 1024 && logLargeCopy(v, s) => + (MVC [makeValAndOff(int32(s)-768, 768)] dst src (MVC [makeValAndOff(256, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)))) + +// Move more than 1024 bytes using a loop. +(Move [s] dst src mem) && s > 1024 && logLargeCopy(v, s) => + (LoweredMove [s%256] dst src (ADD <src.Type> src (MOVDconst [(s/256)*256])) mem) + +// Lowering Zero instructions +(Zero [0] _ mem) => mem +(Zero [1] destptr mem) => (MOVBstoreconst [0] destptr mem) +(Zero [2] destptr mem) => (MOVHstoreconst [0] destptr mem) +(Zero [4] destptr mem) => (MOVWstoreconst [0] destptr mem) +(Zero [8] destptr mem) => (MOVDstoreconst [0] destptr mem) +(Zero [3] destptr mem) => + (MOVBstoreconst [makeValAndOff(0,2)] destptr + (MOVHstoreconst [0] destptr mem)) +(Zero [5] destptr mem) => + (MOVBstoreconst [makeValAndOff(0,4)] destptr + (MOVWstoreconst [0] destptr mem)) +(Zero [6] destptr mem) => + (MOVHstoreconst [makeValAndOff(0,4)] destptr + (MOVWstoreconst [0] destptr mem)) +(Zero [7] destptr mem) => + (MOVWstoreconst [makeValAndOff(0,3)] destptr + (MOVWstoreconst [0] destptr mem)) + +(Zero [s] destptr mem) && s > 0 && s <= 1024 => + (CLEAR [makeValAndOff(int32(s), 0)] destptr mem) + +// Zero more than 1024 bytes using a loop. +(Zero [s] destptr mem) && s > 1024 => + (LoweredZero [s%256] destptr (ADDconst <destptr.Type> destptr [(int32(s)/256)*256]) mem) + +// Lowering constants +(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)]) +(Const(32|64)F ...) => (FMOV(S|D)const ...) +(ConstNil) => (MOVDconst [0]) +(ConstBool [t]) => (MOVDconst [b2i(t)]) + +// Lowering calls +(StaticCall ...) => (CALLstatic ...) +(ClosureCall ...) => (CALLclosure ...) +(InterCall ...) => (CALLinter ...) +(TailCall ...) => (CALLtail ...) + +// Miscellaneous +(IsNonNil p) => (LOCGR {s390x.NotEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPconst p [0])) +(IsInBounds idx len) => (LOCGR {s390x.Less} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len)) +(IsSliceInBounds idx len) => (LOCGR {s390x.LessOrEqual} (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len)) +(NilCheck ...) => (LoweredNilCheck ...) +(GetG ...) => (LoweredGetG ...) +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) +(Addr {sym} base) => (MOVDaddr {sym} base) +(LocalAddr {sym} base _) => (MOVDaddr {sym} base) +(ITab (Load ptr mem)) => (MOVDload ptr mem) + +// block rewrites +(If cond yes no) => (CLIJ {s390x.LessOrGreater} (MOVBZreg <typ.Bool> cond) [0] yes no) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) +(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) + +// *************************** +// Above: lowering rules +// Below: optimizations +// *************************** +// TODO: Should the optimizations be a separate pass? + +// Note: when removing unnecessary sign/zero extensions. +// +// After a value is spilled it is restored using a sign- or zero-extension +// to register-width as appropriate for its type. For example, a uint8 will +// be restored using a MOVBZ (llgc) instruction which will zero extend the +// 8-bit value to 64-bits. +// +// This is a hazard when folding sign- and zero-extensions since we need to +// ensure not only that the value in the argument register is correctly +// extended but also that it will still be correctly extended if it is +// spilled and restored. +// +// In general this means we need type checks when the RHS of a rule is an +// OpCopy (i.e. "(... x:(...) ...) -> x"). + +// Merge double extensions. +(MOV(H|HZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(W|WZ)reg e:(MOV(B|BZ)reg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(W|WZ)reg e:(MOV(H|HZ)reg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x) + +// Bypass redundant sign extensions. +(MOV(B|BZ)reg e:(MOVBreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(B|BZ)reg e:(MOVHreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(B|BZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(H|HZ)reg e:(MOVHreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x) +(MOV(H|HZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x) +(MOV(W|WZ)reg e:(MOVWreg x)) && clobberIfDead(e) => (MOV(W|WZ)reg x) + +// Bypass redundant zero extensions. +(MOV(B|BZ)reg e:(MOVBZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(B|BZ)reg e:(MOVHZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(B|BZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(B|BZ)reg x) +(MOV(H|HZ)reg e:(MOVHZreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x) +(MOV(H|HZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(H|HZ)reg x) +(MOV(W|WZ)reg e:(MOVWZreg x)) && clobberIfDead(e) => (MOV(W|WZ)reg x) + +// Remove zero extensions after zero extending load. +// Note: take care that if x is spilled it is restored correctly. +(MOV(B|H|W)Zreg x:(MOVBZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) => x +(MOV(H|W)Zreg x:(MOVHZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) => x +(MOVWZreg x:(MOVWZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 4) => x + +// Remove sign extensions after sign extending load. +// Note: take care that if x is spilled it is restored correctly. +(MOV(B|H|W)reg x:(MOVBload _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x +(MOV(H|W)reg x:(MOVHload _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x +(MOVWreg x:(MOVWload _ _)) && (x.Type.IsSigned() || x.Type.Size() == 8) => x + +// Remove sign extensions after zero extending load. +// These type checks are probably unnecessary but do them anyway just in case. +(MOV(H|W)reg x:(MOVBZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 1) => x +(MOVWreg x:(MOVHZload _ _)) && (!x.Type.IsSigned() || x.Type.Size() > 2) => x + +// Fold sign and zero extensions into loads. +// +// Note: The combined instruction must end up in the same block +// as the original load. If not, we end up making a value with +// memory type live in two different blocks, which can lead to +// multiple memory values alive simultaneously. +// +// Make sure we don't combine these ops if the load has another use. +// This prevents a single load from being split into multiple loads +// which then might return different values. See test/atomicload.go. +(MOV(B|H|W)Zreg <t> x:(MOV(B|H|W)load [o] {s} p mem)) + && x.Uses == 1 + && clobber(x) + => @x.Block (MOV(B|H|W)Zload <t> [o] {s} p mem) +(MOV(B|H|W)reg <t> x:(MOV(B|H|W)Zload [o] {s} p mem)) + && x.Uses == 1 + && clobber(x) + => @x.Block (MOV(B|H|W)load <t> [o] {s} p mem) + +// Remove zero extensions after argument load. +(MOVBZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() == 1 => x +(MOVHZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 2 => x +(MOVWZreg x:(Arg <t>)) && !t.IsSigned() && t.Size() <= 4 => x + +// Remove sign extensions after argument load. +(MOVBreg x:(Arg <t>)) && t.IsSigned() && t.Size() == 1 => x +(MOVHreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 2 => x +(MOVWreg x:(Arg <t>)) && t.IsSigned() && t.Size() <= 4 => x + +// Fold zero extensions into constants. +(MOVBZreg (MOVDconst [c])) => (MOVDconst [int64( uint8(c))]) +(MOVHZreg (MOVDconst [c])) => (MOVDconst [int64(uint16(c))]) +(MOVWZreg (MOVDconst [c])) => (MOVDconst [int64(uint32(c))]) + +// Fold sign extensions into constants. +(MOVBreg (MOVDconst [c])) => (MOVDconst [int64( int8(c))]) +(MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))]) +(MOVWreg (MOVDconst [c])) => (MOVDconst [int64(int32(c))]) + +// Remove zero extension of conditional move. +// Note: only for MOVBZreg for now since it is added as part of 'if' statement lowering. +(MOVBZreg x:(LOCGR (MOVDconst [c]) (MOVDconst [d]) _)) + && int64(uint8(c)) == c + && int64(uint8(d)) == d + && (!x.Type.IsSigned() || x.Type.Size() > 1) + => x + +// Fold boolean tests into blocks. +// Note: this must match If statement lowering. +(CLIJ {s390x.LessOrGreater} (LOCGR {d} (MOVDconst [0]) (MOVDconst [x]) cmp) [0] yes no) + && int32(x) != 0 + => (BRC {d} cmp yes no) + +// Canonicalize BRC condition code mask by removing impossible conditions. +// Integer comparisons cannot generate the unordered condition. +(BRC {c} x:((CMP|CMPW|CMPU|CMPWU) _ _) yes no) && c&s390x.Unordered != 0 => (BRC {c&^s390x.Unordered} x yes no) +(BRC {c} x:((CMP|CMPW|CMPU|CMPWU)const _) yes no) && c&s390x.Unordered != 0 => (BRC {c&^s390x.Unordered} x yes no) + +// Compare-and-branch. +// Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered. +(BRC {c} (CMP x y) yes no) => (CGRJ {c&^s390x.Unordered} x y yes no) +(BRC {c} (CMPW x y) yes no) => (CRJ {c&^s390x.Unordered} x y yes no) +(BRC {c} (CMPU x y) yes no) => (CLGRJ {c&^s390x.Unordered} x y yes no) +(BRC {c} (CMPWU x y) yes no) => (CLRJ {c&^s390x.Unordered} x y yes no) + +// Compare-and-branch (immediate). +// Note: bit 3 (unordered) must not be set so we mask out s390x.Unordered. +(BRC {c} (CMPconst x [y]) yes no) && y == int32( int8(y)) => (CGIJ {c&^s390x.Unordered} x [ int8(y)] yes no) +(BRC {c} (CMPWconst x [y]) yes no) && y == int32( int8(y)) => (CIJ {c&^s390x.Unordered} x [ int8(y)] yes no) +(BRC {c} (CMPUconst x [y]) yes no) && y == int32(uint8(y)) => (CLGIJ {c&^s390x.Unordered} x [uint8(y)] yes no) +(BRC {c} (CMPWUconst x [y]) yes no) && y == int32(uint8(y)) => (CLIJ {c&^s390x.Unordered} x [uint8(y)] yes no) + +// Absorb immediate into compare-and-branch. +(C(R|GR)J {c} x (MOVDconst [y]) yes no) && is8Bit(y) => (C(I|GI)J {c} x [ int8(y)] yes no) +(CL(R|GR)J {c} x (MOVDconst [y]) yes no) && isU8Bit(y) => (CL(I|GI)J {c} x [uint8(y)] yes no) +(C(R|GR)J {c} (MOVDconst [x]) y yes no) && is8Bit(x) => (C(I|GI)J {c.ReverseComparison()} y [ int8(x)] yes no) +(CL(R|GR)J {c} (MOVDconst [x]) y yes no) && isU8Bit(x) => (CL(I|GI)J {c.ReverseComparison()} y [uint8(x)] yes no) + +// Prefer comparison with immediate to compare-and-branch. +(CGRJ {c} x (MOVDconst [y]) yes no) && !is8Bit(y) && is32Bit(y) => (BRC {c} (CMPconst x [int32(y)]) yes no) +(CRJ {c} x (MOVDconst [y]) yes no) && !is8Bit(y) && is32Bit(y) => (BRC {c} (CMPWconst x [int32(y)]) yes no) +(CLGRJ {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) => (BRC {c} (CMPUconst x [int32(y)]) yes no) +(CLRJ {c} x (MOVDconst [y]) yes no) && !isU8Bit(y) && isU32Bit(y) => (BRC {c} (CMPWUconst x [int32(y)]) yes no) +(CGRJ {c} (MOVDconst [x]) y yes no) && !is8Bit(x) && is32Bit(x) => (BRC {c.ReverseComparison()} (CMPconst y [int32(x)]) yes no) +(CRJ {c} (MOVDconst [x]) y yes no) && !is8Bit(x) && is32Bit(x) => (BRC {c.ReverseComparison()} (CMPWconst y [int32(x)]) yes no) +(CLGRJ {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) => (BRC {c.ReverseComparison()} (CMPUconst y [int32(x)]) yes no) +(CLRJ {c} (MOVDconst [x]) y yes no) && !isU8Bit(x) && isU32Bit(x) => (BRC {c.ReverseComparison()} (CMPWUconst y [int32(x)]) yes no) + +// Absorb sign/zero extensions into 32-bit compare-and-branch. +(CIJ {c} (MOV(W|WZ)reg x) [y] yes no) => (CIJ {c} x [y] yes no) +(CLIJ {c} (MOV(W|WZ)reg x) [y] yes no) => (CLIJ {c} x [y] yes no) + +// Bring out-of-range signed immediates into range by varying branch condition. +(BRC {s390x.Less} (CMPconst x [ 128]) yes no) => (CGIJ {s390x.LessOrEqual} x [ 127] yes no) +(BRC {s390x.Less} (CMPWconst x [ 128]) yes no) => (CIJ {s390x.LessOrEqual} x [ 127] yes no) +(BRC {s390x.LessOrEqual} (CMPconst x [-129]) yes no) => (CGIJ {s390x.Less} x [-128] yes no) +(BRC {s390x.LessOrEqual} (CMPWconst x [-129]) yes no) => (CIJ {s390x.Less} x [-128] yes no) +(BRC {s390x.Greater} (CMPconst x [-129]) yes no) => (CGIJ {s390x.GreaterOrEqual} x [-128] yes no) +(BRC {s390x.Greater} (CMPWconst x [-129]) yes no) => (CIJ {s390x.GreaterOrEqual} x [-128] yes no) +(BRC {s390x.GreaterOrEqual} (CMPconst x [ 128]) yes no) => (CGIJ {s390x.Greater} x [ 127] yes no) +(BRC {s390x.GreaterOrEqual} (CMPWconst x [ 128]) yes no) => (CIJ {s390x.Greater} x [ 127] yes no) + +// Bring out-of-range unsigned immediates into range by varying branch condition. +(BRC {s390x.Less} (CMP(WU|U)const x [256]) yes no) => (C(L|LG)IJ {s390x.LessOrEqual} x [255] yes no) +(BRC {s390x.GreaterOrEqual} (CMP(WU|U)const x [256]) yes no) => (C(L|LG)IJ {s390x.Greater} x [255] yes no) + +// Bring out-of-range immediates into range by switching signedness (only == and !=). +(BRC {c} (CMPconst x [y]) yes no) && y == int32(uint8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CLGIJ {c} x [uint8(y)] yes no) +(BRC {c} (CMPWconst x [y]) yes no) && y == int32(uint8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CLIJ {c} x [uint8(y)] yes no) +(BRC {c} (CMPUconst x [y]) yes no) && y == int32( int8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CGIJ {c} x [ int8(y)] yes no) +(BRC {c} (CMPWUconst x [y]) yes no) && y == int32( int8(y)) && (c == s390x.Equal || c == s390x.LessOrGreater) => (CIJ {c} x [ int8(y)] yes no) + +// Fold constants into instructions. +(ADD x (MOVDconst [c])) && is32Bit(c) => (ADDconst [int32(c)] x) +(ADDW x (MOVDconst [c])) => (ADDWconst [int32(c)] x) + +(SUB x (MOVDconst [c])) && is32Bit(c) => (SUBconst x [int32(c)]) +(SUB (MOVDconst [c]) x) && is32Bit(c) => (NEG (SUBconst <v.Type> x [int32(c)])) +(SUBW x (MOVDconst [c])) => (SUBWconst x [int32(c)]) +(SUBW (MOVDconst [c]) x) => (NEGW (SUBWconst <v.Type> x [int32(c)])) + +(MULLD x (MOVDconst [c])) && is32Bit(c) => (MULLDconst [int32(c)] x) +(MULLW x (MOVDconst [c])) => (MULLWconst [int32(c)] x) + +// NILF instructions leave the high 32 bits unchanged which is +// equivalent to the leftmost 32 bits being set. +// TODO(mundaym): modify the assembler to accept 64-bit values +// and use isU32Bit(^c). +(AND x (MOVDconst [c])) + && s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c)) != nil + => (RISBGZ x {*s390x.NewRotateParams(0, 63, 0).OutMerge(uint64(c))}) +(AND x (MOVDconst [c])) + && is32Bit(c) + && c < 0 + => (ANDconst [c] x) +(AND x (MOVDconst [c])) + && is32Bit(c) + && c >= 0 + => (MOVWZreg (ANDWconst <typ.UInt32> [int32(c)] x)) + +(ANDW x (MOVDconst [c])) => (ANDWconst [int32(c)] x) + +((AND|ANDW)const [c] ((AND|ANDW)const [d] x)) => ((AND|ANDW)const [c&d] x) + +((OR|XOR) x (MOVDconst [c])) && isU32Bit(c) => ((OR|XOR)const [c] x) +((OR|XOR)W x (MOVDconst [c])) => ((OR|XOR)Wconst [int32(c)] x) + +// Constant shifts. +(S(LD|RD|RAD) x (MOVDconst [c])) => (S(LD|RD|RAD)const x [uint8(c&63)]) +(S(LW|RW|RAW) x (MOVDconst [c])) && c&32 == 0 => (S(LW|RW|RAW)const x [uint8(c&31)]) +(S(LW|RW) _ (MOVDconst [c])) && c&32 != 0 => (MOVDconst [0]) +(SRAW x (MOVDconst [c])) && c&32 != 0 => (SRAWconst x [31]) + +// Shifts only use the rightmost 6 bits of the shift value. +(S(LD|RD|RAD|LW|RW|RAW) x (RISBGZ y {r})) + && r.Amount == 0 + && r.OutMask()&63 == 63 + => (S(LD|RD|RAD|LW|RW|RAW) x y) +(S(LD|RD|RAD|LW|RW|RAW) x (AND (MOVDconst [c]) y)) + => (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [int32(c&63)] y)) +(S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst [c] y)) && c&63 == 63 + => (S(LD|RD|RAD|LW|RW|RAW) x y) +(SLD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLD x y) +(SRD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRD x y) +(SRAD x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAD x y) +(SLW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SLW x y) +(SRW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRW x y) +(SRAW x (MOV(W|H|B|WZ|HZ|BZ)reg y)) => (SRAW x y) + +// Match rotate by constant. +(RLLG x (MOVDconst [c])) => (RISBGZ x {s390x.NewRotateParams(0, 63, uint8(c&63))}) +(RLL x (MOVDconst [c])) => (RLLconst x [uint8(c&31)]) + +// Signed 64-bit comparison with immediate. +(CMP x (MOVDconst [c])) && is32Bit(c) => (CMPconst x [int32(c)]) +(CMP (MOVDconst [c]) x) && is32Bit(c) => (InvertFlags (CMPconst x [int32(c)])) + +// Unsigned 64-bit comparison with immediate. +(CMPU x (MOVDconst [c])) && isU32Bit(c) => (CMPUconst x [int32(c)]) +(CMPU (MOVDconst [c]) x) && isU32Bit(c) => (InvertFlags (CMPUconst x [int32(c)])) + +// Signed and unsigned 32-bit comparison with immediate. +(CMP(W|WU) x (MOVDconst [c])) => (CMP(W|WU)const x [int32(c)]) +(CMP(W|WU) (MOVDconst [c]) x) => (InvertFlags (CMP(W|WU)const x [int32(c)])) + +// Match (x >> c) << d to 'rotate then insert selected bits [into zero]'. +(SLDconst (SRDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(uint8(max8(0, int8(c-d))), 63-d, uint8(int8(d-c)&63))}) + +// Match (x << c) >> d to 'rotate then insert selected bits [into zero]'. +(SRDconst (SLDconst x [c]) [d]) => (RISBGZ x {s390x.NewRotateParams(d, uint8(min8(63, int8(63-c+d))), uint8(int8(c-d)&63))}) + +// Absorb input zero extension into 'rotate then insert selected bits [into zero]'. +(RISBGZ (MOVWZreg x) {r}) && r.InMerge(0xffffffff) != nil => (RISBGZ x {*r.InMerge(0xffffffff)}) +(RISBGZ (MOVHZreg x) {r}) && r.InMerge(0x0000ffff) != nil => (RISBGZ x {*r.InMerge(0x0000ffff)}) +(RISBGZ (MOVBZreg x) {r}) && r.InMerge(0x000000ff) != nil => (RISBGZ x {*r.InMerge(0x000000ff)}) + +// Absorb 'rotate then insert selected bits [into zero]' into zero extension. +(MOVWZreg (RISBGZ x {r})) && r.OutMerge(0xffffffff) != nil => (RISBGZ x {*r.OutMerge(0xffffffff)}) +(MOVHZreg (RISBGZ x {r})) && r.OutMerge(0x0000ffff) != nil => (RISBGZ x {*r.OutMerge(0x0000ffff)}) +(MOVBZreg (RISBGZ x {r})) && r.OutMerge(0x000000ff) != nil => (RISBGZ x {*r.OutMerge(0x000000ff)}) + +// Absorb shift into 'rotate then insert selected bits [into zero]'. +// +// Any unsigned shift can be represented as a rotate and mask operation: +// +// x << c => RotateLeft64(x, c) & (^uint64(0) << c) +// x >> c => RotateLeft64(x, -c) & (^uint64(0) >> c) +// +// Therefore when a shift is used as the input to a rotate then insert +// selected bits instruction we can merge the two together. We just have +// to be careful that the resultant mask is representable (non-zero and +// contiguous). For example, assuming that x is variable and c, y and m +// are constants, a shift followed by a rotate then insert selected bits +// could be represented as: +// +// RotateLeft64(RotateLeft64(x, c) & (^uint64(0) << c), y) & m +// +// We can split the rotation by y into two, one rotate for x and one for +// the mask: +// +// RotateLeft64(RotateLeft64(x, c), y) & (RotateLeft64(^uint64(0) << c, y)) & m +// +// The rotations of x by c followed by y can then be combined: +// +// RotateLeft64(x, c+y) & (RotateLeft64(^uint64(0) << c, y)) & m +// ^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +// rotate mask +// +// To perform this optimization we therefore just need to check that it +// is valid to merge the shift mask (^(uint64(0)<<c)) into the selected +// bits mask (i.e. that the resultant mask is non-zero and contiguous). +// +(RISBGZ (SLDconst x [c]) {r}) && r.InMerge(^uint64(0)<<c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)<<c)).RotateLeft(c)}) +(RISBGZ (SRDconst x [c]) {r}) && r.InMerge(^uint64(0)>>c) != nil => (RISBGZ x {(*r.InMerge(^uint64(0)>>c)).RotateLeft(-c)}) + +// Absorb 'rotate then insert selected bits [into zero]' into left shift. +(SLDconst (RISBGZ x {r}) [c]) + && s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask()) != nil + => (RISBGZ x {(*s390x.NewRotateParams(0, 63-c, c).InMerge(r.OutMask())).RotateLeft(r.Amount)}) + +// Absorb 'rotate then insert selected bits [into zero]' into right shift. +(SRDconst (RISBGZ x {r}) [c]) + && s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask()) != nil + => (RISBGZ x {(*s390x.NewRotateParams(c, 63, -c&63).InMerge(r.OutMask())).RotateLeft(r.Amount)}) + +// Merge 'rotate then insert selected bits [into zero]' instructions together. +(RISBGZ (RISBGZ x {y}) {z}) + && z.InMerge(y.OutMask()) != nil + => (RISBGZ x {(*z.InMerge(y.OutMask())).RotateLeft(y.Amount)}) + +// Convert RISBGZ into 64-bit shift (helps CSE). +(RISBGZ x {r}) && r.End == 63 && r.Start == -r.Amount&63 => (SRDconst x [-r.Amount&63]) +(RISBGZ x {r}) && r.Start == 0 && r.End == 63-r.Amount => (SLDconst x [r.Amount]) + +// Optimize single bit isolation when it is known to be equivalent to +// the most significant bit due to mask produced by arithmetic shift. +// Simply isolate the most significant bit itself and place it in the +// correct position. +// +// Example: (int64(x) >> 63) & 0x8 -> RISBGZ $60, $60, $4, Rsrc, Rdst +(RISBGZ (SRADconst x [c]) {r}) + && r.Start == r.End // single bit selected + && (r.Start+r.Amount)&63 <= c // equivalent to most significant bit of x + => (RISBGZ x {s390x.NewRotateParams(r.Start, r.Start, -r.Start&63)}) + +// Canonicalize the order of arguments to comparisons - helps with CSE. +((CMP|CMPW|CMPU|CMPWU) x y) && canonLessThan(x,y) => (InvertFlags ((CMP|CMPW|CMPU|CMPWU) y x)) + +// Use sign/zero extend instead of RISBGZ. +(RISBGZ x {r}) && r == s390x.NewRotateParams(56, 63, 0) => (MOVBZreg x) +(RISBGZ x {r}) && r == s390x.NewRotateParams(48, 63, 0) => (MOVHZreg x) +(RISBGZ x {r}) && r == s390x.NewRotateParams(32, 63, 0) => (MOVWZreg x) + +// Use sign/zero extend instead of ANDW. +(ANDWconst [0x00ff] x) => (MOVBZreg x) +(ANDWconst [0xffff] x) => (MOVHZreg x) + +// Strength reduce multiplication to the sum (or difference) of two powers of two. +// +// Examples: +// 5x -> 4x + 1x +// 10x -> 8x + 2x +// 120x -> 128x - 8x +// -120x -> 8x - 128x +// +// We know that the rightmost bit of any positive value, once isolated, must either +// be a power of 2 (because it is a single bit) or 0 (if the original value is 0). +// In all of these rules we use a rightmost bit calculation to determine one operand +// for the addition or subtraction. We then just need to calculate if the other +// operand is a valid power of 2 before we can match the rule. +// +// Notes: +// - the generic rules have already matched single powers of two so we ignore them here +// - isPowerOfTwo32 asserts that its argument is greater than 0 +// - c&(c-1) = clear rightmost bit +// - c&^(c-1) = isolate rightmost bit + +// c = 2ˣ + 2ʸ => c - 2ˣ = 2ʸ +(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c&(c-1)) + => ((ADD|ADDW) (SL(D|W)const <t> x [uint8(log32(c&(c-1)))]) + (SL(D|W)const <t> x [uint8(log32(c&^(c-1)))])) + +// c = 2ʸ - 2ˣ => c + 2ˣ = 2ʸ +(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(c+(c&^(c-1))) + => ((SUB|SUBW) (SL(D|W)const <t> x [uint8(log32(c+(c&^(c-1))))]) + (SL(D|W)const <t> x [uint8(log32(c&^(c-1)))])) + +// c = 2ˣ - 2ʸ => -c + 2ˣ = 2ʸ +(MULL(D|W)const <t> x [c]) && isPowerOfTwo32(-c+(-c&^(-c-1))) + => ((SUB|SUBW) (SL(D|W)const <t> x [uint8(log32(-c&^(-c-1)))]) + (SL(D|W)const <t> x [uint8(log32(-c+(-c&^(-c-1))))])) + +// Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them). +(ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(int64(c)+int64(d)) => (MOVDaddr [c+d] {s} x) +(ADDconst [c] (MOVDaddr [d] {s} x)) && x.Op != OpSB && is20Bit(int64(c)+int64(d)) => (MOVDaddr [c+d] {s} x) +(ADD idx (MOVDaddr [c] {s} ptr)) && ptr.Op != OpSB => (MOVDaddridx [c] {s} ptr idx) + +// fold ADDconst into MOVDaddrx +(ADDconst [c] (MOVDaddridx [d] {s} x y)) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y) +(MOVDaddridx [c] {s} (ADDconst [d] x) y) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y) +(MOVDaddridx [c] {s} x (ADDconst [d] y)) && is20Bit(int64(c)+int64(d)) => (MOVDaddridx [c+d] {s} x y) + +// reverse ordering of compare instruction +(LOCGR {c} x y (InvertFlags cmp)) => (LOCGR {c.ReverseComparison()} x y cmp) + +// replace load from same location as preceding store with copy +(MOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x +(MOVWload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWreg x) +(MOVHload [off] {sym} ptr1 (MOVHstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVHreg x) +(MOVBload [off] {sym} ptr1 (MOVBstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVBreg x) +(MOVWZload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWZreg x) +(MOVHZload [off] {sym} ptr1 (MOVHstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVHZreg x) +(MOVBZload [off] {sym} ptr1 (MOVBstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVBZreg x) +(MOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (LGDR x) +(FMOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (LDGR x) +(FMOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x +(FMOVSload [off] {sym} ptr1 (FMOVSstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => x + +// prefer FPR <-> GPR moves over combined load ops +(MULLDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (MULLD x (LGDR <t> y)) +(ADDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (ADD x (LGDR <t> y)) +(SUBload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (SUB x (LGDR <t> y)) +(ORload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (OR x (LGDR <t> y)) +(ANDload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (AND x (LGDR <t> y)) +(XORload <t> [off] {sym} x ptr1 (FMOVDstore [off] {sym} ptr2 y _)) && isSamePtr(ptr1, ptr2) => (XOR x (LGDR <t> y)) + +// detect attempts to set/clear the sign bit +// may need to be reworked when NIHH/OIHH are added +(RISBGZ (LGDR <t> x) {r}) && r == s390x.NewRotateParams(1, 63, 0) => (LGDR <t> (LPDFR <x.Type> x)) +(LDGR <t> (RISBGZ x {r})) && r == s390x.NewRotateParams(1, 63, 0) => (LPDFR (LDGR <t> x)) +(OR (MOVDconst [-1<<63]) (LGDR <t> x)) => (LGDR <t> (LNDFR <x.Type> x)) +(LDGR <t> (OR (MOVDconst [-1<<63]) x)) => (LNDFR (LDGR <t> x)) + +// detect attempts to set the sign bit with load +(LDGR <t> x:(ORload <t1> [off] {sym} (MOVDconst [-1<<63]) ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (LNDFR <t> (LDGR <t> (MOVDload <t1> [off] {sym} ptr mem))) + +// detect copysign +(OR (RISBGZ (LGDR x) {r}) (LGDR (LPDFR <t> y))) + && r == s390x.NewRotateParams(0, 0, 0) + => (LGDR (CPSDR <t> y x)) +(OR (RISBGZ (LGDR x) {r}) (MOVDconst [c])) + && c >= 0 + && r == s390x.NewRotateParams(0, 0, 0) + => (LGDR (CPSDR <x.Type> (FMOVDconst <x.Type> [math.Float64frombits(uint64(c))]) x)) +(CPSDR y (FMOVDconst [c])) && !math.Signbit(c) => (LPDFR y) +(CPSDR y (FMOVDconst [c])) && math.Signbit(c) => (LNDFR y) + +// absorb negations into set/clear sign bit +(FNEG (LPDFR x)) => (LNDFR x) +(FNEG (LNDFR x)) => (LPDFR x) +(FNEGS (LPDFR x)) => (LNDFR x) +(FNEGS (LNDFR x)) => (LPDFR x) + +// no need to convert float32 to float64 to set/clear sign bit +(LEDBR (LPDFR (LDEBR x))) => (LPDFR x) +(LEDBR (LNDFR (LDEBR x))) => (LNDFR x) + +// remove unnecessary FPR <-> GPR moves +(LDGR (LGDR x)) => x +(LGDR (LDGR x)) => x + +// Don't extend before storing +(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem) +(MOVWstore [off] {sym} ptr (MOVWZreg x) mem) => (MOVWstore [off] {sym} ptr x mem) +(MOVHstore [off] {sym} ptr (MOVHZreg x) mem) => (MOVHstore [off] {sym} ptr x mem) +(MOVBstore [off] {sym} ptr (MOVBZreg x) mem) => (MOVBstore [off] {sym} ptr x mem) + +// Fold constants into memory operations. +// Note that this is not always a good idea because if not all the uses of +// the ADDconst get eliminated, we still have to compute the ADDconst and we now +// have potentially two live values (ptr and (ADDconst [off] ptr)) instead of one. +// Nevertheless, let's do it! +(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVDload [off1+off2] {sym} ptr mem) +(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWload [off1+off2] {sym} ptr mem) +(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHload [off1+off2] {sym} ptr mem) +(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBload [off1+off2] {sym} ptr mem) +(MOVWZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWZload [off1+off2] {sym} ptr mem) +(MOVHZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHZload [off1+off2] {sym} ptr mem) +(MOVBZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBZload [off1+off2] {sym} ptr mem) +(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVSload [off1+off2] {sym} ptr mem) +(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVDload [off1+off2] {sym} ptr mem) + +(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVDstore [off1+off2] {sym} ptr val mem) +(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVWstore [off1+off2] {sym} ptr val mem) +(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVHstore [off1+off2] {sym} ptr val mem) +(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (MOVBstore [off1+off2] {sym} ptr val mem) +(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVSstore [off1+off2] {sym} ptr val mem) +(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(int64(off1)+int64(off2)) => (FMOVDstore [off1+off2] {sym} ptr val mem) + +(ADDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ADDload [off1+off2] {sym} x ptr mem) +(ADDWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ADDWload [off1+off2] {sym} x ptr mem) +(MULLDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (MULLDload [off1+off2] {sym} x ptr mem) +(MULLWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (MULLWload [off1+off2] {sym} x ptr mem) +(SUBload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (SUBload [off1+off2] {sym} x ptr mem) +(SUBWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (SUBWload [off1+off2] {sym} x ptr mem) + +(ANDload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ANDload [off1+off2] {sym} x ptr mem) +(ANDWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ANDWload [off1+off2] {sym} x ptr mem) +(ORload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ORload [off1+off2] {sym} x ptr mem) +(ORWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (ORWload [off1+off2] {sym} x ptr mem) +(XORload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (XORload [off1+off2] {sym} x ptr mem) +(XORWload [off1] {sym} x (ADDconst [off2] ptr) mem) && ptr.Op != OpSB && is20Bit(int64(off1)+int64(off2)) => (XORWload [off1+off2] {sym} x ptr mem) + +// Fold constants into stores. +(MOVDstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(int64(off)) && ptr.Op != OpSB => + (MOVDstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem) +(MOVWstore [off] {sym} ptr (MOVDconst [c]) mem) && is16Bit(c) && isU12Bit(int64(off)) && ptr.Op != OpSB => + (MOVWstoreconst [makeValAndOff(int32(c),off)] {sym} ptr mem) +(MOVHstore [off] {sym} ptr (MOVDconst [c]) mem) && isU12Bit(int64(off)) && ptr.Op != OpSB => + (MOVHstoreconst [makeValAndOff(int32(int16(c)),off)] {sym} ptr mem) +(MOVBstore [off] {sym} ptr (MOVDconst [c]) mem) && is20Bit(int64(off)) && ptr.Op != OpSB => + (MOVBstoreconst [makeValAndOff(int32(int8(c)),off)] {sym} ptr mem) + +// Fold address offsets into constant stores. +(MOVDstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off64()+int64(off)) => + (MOVDstoreconst [sc.addOffset32(off)] {s} ptr mem) +(MOVWstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off64()+int64(off)) => + (MOVWstoreconst [sc.addOffset32(off)] {s} ptr mem) +(MOVHstoreconst [sc] {s} (ADDconst [off] ptr) mem) && isU12Bit(sc.Off64()+int64(off)) => + (MOVHstoreconst [sc.addOffset32(off)] {s} ptr mem) +(MOVBstoreconst [sc] {s} (ADDconst [off] ptr) mem) && is20Bit(sc.Off64()+int64(off)) => + (MOVBstoreconst [sc.addOffset32(off)] {s} ptr mem) + +// Merge address calculations into loads and stores. +// Offsets from SB must not be merged into unaligned memory accesses because +// loads/stores using PC-relative addressing directly must be aligned to the +// size of the target. +(MOVDload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) => + (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVWZload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) => + (MOVWZload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVHZload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) => + (MOVHZload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVBZload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVBZload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (FMOVSload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem) + +(MOVWload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) => + (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVHload [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) => + (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem) +(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem) + +(MOVDstore [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%8 == 0 && (off1+off2)%8 == 0)) => + (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVWstore [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%4 == 0 && (off1+off2)%4 == 0)) => + (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVHstore [off1] {sym1} (MOVDaddr <t> [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || (t.IsPtr() && t.Elem().Alignment()%2 == 0 && (off1+off2)%2 == 0)) => + (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(MOVBstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => + (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) + +(ADDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ADDload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(ADDWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ADDWload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(MULLDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (MULLDload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(MULLWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (MULLWload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(SUBload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (SUBload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(SUBWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (SUBWload [o1+o2] {mergeSym(s1, s2)} x ptr mem) + +(ANDload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ANDload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(ANDWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ANDWload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(ORload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ORload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(ORWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (ORWload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(XORload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (XORload [o1+o2] {mergeSym(s1, s2)} x ptr mem) +(XORWload [o1] {s1} x (MOVDaddr [o2] {s2} ptr) mem) && ptr.Op != OpSB && is20Bit(int64(o1)+int64(o2)) && canMergeSym(s1, s2) => (XORWload [o1+o2] {mergeSym(s1, s2)} x ptr mem) + +// Cannot store constant to SB directly (no 'move relative long immediate' instructions). +(MOVDstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) => + (MOVDstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem) +(MOVWstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) => + (MOVWstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem) +(MOVHstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) => + (MOVHstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem) +(MOVBstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && ptr.Op != OpSB && canMergeSym(sym1, sym2) && sc.canAdd32(off) => + (MOVBstoreconst [sc.addOffset32(off)] {mergeSym(sym1, sym2)} ptr mem) + +// MOVDaddr into MOVDaddridx +(MOVDaddridx [off1] {sym1} (MOVDaddr [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB => + (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y) +(MOVDaddridx [off1] {sym1} x (MOVDaddr [off2] {sym2} y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && y.Op != OpSB => + (MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y) + +// Absorb InvertFlags into branches. +(BRC {c} (InvertFlags cmp) yes no) => (BRC {c.ReverseComparison()} cmp yes no) + +// Constant comparisons. +(CMPconst (MOVDconst [x]) [y]) && x==int64(y) => (FlagEQ) +(CMPconst (MOVDconst [x]) [y]) && x<int64(y) => (FlagLT) +(CMPconst (MOVDconst [x]) [y]) && x>int64(y) => (FlagGT) +(CMPUconst (MOVDconst [x]) [y]) && uint64(x)==uint64(y) => (FlagEQ) +(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) => (FlagLT) +(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) => (FlagGT) + +(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) => (FlagEQ) +(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) => (FlagLT) +(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) => (FlagGT) +(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)==uint32(y) => (FlagEQ) +(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) => (FlagLT) +(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) => (FlagGT) + +(CMP(W|WU)const (MOVBZreg _) [c]) && 0xff < c => (FlagLT) +(CMP(W|WU)const (MOVHZreg _) [c]) && 0xffff < c => (FlagLT) + +(CMPconst (SRDconst _ [c]) [n]) && c > 0 && n < 0 => (FlagGT) +(CMPWconst (SRWconst _ [c]) [n]) && c > 0 && n < 0 => (FlagGT) + +(CMPUconst (SRDconst _ [c]) [n]) && c > 0 && c < 64 && (1<<uint(64-c)) <= uint64(n) => (FlagLT) +(CMPWUconst (SRWconst _ [c]) [n]) && c > 0 && c < 32 && (1<<uint(32-c)) <= uint32(n) => (FlagLT) + +(CMPWconst (ANDWconst _ [m]) [n]) && int32(m) >= 0 && int32(m) < int32(n) => (FlagLT) +(CMPWUconst (ANDWconst _ [m]) [n]) && uint32(m) < uint32(n) => (FlagLT) + +(CMPconst (RISBGZ x {r}) [c]) && c > 0 && r.OutMask() < uint64(c) => (FlagLT) +(CMPUconst (RISBGZ x {r}) [c]) && r.OutMask() < uint64(uint32(c)) => (FlagLT) + +// Constant compare-and-branch with immediate. +(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && int64(x) == int64(y) => (First yes no) +(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && int64(x) < int64(y) => (First yes no) +(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && int64(x) > int64(y) => (First yes no) +(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && int32(x) == int32(y) => (First yes no) +(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && int32(x) < int32(y) => (First yes no) +(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && int32(x) > int32(y) => (First yes no) +(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && uint64(x) == uint64(y) => (First yes no) +(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && uint64(x) < uint64(y) => (First yes no) +(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && uint64(x) > uint64(y) => (First yes no) +(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal != 0 && uint32(x) == uint32(y) => (First yes no) +(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less != 0 && uint32(x) < uint32(y) => (First yes no) +(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater != 0 && uint32(x) > uint32(y) => (First yes no) +(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && int64(x) == int64(y) => (First no yes) +(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && int64(x) < int64(y) => (First no yes) +(CGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && int64(x) > int64(y) => (First no yes) +(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && int32(x) == int32(y) => (First no yes) +(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && int32(x) < int32(y) => (First no yes) +(CIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && int32(x) > int32(y) => (First no yes) +(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && uint64(x) == uint64(y) => (First no yes) +(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && uint64(x) < uint64(y) => (First no yes) +(CLGIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && uint64(x) > uint64(y) => (First no yes) +(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Equal == 0 && uint32(x) == uint32(y) => (First no yes) +(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Less == 0 && uint32(x) < uint32(y) => (First no yes) +(CLIJ {c} (MOVDconst [x]) [y] yes no) && c&s390x.Greater == 0 && uint32(x) > uint32(y) => (First no yes) + +// Constant compare-and-branch with immediate when unsigned comparison with zero. +(C(L|LG)IJ {s390x.GreaterOrEqual} _ [0] yes no) => (First yes no) +(C(L|LG)IJ {s390x.Less} _ [0] yes no) => (First no yes) + +// Constant compare-and-branch when operands match. +(C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c&s390x.Equal != 0 => (First yes no) +(C(GR|R|LGR|LR)J {c} x y yes no) && x == y && c&s390x.Equal == 0 => (First no yes) + +// Convert 64-bit comparisons to 32-bit comparisons and signed comparisons +// to unsigned comparisons. +// Helps simplify constant comparison detection. +(CM(P|PU)const (MOV(W|WZ)reg x) [c]) => (CMP(W|WU)const x [c]) +(CM(P|P|PU|PU)const x:(MOV(H|HZ|H|HZ)reg _) [c]) => (CMP(W|W|WU|WU)const x [c]) +(CM(P|P|PU|PU)const x:(MOV(B|BZ|B|BZ)reg _) [c]) => (CMP(W|W|WU|WU)const x [c]) +(CMPconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 && c >= 0 => (CMPWUconst x [c]) +(CMPUconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 => (CMPWUconst x [c]) +(CMPconst x:(SRDconst _ [c]) [n]) && c > 0 && n >= 0 => (CMPUconst x [n]) +(CMPWconst x:(SRWconst _ [c]) [n]) && c > 0 && n >= 0 => (CMPWUconst x [n]) + +// Absorb sign and zero extensions into 32-bit comparisons. +(CMP(W|W|WU|WU) x (MOV(W|WZ|W|WZ)reg y)) => (CMP(W|W|WU|WU) x y) +(CMP(W|W|WU|WU) (MOV(W|WZ|W|WZ)reg x) y) => (CMP(W|W|WU|WU) x y) +(CMP(W|W|WU|WU)const (MOV(W|WZ|W|WZ)reg x) [c]) => (CMP(W|W|WU|WU)const x [c]) + +// Absorb flag constants into branches. +(BRC {c} (FlagEQ) yes no) && c&s390x.Equal != 0 => (First yes no) +(BRC {c} (FlagLT) yes no) && c&s390x.Less != 0 => (First yes no) +(BRC {c} (FlagGT) yes no) && c&s390x.Greater != 0 => (First yes no) +(BRC {c} (FlagOV) yes no) && c&s390x.Unordered != 0 => (First yes no) + +(BRC {c} (FlagEQ) yes no) && c&s390x.Equal == 0 => (First no yes) +(BRC {c} (FlagLT) yes no) && c&s390x.Less == 0 => (First no yes) +(BRC {c} (FlagGT) yes no) && c&s390x.Greater == 0 => (First no yes) +(BRC {c} (FlagOV) yes no) && c&s390x.Unordered == 0 => (First no yes) + +// Absorb flag constants into SETxx ops. +(LOCGR {c} _ x (FlagEQ)) && c&s390x.Equal != 0 => x +(LOCGR {c} _ x (FlagLT)) && c&s390x.Less != 0 => x +(LOCGR {c} _ x (FlagGT)) && c&s390x.Greater != 0 => x +(LOCGR {c} _ x (FlagOV)) && c&s390x.Unordered != 0 => x + +(LOCGR {c} x _ (FlagEQ)) && c&s390x.Equal == 0 => x +(LOCGR {c} x _ (FlagLT)) && c&s390x.Less == 0 => x +(LOCGR {c} x _ (FlagGT)) && c&s390x.Greater == 0 => x +(LOCGR {c} x _ (FlagOV)) && c&s390x.Unordered == 0 => x + +// Remove redundant *const ops +(ADDconst [0] x) => x +(ADDWconst [c] x) && int32(c)==0 => x +(SUBconst [0] x) => x +(SUBWconst [c] x) && int32(c) == 0 => x +(ANDconst [0] _) => (MOVDconst [0]) +(ANDWconst [c] _) && int32(c)==0 => (MOVDconst [0]) +(ANDconst [-1] x) => x +(ANDWconst [c] x) && int32(c)==-1 => x +(ORconst [0] x) => x +(ORWconst [c] x) && int32(c)==0 => x +(ORconst [-1] _) => (MOVDconst [-1]) +(ORWconst [c] _) && int32(c)==-1 => (MOVDconst [-1]) +(XORconst [0] x) => x +(XORWconst [c] x) && int32(c)==0 => x + +// Shifts by zero (may be inserted during multiplication strength reduction). +((SLD|SLW|SRD|SRW|SRAD|SRAW)const x [0]) => x + +// Convert constant subtracts to constant adds. +(SUBconst [c] x) && c != -(1<<31) => (ADDconst [-c] x) +(SUBWconst [c] x) => (ADDWconst [-int32(c)] x) + +// generic constant folding +// TODO: more of this +(ADDconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)+d]) +(ADDWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)+d]) +(ADDconst [c] (ADDconst [d] x)) && is32Bit(int64(c)+int64(d)) => (ADDconst [c+d] x) +(ADDWconst [c] (ADDWconst [d] x)) => (ADDWconst [int32(c+d)] x) +(SUBconst (MOVDconst [d]) [c]) => (MOVDconst [d-int64(c)]) +(SUBconst (SUBconst x [d]) [c]) && is32Bit(-int64(c)-int64(d)) => (ADDconst [-c-d] x) +(SRADconst [c] (MOVDconst [d])) => (MOVDconst [d>>uint64(c)]) +(SRAWconst [c] (MOVDconst [d])) => (MOVDconst [int64(int32(d))>>uint64(c)]) +(NEG (MOVDconst [c])) => (MOVDconst [-c]) +(NEGW (MOVDconst [c])) => (MOVDconst [int64(int32(-c))]) +(MULLDconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)*d]) +(MULLWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c*int32(d))]) +(AND (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c&d]) +(ANDconst [c] (MOVDconst [d])) => (MOVDconst [c&d]) +(ANDWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)&d]) +(OR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c|d]) +(ORconst [c] (MOVDconst [d])) => (MOVDconst [c|d]) +(ORWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)|d]) +(XOR (MOVDconst [c]) (MOVDconst [d])) => (MOVDconst [c^d]) +(XORconst [c] (MOVDconst [d])) => (MOVDconst [c^d]) +(XORWconst [c] (MOVDconst [d])) => (MOVDconst [int64(c)^d]) +(LoweredRound32F x:(FMOVSconst)) => x +(LoweredRound64F x:(FMOVDconst)) => x + +// generic simplifications +// TODO: more of this +(ADD x (NEG y)) => (SUB x y) +(ADDW x (NEGW y)) => (SUBW x y) +(SUB x x) => (MOVDconst [0]) +(SUBW x x) => (MOVDconst [0]) +(AND x x) => x +(ANDW x x) => x +(OR x x) => x +(ORW x x) => x +(XOR x x) => (MOVDconst [0]) +(XORW x x) => (MOVDconst [0]) +(NEG (ADDconst [c] (NEG x))) && c != -(1<<31) => (ADDconst [-c] x) +(MOVBZreg (ANDWconst [m] x)) => (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x)) +(MOVHZreg (ANDWconst [m] x)) => (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x)) +(MOVBreg (ANDWconst [m] x)) && int8(m) >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32( uint8(m))] x)) +(MOVHreg (ANDWconst [m] x)) && int16(m) >= 0 => (MOVWZreg (ANDWconst <typ.UInt32> [int32(uint16(m))] x)) + +// carry flag generation +// (only constant fold carry of zero) +(Select1 (ADDCconst (MOVDconst [c]) [d])) + && uint64(c+int64(d)) >= uint64(c) && c+int64(d) == 0 + => (FlagEQ) +(Select1 (ADDCconst (MOVDconst [c]) [d])) + && uint64(c+int64(d)) >= uint64(c) && c+int64(d) != 0 + => (FlagLT) + +// borrow flag generation +// (only constant fold borrow of zero) +(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d]))) + && uint64(d) <= uint64(c) && c-d == 0 + => (FlagGT) +(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d]))) + && uint64(d) <= uint64(c) && c-d != 0 + => (FlagOV) + +// add with carry +(ADDE x y (FlagEQ)) => (ADDC x y) +(ADDE x y (FlagLT)) => (ADDC x y) +(ADDC x (MOVDconst [c])) && is16Bit(c) => (ADDCconst x [int16(c)]) +(Select0 (ADDCconst (MOVDconst [c]) [d])) => (MOVDconst [c+int64(d)]) + +// subtract with borrow +(SUBE x y (FlagGT)) => (SUBC x y) +(SUBE x y (FlagOV)) => (SUBC x y) +(Select0 (SUBC (MOVDconst [c]) (MOVDconst [d]))) => (MOVDconst [c-d]) + +// collapse carry chain +(ADDE x y (Select1 (ADDCconst [-1] (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) c))))) + => (ADDE x y c) + +// collapse borrow chain +(SUBE x y (Select1 (SUBC (MOVDconst [0]) (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) c)))))) + => (SUBE x y c) + +// branch on carry +(C(G|LG)IJ {s390x.Equal} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.NoCarry} carry) +(C(G|LG)IJ {s390x.Equal} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [1]) => (BRC {s390x.Carry} carry) +(C(G|LG)IJ {s390x.LessOrGreater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.Carry} carry) +(C(G|LG)IJ {s390x.LessOrGreater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [1]) => (BRC {s390x.NoCarry} carry) +(C(G|LG)IJ {s390x.Greater} (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) carry)) [0]) => (BRC {s390x.Carry} carry) + +// branch on borrow +(C(G|LG)IJ {s390x.Equal} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.NoBorrow} borrow) +(C(G|LG)IJ {s390x.Equal} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [1]) => (BRC {s390x.Borrow} borrow) +(C(G|LG)IJ {s390x.LessOrGreater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.Borrow} borrow) +(C(G|LG)IJ {s390x.LessOrGreater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [1]) => (BRC {s390x.NoBorrow} borrow) +(C(G|LG)IJ {s390x.Greater} (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) borrow))) [0]) => (BRC {s390x.Borrow} borrow) + +// fused multiply-add +(Select0 (F(ADD|SUB) (FMUL y z) x)) && x.Block.Func.useFMA(v) => (FM(ADD|SUB) x y z) +(Select0 (F(ADDS|SUBS) (FMULS y z) x)) && x.Block.Func.useFMA(v) => (FM(ADDS|SUBS) x y z) + +// Convert floating point comparisons against zero into 'load and test' instructions. +(F(CMP|CMPS) x (FMOV(D|S)const [0.0])) => (LT(D|E)BR x) +(F(CMP|CMPS) (FMOV(D|S)const [0.0]) x) => (InvertFlags (LT(D|E)BR <v.Type> x)) + +// FSUB, FSUBS, FADD, FADDS now produce a condition code representing the +// comparison of the result with 0.0. If a compare with zero instruction +// (e.g. LTDBR) is following one of those instructions, we can use the +// generated flag and remove the comparison instruction. +// Note: when inserting Select1 ops we need to ensure they are in the +// same block as their argument. We could also use @x.Block for this +// but moving the flag generating value to a different block seems to +// increase the likelihood that the flags value will have to be regenerated +// by flagalloc which is not what we want. +(LTDBR (Select0 x:(F(ADD|SUB) _ _))) && b == x.Block => (Select1 x) +(LTEBR (Select0 x:(F(ADDS|SUBS) _ _))) && b == x.Block => (Select1 x) + +// Fold memory operations into operations. +// Exclude global data (SB) because these instructions cannot handle relative addresses. +// TODO(mundaym): indexed versions of these? +((ADD|SUB|MULLD|AND|OR|XOR) <t> x g:(MOVDload [off] {sym} ptr mem)) + && ptr.Op != OpSB + && is20Bit(int64(off)) + && canMergeLoadClobber(v, g, x) + && clobber(g) + => ((ADD|SUB|MULLD|AND|OR|XOR)load <t> [off] {sym} x ptr mem) +((ADD|SUB|MULL|AND|OR|XOR)W <t> x g:(MOVWload [off] {sym} ptr mem)) + && ptr.Op != OpSB + && is20Bit(int64(off)) + && canMergeLoadClobber(v, g, x) + && clobber(g) + => ((ADD|SUB|MULL|AND|OR|XOR)Wload <t> [off] {sym} x ptr mem) +((ADD|SUB|MULL|AND|OR|XOR)W <t> x g:(MOVWZload [off] {sym} ptr mem)) + && ptr.Op != OpSB + && is20Bit(int64(off)) + && canMergeLoadClobber(v, g, x) + && clobber(g) + => ((ADD|SUB|MULL|AND|OR|XOR)Wload <t> [off] {sym} x ptr mem) + +// Combine constant stores into larger (unaligned) stores. +// Avoid SB because constant stores to relative offsets are +// emulated by the assembler and also can't handle unaligned offsets. +(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem)) + && p.Op != OpSB + && x.Uses == 1 + && a.Off() + 1 == c.Off() + && clobber(x) + => (MOVHstoreconst [makeValAndOff(c.Val()&0xff | a.Val()<<8, a.Off())] {s} p mem) +(MOVHstoreconst [c] {s} p x:(MOVHstoreconst [a] {s} p mem)) + && p.Op != OpSB + && x.Uses == 1 + && a.Off() + 2 == c.Off() + && clobber(x) + => (MOVWstore [a.Off()] {s} p (MOVDconst [int64(c.Val()&0xffff | a.Val()<<16)]) mem) +(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem)) + && p.Op != OpSB + && x.Uses == 1 + && a.Off() + 4 == c.Off() + && clobber(x) + => (MOVDstore [a.Off()] {s} p (MOVDconst [c.Val64()&0xffffffff | a.Val64()<<32]) mem) + +// Combine stores into larger (unaligned) stores. +// It doesn't work on global data (based on SB) because stores with relative addressing +// require that the memory operand be aligned. +(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRDconst [8] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} p w mem) +(MOVBstore [i] {s} p w0:(SRDconst [j] w) x:(MOVBstore [i-1] {s} p (SRDconst [j+8] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} p w0 mem) +(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRWconst [8] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} p w mem) +(MOVBstore [i] {s} p w0:(SRWconst [j] w) x:(MOVBstore [i-1] {s} p (SRWconst [j+8] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHstore [i-1] {s} p w0 mem) +(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRDconst [16] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-2] {s} p w mem) +(MOVHstore [i] {s} p w0:(SRDconst [j] w) x:(MOVHstore [i-2] {s} p (SRDconst [j+16] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-2] {s} p w0 mem) +(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRWconst [16] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-2] {s} p w mem) +(MOVHstore [i] {s} p w0:(SRWconst [j] w) x:(MOVHstore [i-2] {s} p (SRWconst [j+16] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVWstore [i-2] {s} p w0 mem) +(MOVWstore [i] {s} p (SRDconst [32] w) x:(MOVWstore [i-4] {s} p w mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVDstore [i-4] {s} p w mem) +(MOVWstore [i] {s} p w0:(SRDconst [j] w) x:(MOVWstore [i-4] {s} p (SRDconst [j+32] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVDstore [i-4] {s} p w0 mem) + +// Combine stores into larger (unaligned) stores with the bytes reversed (little endian). +// Store-with-bytes-reversed instructions do not support relative memory addresses, +// so these stores can't operate on global data (SB). +(MOVBstore [i] {s} p (SRDconst [8] w) x:(MOVBstore [i-1] {s} p w mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHBRstore [i-1] {s} p w mem) +(MOVBstore [i] {s} p (SRDconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRDconst [j-8] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHBRstore [i-1] {s} p w0 mem) +(MOVBstore [i] {s} p (SRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHBRstore [i-1] {s} p w mem) +(MOVBstore [i] {s} p (SRWconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRWconst [j-8] w) mem)) + && p.Op != OpSB + && x.Uses == 1 + && clobber(x) + => (MOVHBRstore [i-1] {s} p w0 mem) +(MOVHBRstore [i] {s} p (SRDconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWBRstore [i-2] {s} p w mem) +(MOVHBRstore [i] {s} p (SRDconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRDconst [j-16] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWBRstore [i-2] {s} p w0 mem) +(MOVHBRstore [i] {s} p (SRWconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWBRstore [i-2] {s} p w mem) +(MOVHBRstore [i] {s} p (SRWconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRWconst [j-16] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVWBRstore [i-2] {s} p w0 mem) +(MOVWBRstore [i] {s} p (SRDconst [32] w) x:(MOVWBRstore [i-4] {s} p w mem)) + && x.Uses == 1 + && clobber(x) + => (MOVDBRstore [i-4] {s} p w mem) +(MOVWBRstore [i] {s} p (SRDconst [j] w) x:(MOVWBRstore [i-4] {s} p w0:(SRDconst [j-32] w) mem)) + && x.Uses == 1 + && clobber(x) + => (MOVDBRstore [i-4] {s} p w0 mem) + +(MOVBstore [7] {s} p1 (SRDconst w) + x1:(MOVHBRstore [5] {s} p1 (SRDconst w) + x2:(MOVWBRstore [1] {s} p1 (SRDconst w) + x3:(MOVBstore [0] {s} p1 w mem)))) + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && clobber(x1, x2, x3) + => (MOVDBRstore {s} p1 w mem) + +// Combining byte loads into larger (unaligned) loads. + +// Big-endian loads + +(ORW x1:(MOVBZload [i1] {s} p mem) + sh:(SLWconst [8] x0:(MOVBZload [i0] {s} p mem))) + && i1 == i0+1 + && p.Op != OpSB + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem) + +(OR x1:(MOVBZload [i1] {s} p mem) + sh:(SLDconst [8] x0:(MOVBZload [i0] {s} p mem))) + && i1 == i0+1 + && p.Op != OpSB + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem) + +(ORW x1:(MOVHZload [i1] {s} p mem) + sh:(SLWconst [16] x0:(MOVHZload [i0] {s} p mem))) + && i1 == i0+2 + && p.Op != OpSB + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem) + +(OR x1:(MOVHZload [i1] {s} p mem) + sh:(SLDconst [16] x0:(MOVHZload [i0] {s} p mem))) + && i1 == i0+2 + && p.Op != OpSB + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem) + +(OR x1:(MOVWZload [i1] {s} p mem) + sh:(SLDconst [32] x0:(MOVWZload [i0] {s} p mem))) + && i1 == i0+4 + && p.Op != OpSB + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVDload [i0] {s} p mem) + +(ORW + s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem)) + or:(ORW + s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem)) + y)) + && i1 == i0+1 + && j1 == j0-8 + && j1 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y) + +(OR + s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem)) + or:(OR + s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem)) + y)) + && i1 == i0+1 + && j1 == j0-8 + && j1 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y) + +(OR + s0:(SLDconst [j0] x0:(MOVHZload [i0] {s} p mem)) + or:(OR + s1:(SLDconst [j1] x1:(MOVHZload [i1] {s} p mem)) + y)) + && i1 == i0+2 + && j1 == j0-16 + && j1 % 32 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZload [i0] {s} p mem)) y) + +// Little-endian loads + +(ORW x0:(MOVBZload [i0] {s} p mem) + sh:(SLWconst [8] x1:(MOVBZload [i1] {s} p mem))) + && p.Op != OpSB + && i1 == i0+1 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem)) + +(OR x0:(MOVBZload [i0] {s} p mem) + sh:(SLDconst [8] x1:(MOVBZload [i1] {s} p mem))) + && p.Op != OpSB + && i1 == i0+1 + && x0.Uses == 1 + && x1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, sh) + => @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem)) + +(ORW r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem)) + sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem)))) + && i1 == i0+2 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (MOVWBRload [i0] {s} p mem) + +(OR r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem)) + sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem)))) + && i1 == i0+2 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRload [i0] {s} p mem)) + +(OR r0:(MOVWZreg x0:(MOVWBRload [i0] {s} p mem)) + sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRload [i1] {s} p mem)))) + && i1 == i0+4 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && sh.Uses == 1 + && mergePoint(b,x0,x1) != nil + && clobber(x0, x1, r0, r1, sh) + => @mergePoint(b,x0,x1) (MOVDBRload [i0] {s} p mem) + +(ORW + s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem)) + or:(ORW + s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem)) + y)) + && p.Op != OpSB + && i1 == i0+1 + && j1 == j0+8 + && j0 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y) + +(OR + s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem)) + or:(OR + s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem)) + y)) + && p.Op != OpSB + && i1 == i0+1 + && j1 == j0+8 + && j0 % 16 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y) + +(OR + s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))) + or:(OR + s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))) + y)) + && i1 == i0+2 + && j1 == j0+16 + && j0 % 32 == 0 + && x0.Uses == 1 + && x1.Uses == 1 + && r0.Uses == 1 + && r1.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && or.Uses == 1 + && mergePoint(b,x0,x1,y) != nil + && clobber(x0, x1, r0, r1, s0, s1, or) + => @mergePoint(b,x0,x1,y) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRload [i0] {s} p mem))) y) + +// Combine stores into store multiples. +// 32-bit +(MOVWstore [i] {s} p w1 x:(MOVWstore [i-4] {s} p w0 mem)) + && p.Op != OpSB + && x.Uses == 1 + && is20Bit(int64(i)-4) + && clobber(x) + => (STM2 [i-4] {s} p w0 w1 mem) +(MOVWstore [i] {s} p w2 x:(STM2 [i-8] {s} p w0 w1 mem)) + && x.Uses == 1 + && is20Bit(int64(i)-8) + && clobber(x) + => (STM3 [i-8] {s} p w0 w1 w2 mem) +(MOVWstore [i] {s} p w3 x:(STM3 [i-12] {s} p w0 w1 w2 mem)) + && x.Uses == 1 + && is20Bit(int64(i)-12) + && clobber(x) + => (STM4 [i-12] {s} p w0 w1 w2 w3 mem) +(STM2 [i] {s} p w2 w3 x:(STM2 [i-8] {s} p w0 w1 mem)) + && x.Uses == 1 + && is20Bit(int64(i)-8) + && clobber(x) + => (STM4 [i-8] {s} p w0 w1 w2 w3 mem) +// 64-bit +(MOVDstore [i] {s} p w1 x:(MOVDstore [i-8] {s} p w0 mem)) + && p.Op != OpSB + && x.Uses == 1 + && is20Bit(int64(i)-8) + && clobber(x) + => (STMG2 [i-8] {s} p w0 w1 mem) +(MOVDstore [i] {s} p w2 x:(STMG2 [i-16] {s} p w0 w1 mem)) + && x.Uses == 1 + && is20Bit(int64(i)-16) + && clobber(x) + => (STMG3 [i-16] {s} p w0 w1 w2 mem) +(MOVDstore [i] {s} p w3 x:(STMG3 [i-24] {s} p w0 w1 w2 mem)) + && x.Uses == 1 + && is20Bit(int64(i)-24) + && clobber(x) + => (STMG4 [i-24] {s} p w0 w1 w2 w3 mem) +(STMG2 [i] {s} p w2 w3 x:(STMG2 [i-16] {s} p w0 w1 mem)) + && x.Uses == 1 + && is20Bit(int64(i)-16) + && clobber(x) + => (STMG4 [i-16] {s} p w0 w1 w2 w3 mem) + +// Convert 32-bit store multiples into 64-bit stores. +(STM2 [i] {s} p (SRDconst [32] x) x mem) => (MOVDstore [i] {s} p x mem) diff --git a/src/cmd/compile/internal/ssa/_gen/S390XOps.go b/src/cmd/compile/internal/ssa/_gen/S390XOps.go new file mode 100644 index 0000000..896fdaa --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/S390XOps.go @@ -0,0 +1,817 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +// Notes: +// - Integer types live in the low portion of registers. Upper portions are junk. +// - Boolean types use the low-order byte of a register. 0=false, 1=true. +// Upper bytes are junk. +// - When doing sub-register operations, we try to write the whole +// destination register to avoid a partial-register write. +// - Unused portions of AuxInt (or the Val portion of ValAndOff) are +// filled by sign-extending the used portion. Users of AuxInt which interpret +// AuxInt as unsigned (e.g. shifts) must be careful. +// - The SB 'register' is implemented using instruction-relative addressing. This +// places some limitations on when and how memory operands that are addressed +// relative to SB can be used: +// +// 1. Pseudo-instructions do not always map to a single machine instruction when +// using the SB 'register' to address data. This is because many machine +// instructions do not have relative long (RL suffix) equivalents. For example, +// ADDload, which is assembled as AG. +// +// 2. Loads and stores using relative addressing require the data be aligned +// according to its size (8-bytes for double words, 4-bytes for words +// and so on). +// +// We can always work around these by inserting LARL instructions (load address +// relative long) in the assembler, but typically this results in worse code +// generation because the address can't be re-used. Inserting instructions in the +// assembler also means clobbering the temp register and it is a long-term goal +// to prevent the compiler doing this so that it can be allocated as a normal +// register. +// +// For more information about the z/Architecture, the instruction set and the +// addressing modes it supports take a look at the z/Architecture Principles of +// Operation: http://publibfp.boulder.ibm.com/epubs/pdf/dz9zr010.pdf +// +// Suffixes encode the bit width of pseudo-instructions. +// D (double word) = 64 bit (frequently omitted) +// W (word) = 32 bit +// H (half word) = 16 bit +// B (byte) = 8 bit +// S (single prec.) = 32 bit (double precision is omitted) + +// copied from ../../s390x/reg.go +var regNamesS390X = []string{ + "R0", + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", + "R12", + "g", // R13 + "R14", + "SP", // R15 + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", + + // If you add registers, update asyncPreempt in runtime. + + //pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesS390X) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesS390X { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + // Common individual register masks + var ( + sp = buildReg("SP") + sb = buildReg("SB") + r0 = buildReg("R0") + tmp = buildReg("R11") // R11 is used as a temporary in a small number of instructions. + + // R10 is reserved by the assembler. + gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14") + gpg = gp | buildReg("g") + gpsp = gp | sp + + // R0 is considered to contain the value 0 in address calculations. + ptr = gp &^ r0 + ptrsp = ptr | sp + ptrspsb = ptrsp | sb + + fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15") + callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r1 = buildReg("R1") + r2 = buildReg("R2") + r3 = buildReg("R3") + ) + // Common slices of register masks + var ( + gponly = []regMask{gp} + fponly = []regMask{fp} + ) + + // Common regInfo + var ( + gp01 = regInfo{inputs: []regMask{}, outputs: gponly} + gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly} + gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly} + gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly} + gp21tmp = regInfo{inputs: []regMask{gp &^ tmp, gp &^ tmp}, outputs: []regMask{gp &^ tmp}, clobbers: tmp} + + // R0 evaluates to 0 when used as the number of bits to shift + // so we need to exclude it from that operand. + sh21 = regInfo{inputs: []regMask{gp, ptr}, outputs: gponly} + + addr = regInfo{inputs: []regMask{sp | sb}, outputs: gponly} + addridx = regInfo{inputs: []regMask{sp | sb, ptrsp}, outputs: gponly} + + gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}} + gp1flags = regInfo{inputs: []regMask{gpsp}} + gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + gp11flags = regInfo{inputs: []regMask{gp}, outputs: gponly} + gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + gp2flags1flags = regInfo{inputs: []regMask{gp, gp}, outputs: gponly} + + gpload = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: gponly} + gploadidx = regInfo{inputs: []regMask{ptrspsb, ptrsp, 0}, outputs: gponly} + gpopload = regInfo{inputs: []regMask{gp, ptrsp, 0}, outputs: gponly} + gpstore = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}} + gpstoreconst = regInfo{inputs: []regMask{ptrspsb, 0}} + gpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, gpsp, 0}} + gpstorebr = regInfo{inputs: []regMask{ptrsp, gpsp, 0}} + gpstorelaa = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}, outputs: gponly} + gpstorelab = regInfo{inputs: []regMask{r1, gpsp, 0}, clobbers: r1} + + gpmvc = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}} + + fp01 = regInfo{inputs: []regMask{}, outputs: fponly} + fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly} + fp21clobber = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} + fpgp = regInfo{inputs: fponly, outputs: gponly} + gpfp = regInfo{inputs: gponly, outputs: fponly} + fp11 = regInfo{inputs: fponly, outputs: fponly} + fp1flags = regInfo{inputs: []regMask{fp}} + fp11clobber = regInfo{inputs: fponly, outputs: fponly} + fp2flags = regInfo{inputs: []regMask{fp, fp}} + + fpload = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: fponly} + fploadidx = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}, outputs: fponly} + + fpstore = regInfo{inputs: []regMask{ptrspsb, fp, 0}} + fpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, fp, 0}} + + sync = regInfo{inputs: []regMask{0}} + + // LoweredAtomicCas may overwrite arg1, so force it to R0 for now. + cas = regInfo{inputs: []regMask{ptrsp, r0, gpsp, 0}, outputs: []regMask{gp, 0}, clobbers: r0} + + // LoweredAtomicExchange overwrites the output before executing + // CS{,G}, so the output register must not be the same as the + // input register. For now we just force the output register to + // R0. + exchange = regInfo{inputs: []regMask{ptrsp, gpsp &^ r0, 0}, outputs: []regMask{r0, 0}} + ) + + var S390Xops = []opData{ + // fp ops + {name: "FADDS", argLength: 2, reg: fp21clobber, typ: "(Float32,Flags)", asm: "FADDS", commutative: true, resultInArg0: true}, // fp32 arg0 + arg1 + {name: "FADD", argLength: 2, reg: fp21clobber, typ: "(Float64,Flags)", asm: "FADD", commutative: true, resultInArg0: true}, // fp64 arg0 + arg1 + {name: "FSUBS", argLength: 2, reg: fp21clobber, typ: "(Float32,Flags)", asm: "FSUBS", resultInArg0: true}, // fp32 arg0 - arg1 + {name: "FSUB", argLength: 2, reg: fp21clobber, typ: "(Float64,Flags)", asm: "FSUB", resultInArg0: true}, // fp64 arg0 - arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, resultInArg0: true}, // fp32 arg0 * arg1 + {name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true, resultInArg0: true}, // fp64 arg0 * arg1 + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", resultInArg0: true}, // fp32 arg0 / arg1 + {name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV", resultInArg0: true}, // fp64 arg0 / arg1 + {name: "FNEGS", argLength: 1, reg: fp11clobber, asm: "FNEGS", clobberFlags: true}, // fp32 -arg0 + {name: "FNEG", argLength: 1, reg: fp11clobber, asm: "FNEG", clobberFlags: true}, // fp64 -arg0 + {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS", resultInArg0: true}, // fp32 arg1 * arg2 + arg0 + {name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD", resultInArg0: true}, // fp64 arg1 * arg2 + arg0 + {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", resultInArg0: true}, // fp32 arg1 * arg2 - arg0 + {name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB", resultInArg0: true}, // fp64 arg1 * arg2 - arg0 + {name: "LPDFR", argLength: 1, reg: fp11, asm: "LPDFR"}, // fp64/fp32 set sign bit + {name: "LNDFR", argLength: 1, reg: fp11, asm: "LNDFR"}, // fp64/fp32 clear sign bit + {name: "CPSDR", argLength: 2, reg: fp21, asm: "CPSDR"}, // fp64/fp32 copy arg1 sign bit to arg0 + + // Round to integer, float64 only. + // + // aux | rounding mode + // ----+----------------------------------- + // 1 | round to nearest, ties away from 0 + // 4 | round to nearest, ties to even + // 5 | round toward 0 + // 6 | round toward +∞ + // 7 | round toward -∞ + {name: "FIDBR", argLength: 1, reg: fp11, asm: "FIDBR", aux: "Int8"}, + + {name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load + {name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load + {name: "FMOVSconst", reg: fp01, asm: "FMOVS", aux: "Float32", rematerializeable: true}, // fp32 constant + {name: "FMOVDconst", reg: fp01, asm: "FMOVD", aux: "Float64", rematerializeable: true}, // fp64 constant + {name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by i + {name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by i + + {name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store + {name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store + {name: "FMOVSstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by i store + {name: "FMOVDstoreidx", argLength: 4, reg: fpstoreidx, asm: "FMOVD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by i store + + // binary ops + {name: "ADD", argLength: 2, reg: gp21sp, asm: "ADD", commutative: true, clobberFlags: true}, // arg0 + arg1 + {name: "ADDW", argLength: 2, reg: gp21sp, asm: "ADDW", commutative: true, clobberFlags: true}, // arg0 + arg1 + {name: "ADDconst", argLength: 1, reg: gp11sp, asm: "ADD", aux: "Int32", typ: "UInt64", clobberFlags: true}, // arg0 + auxint + {name: "ADDWconst", argLength: 1, reg: gp11sp, asm: "ADDW", aux: "Int32", clobberFlags: true}, // arg0 + auxint + {name: "ADDload", argLength: 3, reg: gpopload, asm: "ADD", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + *arg1. arg2=mem + {name: "ADDWload", argLength: 3, reg: gpopload, asm: "ADDW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + *arg1. arg2=mem + + {name: "SUB", argLength: 2, reg: gp21, asm: "SUB", clobberFlags: true}, // arg0 - arg1 + {name: "SUBW", argLength: 2, reg: gp21, asm: "SUBW", clobberFlags: true}, // arg0 - arg1 + {name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint + {name: "SUBWconst", argLength: 1, reg: gp11, asm: "SUBW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint + {name: "SUBload", argLength: 3, reg: gpopload, asm: "SUB", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - *arg1. arg2=mem + {name: "SUBWload", argLength: 3, reg: gpopload, asm: "SUBW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - *arg1. arg2=mem + + {name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1 + {name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1 + {name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 * auxint + {name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 * auxint + {name: "MULLDload", argLength: 3, reg: gpopload, asm: "MULLD", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * *arg1. arg2=mem + {name: "MULLWload", argLength: 3, reg: gpopload, asm: "MULLW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * *arg1. arg2=mem + + {name: "MULHD", argLength: 2, reg: gp21tmp, asm: "MULHD", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 * arg1) >> width + {name: "MULHDU", argLength: 2, reg: gp21tmp, asm: "MULHDU", typ: "Int64", commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 * arg1) >> width + + {name: "DIVD", argLength: 2, reg: gp21tmp, asm: "DIVD", resultInArg0: true, clobberFlags: true}, // arg0 / arg1 + {name: "DIVW", argLength: 2, reg: gp21tmp, asm: "DIVW", resultInArg0: true, clobberFlags: true}, // arg0 / arg1 + {name: "DIVDU", argLength: 2, reg: gp21tmp, asm: "DIVDU", resultInArg0: true, clobberFlags: true}, // arg0 / arg1 + {name: "DIVWU", argLength: 2, reg: gp21tmp, asm: "DIVWU", resultInArg0: true, clobberFlags: true}, // arg0 / arg1 + + {name: "MODD", argLength: 2, reg: gp21tmp, asm: "MODD", resultInArg0: true, clobberFlags: true}, // arg0 % arg1 + {name: "MODW", argLength: 2, reg: gp21tmp, asm: "MODW", resultInArg0: true, clobberFlags: true}, // arg0 % arg1 + + {name: "MODDU", argLength: 2, reg: gp21tmp, asm: "MODDU", resultInArg0: true, clobberFlags: true}, // arg0 % arg1 + {name: "MODWU", argLength: 2, reg: gp21tmp, asm: "MODWU", resultInArg0: true, clobberFlags: true}, // arg0 % arg1 + + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true, clobberFlags: true}, // arg0 & arg1 + {name: "ANDW", argLength: 2, reg: gp21, asm: "ANDW", commutative: true, clobberFlags: true}, // arg0 & arg1 + {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 & auxint + {name: "ANDWconst", argLength: 1, reg: gp11, asm: "ANDW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint + {name: "ANDload", argLength: 3, reg: gpopload, asm: "AND", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & *arg1. arg2=mem + {name: "ANDWload", argLength: 3, reg: gpopload, asm: "ANDW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & *arg1. arg2=mem + + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true, clobberFlags: true}, // arg0 | arg1 + {name: "ORW", argLength: 2, reg: gp21, asm: "ORW", commutative: true, clobberFlags: true}, // arg0 | arg1 + {name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 | auxint + {name: "ORWconst", argLength: 1, reg: gp11, asm: "ORW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint + {name: "ORload", argLength: 3, reg: gpopload, asm: "OR", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | *arg1. arg2=mem + {name: "ORWload", argLength: 3, reg: gpopload, asm: "ORW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | *arg1. arg2=mem + + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, clobberFlags: true}, // arg0 ^ arg1 + {name: "XORW", argLength: 2, reg: gp21, asm: "XORW", commutative: true, clobberFlags: true}, // arg0 ^ arg1 + {name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint + {name: "XORWconst", argLength: 1, reg: gp11, asm: "XORW", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint + {name: "XORload", argLength: 3, reg: gpopload, asm: "XOR", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ *arg1. arg2=mem + {name: "XORWload", argLength: 3, reg: gpopload, asm: "XORW", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ *arg1. arg2=mem + + // Arithmetic ops with carry/borrow chain. + // + // A carry is represented by a condition code of 2 or 3 (GT or OV). + // A borrow is represented by a condition code of 0 or 1 (EQ or LT). + {name: "ADDC", argLength: 2, reg: gp21flags, asm: "ADDC", typ: "(UInt64,Flags)", commutative: true}, // (arg0 + arg1, carry out) + {name: "ADDCconst", argLength: 1, reg: gp11flags, asm: "ADDC", typ: "(UInt64,Flags)", aux: "Int16"}, // (arg0 + auxint, carry out) + {name: "ADDE", argLength: 3, reg: gp2flags1flags, asm: "ADDE", typ: "(UInt64,Flags)", commutative: true, resultInArg0: true}, // (arg0 + arg1 + arg2 (carry in), carry out) + {name: "SUBC", argLength: 2, reg: gp21flags, asm: "SUBC", typ: "(UInt64,Flags)"}, // (arg0 - arg1, borrow out) + {name: "SUBE", argLength: 3, reg: gp2flags1flags, asm: "SUBE", typ: "(UInt64,Flags)", resultInArg0: true}, // (arg0 - arg1 - arg2 (borrow in), borrow out) + + // Comparisons. + {name: "CMP", argLength: 2, reg: gp2flags, asm: "CMP", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1 + + {name: "CMPU", argLength: 2, reg: gp2flags, asm: "CMPU", typ: "Flags"}, // arg0 compare to arg1 + {name: "CMPWU", argLength: 2, reg: gp2flags, asm: "CMPWU", typ: "Flags"}, // arg0 compare to arg1 + + {name: "CMPconst", argLength: 1, reg: gp1flags, asm: "CMP", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint + {name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint + {name: "CMPUconst", argLength: 1, reg: gp1flags, asm: "CMPU", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint + {name: "CMPWUconst", argLength: 1, reg: gp1flags, asm: "CMPWU", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint + + {name: "FCMPS", argLength: 2, reg: fp2flags, asm: "CEBR", typ: "Flags"}, // arg0 compare to arg1, f32 + {name: "FCMP", argLength: 2, reg: fp2flags, asm: "FCMPU", typ: "Flags"}, // arg0 compare to arg1, f64 + {name: "LTDBR", argLength: 1, reg: fp1flags, asm: "LTDBR", typ: "Flags"}, // arg0 compare to 0, f64 + {name: "LTEBR", argLength: 1, reg: fp1flags, asm: "LTEBR", typ: "Flags"}, // arg0 compare to 0, f32 + + {name: "SLD", argLength: 2, reg: sh21, asm: "SLD"}, // arg0 << arg1, shift amount is mod 64 + {name: "SLW", argLength: 2, reg: sh21, asm: "SLW"}, // arg0 << arg1, shift amount is mod 64 + {name: "SLDconst", argLength: 1, reg: gp11, asm: "SLD", aux: "UInt8"}, // arg0 << auxint, shift amount 0-63 + {name: "SLWconst", argLength: 1, reg: gp11, asm: "SLW", aux: "UInt8"}, // arg0 << auxint, shift amount 0-31 + + {name: "SRD", argLength: 2, reg: sh21, asm: "SRD"}, // unsigned arg0 >> arg1, shift amount is mod 64 + {name: "SRW", argLength: 2, reg: sh21, asm: "SRW"}, // unsigned uint32(arg0) >> arg1, shift amount is mod 64 + {name: "SRDconst", argLength: 1, reg: gp11, asm: "SRD", aux: "UInt8"}, // unsigned arg0 >> auxint, shift amount 0-63 + {name: "SRWconst", argLength: 1, reg: gp11, asm: "SRW", aux: "UInt8"}, // unsigned uint32(arg0) >> auxint, shift amount 0-31 + + // Arithmetic shifts clobber flags. + {name: "SRAD", argLength: 2, reg: sh21, asm: "SRAD", clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 64 + {name: "SRAW", argLength: 2, reg: sh21, asm: "SRAW", clobberFlags: true}, // signed int32(arg0) >> arg1, shift amount is mod 64 + {name: "SRADconst", argLength: 1, reg: gp11, asm: "SRAD", aux: "UInt8", clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-63 + {name: "SRAWconst", argLength: 1, reg: gp11, asm: "SRAW", aux: "UInt8", clobberFlags: true}, // signed int32(arg0) >> auxint, shift amount 0-31 + + // Rotate instructions. + // Note: no RLLGconst - use RISBGZ instead. + {name: "RLLG", argLength: 2, reg: sh21, asm: "RLLG"}, // arg0 rotate left arg1, rotate amount 0-63 + {name: "RLL", argLength: 2, reg: sh21, asm: "RLL"}, // arg0 rotate left arg1, rotate amount 0-31 + {name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "UInt8"}, // arg0 rotate left auxint, rotate amount 0-31 + + // Rotate then (and|or|xor|insert) selected bits instructions. + // + // Aux is an s390x.RotateParams struct containing Start, End and rotation + // Amount fields. + // + // arg1 is rotated left by the rotation amount then the bits from the start + // bit to the end bit (inclusive) are combined with arg0 using the logical + // operation specified. Bit indices are specified from left to right - the + // MSB is 0 and the LSB is 63. + // + // Examples: + // | aux | + // | instruction | start | end | amount | arg0 | arg1 | result | + // +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+ + // | RXSBG (XOR) | 0 | 1 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0x3fff_ffff_ffff_ffff | + // | RXSBG (XOR) | 62 | 63 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_fffc | + // | RXSBG (XOR) | 0 | 47 | 16 | 0xffff_ffff_ffff_ffff | 0x0000_0000_0000_ffff | 0xffff_ffff_0000_ffff | + // +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+ + // + {name: "RXSBG", argLength: 2, reg: gp21, asm: "RXSBG", resultInArg0: true, aux: "S390XRotateParams", clobberFlags: true}, // rotate then xor selected bits + {name: "RISBGZ", argLength: 1, reg: gp11, asm: "RISBGZ", aux: "S390XRotateParams", clobberFlags: true}, // rotate then insert selected bits [into zero] + + // unary ops + {name: "NEG", argLength: 1, reg: gp11, asm: "NEG", clobberFlags: true}, // -arg0 + {name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW", clobberFlags: true}, // -arg0 + + {name: "NOT", argLength: 1, reg: gp11, resultInArg0: true, clobberFlags: true}, // ^arg0 + {name: "NOTW", argLength: 1, reg: gp11, resultInArg0: true, clobberFlags: true}, // ^arg0 + + {name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0) + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0), float32 + + // Conditional register-register moves. + // The aux for these values is an s390x.CCMask value representing the condition code mask. + {name: "LOCGR", argLength: 3, reg: gp2flags1, resultInArg0: true, asm: "LOCGR", aux: "S390XCCMask"}, // load arg1 into arg0 if the condition code in arg2 matches a masked bit in aux. + + {name: "MOVBreg", argLength: 1, reg: gp11sp, asm: "MOVB", typ: "Int64"}, // sign extend arg0 from int8 to int64 + {name: "MOVBZreg", argLength: 1, reg: gp11sp, asm: "MOVBZ", typ: "UInt64"}, // zero extend arg0 from int8 to int64 + {name: "MOVHreg", argLength: 1, reg: gp11sp, asm: "MOVH", typ: "Int64"}, // sign extend arg0 from int16 to int64 + {name: "MOVHZreg", argLength: 1, reg: gp11sp, asm: "MOVHZ", typ: "UInt64"}, // zero extend arg0 from int16 to int64 + {name: "MOVWreg", argLength: 1, reg: gp11sp, asm: "MOVW", typ: "Int64"}, // sign extend arg0 from int32 to int64 + {name: "MOVWZreg", argLength: 1, reg: gp11sp, asm: "MOVWZ", typ: "UInt64"}, // zero extend arg0 from int32 to int64 + + {name: "MOVDconst", reg: gp01, asm: "MOVD", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint + + {name: "LDGR", argLength: 1, reg: gpfp, asm: "LDGR"}, // move int64 to float64 (no conversion) + {name: "LGDR", argLength: 1, reg: fpgp, asm: "LGDR"}, // move float64 to int64 (no conversion) + + {name: "CFDBRA", argLength: 1, reg: fpgp, asm: "CFDBRA", clobberFlags: true}, // convert float64 to int32 + {name: "CGDBRA", argLength: 1, reg: fpgp, asm: "CGDBRA", clobberFlags: true}, // convert float64 to int64 + {name: "CFEBRA", argLength: 1, reg: fpgp, asm: "CFEBRA", clobberFlags: true}, // convert float32 to int32 + {name: "CGEBRA", argLength: 1, reg: fpgp, asm: "CGEBRA", clobberFlags: true}, // convert float32 to int64 + {name: "CEFBRA", argLength: 1, reg: gpfp, asm: "CEFBRA", clobberFlags: true}, // convert int32 to float32 + {name: "CDFBRA", argLength: 1, reg: gpfp, asm: "CDFBRA", clobberFlags: true}, // convert int32 to float64 + {name: "CEGBRA", argLength: 1, reg: gpfp, asm: "CEGBRA", clobberFlags: true}, // convert int64 to float32 + {name: "CDGBRA", argLength: 1, reg: gpfp, asm: "CDGBRA", clobberFlags: true}, // convert int64 to float64 + {name: "CLFEBR", argLength: 1, reg: fpgp, asm: "CLFEBR", clobberFlags: true}, // convert float32 to uint32 + {name: "CLFDBR", argLength: 1, reg: fpgp, asm: "CLFDBR", clobberFlags: true}, // convert float64 to uint32 + {name: "CLGEBR", argLength: 1, reg: fpgp, asm: "CLGEBR", clobberFlags: true}, // convert float32 to uint64 + {name: "CLGDBR", argLength: 1, reg: fpgp, asm: "CLGDBR", clobberFlags: true}, // convert float64 to uint64 + {name: "CELFBR", argLength: 1, reg: gpfp, asm: "CELFBR", clobberFlags: true}, // convert uint32 to float32 + {name: "CDLFBR", argLength: 1, reg: gpfp, asm: "CDLFBR", clobberFlags: true}, // convert uint32 to float64 + {name: "CELGBR", argLength: 1, reg: gpfp, asm: "CELGBR", clobberFlags: true}, // convert uint64 to float32 + {name: "CDLGBR", argLength: 1, reg: gpfp, asm: "CDLGBR", clobberFlags: true}, // convert uint64 to float64 + + {name: "LEDBR", argLength: 1, reg: fp11, asm: "LEDBR"}, // convert float64 to float32 + {name: "LDEBR", argLength: 1, reg: fp11, asm: "LDEBR"}, // convert float32 to float64 + + {name: "MOVDaddr", argLength: 1, reg: addr, aux: "SymOff", rematerializeable: true, symEffect: "Read"}, // arg0 + auxint + offset encoded in aux + {name: "MOVDaddridx", argLength: 2, reg: addridx, aux: "SymOff", symEffect: "Read"}, // arg0 + arg1 + auxint + aux + + // auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address + {name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int64 + {name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int64 + {name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend. + {name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int64 + {name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes from arg0+auxint+aux. arg1=mem + + {name: "MOVWBR", argLength: 1, reg: gp11, asm: "MOVWBR"}, // arg0 swap bytes + {name: "MOVDBR", argLength: 1, reg: gp11, asm: "MOVDBR"}, // arg0 swap bytes + + {name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes. + {name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes. + {name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes from arg0+auxint+aux. arg1=mem. Reverse bytes. + + {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem + {name: "MOVHBRstore", argLength: 3, reg: gpstorebr, asm: "MOVHBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes. + {name: "MOVWBRstore", argLength: 3, reg: gpstorebr, asm: "MOVWBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes. + {name: "MOVDBRstore", argLength: 3, reg: gpstorebr, asm: "MOVDBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem. Reverse bytes. + + {name: "MVC", argLength: 3, reg: gpmvc, asm: "MVC", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, faultOnNilArg1: true, symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size,off + + // indexed loads/stores + {name: "MOVBZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem. Zero extend. + {name: "MOVBloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVB", aux: "SymOff", typ: "Int8", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem. Sign extend. + {name: "MOVHZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend. + {name: "MOVHloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVH", aux: "SymOff", typ: "Int16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Sign extend. + {name: "MOVWZloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Zero extend. + {name: "MOVWloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVW", aux: "SymOff", typ: "Int32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Sign extend. + {name: "MOVDloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVD", aux: "SymOff", typ: "UInt64", symEffect: "Read"}, // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem + {name: "MOVHBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVHBR", aux: "SymOff", typ: "Int16", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes. + {name: "MOVWBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWBR", aux: "SymOff", typ: "Int32", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes. + {name: "MOVDBRloadidx", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVDBR", aux: "SymOff", typ: "Int64", symEffect: "Read"}, // load 8 bytes from arg0+arg1+auxint+aux. arg2=mem. Reverse bytes. + {name: "MOVBstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVHstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVH", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVWstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVDstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVD", aux: "SymOff", symEffect: "Write"}, // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem + {name: "MOVHBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVHBR", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes. + {name: "MOVWBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVWBR", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes. + {name: "MOVDBRstoreidx", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVDBR", aux: "SymOff", symEffect: "Write"}, // store 8 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem. Reverse bytes. + + // For storeconst ops, the AuxInt field encodes both + // the value to store and an address offset of the store. + // Cast AuxInt to a ValAndOff to extract Val and Off fields. + {name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux. arg1=mem + {name: "MOVHstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVH", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ... + {name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ... + {name: "MOVDstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVD", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of ... + + {name: "CLEAR", argLength: 2, reg: regInfo{inputs: []regMask{ptr, 0}}, asm: "CLEAR", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Write"}, + + {name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{ptrsp, buildReg("R12"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{ptr}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + // (InvertFlags (CMP a b)) == (CMP b a) + // InvertFlags is a pseudo-op which can't appear in assembly output. + {name: "InvertFlags", argLength: 1}, // reverse direction of arg0 + + // Pseudo-ops + {name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem + // Scheduler ensures LoweredGetClosurePtr occurs only in entry block, + // and sorts it to the very beginning of the block to prevent other + // use of R12 (the closure pointer) + {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R12")}}, zeroWidth: true}, + // arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. + // LoweredGetCallerSP returns the SP of the caller of the current function. + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, + // LoweredGetCallerPC evaluates to the PC to which its "caller" will return. + // I.e., if f calls g "calls" getcallerpc, + // the result should be the PC within f that g will return to. + // See runtime/stubs.go for a more detailed discussion. + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{ptrsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true}, + // Round ops to block fused-multiply-add extraction. + {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true}, + {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true}, + + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, + // but clobbers R14 (LR) because it's a call, + // and also clobbers R1 as the PLT stub does. + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("R2"), buildReg("R3")}, clobbers: (callerSave &^ gpg) | buildReg("R14") | r1}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. + {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + + // Constant condition code values. The condition code can be 0, 1, 2 or 3. + {name: "FlagEQ"}, // CC=0 (equal) + {name: "FlagLT"}, // CC=1 (less than) + {name: "FlagGT"}, // CC=2 (greater than) + {name: "FlagOV"}, // CC=3 (overflow) + + // Fast-BCR-serialization to ensure store-load ordering. + {name: "SYNC", argLength: 1, reg: sync, asm: "SYNC", typ: "Mem"}, + + // Atomic loads. These are just normal loads but return <value,memory> tuples + // so they can be properly ordered with other loads. + // load from arg0+auxint+aux. arg1=mem. + {name: "MOVBZatomicload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVWZatomicload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "MOVDatomicload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + + // Atomic stores. These are just normal stores. + // store arg1 to arg0+auxint+aux. arg2=mem. + {name: "MOVBatomicstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"}, + {name: "MOVWatomicstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"}, + {name: "MOVDatomicstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "Write"}, + + // Atomic adds. + // *(arg0+auxint+aux) += arg1. arg2=mem. + // Returns a tuple of <old contents of *(arg0+auxint+aux), memory>. + {name: "LAA", argLength: 3, reg: gpstorelaa, asm: "LAA", typ: "(UInt32,Mem)", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "LAAG", argLength: 3, reg: gpstorelaa, asm: "LAAG", typ: "(UInt64,Mem)", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>. + {name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>. + + // Atomic bitwise operations. + // Note: 'floor' operations round the pointer down to the nearest word boundary + // which reflects how they are used in the runtime. + {name: "LAN", argLength: 3, reg: gpstore, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 &= arg1. arg2 = mem. + {name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem. + {name: "LAO", argLength: 3, reg: gpstore, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 |= arg1. arg2 = mem. + {name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem. + + // Compare and swap. + // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. + // if *(arg0+auxint+aux) == arg1 { + // *(arg0+auxint+aux) = arg2 + // return (true, memory) + // } else { + // return (false, memory) + // } + // Note that these instructions also return the old value in arg1, but we ignore it. + // TODO: have these return flags instead of bool. The current system generates: + // CS ... + // MOVD $0, ret + // BNE 2(PC) + // MOVD $1, ret + // CMPW ret, $0 + // BNE ... + // instead of just + // CS ... + // BEQ ... + // but we can't do that because memory-using ops can't generate flags yet + // (flagalloc wants to move flag-generating instructions around). + {name: "LoweredAtomicCas32", argLength: 4, reg: cas, asm: "CS", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "LoweredAtomicCas64", argLength: 4, reg: cas, asm: "CSG", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + + // Lowered atomic swaps, emulated using compare-and-swap. + // store arg1 to arg0+auxint+aux, arg2=mem. + {name: "LoweredAtomicExchange32", argLength: 3, reg: exchange, asm: "CS", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + {name: "LoweredAtomicExchange64", argLength: 3, reg: exchange, asm: "CSG", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, + + // find leftmost one + { + name: "FLOGR", + argLength: 1, + reg: regInfo{inputs: gponly, outputs: []regMask{buildReg("R0")}, clobbers: buildReg("R1")}, + asm: "FLOGR", + typ: "UInt64", + clobberFlags: true, + }, + + // population count + // + // Counts the number of ones in each byte of arg0 + // and places the result into the corresponding byte + // of the result. + { + name: "POPCNT", + argLength: 1, + reg: gp11, + asm: "POPCNT", + typ: "UInt64", + clobberFlags: true, + }, + + // unsigned multiplication (64x64 → 128) + // + // Multiply the two 64-bit input operands together and place the 128-bit result into + // an even-odd register pair. The second register in the target pair also contains + // one of the input operands. Since we don't currently have a way to specify an + // even-odd register pair we hardcode this register pair as R2:R3. + { + name: "MLGR", + argLength: 2, + reg: regInfo{inputs: []regMask{gp, r3}, outputs: []regMask{r2, r3}}, + asm: "MLGR", + }, + + // pseudo operations to sum the output of the POPCNT instruction + {name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow + {name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow + {name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow + + // store multiple + { + name: "STMG2", + argLength: 4, + reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), 0}}, + aux: "SymOff", + typ: "Mem", + asm: "STMG", + faultOnNilArg0: true, + symEffect: "Write", + clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets + }, + { + name: "STMG3", + argLength: 5, + reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), buildReg("R3"), 0}}, + aux: "SymOff", + typ: "Mem", + asm: "STMG", + faultOnNilArg0: true, + symEffect: "Write", + clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets + }, + { + name: "STMG4", + argLength: 6, + reg: regInfo{inputs: []regMask{ + ptrsp, + buildReg("R1"), + buildReg("R2"), + buildReg("R3"), + buildReg("R4"), + 0, + }}, + aux: "SymOff", + typ: "Mem", + asm: "STMG", + faultOnNilArg0: true, + symEffect: "Write", + clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets + }, + { + name: "STM2", + argLength: 4, + reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), 0}}, + aux: "SymOff", + typ: "Mem", + asm: "STMY", + faultOnNilArg0: true, + symEffect: "Write", + clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets + }, + { + name: "STM3", + argLength: 5, + reg: regInfo{inputs: []regMask{ptrsp, buildReg("R1"), buildReg("R2"), buildReg("R3"), 0}}, + aux: "SymOff", + typ: "Mem", + asm: "STMY", + faultOnNilArg0: true, + symEffect: "Write", + clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets + }, + { + name: "STM4", + argLength: 6, + reg: regInfo{inputs: []regMask{ + ptrsp, + buildReg("R1"), + buildReg("R2"), + buildReg("R3"), + buildReg("R4"), + 0, + }}, + aux: "SymOff", + typ: "Mem", + asm: "STMY", + faultOnNilArg0: true, + symEffect: "Write", + clobberFlags: true, // TODO(mundaym): currently uses AGFI to handle large offsets + }, + + // large move + // auxint = remaining bytes after loop (rem) + // arg0 = address of dst memory (in R1, changed as a side effect) + // arg1 = address of src memory (in R2, changed as a side effect) + // arg2 = pointer to last address to move in loop + 256 + // arg3 = mem + // returns mem + // + // mvc: MVC $256, 0(R2), 0(R1) + // MOVD $256(R1), R1 + // MOVD $256(R2), R2 + // CMP R2, Rarg2 + // BNE mvc + // MVC $rem, 0(R2), 0(R1) // if rem > 0 + { + name: "LoweredMove", + aux: "Int64", + argLength: 4, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), buildReg("R2"), gpsp}, + clobbers: buildReg("R1 R2"), + }, + clobberFlags: true, + typ: "Mem", + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // large clear + // auxint = remaining bytes after loop (rem) + // arg0 = address of dst memory (in R1, changed as a side effect) + // arg1 = pointer to last address to zero in loop + 256 + // arg2 = mem + // returns mem + // + // clear: CLEAR $256, 0(R1) + // MOVD $256(R1), R1 + // CMP R1, Rarg2 + // BNE clear + // CLEAR $rem, 0(R1) // if rem > 0 + { + name: "LoweredZero", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{buildReg("R1"), gpsp}, + clobbers: buildReg("R1"), + }, + clobberFlags: true, + typ: "Mem", + faultOnNilArg0: true, + }, + } + + // All blocks on s390x have their condition code mask (s390x.CCMask) as the Aux value. + // The condition code mask is a 4-bit mask where each bit corresponds to a condition + // code value. If the value of the condition code matches a bit set in the condition + // code mask then the first successor is executed. Otherwise the second successor is + // executed. + // + // | condition code value | mask bit | + // +----------------------+------------+ + // | 0 (equal) | 0b1000 (8) | + // | 1 (less than) | 0b0100 (4) | + // | 2 (greater than) | 0b0010 (2) | + // | 3 (unordered) | 0b0001 (1) | + // + // Note: that compare-and-branch instructions must not have bit 3 (0b0001) set. + var S390Xblocks = []blockData{ + // branch on condition + {name: "BRC", controls: 1, aux: "S390XCCMask"}, // condition code value (flags) is Controls[0] + + // compare-and-branch (register-register) + // - integrates comparison of Controls[0] with Controls[1] + // - both control values must be in general purpose registers + {name: "CRJ", controls: 2, aux: "S390XCCMask"}, // signed 32-bit integer comparison + {name: "CGRJ", controls: 2, aux: "S390XCCMask"}, // signed 64-bit integer comparison + {name: "CLRJ", controls: 2, aux: "S390XCCMask"}, // unsigned 32-bit integer comparison + {name: "CLGRJ", controls: 2, aux: "S390XCCMask"}, // unsigned 64-bit integer comparison + + // compare-and-branch (register-immediate) + // - integrates comparison of Controls[0] with AuxInt + // - control value must be in a general purpose register + // - the AuxInt value is sign-extended for signed comparisons + // and zero-extended for unsigned comparisons + {name: "CIJ", controls: 1, aux: "S390XCCMaskInt8"}, // signed 32-bit integer comparison + {name: "CGIJ", controls: 1, aux: "S390XCCMaskInt8"}, // signed 64-bit integer comparison + {name: "CLIJ", controls: 1, aux: "S390XCCMaskUint8"}, // unsigned 32-bit integer comparison + {name: "CLGIJ", controls: 1, aux: "S390XCCMaskUint8"}, // unsigned 64-bit integer comparison + } + + archs = append(archs, arch{ + name: "S390X", + pkg: "cmd/internal/obj/s390x", + genfile: "../../s390x/ssa.go", + ops: S390Xops, + blocks: S390Xblocks, + regnames: regNamesS390X, + gpregmask: gp, + fpregmask: fp, + framepointerreg: -1, // not used + linkreg: int8(num["R14"]), + imports: []string{ + "cmd/internal/obj/s390x", + }, + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/Wasm.rules b/src/cmd/compile/internal/ssa/_gen/Wasm.rules new file mode 100644 index 0000000..a9ed82e --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/Wasm.rules @@ -0,0 +1,396 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Lowering arithmetic +(Add(64|32|16|8|Ptr) ...) => (I64Add ...) +(Add(64|32)F ...) => (F(64|32)Add ...) + +(Sub(64|32|16|8|Ptr) ...) => (I64Sub ...) +(Sub(64|32)F ...) => (F(64|32)Sub ...) + +(Mul(64|32|16|8) ...) => (I64Mul ...) +(Mul(64|32)F ...) => (F(64|32)Mul ...) + +(Div64 [false] x y) => (I64DivS x y) +(Div32 [false] x y) => (I64DivS (SignExt32to64 x) (SignExt32to64 y)) +(Div16 [false] x y) => (I64DivS (SignExt16to64 x) (SignExt16to64 y)) +(Div8 x y) => (I64DivS (SignExt8to64 x) (SignExt8to64 y)) +(Div64u ...) => (I64DivU ...) +(Div32u x y) => (I64DivU (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Div16u x y) => (I64DivU (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Div8u x y) => (I64DivU (ZeroExt8to64 x) (ZeroExt8to64 y)) +(Div(64|32)F ...) => (F(64|32)Div ...) + +(Mod64 [false] x y) => (I64RemS x y) +(Mod32 [false] x y) => (I64RemS (SignExt32to64 x) (SignExt32to64 y)) +(Mod16 [false] x y) => (I64RemS (SignExt16to64 x) (SignExt16to64 y)) +(Mod8 x y) => (I64RemS (SignExt8to64 x) (SignExt8to64 y)) +(Mod64u ...) => (I64RemU ...) +(Mod32u x y) => (I64RemU (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Mod16u x y) => (I64RemU (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Mod8u x y) => (I64RemU (ZeroExt8to64 x) (ZeroExt8to64 y)) + +(And(64|32|16|8|B) ...) => (I64And ...) + +(Or(64|32|16|8|B) ...) => (I64Or ...) + +(Xor(64|32|16|8) ...) => (I64Xor ...) + +(Neg(64|32|16|8) x) => (I64Sub (I64Const [0]) x) +(Neg(64|32)F ...) => (F(64|32)Neg ...) + +(Com(64|32|16|8) x) => (I64Xor x (I64Const [-1])) + +(Not ...) => (I64Eqz ...) + +// Lowering pointer arithmetic +(OffPtr ...) => (I64AddConst ...) + +// Lowering extension +// It is unnecessary to extend loads +(SignExt32to64 x:(I64Load32S _ _)) => x +(SignExt16to(64|32) x:(I64Load16S _ _)) => x +(SignExt8to(64|32|16) x:(I64Load8S _ _)) => x +(ZeroExt32to64 x:(I64Load32U _ _)) => x +(ZeroExt16to(64|32) x:(I64Load16U _ _)) => x +(ZeroExt8to(64|32|16) x:(I64Load8U _ _)) => x +(SignExt32to64 x) && buildcfg.GOWASM.SignExt => (I64Extend32S x) +(SignExt8to(64|32|16) x) && buildcfg.GOWASM.SignExt => (I64Extend8S x) +(SignExt16to(64|32) x) && buildcfg.GOWASM.SignExt => (I64Extend16S x) +(SignExt32to64 x) => (I64ShrS (I64Shl x (I64Const [32])) (I64Const [32])) +(SignExt16to(64|32) x) => (I64ShrS (I64Shl x (I64Const [48])) (I64Const [48])) +(SignExt8to(64|32|16) x) => (I64ShrS (I64Shl x (I64Const [56])) (I64Const [56])) +(ZeroExt32to64 x) => (I64And x (I64Const [0xffffffff])) +(ZeroExt16to(64|32) x) => (I64And x (I64Const [0xffff])) +(ZeroExt8to(64|32|16) x) => (I64And x (I64Const [0xff])) + +(Slicemask x) => (I64ShrS (I64Sub (I64Const [0]) x) (I64Const [63])) + +// Lowering truncation +// Because we ignore the high parts, truncates are just copies. +(Trunc64to(32|16|8) ...) => (Copy ...) +(Trunc32to(16|8) ...) => (Copy ...) +(Trunc16to8 ...) => (Copy ...) + +// Lowering float <=> int +(Cvt32to(64|32)F x) => (F(64|32)ConvertI64S (SignExt32to64 x)) +(Cvt64to(64|32)F ...) => (F(64|32)ConvertI64S ...) +(Cvt32Uto(64|32)F x) => (F(64|32)ConvertI64U (ZeroExt32to64 x)) +(Cvt64Uto(64|32)F ...) => (F(64|32)ConvertI64U ...) + +(Cvt32Fto32 ...) => (I64TruncSatF32S ...) +(Cvt32Fto64 ...) => (I64TruncSatF32S ...) +(Cvt64Fto32 ...) => (I64TruncSatF64S ...) +(Cvt64Fto64 ...) => (I64TruncSatF64S ...) +(Cvt32Fto32U ...) => (I64TruncSatF32U ...) +(Cvt32Fto64U ...) => (I64TruncSatF32U ...) +(Cvt64Fto32U ...) => (I64TruncSatF64U ...) +(Cvt64Fto64U ...) => (I64TruncSatF64U ...) + +(Cvt32Fto64F ...) => (F64PromoteF32 ...) +(Cvt64Fto32F ...) => (F32DemoteF64 ...) + +(CvtBoolToUint8 ...) => (Copy ...) + +(Round32F ...) => (Copy ...) +(Round64F ...) => (Copy ...) + +// Lowering shifts +// Unsigned shifts need to return 0 if shift amount is >= width of shifted value. + +(Lsh64x64 x y) && shiftIsBounded(v) => (I64Shl x y) +(Lsh64x64 x (I64Const [c])) && uint64(c) < 64 => (I64Shl x (I64Const [c])) +(Lsh64x64 x (I64Const [c])) && uint64(c) >= 64 => (I64Const [0]) +(Lsh64x64 x y) => (Select (I64Shl x y) (I64Const [0]) (I64LtU y (I64Const [64]))) +(Lsh64x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y)) + +(Lsh32x64 ...) => (Lsh64x64 ...) +(Lsh32x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y)) + +(Lsh16x64 ...) => (Lsh64x64 ...) +(Lsh16x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y)) + +(Lsh8x64 ...) => (Lsh64x64 ...) +(Lsh8x(32|16|8) [c] x y) => (Lsh64x64 [c] x (ZeroExt(32|16|8)to64 y)) + +(Rsh64Ux64 x y) && shiftIsBounded(v) => (I64ShrU x y) +(Rsh64Ux64 x (I64Const [c])) && uint64(c) < 64 => (I64ShrU x (I64Const [c])) +(Rsh64Ux64 x (I64Const [c])) && uint64(c) >= 64 => (I64Const [0]) +(Rsh64Ux64 x y) => (Select (I64ShrU x y) (I64Const [0]) (I64LtU y (I64Const [64]))) +(Rsh64Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] x (ZeroExt(32|16|8)to64 y)) + +(Rsh32Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt32to64 x) y) +(Rsh32Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt32to64 x) (ZeroExt(32|16|8)to64 y)) + +(Rsh16Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt16to64 x) y) +(Rsh16Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt16to64 x) (ZeroExt(32|16|8)to64 y)) + +(Rsh8Ux64 [c] x y) => (Rsh64Ux64 [c] (ZeroExt8to64 x) y) +(Rsh8Ux(32|16|8) [c] x y) => (Rsh64Ux64 [c] (ZeroExt8to64 x) (ZeroExt(32|16|8)to64 y)) + +// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value. +// We implement this by setting the shift value to (width - 1) if the shift value is >= width. + +(Rsh64x64 x y) && shiftIsBounded(v) => (I64ShrS x y) +(Rsh64x64 x (I64Const [c])) && uint64(c) < 64 => (I64ShrS x (I64Const [c])) +(Rsh64x64 x (I64Const [c])) && uint64(c) >= 64 => (I64ShrS x (I64Const [63])) +(Rsh64x64 x y) => (I64ShrS x (Select <typ.Int64> y (I64Const [63]) (I64LtU y (I64Const [64])))) +(Rsh64x(32|16|8) [c] x y) => (Rsh64x64 [c] x (ZeroExt(32|16|8)to64 y)) + +(Rsh32x64 [c] x y) => (Rsh64x64 [c] (SignExt32to64 x) y) +(Rsh32x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt32to64 x) (ZeroExt(32|16|8)to64 y)) + +(Rsh16x64 [c] x y) => (Rsh64x64 [c] (SignExt16to64 x) y) +(Rsh16x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt16to64 x) (ZeroExt(32|16|8)to64 y)) + +(Rsh8x64 [c] x y) => (Rsh64x64 [c] (SignExt8to64 x) y) +(Rsh8x(32|16|8) [c] x y) => (Rsh64x64 [c] (SignExt8to64 x) (ZeroExt(32|16|8)to64 y)) + +// Lowering rotates +(RotateLeft8 <t> x (I64Const [c])) => (Or8 (Lsh8x64 <t> x (I64Const [c&7])) (Rsh8Ux64 <t> x (I64Const [-c&7]))) +(RotateLeft16 <t> x (I64Const [c])) => (Or16 (Lsh16x64 <t> x (I64Const [c&15])) (Rsh16Ux64 <t> x (I64Const [-c&15]))) +(RotateLeft32 ...) => (I32Rotl ...) +(RotateLeft64 ...) => (I64Rotl ...) + +// Lowering comparisons +(Less64 ...) => (I64LtS ...) +(Less32 x y) => (I64LtS (SignExt32to64 x) (SignExt32to64 y)) +(Less16 x y) => (I64LtS (SignExt16to64 x) (SignExt16to64 y)) +(Less8 x y) => (I64LtS (SignExt8to64 x) (SignExt8to64 y)) +(Less64U ...) => (I64LtU ...) +(Less32U x y) => (I64LtU (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Less16U x y) => (I64LtU (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Less8U x y) => (I64LtU (ZeroExt8to64 x) (ZeroExt8to64 y)) +(Less(64|32)F ...) => (F(64|32)Lt ...) + +(Leq64 ...) => (I64LeS ...) +(Leq32 x y) => (I64LeS (SignExt32to64 x) (SignExt32to64 y)) +(Leq16 x y) => (I64LeS (SignExt16to64 x) (SignExt16to64 y)) +(Leq8 x y) => (I64LeS (SignExt8to64 x) (SignExt8to64 y)) +(Leq64U ...) => (I64LeU ...) +(Leq32U x y) => (I64LeU (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Leq16U x y) => (I64LeU (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Leq8U x y) => (I64LeU (ZeroExt8to64 x) (ZeroExt8to64 y)) +(Leq(64|32)F ...) => (F(64|32)Le ...) + +(Eq64 ...) => (I64Eq ...) +(Eq32 x y) => (I64Eq (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Eq16 x y) => (I64Eq (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Eq8 x y) => (I64Eq (ZeroExt8to64 x) (ZeroExt8to64 y)) +(EqB ...) => (I64Eq ...) +(EqPtr ...) => (I64Eq ...) +(Eq(64|32)F ...) => (F(64|32)Eq ...) + +(Neq64 ...) => (I64Ne ...) +(Neq32 x y) => (I64Ne (ZeroExt32to64 x) (ZeroExt32to64 y)) +(Neq16 x y) => (I64Ne (ZeroExt16to64 x) (ZeroExt16to64 y)) +(Neq8 x y) => (I64Ne (ZeroExt8to64 x) (ZeroExt8to64 y)) +(NeqB ...) => (I64Ne ...) +(NeqPtr ...) => (I64Ne ...) +(Neq(64|32)F ...) => (F(64|32)Ne ...) + +// Lowering loads +(Load <t> ptr mem) && is32BitFloat(t) => (F32Load ptr mem) +(Load <t> ptr mem) && is64BitFloat(t) => (F64Load ptr mem) +(Load <t> ptr mem) && t.Size() == 8 => (I64Load ptr mem) +(Load <t> ptr mem) && t.Size() == 4 && !t.IsSigned() => (I64Load32U ptr mem) +(Load <t> ptr mem) && t.Size() == 4 && t.IsSigned() => (I64Load32S ptr mem) +(Load <t> ptr mem) && t.Size() == 2 && !t.IsSigned() => (I64Load16U ptr mem) +(Load <t> ptr mem) && t.Size() == 2 && t.IsSigned() => (I64Load16S ptr mem) +(Load <t> ptr mem) && t.Size() == 1 && !t.IsSigned() => (I64Load8U ptr mem) +(Load <t> ptr mem) && t.Size() == 1 && t.IsSigned() => (I64Load8S ptr mem) + +// Lowering stores +(Store {t} ptr val mem) && is64BitFloat(t) => (F64Store ptr val mem) +(Store {t} ptr val mem) && is32BitFloat(t) => (F32Store ptr val mem) +(Store {t} ptr val mem) && t.Size() == 8 => (I64Store ptr val mem) +(Store {t} ptr val mem) && t.Size() == 4 => (I64Store32 ptr val mem) +(Store {t} ptr val mem) && t.Size() == 2 => (I64Store16 ptr val mem) +(Store {t} ptr val mem) && t.Size() == 1 => (I64Store8 ptr val mem) + +// Lowering moves +(Move [0] _ _ mem) => mem +(Move [1] dst src mem) => (I64Store8 dst (I64Load8U src mem) mem) +(Move [2] dst src mem) => (I64Store16 dst (I64Load16U src mem) mem) +(Move [4] dst src mem) => (I64Store32 dst (I64Load32U src mem) mem) +(Move [8] dst src mem) => (I64Store dst (I64Load src mem) mem) +(Move [16] dst src mem) => + (I64Store [8] dst (I64Load [8] src mem) + (I64Store dst (I64Load src mem) mem)) +(Move [3] dst src mem) => + (I64Store8 [2] dst (I64Load8U [2] src mem) + (I64Store16 dst (I64Load16U src mem) mem)) +(Move [5] dst src mem) => + (I64Store8 [4] dst (I64Load8U [4] src mem) + (I64Store32 dst (I64Load32U src mem) mem)) +(Move [6] dst src mem) => + (I64Store16 [4] dst (I64Load16U [4] src mem) + (I64Store32 dst (I64Load32U src mem) mem)) +(Move [7] dst src mem) => + (I64Store32 [3] dst (I64Load32U [3] src mem) + (I64Store32 dst (I64Load32U src mem) mem)) +(Move [s] dst src mem) && s > 8 && s < 16 => + (I64Store [s-8] dst (I64Load [s-8] src mem) + (I64Store dst (I64Load src mem) mem)) + +// Large copying uses helper. +(Move [s] dst src mem) && logLargeCopy(v, s) => + (LoweredMove [s] dst src mem) + +// Lowering Zero instructions +(Zero [0] _ mem) => mem +(Zero [1] destptr mem) => (I64Store8 destptr (I64Const [0]) mem) +(Zero [2] destptr mem) => (I64Store16 destptr (I64Const [0]) mem) +(Zero [4] destptr mem) => (I64Store32 destptr (I64Const [0]) mem) +(Zero [8] destptr mem) => (I64Store destptr (I64Const [0]) mem) + +(Zero [3] destptr mem) => + (I64Store8 [2] destptr (I64Const [0]) + (I64Store16 destptr (I64Const [0]) mem)) +(Zero [5] destptr mem) => + (I64Store8 [4] destptr (I64Const [0]) + (I64Store32 destptr (I64Const [0]) mem)) +(Zero [6] destptr mem) => + (I64Store16 [4] destptr (I64Const [0]) + (I64Store32 destptr (I64Const [0]) mem)) +(Zero [7] destptr mem) => + (I64Store32 [3] destptr (I64Const [0]) + (I64Store32 destptr (I64Const [0]) mem)) + +// Strip off any fractional word zeroing. +(Zero [s] destptr mem) && s%8 != 0 && s > 8 && s < 32 => + (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8]) + (I64Store destptr (I64Const [0]) mem)) + +// Zero small numbers of words directly. +(Zero [16] destptr mem) => + (I64Store [8] destptr (I64Const [0]) + (I64Store destptr (I64Const [0]) mem)) +(Zero [24] destptr mem) => + (I64Store [16] destptr (I64Const [0]) + (I64Store [8] destptr (I64Const [0]) + (I64Store destptr (I64Const [0]) mem))) +(Zero [32] destptr mem) => + (I64Store [24] destptr (I64Const [0]) + (I64Store [16] destptr (I64Const [0]) + (I64Store [8] destptr (I64Const [0]) + (I64Store destptr (I64Const [0]) mem)))) + +// Large zeroing uses helper. +(Zero [s] destptr mem) => + (LoweredZero [s] destptr mem) + +// Lowering constants +(Const64 ...) => (I64Const ...) +(Const(32|16|8) [c]) => (I64Const [int64(c)]) +(Const(64|32)F ...) => (F(64|32)Const ...) +(ConstNil) => (I64Const [0]) +(ConstBool [c]) => (I64Const [b2i(c)]) + +// Lowering calls +(StaticCall ...) => (LoweredStaticCall ...) +(ClosureCall ...) => (LoweredClosureCall ...) +(InterCall ...) => (LoweredInterCall ...) +(TailCall ...) => (LoweredTailCall ...) + +// Miscellaneous +(Convert ...) => (LoweredConvert ...) +(IsNonNil p) => (I64Eqz (I64Eqz p)) +(IsInBounds ...) => (I64LtU ...) +(IsSliceInBounds ...) => (I64LeU ...) +(NilCheck ...) => (LoweredNilCheck ...) +(GetClosurePtr ...) => (LoweredGetClosurePtr ...) +(GetCallerPC ...) => (LoweredGetCallerPC ...) +(GetCallerSP ...) => (LoweredGetCallerSP ...) +(Addr {sym} base) => (LoweredAddr {sym} [0] base) +(LocalAddr {sym} base _) => (LoweredAddr {sym} base) + +// Write barrier. +(WB ...) => (LoweredWB ...) + +// --- Intrinsics --- +(Sqrt ...) => (F64Sqrt ...) +(Trunc ...) => (F64Trunc ...) +(Ceil ...) => (F64Ceil ...) +(Floor ...) => (F64Floor ...) +(RoundToEven ...) => (F64Nearest ...) +(Abs ...) => (F64Abs ...) +(Copysign ...) => (F64Copysign ...) + +(Sqrt32 ...) => (F32Sqrt ...) + +(Ctz64 ...) => (I64Ctz ...) +(Ctz32 x) => (I64Ctz (I64Or x (I64Const [0x100000000]))) +(Ctz16 x) => (I64Ctz (I64Or x (I64Const [0x10000]))) +(Ctz8 x) => (I64Ctz (I64Or x (I64Const [0x100]))) + +(Ctz(64|32|16|8)NonZero ...) => (I64Ctz ...) + +(BitLen64 x) => (I64Sub (I64Const [64]) (I64Clz x)) + +(PopCount64 ...) => (I64Popcnt ...) +(PopCount32 x) => (I64Popcnt (ZeroExt32to64 x)) +(PopCount16 x) => (I64Popcnt (ZeroExt16to64 x)) +(PopCount8 x) => (I64Popcnt (ZeroExt8to64 x)) + +(CondSelect ...) => (Select ...) + +// --- Optimizations --- +(I64Add (I64Const [x]) (I64Const [y])) => (I64Const [x + y]) +(I64Mul (I64Const [x]) (I64Const [y])) => (I64Const [x * y]) +(I64And (I64Const [x]) (I64Const [y])) => (I64Const [x & y]) +(I64Or (I64Const [x]) (I64Const [y])) => (I64Const [x | y]) +(I64Xor (I64Const [x]) (I64Const [y])) => (I64Const [x ^ y]) +(F64Add (F64Const [x]) (F64Const [y])) => (F64Const [x + y]) +(F64Mul (F64Const [x]) (F64Const [y])) && !math.IsNaN(x * y) => (F64Const [x * y]) +(I64Eq (I64Const [x]) (I64Const [y])) && x == y => (I64Const [1]) +(I64Eq (I64Const [x]) (I64Const [y])) && x != y => (I64Const [0]) +(I64Ne (I64Const [x]) (I64Const [y])) && x == y => (I64Const [0]) +(I64Ne (I64Const [x]) (I64Const [y])) && x != y => (I64Const [1]) + +(I64Shl (I64Const [x]) (I64Const [y])) => (I64Const [x << uint64(y)]) +(I64ShrU (I64Const [x]) (I64Const [y])) => (I64Const [int64(uint64(x) >> uint64(y))]) +(I64ShrS (I64Const [x]) (I64Const [y])) => (I64Const [x >> uint64(y)]) + +// TODO: declare these operations as commutative and get rid of these rules? +(I64Add (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Add y (I64Const [x])) +(I64Mul (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Mul y (I64Const [x])) +(I64And (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64And y (I64Const [x])) +(I64Or (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Or y (I64Const [x])) +(I64Xor (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Xor y (I64Const [x])) +(F64Add (F64Const [x]) y) && y.Op != OpWasmF64Const => (F64Add y (F64Const [x])) +(F64Mul (F64Const [x]) y) && y.Op != OpWasmF64Const => (F64Mul y (F64Const [x])) +(I64Eq (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Eq y (I64Const [x])) +(I64Ne (I64Const [x]) y) && y.Op != OpWasmI64Const => (I64Ne y (I64Const [x])) + +(I64Eq x (I64Const [0])) => (I64Eqz x) +(I64LtU (I64Const [0]) x) => (I64Eqz (I64Eqz x)) +(I64LeU x (I64Const [0])) => (I64Eqz x) +(I64LtU x (I64Const [1])) => (I64Eqz x) +(I64LeU (I64Const [1]) x) => (I64Eqz (I64Eqz x)) +(I64Ne x (I64Const [0])) => (I64Eqz (I64Eqz x)) + +(I64Add x (I64Const [y])) => (I64AddConst [y] x) +(I64AddConst [0] x) => x +(I64Eqz (I64Eqz (I64Eqz x))) => (I64Eqz x) + +// folding offset into load/store +((I64Load|I64Load32U|I64Load32S|I64Load16U|I64Load16S|I64Load8U|I64Load8S) [off] (I64AddConst [off2] ptr) mem) + && isU32Bit(off+off2) => + ((I64Load|I64Load32U|I64Load32S|I64Load16U|I64Load16S|I64Load8U|I64Load8S) [off+off2] ptr mem) + +((I64Store|I64Store32|I64Store16|I64Store8) [off] (I64AddConst [off2] ptr) val mem) + && isU32Bit(off+off2) => + ((I64Store|I64Store32|I64Store16|I64Store8) [off+off2] ptr val mem) + +// folding offset into address +(I64AddConst [off] (LoweredAddr {sym} [off2] base)) && isU32Bit(off+int64(off2)) => + (LoweredAddr {sym} [int32(off)+off2] base) +(I64AddConst [off] x:(SP)) && isU32Bit(off) => (LoweredAddr [int32(off)] x) // so it is rematerializeable + +// transforming readonly globals into constants +(I64Load [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read64(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))]) +(I64Load32U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read32(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))]) +(I64Load16U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read16(sym, off+int64(off2), config.ctxt.Arch.ByteOrder))]) +(I64Load8U [off] (LoweredAddr {sym} [off2] (SB)) _) && symIsRO(sym) && isU32Bit(off+int64(off2)) => (I64Const [int64(read8(sym, off+int64(off2)))]) diff --git a/src/cmd/compile/internal/ssa/_gen/WasmOps.go b/src/cmd/compile/internal/ssa/_gen/WasmOps.go new file mode 100644 index 0000000..cd127b5 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/WasmOps.go @@ -0,0 +1,277 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import "strings" + +var regNamesWasm = []string{ + "R0", + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "R14", + "R15", + + "F0", + "F1", + "F2", + "F3", + "F4", + "F5", + "F6", + "F7", + "F8", + "F9", + "F10", + "F11", + "F12", + "F13", + "F14", + "F15", + + "F16", + "F17", + "F18", + "F19", + "F20", + "F21", + "F22", + "F23", + "F24", + "F25", + "F26", + "F27", + "F28", + "F29", + "F30", + "F31", + + "SP", + "g", + + // pseudo-registers + "SB", +} + +func init() { + // Make map from reg names to reg integers. + if len(regNamesWasm) > 64 { + panic("too many registers") + } + num := map[string]int{} + for i, name := range regNamesWasm { + num[name] = i + } + buildReg := func(s string) regMask { + m := regMask(0) + for _, r := range strings.Split(s, " ") { + if n, ok := num[r]; ok { + m |= regMask(1) << uint(n) + continue + } + panic("register " + r + " not found") + } + return m + } + + var ( + gp = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15") + fp32 = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15") + fp64 = buildReg("F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") + gpsp = gp | buildReg("SP") + gpspsb = gpsp | buildReg("SB") + // The "registers", which are actually local variables, can get clobbered + // if we're switching goroutines, because it unwinds the WebAssembly stack. + callerSave = gp | fp32 | fp64 | buildReg("g") + ) + + // Common regInfo + var ( + gp01 = regInfo{inputs: nil, outputs: []regMask{gp}} + gp11 = regInfo{inputs: []regMask{gpsp}, outputs: []regMask{gp}} + gp21 = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: []regMask{gp}} + gp31 = regInfo{inputs: []regMask{gpsp, gpsp, gpsp}, outputs: []regMask{gp}} + fp32_01 = regInfo{inputs: nil, outputs: []regMask{fp32}} + fp32_11 = regInfo{inputs: []regMask{fp32}, outputs: []regMask{fp32}} + fp32_21 = regInfo{inputs: []regMask{fp32, fp32}, outputs: []regMask{fp32}} + fp32_21gp = regInfo{inputs: []regMask{fp32, fp32}, outputs: []regMask{gp}} + fp64_01 = regInfo{inputs: nil, outputs: []regMask{fp64}} + fp64_11 = regInfo{inputs: []regMask{fp64}, outputs: []regMask{fp64}} + fp64_21 = regInfo{inputs: []regMask{fp64, fp64}, outputs: []regMask{fp64}} + fp64_21gp = regInfo{inputs: []regMask{fp64, fp64}, outputs: []regMask{gp}} + gpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{gp}} + gpstore = regInfo{inputs: []regMask{gpspsb, gpsp, 0}} + fp32load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{fp32}} + fp32store = regInfo{inputs: []regMask{gpspsb, fp32, 0}} + fp64load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: []regMask{fp64}} + fp64store = regInfo{inputs: []regMask{gpspsb, fp64, 0}} + ) + + var WasmOps = []opData{ + {name: "LoweredStaticCall", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "LoweredTailCall", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem + {name: "LoweredClosureCall", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp, 0}, clobbers: callerSave}, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem + {name: "LoweredInterCall", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem + + {name: "LoweredAddr", argLength: 1, reg: gp11, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // returns base+aux+auxint, arg0=base + {name: "LoweredMove", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp}}, aux: "Int64"}, // large move. arg0=dst, arg1=src, arg2=mem, auxint=len, returns mem + {name: "LoweredZero", argLength: 2, reg: regInfo{inputs: []regMask{gp}}, aux: "Int64"}, // large zeroing. arg0=start, arg1=mem, auxint=len, returns mem + + {name: "LoweredGetClosurePtr", reg: gp01}, // returns wasm.REG_CTXT, the closure pointer + {name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true}, // returns the PC of the caller of the current function + {name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true}, // returns the SP of the caller of the current function + {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{gp, gp}}, aux: "Sym", symEffect: "None"}, // invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + + // LoweredConvert converts between pointers and integers. + // We have a special op for this so as to not confuse GCCallOff + // (particularly stack maps). It takes a memory arg so it + // gets correctly ordered with respect to GC safepoints. + // arg0=ptr/int arg1=mem, output=int/ptr + // + // TODO(neelance): LoweredConvert should not be necessary any more, since OpConvert does not need to be lowered any more (CL 108496). + {name: "LoweredConvert", argLength: 2, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}}, + + // The following are native WebAssembly instructions, see https://webassembly.github.io/spec/core/syntax/instructions.html + + {name: "Select", asm: "Select", argLength: 3, reg: gp31}, // returns arg0 if arg2 != 0, otherwise returns arg1 + + {name: "I64Load8U", asm: "I64Load8U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt8"}, // read unsigned 8-bit integer from address arg0+aux, arg1=mem + {name: "I64Load8S", asm: "I64Load8S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int8"}, // read signed 8-bit integer from address arg0+aux, arg1=mem + {name: "I64Load16U", asm: "I64Load16U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt16"}, // read unsigned 16-bit integer from address arg0+aux, arg1=mem + {name: "I64Load16S", asm: "I64Load16S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int16"}, // read signed 16-bit integer from address arg0+aux, arg1=mem + {name: "I64Load32U", asm: "I64Load32U", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt32"}, // read unsigned 32-bit integer from address arg0+aux, arg1=mem + {name: "I64Load32S", asm: "I64Load32S", argLength: 2, reg: gpload, aux: "Int64", typ: "Int32"}, // read signed 32-bit integer from address arg0+aux, arg1=mem + {name: "I64Load", asm: "I64Load", argLength: 2, reg: gpload, aux: "Int64", typ: "UInt64"}, // read 64-bit integer from address arg0+aux, arg1=mem + {name: "I64Store8", asm: "I64Store8", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 8-bit integer arg1 at address arg0+aux, arg2=mem, returns mem + {name: "I64Store16", asm: "I64Store16", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 16-bit integer arg1 at address arg0+aux, arg2=mem, returns mem + {name: "I64Store32", asm: "I64Store32", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 32-bit integer arg1 at address arg0+aux, arg2=mem, returns mem + {name: "I64Store", asm: "I64Store", argLength: 3, reg: gpstore, aux: "Int64", typ: "Mem"}, // store 64-bit integer arg1 at address arg0+aux, arg2=mem, returns mem + + {name: "F32Load", asm: "F32Load", argLength: 2, reg: fp32load, aux: "Int64", typ: "Float32"}, // read 32-bit float from address arg0+aux, arg1=mem + {name: "F64Load", asm: "F64Load", argLength: 2, reg: fp64load, aux: "Int64", typ: "Float64"}, // read 64-bit float from address arg0+aux, arg1=mem + {name: "F32Store", asm: "F32Store", argLength: 3, reg: fp32store, aux: "Int64", typ: "Mem"}, // store 32-bit float arg1 at address arg0+aux, arg2=mem, returns mem + {name: "F64Store", asm: "F64Store", argLength: 3, reg: fp64store, aux: "Int64", typ: "Mem"}, // store 64-bit float arg1 at address arg0+aux, arg2=mem, returns mem + + {name: "I64Const", reg: gp01, aux: "Int64", rematerializeable: true, typ: "Int64"}, // returns the constant integer aux + {name: "F32Const", reg: fp32_01, aux: "Float32", rematerializeable: true, typ: "Float32"}, // returns the constant float aux + {name: "F64Const", reg: fp64_01, aux: "Float64", rematerializeable: true, typ: "Float64"}, // returns the constant float aux + + {name: "I64Eqz", asm: "I64Eqz", argLength: 1, reg: gp11, typ: "Bool"}, // arg0 == 0 + {name: "I64Eq", asm: "I64Eq", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 == arg1 + {name: "I64Ne", asm: "I64Ne", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 != arg1 + {name: "I64LtS", asm: "I64LtS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 < arg1 (signed) + {name: "I64LtU", asm: "I64LtU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 < arg1 (unsigned) + {name: "I64GtS", asm: "I64GtS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 > arg1 (signed) + {name: "I64GtU", asm: "I64GtU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 > arg1 (unsigned) + {name: "I64LeS", asm: "I64LeS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 <= arg1 (signed) + {name: "I64LeU", asm: "I64LeU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 <= arg1 (unsigned) + {name: "I64GeS", asm: "I64GeS", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 >= arg1 (signed) + {name: "I64GeU", asm: "I64GeU", argLength: 2, reg: gp21, typ: "Bool"}, // arg0 >= arg1 (unsigned) + + {name: "F32Eq", asm: "F32Eq", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 == arg1 + {name: "F32Ne", asm: "F32Ne", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 != arg1 + {name: "F32Lt", asm: "F32Lt", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 < arg1 + {name: "F32Gt", asm: "F32Gt", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 > arg1 + {name: "F32Le", asm: "F32Le", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 <= arg1 + {name: "F32Ge", asm: "F32Ge", argLength: 2, reg: fp32_21gp, typ: "Bool"}, // arg0 >= arg1 + + {name: "F64Eq", asm: "F64Eq", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 == arg1 + {name: "F64Ne", asm: "F64Ne", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 != arg1 + {name: "F64Lt", asm: "F64Lt", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 < arg1 + {name: "F64Gt", asm: "F64Gt", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 > arg1 + {name: "F64Le", asm: "F64Le", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 <= arg1 + {name: "F64Ge", asm: "F64Ge", argLength: 2, reg: fp64_21gp, typ: "Bool"}, // arg0 >= arg1 + + {name: "I64Add", asm: "I64Add", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 + arg1 + {name: "I64AddConst", asm: "I64Add", argLength: 1, reg: gp11, aux: "Int64", typ: "Int64"}, // arg0 + aux + {name: "I64Sub", asm: "I64Sub", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 - arg1 + {name: "I64Mul", asm: "I64Mul", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 * arg1 + {name: "I64DivS", asm: "I64DivS", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 / arg1 (signed) + {name: "I64DivU", asm: "I64DivU", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 / arg1 (unsigned) + {name: "I64RemS", asm: "I64RemS", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 % arg1 (signed) + {name: "I64RemU", asm: "I64RemU", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 % arg1 (unsigned) + {name: "I64And", asm: "I64And", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 & arg1 + {name: "I64Or", asm: "I64Or", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 | arg1 + {name: "I64Xor", asm: "I64Xor", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 ^ arg1 + {name: "I64Shl", asm: "I64Shl", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 << (arg1 % 64) + {name: "I64ShrS", asm: "I64ShrS", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 >> (arg1 % 64) (signed) + {name: "I64ShrU", asm: "I64ShrU", argLength: 2, reg: gp21, typ: "Int64"}, // arg0 >> (arg1 % 64) (unsigned) + + {name: "F32Neg", asm: "F32Neg", argLength: 1, reg: fp32_11, typ: "Float32"}, // -arg0 + {name: "F32Add", asm: "F32Add", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 + arg1 + {name: "F32Sub", asm: "F32Sub", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 - arg1 + {name: "F32Mul", asm: "F32Mul", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 * arg1 + {name: "F32Div", asm: "F32Div", argLength: 2, reg: fp32_21, typ: "Float32"}, // arg0 / arg1 + + {name: "F64Neg", asm: "F64Neg", argLength: 1, reg: fp64_11, typ: "Float64"}, // -arg0 + {name: "F64Add", asm: "F64Add", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 + arg1 + {name: "F64Sub", asm: "F64Sub", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 - arg1 + {name: "F64Mul", asm: "F64Mul", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 * arg1 + {name: "F64Div", asm: "F64Div", argLength: 2, reg: fp64_21, typ: "Float64"}, // arg0 / arg1 + + {name: "I64TruncSatF64S", asm: "I64TruncSatF64S", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to a signed integer (saturating) + {name: "I64TruncSatF64U", asm: "I64TruncSatF64U", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to an unsigned integer (saturating) + {name: "I64TruncSatF32S", asm: "I64TruncSatF32S", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to a signed integer (saturating) + {name: "I64TruncSatF32U", asm: "I64TruncSatF32U", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{gp}}, typ: "Int64"}, // truncates the float arg0 to an unsigned integer (saturating) + {name: "F32ConvertI64S", asm: "F32ConvertI64S", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp32}}, typ: "Float32"}, // converts the signed integer arg0 to a float + {name: "F32ConvertI64U", asm: "F32ConvertI64U", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp32}}, typ: "Float32"}, // converts the unsigned integer arg0 to a float + {name: "F64ConvertI64S", asm: "F64ConvertI64S", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp64}}, typ: "Float64"}, // converts the signed integer arg0 to a float + {name: "F64ConvertI64U", asm: "F64ConvertI64U", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{fp64}}, typ: "Float64"}, // converts the unsigned integer arg0 to a float + {name: "F32DemoteF64", asm: "F32DemoteF64", argLength: 1, reg: regInfo{inputs: []regMask{fp64}, outputs: []regMask{fp32}}, typ: "Float32"}, + {name: "F64PromoteF32", asm: "F64PromoteF32", argLength: 1, reg: regInfo{inputs: []regMask{fp32}, outputs: []regMask{fp64}}, typ: "Float64"}, + + {name: "I64Extend8S", asm: "I64Extend8S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 8 to 64 bit + {name: "I64Extend16S", asm: "I64Extend16S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 16 to 64 bit + {name: "I64Extend32S", asm: "I64Extend32S", argLength: 1, reg: gp11, typ: "Int64"}, // sign-extend arg0 from 32 to 64 bit + + {name: "F32Sqrt", asm: "F32Sqrt", argLength: 1, reg: fp32_11, typ: "Float32"}, // sqrt(arg0) + {name: "F32Trunc", asm: "F32Trunc", argLength: 1, reg: fp32_11, typ: "Float32"}, // trunc(arg0) + {name: "F32Ceil", asm: "F32Ceil", argLength: 1, reg: fp32_11, typ: "Float32"}, // ceil(arg0) + {name: "F32Floor", asm: "F32Floor", argLength: 1, reg: fp32_11, typ: "Float32"}, // floor(arg0) + {name: "F32Nearest", asm: "F32Nearest", argLength: 1, reg: fp32_11, typ: "Float32"}, // round(arg0) + {name: "F32Abs", asm: "F32Abs", argLength: 1, reg: fp32_11, typ: "Float32"}, // abs(arg0) + {name: "F32Copysign", asm: "F32Copysign", argLength: 2, reg: fp32_21, typ: "Float32"}, // copysign(arg0, arg1) + + {name: "F64Sqrt", asm: "F64Sqrt", argLength: 1, reg: fp64_11, typ: "Float64"}, // sqrt(arg0) + {name: "F64Trunc", asm: "F64Trunc", argLength: 1, reg: fp64_11, typ: "Float64"}, // trunc(arg0) + {name: "F64Ceil", asm: "F64Ceil", argLength: 1, reg: fp64_11, typ: "Float64"}, // ceil(arg0) + {name: "F64Floor", asm: "F64Floor", argLength: 1, reg: fp64_11, typ: "Float64"}, // floor(arg0) + {name: "F64Nearest", asm: "F64Nearest", argLength: 1, reg: fp64_11, typ: "Float64"}, // round(arg0) + {name: "F64Abs", asm: "F64Abs", argLength: 1, reg: fp64_11, typ: "Float64"}, // abs(arg0) + {name: "F64Copysign", asm: "F64Copysign", argLength: 2, reg: fp64_21, typ: "Float64"}, // copysign(arg0, arg1) + + {name: "I64Ctz", asm: "I64Ctz", argLength: 1, reg: gp11, typ: "Int64"}, // ctz(arg0) + {name: "I64Clz", asm: "I64Clz", argLength: 1, reg: gp11, typ: "Int64"}, // clz(arg0) + {name: "I32Rotl", asm: "I32Rotl", argLength: 2, reg: gp21, typ: "Int32"}, // rotl(arg0, arg1) + {name: "I64Rotl", asm: "I64Rotl", argLength: 2, reg: gp21, typ: "Int64"}, // rotl(arg0, arg1) + {name: "I64Popcnt", asm: "I64Popcnt", argLength: 1, reg: gp11, typ: "Int64"}, // popcnt(arg0) + } + + archs = append(archs, arch{ + name: "Wasm", + pkg: "cmd/internal/obj/wasm", + genfile: "../../wasm/ssa.go", + ops: WasmOps, + blocks: nil, + regnames: regNamesWasm, + gpregmask: gp, + fpregmask: fp32 | fp64, + fp32regmask: fp32, + fp64regmask: fp64, + framepointerreg: -1, // not used + linkreg: -1, // not used + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/allocators.go b/src/cmd/compile/internal/ssa/_gen/allocators.go new file mode 100644 index 0000000..0f3968c --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/allocators.go @@ -0,0 +1,198 @@ +package main + +// TODO: should we share backing storage for similarly-shaped types? +// e.g. []*Value and []*Block, or even []int32 and []bool. + +import ( + "bytes" + "fmt" + "go/format" + "io" + "log" + "os" +) + +type allocator struct { + name string // name for alloc/free functions + typ string // the type they return/accept + mak string // code to make a new object (takes power-of-2 size as fmt arg) + capacity string // code to calculate the capacity of an object. Should always report a power of 2. + resize string // code to shrink to sub-power-of-two size (takes size as fmt arg) + clear string // code for clearing object before putting it on the free list + minLog int // log_2 of minimum allocation size + maxLog int // log_2 of maximum allocation size +} + +func genAllocators() { + allocators := []allocator{ + { + name: "ValueSlice", + typ: "[]*Value", + capacity: "cap(%s)", + mak: "make([]*Value, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = nil\n}", + minLog: 5, + maxLog: 32, + }, + { + name: "BlockSlice", + typ: "[]*Block", + capacity: "cap(%s)", + mak: "make([]*Block, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = nil\n}", + minLog: 5, + maxLog: 32, + }, + { + name: "BoolSlice", + typ: "[]bool", + capacity: "cap(%s)", + mak: "make([]bool, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = false\n}", + minLog: 8, + maxLog: 32, + }, + { + name: "IntSlice", + typ: "[]int", + capacity: "cap(%s)", + mak: "make([]int, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}", + minLog: 5, + maxLog: 32, + }, + { + name: "Int32Slice", + typ: "[]int32", + capacity: "cap(%s)", + mak: "make([]int32, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}", + minLog: 6, + maxLog: 32, + }, + { + name: "Int8Slice", + typ: "[]int8", + capacity: "cap(%s)", + mak: "make([]int8, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}", + minLog: 8, + maxLog: 32, + }, + { + name: "IDSlice", + typ: "[]ID", + capacity: "cap(%s)", + mak: "make([]ID, %s)", + resize: "%s[:%s]", + clear: "for i := range %[1]s {\n%[1]s[i] = 0\n}", + minLog: 6, + maxLog: 32, + }, + { + name: "SparseSet", + typ: "*sparseSet", + capacity: "%s.cap()", + mak: "newSparseSet(%s)", + resize: "", // larger-sized sparse sets are ok + clear: "%s.clear()", + minLog: 5, + maxLog: 32, + }, + { + name: "SparseMap", + typ: "*sparseMap", + capacity: "%s.cap()", + mak: "newSparseMap(%s)", + resize: "", // larger-sized sparse maps are ok + clear: "%s.clear()", + minLog: 5, + maxLog: 32, + }, + { + name: "SparseMapPos", + typ: "*sparseMapPos", + capacity: "%s.cap()", + mak: "newSparseMapPos(%s)", + resize: "", // larger-sized sparse maps are ok + clear: "%s.clear()", + minLog: 5, + maxLog: 32, + }, + } + + w := new(bytes.Buffer) + fmt.Fprintf(w, "// Code generated from _gen/allocators.go; DO NOT EDIT.\n") + fmt.Fprintln(w) + fmt.Fprintln(w, "package ssa") + + fmt.Fprintln(w, "import (") + fmt.Fprintln(w, "\"math/bits\"") + fmt.Fprintln(w, "\"sync\"") + fmt.Fprintln(w, ")") + for _, a := range allocators { + genAllocator(w, a) + } + // gofmt result + b := w.Bytes() + var err error + b, err = format.Source(b) + if err != nil { + fmt.Printf("%s\n", w.Bytes()) + panic(err) + } + + if err := os.WriteFile("../allocators.go", b, 0666); err != nil { + log.Fatalf("can't write output: %v\n", err) + } +} +func genAllocator(w io.Writer, a allocator) { + fmt.Fprintf(w, "var poolFree%s [%d]sync.Pool\n", a.name, a.maxLog-a.minLog) + fmt.Fprintf(w, "func (c *Cache) alloc%s(n int) %s {\n", a.name, a.typ) + fmt.Fprintf(w, "var s %s\n", a.typ) + fmt.Fprintf(w, "n2 := n\n") + fmt.Fprintf(w, "if n2 < %d { n2 = %d }\n", 1<<a.minLog, 1<<a.minLog) + fmt.Fprintf(w, "b := bits.Len(uint(n2-1))\n") + fmt.Fprintf(w, "v := poolFree%s[b-%d].Get()\n", a.name, a.minLog) + fmt.Fprintf(w, "if v == nil {\n") + fmt.Fprintf(w, " s = %s\n", fmt.Sprintf(a.mak, "1<<b")) + fmt.Fprintf(w, "} else {\n") + if a.typ[0] == '*' { + fmt.Fprintf(w, "s = v.(%s)\n", a.typ) + } else { + fmt.Fprintf(w, "sp := v.(*%s)\n", a.typ) + fmt.Fprintf(w, "s = *sp\n") + fmt.Fprintf(w, "*sp = nil\n") + fmt.Fprintf(w, "c.hdr%s = append(c.hdr%s, sp)\n", a.name, a.name) + } + fmt.Fprintf(w, "}\n") + if a.resize != "" { + fmt.Fprintf(w, "s = %s\n", fmt.Sprintf(a.resize, "s", "n")) + } + fmt.Fprintf(w, "return s\n") + fmt.Fprintf(w, "}\n") + fmt.Fprintf(w, "func (c *Cache) free%s(s %s) {\n", a.name, a.typ) + fmt.Fprintf(w, "%s\n", fmt.Sprintf(a.clear, "s")) + fmt.Fprintf(w, "b := bits.Len(uint(%s) - 1)\n", fmt.Sprintf(a.capacity, "s")) + if a.typ[0] == '*' { + fmt.Fprintf(w, "poolFree%s[b-%d].Put(s)\n", a.name, a.minLog) + } else { + fmt.Fprintf(w, "var sp *%s\n", a.typ) + fmt.Fprintf(w, "if len(c.hdr%s) == 0 {\n", a.name) + fmt.Fprintf(w, " sp = new(%s)\n", a.typ) + fmt.Fprintf(w, "} else {\n") + fmt.Fprintf(w, " sp = c.hdr%s[len(c.hdr%s)-1]\n", a.name, a.name) + fmt.Fprintf(w, " c.hdr%s[len(c.hdr%s)-1] = nil\n", a.name, a.name) + fmt.Fprintf(w, " c.hdr%s = c.hdr%s[:len(c.hdr%s)-1]\n", a.name, a.name, a.name) + fmt.Fprintf(w, "}\n") + fmt.Fprintf(w, "*sp = s\n") + fmt.Fprintf(w, "poolFree%s[b-%d].Put(sp)\n", a.name, a.minLog) + } + fmt.Fprintf(w, "}\n") +} diff --git a/src/cmd/compile/internal/ssa/_gen/cover.bash b/src/cmd/compile/internal/ssa/_gen/cover.bash new file mode 100755 index 0000000..7311cfb --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/cover.bash @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright 2020 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# A quick and dirty way to obtain code coverage from rulegen's main func. For +# example: +# +# ./cover.bash && go tool cover -html=cover.out +# +# This script is needed to set up a temporary test file, so that we don't break +# regular 'go run .' usage to run the generator. + +cat >main_test.go <<-EOF + // +build ignore + + package main + + import "testing" + + func TestCoverage(t *testing.T) { main() } +EOF + +go test -run='^TestCoverage$' -coverprofile=cover.out "$@" *.go + +rm -f main_test.go diff --git a/src/cmd/compile/internal/ssa/_gen/dec.rules b/src/cmd/compile/internal/ssa/_gen/dec.rules new file mode 100644 index 0000000..b194898 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/dec.rules @@ -0,0 +1,93 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains rules to decompose builtin compound types +// (complex,string,slice,interface) into their constituent +// types. These rules work together with the decomposeBuiltIn +// pass which handles phis of these types. + +// complex ops +(ComplexReal (ComplexMake real _ )) => real +(ComplexImag (ComplexMake _ imag )) => imag + +(Load <t> ptr mem) && t.IsComplex() && t.Size() == 8 => + (ComplexMake + (Load <typ.Float32> ptr mem) + (Load <typ.Float32> + (OffPtr <typ.Float32Ptr> [4] ptr) + mem) + ) +(Store {t} dst (ComplexMake real imag) mem) && t.Size() == 8 => + (Store {typ.Float32} + (OffPtr <typ.Float32Ptr> [4] dst) + imag + (Store {typ.Float32} dst real mem)) +(Load <t> ptr mem) && t.IsComplex() && t.Size() == 16 => + (ComplexMake + (Load <typ.Float64> ptr mem) + (Load <typ.Float64> + (OffPtr <typ.Float64Ptr> [8] ptr) + mem) + ) +(Store {t} dst (ComplexMake real imag) mem) && t.Size() == 16 => + (Store {typ.Float64} + (OffPtr <typ.Float64Ptr> [8] dst) + imag + (Store {typ.Float64} dst real mem)) + +// string ops +(StringPtr (StringMake ptr _)) => ptr +(StringLen (StringMake _ len)) => len + +(Load <t> ptr mem) && t.IsString() => + (StringMake + (Load <typ.BytePtr> ptr mem) + (Load <typ.Int> + (OffPtr <typ.IntPtr> [config.PtrSize] ptr) + mem)) +(Store dst (StringMake ptr len) mem) => + (Store {typ.Int} + (OffPtr <typ.IntPtr> [config.PtrSize] dst) + len + (Store {typ.BytePtr} dst ptr mem)) + +// slice ops +(SlicePtr (SliceMake ptr _ _ )) => ptr +(SliceLen (SliceMake _ len _)) => len +(SliceCap (SliceMake _ _ cap)) => cap +(SlicePtrUnchecked (SliceMake ptr _ _ )) => ptr + +(Load <t> ptr mem) && t.IsSlice() => + (SliceMake + (Load <t.Elem().PtrTo()> ptr mem) + (Load <typ.Int> + (OffPtr <typ.IntPtr> [config.PtrSize] ptr) + mem) + (Load <typ.Int> + (OffPtr <typ.IntPtr> [2*config.PtrSize] ptr) + mem)) +(Store {t} dst (SliceMake ptr len cap) mem) => + (Store {typ.Int} + (OffPtr <typ.IntPtr> [2*config.PtrSize] dst) + cap + (Store {typ.Int} + (OffPtr <typ.IntPtr> [config.PtrSize] dst) + len + (Store {t.Elem().PtrTo()} dst ptr mem))) + +// interface ops +(ITab (IMake itab _)) => itab +(IData (IMake _ data)) => data + +(Load <t> ptr mem) && t.IsInterface() => + (IMake + (Load <typ.Uintptr> ptr mem) + (Load <typ.BytePtr> + (OffPtr <typ.BytePtrPtr> [config.PtrSize] ptr) + mem)) +(Store dst (IMake itab data) mem) => + (Store {typ.BytePtr} + (OffPtr <typ.BytePtrPtr> [config.PtrSize] dst) + data + (Store {typ.Uintptr} dst itab mem)) diff --git a/src/cmd/compile/internal/ssa/_gen/dec64.rules b/src/cmd/compile/internal/ssa/_gen/dec64.rules new file mode 100644 index 0000000..ba776af --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/dec64.rules @@ -0,0 +1,401 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains rules to decompose [u]int64 types on 32-bit +// architectures. These rules work together with the decomposeBuiltIn +// pass which handles phis of these typ. + +(Int64Hi (Int64Make hi _)) => hi +(Int64Lo (Int64Make _ lo)) => lo + +(Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && t.IsSigned() => + (Int64Make + (Load <typ.Int32> (OffPtr <typ.Int32Ptr> [4] ptr) mem) + (Load <typ.UInt32> ptr mem)) + +(Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && !t.IsSigned() => + (Int64Make + (Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem) + (Load <typ.UInt32> ptr mem)) + +(Load <t> ptr mem) && is64BitInt(t) && config.BigEndian && t.IsSigned() => + (Int64Make + (Load <typ.Int32> ptr mem) + (Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem)) + +(Load <t> ptr mem) && is64BitInt(t) && config.BigEndian && !t.IsSigned() => + (Int64Make + (Load <typ.UInt32> ptr mem) + (Load <typ.UInt32> (OffPtr <typ.UInt32Ptr> [4] ptr) mem)) + +(Store {t} dst (Int64Make hi lo) mem) && t.Size() == 8 && !config.BigEndian => + (Store {hi.Type} + (OffPtr <hi.Type.PtrTo()> [4] dst) + hi + (Store {lo.Type} dst lo mem)) + +(Store {t} dst (Int64Make hi lo) mem) && t.Size() == 8 && config.BigEndian => + (Store {lo.Type} + (OffPtr <lo.Type.PtrTo()> [4] dst) + lo + (Store {hi.Type} dst hi mem)) + +// These are not enabled during decomposeBuiltin if late call expansion, but they are always enabled for softFloat +(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") => + (Int64Make + (Arg <typ.Int32> {n} [off+4]) + (Arg <typ.UInt32> {n} [off])) +(Arg {n} [off]) && is64BitInt(v.Type) && !config.BigEndian && !v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") => + (Int64Make + (Arg <typ.UInt32> {n} [off+4]) + (Arg <typ.UInt32> {n} [off])) + +(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") => + (Int64Make + (Arg <typ.Int32> {n} [off]) + (Arg <typ.UInt32> {n} [off+4])) +(Arg {n} [off]) && is64BitInt(v.Type) && config.BigEndian && !v.Type.IsSigned() && !(b.Func.pass.name == "decompose builtin") => + (Int64Make + (Arg <typ.UInt32> {n} [off]) + (Arg <typ.UInt32> {n} [off+4])) + +(Add64 x y) => + (Int64Make + (Add32withcarry <typ.Int32> + (Int64Hi x) + (Int64Hi y) + (Select1 <types.TypeFlags> (Add32carry (Int64Lo x) (Int64Lo y)))) + (Select0 <typ.UInt32> (Add32carry (Int64Lo x) (Int64Lo y)))) + +(Sub64 x y) => + (Int64Make + (Sub32withcarry <typ.Int32> + (Int64Hi x) + (Int64Hi y) + (Select1 <types.TypeFlags> (Sub32carry (Int64Lo x) (Int64Lo y)))) + (Select0 <typ.UInt32> (Sub32carry (Int64Lo x) (Int64Lo y)))) + +(Mul64 x y) => + (Int64Make + (Add32 <typ.UInt32> + (Mul32 <typ.UInt32> (Int64Lo x) (Int64Hi y)) + (Add32 <typ.UInt32> + (Mul32 <typ.UInt32> (Int64Hi x) (Int64Lo y)) + (Select0 <typ.UInt32> (Mul32uhilo (Int64Lo x) (Int64Lo y))))) + (Select1 <typ.UInt32> (Mul32uhilo (Int64Lo x) (Int64Lo y)))) + +(And64 x y) => + (Int64Make + (And32 <typ.UInt32> (Int64Hi x) (Int64Hi y)) + (And32 <typ.UInt32> (Int64Lo x) (Int64Lo y))) + +(Or64 x y) => + (Int64Make + (Or32 <typ.UInt32> (Int64Hi x) (Int64Hi y)) + (Or32 <typ.UInt32> (Int64Lo x) (Int64Lo y))) + +(Xor64 x y) => + (Int64Make + (Xor32 <typ.UInt32> (Int64Hi x) (Int64Hi y)) + (Xor32 <typ.UInt32> (Int64Lo x) (Int64Lo y))) + +(Neg64 <t> x) => (Sub64 (Const64 <t> [0]) x) + +(Com64 x) => + (Int64Make + (Com32 <typ.UInt32> (Int64Hi x)) + (Com32 <typ.UInt32> (Int64Lo x))) + +// Sadly, just because we know that x is non-zero, +// we don't know whether either component is, +// so just treat Ctz64NonZero the same as Ctz64. +(Ctz64NonZero ...) => (Ctz64 ...) + +(Ctz64 x) => + (Add32 <typ.UInt32> + (Ctz32 <typ.UInt32> (Int64Lo x)) + (And32 <typ.UInt32> + (Com32 <typ.UInt32> (Zeromask (Int64Lo x))) + (Ctz32 <typ.UInt32> (Int64Hi x)))) + +(BitLen64 x) => + (Add32 <typ.Int> + (BitLen32 <typ.Int> (Int64Hi x)) + (BitLen32 <typ.Int> + (Or32 <typ.UInt32> + (Int64Lo x) + (Zeromask (Int64Hi x))))) + +(Bswap64 x) => + (Int64Make + (Bswap32 <typ.UInt32> (Int64Lo x)) + (Bswap32 <typ.UInt32> (Int64Hi x))) + +(SignExt32to64 x) => (Int64Make (Signmask x) x) +(SignExt16to64 x) => (SignExt32to64 (SignExt16to32 x)) +(SignExt8to64 x) => (SignExt32to64 (SignExt8to32 x)) + +(ZeroExt32to64 x) => (Int64Make (Const32 <typ.UInt32> [0]) x) +(ZeroExt16to64 x) => (ZeroExt32to64 (ZeroExt16to32 x)) +(ZeroExt8to64 x) => (ZeroExt32to64 (ZeroExt8to32 x)) + +(Trunc64to32 (Int64Make _ lo)) => lo +(Trunc64to16 (Int64Make _ lo)) => (Trunc32to16 lo) +(Trunc64to8 (Int64Make _ lo)) => (Trunc32to8 lo) +// Most general +(Trunc64to32 x) => (Int64Lo x) +(Trunc64to16 x) => (Trunc32to16 (Int64Lo x)) +(Trunc64to8 x) => (Trunc32to8 (Int64Lo x)) + +(Lsh32x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0]) +(Rsh32x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask x) +(Rsh32Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0]) +(Lsh16x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0]) +(Rsh16x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask (SignExt16to32 x)) +(Rsh16Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0]) +(Lsh8x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0]) +(Rsh8x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask (SignExt8to32 x)) +(Rsh8Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0]) + +(Lsh32x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh32x32 [c] x lo) +(Rsh32x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh32x32 [c] x lo) +(Rsh32Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh32Ux32 [c] x lo) +(Lsh16x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh16x32 [c] x lo) +(Rsh16x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh16x32 [c] x lo) +(Rsh16Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh16Ux32 [c] x lo) +(Lsh8x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh8x32 [c] x lo) +(Rsh8x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh8x32 [c] x lo) +(Rsh8Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh8Ux32 [c] x lo) + +(Lsh64x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const64 [0]) +(Rsh64x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Int64Make (Signmask (Int64Hi x)) (Signmask (Int64Hi x))) +(Rsh64Ux64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const64 [0]) + +(Lsh64x64 [c] x (Int64Make (Const32 [0]) lo)) => (Lsh64x32 [c] x lo) +(Rsh64x64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh64x32 [c] x lo) +(Rsh64Ux64 [c] x (Int64Make (Const32 [0]) lo)) => (Rsh64Ux32 [c] x lo) + +// turn x64 non-constant shifts to x32 shifts +// if high 32-bit of the shift is nonzero, make a huge shift +(Lsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Lsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh64Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Lsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Lsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh32Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Lsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Lsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh16Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Lsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Lsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) +(Rsh8Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 => + (Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo)) + +// Most general +(Lsh64x64 x y) => (Lsh64x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh64x64 x y) => (Rsh64x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh64Ux64 x y) => (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Lsh32x64 x y) => (Lsh32x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh32x64 x y) => (Rsh32x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh32Ux64 x y) => (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Lsh16x64 x y) => (Lsh16x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh16x64 x y) => (Rsh16x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh16Ux64 x y) => (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Lsh8x64 x y) => (Lsh8x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh8x64 x y) => (Rsh8x32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) +(Rsh8Ux64 x y) => (Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y))) + +(RotateLeft64 x (Int64Make hi lo)) => (RotateLeft64 x lo) +(RotateLeft32 x (Int64Make hi lo)) => (RotateLeft32 x lo) +(RotateLeft16 x (Int64Make hi lo)) => (RotateLeft16 x lo) +(RotateLeft8 x (Int64Make hi lo)) => (RotateLeft8 x lo) + +// Clean up constants a little +(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c == 0 => y +(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c != 0 => (Const32 <typ.UInt32> [-1]) + +// 64x left shift +// result.hi = hi<<s | lo>>(32-s) | lo<<(s-32) // >> is unsigned, large shifts result 0 +// result.lo = lo<<s +(Lsh64x32 x s) => + (Int64Make + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Lsh32x32 <typ.UInt32> (Int64Hi x) s) + (Rsh32Ux32 <typ.UInt32> + (Int64Lo x) + (Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s))) + (Lsh32x32 <typ.UInt32> + (Int64Lo x) + (Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))) + (Lsh32x32 <typ.UInt32> (Int64Lo x) s)) +(Lsh64x16 x s) => + (Int64Make + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Lsh32x16 <typ.UInt32> (Int64Hi x) s) + (Rsh32Ux16 <typ.UInt32> + (Int64Lo x) + (Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s))) + (Lsh32x16 <typ.UInt32> + (Int64Lo x) + (Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))) + (Lsh32x16 <typ.UInt32> (Int64Lo x) s)) +(Lsh64x8 x s) => + (Int64Make + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Lsh32x8 <typ.UInt32> (Int64Hi x) s) + (Rsh32Ux8 <typ.UInt32> + (Int64Lo x) + (Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s))) + (Lsh32x8 <typ.UInt32> + (Int64Lo x) + (Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))) + (Lsh32x8 <typ.UInt32> (Int64Lo x) s)) + +// 64x unsigned right shift +// result.hi = hi>>s +// result.lo = lo>>s | hi<<(32-s) | hi>>(s-32) // >> is unsigned, large shifts result 0 +(Rsh64Ux32 x s) => + (Int64Make + (Rsh32Ux32 <typ.UInt32> (Int64Hi x) s) + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Rsh32Ux32 <typ.UInt32> (Int64Lo x) s) + (Lsh32x32 <typ.UInt32> + (Int64Hi x) + (Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s))) + (Rsh32Ux32 <typ.UInt32> + (Int64Hi x) + (Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32]))))) +(Rsh64Ux16 x s) => + (Int64Make + (Rsh32Ux16 <typ.UInt32> (Int64Hi x) s) + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Rsh32Ux16 <typ.UInt32> (Int64Lo x) s) + (Lsh32x16 <typ.UInt32> + (Int64Hi x) + (Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s))) + (Rsh32Ux16 <typ.UInt32> + (Int64Hi x) + (Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32]))))) +(Rsh64Ux8 x s) => + (Int64Make + (Rsh32Ux8 <typ.UInt32> (Int64Hi x) s) + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Rsh32Ux8 <typ.UInt32> (Int64Lo x) s) + (Lsh32x8 <typ.UInt32> + (Int64Hi x) + (Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s))) + (Rsh32Ux8 <typ.UInt32> + (Int64Hi x) + (Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32]))))) + +// 64x signed right shift +// result.hi = hi>>s +// result.lo = lo>>s | hi<<(32-s) | (hi>>(s-32))&zeromask(s>>5) // hi>>(s-32) is signed, large shifts result 0/-1 +(Rsh64x32 x s) => + (Int64Make + (Rsh32x32 <typ.UInt32> (Int64Hi x) s) + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Rsh32Ux32 <typ.UInt32> (Int64Lo x) s) + (Lsh32x32 <typ.UInt32> + (Int64Hi x) + (Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s))) + (And32 <typ.UInt32> + (Rsh32x32 <typ.UInt32> + (Int64Hi x) + (Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32]))) + (Zeromask + (Rsh32Ux32 <typ.UInt32> s (Const32 <typ.UInt32> [5])))))) +(Rsh64x16 x s) => + (Int64Make + (Rsh32x16 <typ.UInt32> (Int64Hi x) s) + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Rsh32Ux16 <typ.UInt32> (Int64Lo x) s) + (Lsh32x16 <typ.UInt32> + (Int64Hi x) + (Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s))) + (And32 <typ.UInt32> + (Rsh32x16 <typ.UInt32> + (Int64Hi x) + (Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32]))) + (Zeromask + (ZeroExt16to32 + (Rsh16Ux32 <typ.UInt16> s (Const32 <typ.UInt32> [5]))))))) +(Rsh64x8 x s) => + (Int64Make + (Rsh32x8 <typ.UInt32> (Int64Hi x) s) + (Or32 <typ.UInt32> + (Or32 <typ.UInt32> + (Rsh32Ux8 <typ.UInt32> (Int64Lo x) s) + (Lsh32x8 <typ.UInt32> + (Int64Hi x) + (Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s))) + (And32 <typ.UInt32> + (Rsh32x8 <typ.UInt32> + (Int64Hi x) + (Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32]))) + (Zeromask + (ZeroExt8to32 + (Rsh8Ux32 <typ.UInt8> s (Const32 <typ.UInt32> [5]))))))) + +(Const64 <t> [c]) && t.IsSigned() => + (Int64Make (Const32 <typ.Int32> [int32(c>>32)]) (Const32 <typ.UInt32> [int32(c)])) +(Const64 <t> [c]) && !t.IsSigned() => + (Int64Make (Const32 <typ.UInt32> [int32(c>>32)]) (Const32 <typ.UInt32> [int32(c)])) + +(Eq64 x y) => + (AndB + (Eq32 (Int64Hi x) (Int64Hi y)) + (Eq32 (Int64Lo x) (Int64Lo y))) + +(Neq64 x y) => + (OrB + (Neq32 (Int64Hi x) (Int64Hi y)) + (Neq32 (Int64Lo x) (Int64Lo y))) + +(Less64U x y) => + (OrB + (Less32U (Int64Hi x) (Int64Hi y)) + (AndB + (Eq32 (Int64Hi x) (Int64Hi y)) + (Less32U (Int64Lo x) (Int64Lo y)))) + +(Leq64U x y) => + (OrB + (Less32U (Int64Hi x) (Int64Hi y)) + (AndB + (Eq32 (Int64Hi x) (Int64Hi y)) + (Leq32U (Int64Lo x) (Int64Lo y)))) + +(Less64 x y) => + (OrB + (Less32 (Int64Hi x) (Int64Hi y)) + (AndB + (Eq32 (Int64Hi x) (Int64Hi y)) + (Less32U (Int64Lo x) (Int64Lo y)))) + +(Leq64 x y) => + (OrB + (Less32 (Int64Hi x) (Int64Hi y)) + (AndB + (Eq32 (Int64Hi x) (Int64Hi y)) + (Leq32U (Int64Lo x) (Int64Lo y)))) diff --git a/src/cmd/compile/internal/ssa/_gen/dec64Ops.go b/src/cmd/compile/internal/ssa/_gen/dec64Ops.go new file mode 100644 index 0000000..bba218e --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/dec64Ops.go @@ -0,0 +1,18 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +var dec64Ops = []opData{} + +var dec64Blocks = []blockData{} + +func init() { + archs = append(archs, arch{ + name: "dec64", + ops: dec64Ops, + blocks: dec64Blocks, + generic: true, + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/decOps.go b/src/cmd/compile/internal/ssa/_gen/decOps.go new file mode 100644 index 0000000..0cc11cb --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/decOps.go @@ -0,0 +1,18 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +var decOps = []opData{} + +var decBlocks = []blockData{} + +func init() { + archs = append(archs, arch{ + name: "dec", + ops: decOps, + blocks: decBlocks, + generic: true, + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/generic.rules b/src/cmd/compile/internal/ssa/_gen/generic.rules new file mode 100644 index 0000000..0406fbb --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/generic.rules @@ -0,0 +1,2672 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Simplifications that apply to all backend architectures. As an example, this +// Go source code +// +// y := 0 * x +// +// can be translated into y := 0 without losing any information, which saves a +// pointless multiplication instruction. Other .rules files in this directory +// (for example AMD64.rules) contain rules specific to the architecture in the +// filename. The rules here apply to every architecture. +// +// The code for parsing this file lives in rulegen.go; this file generates +// ssa/rewritegeneric.go. + +// values are specified using the following format: +// (op <type> [auxint] {aux} arg0 arg1 ...) +// the type, aux, and auxint fields are optional +// on the matching side +// - the type, aux, and auxint fields must match if they are specified. +// - the first occurrence of a variable defines that variable. Subsequent +// uses must match (be == to) the first use. +// - v is defined to be the value matched. +// - an additional conditional can be provided after the match pattern with "&&". +// on the generated side +// - the type of the top-level expression is the same as the one on the left-hand side. +// - the type of any subexpressions must be specified explicitly (or +// be specified in the op's type field). +// - auxint will be 0 if not specified. +// - aux will be nil if not specified. + +// blocks are specified using the following format: +// (kind controlvalue succ0 succ1 ...) +// controlvalue must be "nil" or a value expression +// succ* fields must be variables +// For now, the generated successors must be a permutation of the matched successors. + +// constant folding +(Trunc16to8 (Const16 [c])) => (Const8 [int8(c)]) +(Trunc32to8 (Const32 [c])) => (Const8 [int8(c)]) +(Trunc32to16 (Const32 [c])) => (Const16 [int16(c)]) +(Trunc64to8 (Const64 [c])) => (Const8 [int8(c)]) +(Trunc64to16 (Const64 [c])) => (Const16 [int16(c)]) +(Trunc64to32 (Const64 [c])) => (Const32 [int32(c)]) +(Cvt64Fto32F (Const64F [c])) => (Const32F [float32(c)]) +(Cvt32Fto64F (Const32F [c])) => (Const64F [float64(c)]) +(Cvt32to32F (Const32 [c])) => (Const32F [float32(c)]) +(Cvt32to64F (Const32 [c])) => (Const64F [float64(c)]) +(Cvt64to32F (Const64 [c])) => (Const32F [float32(c)]) +(Cvt64to64F (Const64 [c])) => (Const64F [float64(c)]) +(Cvt32Fto32 (Const32F [c])) => (Const32 [int32(c)]) +(Cvt32Fto64 (Const32F [c])) => (Const64 [int64(c)]) +(Cvt64Fto32 (Const64F [c])) => (Const32 [int32(c)]) +(Cvt64Fto64 (Const64F [c])) => (Const64 [int64(c)]) +(Round32F x:(Const32F)) => x +(Round64F x:(Const64F)) => x +(CvtBoolToUint8 (ConstBool [false])) => (Const8 [0]) +(CvtBoolToUint8 (ConstBool [true])) => (Const8 [1]) + +(Trunc16to8 (ZeroExt8to16 x)) => x +(Trunc32to8 (ZeroExt8to32 x)) => x +(Trunc32to16 (ZeroExt8to32 x)) => (ZeroExt8to16 x) +(Trunc32to16 (ZeroExt16to32 x)) => x +(Trunc64to8 (ZeroExt8to64 x)) => x +(Trunc64to16 (ZeroExt8to64 x)) => (ZeroExt8to16 x) +(Trunc64to16 (ZeroExt16to64 x)) => x +(Trunc64to32 (ZeroExt8to64 x)) => (ZeroExt8to32 x) +(Trunc64to32 (ZeroExt16to64 x)) => (ZeroExt16to32 x) +(Trunc64to32 (ZeroExt32to64 x)) => x +(Trunc16to8 (SignExt8to16 x)) => x +(Trunc32to8 (SignExt8to32 x)) => x +(Trunc32to16 (SignExt8to32 x)) => (SignExt8to16 x) +(Trunc32to16 (SignExt16to32 x)) => x +(Trunc64to8 (SignExt8to64 x)) => x +(Trunc64to16 (SignExt8to64 x)) => (SignExt8to16 x) +(Trunc64to16 (SignExt16to64 x)) => x +(Trunc64to32 (SignExt8to64 x)) => (SignExt8to32 x) +(Trunc64to32 (SignExt16to64 x)) => (SignExt16to32 x) +(Trunc64to32 (SignExt32to64 x)) => x + +(ZeroExt8to16 (Const8 [c])) => (Const16 [int16( uint8(c))]) +(ZeroExt8to32 (Const8 [c])) => (Const32 [int32( uint8(c))]) +(ZeroExt8to64 (Const8 [c])) => (Const64 [int64( uint8(c))]) +(ZeroExt16to32 (Const16 [c])) => (Const32 [int32(uint16(c))]) +(ZeroExt16to64 (Const16 [c])) => (Const64 [int64(uint16(c))]) +(ZeroExt32to64 (Const32 [c])) => (Const64 [int64(uint32(c))]) +(SignExt8to16 (Const8 [c])) => (Const16 [int16(c)]) +(SignExt8to32 (Const8 [c])) => (Const32 [int32(c)]) +(SignExt8to64 (Const8 [c])) => (Const64 [int64(c)]) +(SignExt16to32 (Const16 [c])) => (Const32 [int32(c)]) +(SignExt16to64 (Const16 [c])) => (Const64 [int64(c)]) +(SignExt32to64 (Const32 [c])) => (Const64 [int64(c)]) + +(Neg8 (Const8 [c])) => (Const8 [-c]) +(Neg16 (Const16 [c])) => (Const16 [-c]) +(Neg32 (Const32 [c])) => (Const32 [-c]) +(Neg64 (Const64 [c])) => (Const64 [-c]) +(Neg32F (Const32F [c])) && c != 0 => (Const32F [-c]) +(Neg64F (Const64F [c])) && c != 0 => (Const64F [-c]) + +(Add8 (Const8 [c]) (Const8 [d])) => (Const8 [c+d]) +(Add16 (Const16 [c]) (Const16 [d])) => (Const16 [c+d]) +(Add32 (Const32 [c]) (Const32 [d])) => (Const32 [c+d]) +(Add64 (Const64 [c]) (Const64 [d])) => (Const64 [c+d]) +(Add32F (Const32F [c]) (Const32F [d])) && c+d == c+d => (Const32F [c+d]) +(Add64F (Const64F [c]) (Const64F [d])) && c+d == c+d => (Const64F [c+d]) +(AddPtr <t> x (Const64 [c])) => (OffPtr <t> x [c]) +(AddPtr <t> x (Const32 [c])) => (OffPtr <t> x [int64(c)]) + +(Sub8 (Const8 [c]) (Const8 [d])) => (Const8 [c-d]) +(Sub16 (Const16 [c]) (Const16 [d])) => (Const16 [c-d]) +(Sub32 (Const32 [c]) (Const32 [d])) => (Const32 [c-d]) +(Sub64 (Const64 [c]) (Const64 [d])) => (Const64 [c-d]) +(Sub32F (Const32F [c]) (Const32F [d])) && c-d == c-d => (Const32F [c-d]) +(Sub64F (Const64F [c]) (Const64F [d])) && c-d == c-d => (Const64F [c-d]) + +(Mul8 (Const8 [c]) (Const8 [d])) => (Const8 [c*d]) +(Mul16 (Const16 [c]) (Const16 [d])) => (Const16 [c*d]) +(Mul32 (Const32 [c]) (Const32 [d])) => (Const32 [c*d]) +(Mul64 (Const64 [c]) (Const64 [d])) => (Const64 [c*d]) +(Mul32F (Const32F [c]) (Const32F [d])) && c*d == c*d => (Const32F [c*d]) +(Mul64F (Const64F [c]) (Const64F [d])) && c*d == c*d => (Const64F [c*d]) + +(And8 (Const8 [c]) (Const8 [d])) => (Const8 [c&d]) +(And16 (Const16 [c]) (Const16 [d])) => (Const16 [c&d]) +(And32 (Const32 [c]) (Const32 [d])) => (Const32 [c&d]) +(And64 (Const64 [c]) (Const64 [d])) => (Const64 [c&d]) + +(Or8 (Const8 [c]) (Const8 [d])) => (Const8 [c|d]) +(Or16 (Const16 [c]) (Const16 [d])) => (Const16 [c|d]) +(Or32 (Const32 [c]) (Const32 [d])) => (Const32 [c|d]) +(Or64 (Const64 [c]) (Const64 [d])) => (Const64 [c|d]) + +(Xor8 (Const8 [c]) (Const8 [d])) => (Const8 [c^d]) +(Xor16 (Const16 [c]) (Const16 [d])) => (Const16 [c^d]) +(Xor32 (Const32 [c]) (Const32 [d])) => (Const32 [c^d]) +(Xor64 (Const64 [c]) (Const64 [d])) => (Const64 [c^d]) + +(Ctz64 (Const64 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz64(c))]) +(Ctz32 (Const32 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz32(c))]) +(Ctz16 (Const16 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz16(c))]) +(Ctz8 (Const8 [c])) && config.PtrSize == 4 => (Const32 [int32(ntz8(c))]) + +(Ctz64 (Const64 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz64(c))]) +(Ctz32 (Const32 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz32(c))]) +(Ctz16 (Const16 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz16(c))]) +(Ctz8 (Const8 [c])) && config.PtrSize == 8 => (Const64 [int64(ntz8(c))]) + +(Div8 (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [c/d]) +(Div16 (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [c/d]) +(Div32 (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [c/d]) +(Div64 (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [c/d]) +(Div8u (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [int8(uint8(c)/uint8(d))]) +(Div16u (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [int16(uint16(c)/uint16(d))]) +(Div32u (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [int32(uint32(c)/uint32(d))]) +(Div64u (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [int64(uint64(c)/uint64(d))]) +(Div32F (Const32F [c]) (Const32F [d])) && c/d == c/d => (Const32F [c/d]) +(Div64F (Const64F [c]) (Const64F [d])) && c/d == c/d => (Const64F [c/d]) +(Select0 (Div128u (Const64 [0]) lo y)) => (Div64u lo y) +(Select1 (Div128u (Const64 [0]) lo y)) => (Mod64u lo y) + +(Not (ConstBool [c])) => (ConstBool [!c]) + +(Floor (Const64F [c])) => (Const64F [math.Floor(c)]) +(Ceil (Const64F [c])) => (Const64F [math.Ceil(c)]) +(Trunc (Const64F [c])) => (Const64F [math.Trunc(c)]) +(RoundToEven (Const64F [c])) => (Const64F [math.RoundToEven(c)]) + +// Convert x * 1 to x. +(Mul(8|16|32|64) (Const(8|16|32|64) [1]) x) => x +(Select0 (Mul(32|64)uover (Const(32|64) [1]) x)) => x +(Select1 (Mul(32|64)uover (Const(32|64) [1]) x)) => (ConstBool [false]) + +// Convert x * -1 to -x. +(Mul(8|16|32|64) (Const(8|16|32|64) [-1]) x) => (Neg(8|16|32|64) x) + +// Convert multiplication by a power of two to a shift. +(Mul8 <t> n (Const8 [c])) && isPowerOfTwo8(c) => (Lsh8x64 <t> n (Const64 <typ.UInt64> [log8(c)])) +(Mul16 <t> n (Const16 [c])) && isPowerOfTwo16(c) => (Lsh16x64 <t> n (Const64 <typ.UInt64> [log16(c)])) +(Mul32 <t> n (Const32 [c])) && isPowerOfTwo32(c) => (Lsh32x64 <t> n (Const64 <typ.UInt64> [log32(c)])) +(Mul64 <t> n (Const64 [c])) && isPowerOfTwo64(c) => (Lsh64x64 <t> n (Const64 <typ.UInt64> [log64(c)])) +(Mul8 <t> n (Const8 [c])) && t.IsSigned() && isPowerOfTwo8(-c) => (Neg8 (Lsh8x64 <t> n (Const64 <typ.UInt64> [log8(-c)]))) +(Mul16 <t> n (Const16 [c])) && t.IsSigned() && isPowerOfTwo16(-c) => (Neg16 (Lsh16x64 <t> n (Const64 <typ.UInt64> [log16(-c)]))) +(Mul32 <t> n (Const32 [c])) && t.IsSigned() && isPowerOfTwo32(-c) => (Neg32 (Lsh32x64 <t> n (Const64 <typ.UInt64> [log32(-c)]))) +(Mul64 <t> n (Const64 [c])) && t.IsSigned() && isPowerOfTwo64(-c) => (Neg64 (Lsh64x64 <t> n (Const64 <typ.UInt64> [log64(-c)]))) + +(Mod8 (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [c % d]) +(Mod16 (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [c % d]) +(Mod32 (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [c % d]) +(Mod64 (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [c % d]) + +(Mod8u (Const8 [c]) (Const8 [d])) && d != 0 => (Const8 [int8(uint8(c) % uint8(d))]) +(Mod16u (Const16 [c]) (Const16 [d])) && d != 0 => (Const16 [int16(uint16(c) % uint16(d))]) +(Mod32u (Const32 [c]) (Const32 [d])) && d != 0 => (Const32 [int32(uint32(c) % uint32(d))]) +(Mod64u (Const64 [c]) (Const64 [d])) && d != 0 => (Const64 [int64(uint64(c) % uint64(d))]) + +(Lsh64x64 (Const64 [c]) (Const64 [d])) => (Const64 [c << uint64(d)]) +(Rsh64x64 (Const64 [c]) (Const64 [d])) => (Const64 [c >> uint64(d)]) +(Rsh64Ux64 (Const64 [c]) (Const64 [d])) => (Const64 [int64(uint64(c) >> uint64(d))]) +(Lsh32x64 (Const32 [c]) (Const64 [d])) => (Const32 [c << uint64(d)]) +(Rsh32x64 (Const32 [c]) (Const64 [d])) => (Const32 [c >> uint64(d)]) +(Rsh32Ux64 (Const32 [c]) (Const64 [d])) => (Const32 [int32(uint32(c) >> uint64(d))]) +(Lsh16x64 (Const16 [c]) (Const64 [d])) => (Const16 [c << uint64(d)]) +(Rsh16x64 (Const16 [c]) (Const64 [d])) => (Const16 [c >> uint64(d)]) +(Rsh16Ux64 (Const16 [c]) (Const64 [d])) => (Const16 [int16(uint16(c) >> uint64(d))]) +(Lsh8x64 (Const8 [c]) (Const64 [d])) => (Const8 [c << uint64(d)]) +(Rsh8x64 (Const8 [c]) (Const64 [d])) => (Const8 [c >> uint64(d)]) +(Rsh8Ux64 (Const8 [c]) (Const64 [d])) => (Const8 [int8(uint8(c) >> uint64(d))]) + +// Fold IsInBounds when the range of the index cannot exceed the limit. +(IsInBounds (ZeroExt8to32 _) (Const32 [c])) && (1 << 8) <= c => (ConstBool [true]) +(IsInBounds (ZeroExt8to64 _) (Const64 [c])) && (1 << 8) <= c => (ConstBool [true]) +(IsInBounds (ZeroExt16to32 _) (Const32 [c])) && (1 << 16) <= c => (ConstBool [true]) +(IsInBounds (ZeroExt16to64 _) (Const64 [c])) && (1 << 16) <= c => (ConstBool [true]) +(IsInBounds x x) => (ConstBool [false]) +(IsInBounds (And8 (Const8 [c]) _) (Const8 [d])) && 0 <= c && c < d => (ConstBool [true]) +(IsInBounds (ZeroExt8to16 (And8 (Const8 [c]) _)) (Const16 [d])) && 0 <= c && int16(c) < d => (ConstBool [true]) +(IsInBounds (ZeroExt8to32 (And8 (Const8 [c]) _)) (Const32 [d])) && 0 <= c && int32(c) < d => (ConstBool [true]) +(IsInBounds (ZeroExt8to64 (And8 (Const8 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true]) +(IsInBounds (And16 (Const16 [c]) _) (Const16 [d])) && 0 <= c && c < d => (ConstBool [true]) +(IsInBounds (ZeroExt16to32 (And16 (Const16 [c]) _)) (Const32 [d])) && 0 <= c && int32(c) < d => (ConstBool [true]) +(IsInBounds (ZeroExt16to64 (And16 (Const16 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true]) +(IsInBounds (And32 (Const32 [c]) _) (Const32 [d])) && 0 <= c && c < d => (ConstBool [true]) +(IsInBounds (ZeroExt32to64 (And32 (Const32 [c]) _)) (Const64 [d])) && 0 <= c && int64(c) < d => (ConstBool [true]) +(IsInBounds (And64 (Const64 [c]) _) (Const64 [d])) && 0 <= c && c < d => (ConstBool [true]) +(IsInBounds (Const32 [c]) (Const32 [d])) => (ConstBool [0 <= c && c < d]) +(IsInBounds (Const64 [c]) (Const64 [d])) => (ConstBool [0 <= c && c < d]) +// (Mod64u x y) is always between 0 (inclusive) and y (exclusive). +(IsInBounds (Mod32u _ y) y) => (ConstBool [true]) +(IsInBounds (Mod64u _ y) y) => (ConstBool [true]) +// Right shifting an unsigned number limits its value. +(IsInBounds (ZeroExt8to64 (Rsh8Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true]) +(IsInBounds (ZeroExt8to32 (Rsh8Ux64 _ (Const64 [c]))) (Const32 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true]) +(IsInBounds (ZeroExt8to16 (Rsh8Ux64 _ (Const64 [c]))) (Const16 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true]) +(IsInBounds (Rsh8Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 8 && 1<<uint( 8-c)-1 < d => (ConstBool [true]) +(IsInBounds (ZeroExt16to64 (Rsh16Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true]) +(IsInBounds (ZeroExt16to32 (Rsh16Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true]) +(IsInBounds (Rsh16Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 16 && 1<<uint(16-c)-1 < d => (ConstBool [true]) +(IsInBounds (ZeroExt32to64 (Rsh32Ux64 _ (Const64 [c]))) (Const64 [d])) && 0 < c && c < 32 && 1<<uint(32-c)-1 < d => (ConstBool [true]) +(IsInBounds (Rsh32Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 32 && 1<<uint(32-c)-1 < d => (ConstBool [true]) +(IsInBounds (Rsh64Ux64 _ (Const64 [c])) (Const64 [d])) && 0 < c && c < 64 && 1<<uint(64-c)-1 < d => (ConstBool [true]) + +(IsSliceInBounds x x) => (ConstBool [true]) +(IsSliceInBounds (And32 (Const32 [c]) _) (Const32 [d])) && 0 <= c && c <= d => (ConstBool [true]) +(IsSliceInBounds (And64 (Const64 [c]) _) (Const64 [d])) && 0 <= c && c <= d => (ConstBool [true]) +(IsSliceInBounds (Const32 [0]) _) => (ConstBool [true]) +(IsSliceInBounds (Const64 [0]) _) => (ConstBool [true]) +(IsSliceInBounds (Const32 [c]) (Const32 [d])) => (ConstBool [0 <= c && c <= d]) +(IsSliceInBounds (Const64 [c]) (Const64 [d])) => (ConstBool [0 <= c && c <= d]) +(IsSliceInBounds (SliceLen x) (SliceCap x)) => (ConstBool [true]) + +(Eq(64|32|16|8) x x) => (ConstBool [true]) +(EqB (ConstBool [c]) (ConstBool [d])) => (ConstBool [c == d]) +(EqB (ConstBool [false]) x) => (Not x) +(EqB (ConstBool [true]) x) => x + +(Neq(64|32|16|8) x x) => (ConstBool [false]) +(NeqB (ConstBool [c]) (ConstBool [d])) => (ConstBool [c != d]) +(NeqB (ConstBool [false]) x) => x +(NeqB (ConstBool [true]) x) => (Not x) +(NeqB (Not x) (Not y)) => (NeqB x y) + +(Eq64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Eq64 (Const64 <t> [c-d]) x) +(Eq32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Eq32 (Const32 <t> [c-d]) x) +(Eq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Eq16 (Const16 <t> [c-d]) x) +(Eq8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Eq8 (Const8 <t> [c-d]) x) + +(Neq64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Neq64 (Const64 <t> [c-d]) x) +(Neq32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Neq32 (Const32 <t> [c-d]) x) +(Neq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Neq16 (Const16 <t> [c-d]) x) +(Neq8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Neq8 (Const8 <t> [c-d]) x) + +// signed integer range: ( c <= x && x (<|<=) d ) -> ( unsigned(x-c) (<|<=) unsigned(d-c) ) +(AndB (Leq64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c])) +(AndB (Leq32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c])) +(AndB (Leq16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c])) +(AndB (Leq8 (Const8 [c]) x) ((Less|Leq)8 x (Const8 [d]))) && d >= c => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c])) (Const8 <x.Type> [d-c])) + +// signed integer range: ( c < x && x (<|<=) d ) -> ( unsigned(x-(c+1)) (<|<=) unsigned(d-(c+1)) ) +(AndB (Less64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1])) +(AndB (Less32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1])) +(AndB (Less16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1])) +(AndB (Less8 (Const8 [c]) x) ((Less|Leq)8 x (Const8 [d]))) && d >= c+1 && c+1 > c => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c+1])) (Const8 <x.Type> [d-c-1])) + +// unsigned integer range: ( c <= x && x (<|<=) d ) -> ( x-c (<|<=) d-c ) +(AndB (Leq64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c) => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c])) +(AndB (Leq32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c) => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c])) +(AndB (Leq16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c) => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c])) +(AndB (Leq8U (Const8 [c]) x) ((Less|Leq)8U x (Const8 [d]))) && uint8(d) >= uint8(c) => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c])) (Const8 <x.Type> [d-c])) + +// unsigned integer range: ( c < x && x (<|<=) d ) -> ( x-(c+1) (<|<=) d-(c+1) ) +(AndB (Less64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c+1) && uint64(c+1) > uint64(c) => ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1])) +(AndB (Less32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c+1) && uint32(c+1) > uint32(c) => ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1])) +(AndB (Less16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c+1) && uint16(c+1) > uint16(c) => ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1])) +(AndB (Less8U (Const8 [c]) x) ((Less|Leq)8U x (Const8 [d]))) && uint8(d) >= uint8(c+1) && uint8(c+1) > uint8(c) => ((Less|Leq)8U (Sub8 <x.Type> x (Const8 <x.Type> [c+1])) (Const8 <x.Type> [d-c-1])) + +// signed integer range: ( c (<|<=) x || x < d ) -> ( unsigned(c-d) (<|<=) unsigned(x-d) ) +(OrB ((Less|Leq)64 (Const64 [c]) x) (Less64 x (Const64 [d]))) && c >= d => ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d]))) +(OrB ((Less|Leq)32 (Const32 [c]) x) (Less32 x (Const32 [d]))) && c >= d => ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d]))) +(OrB ((Less|Leq)16 (Const16 [c]) x) (Less16 x (Const16 [d]))) && c >= d => ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d]))) +(OrB ((Less|Leq)8 (Const8 [c]) x) (Less8 x (Const8 [d]))) && c >= d => ((Less|Leq)8U (Const8 <x.Type> [c-d]) (Sub8 <x.Type> x (Const8 <x.Type> [d]))) + +// signed integer range: ( c (<|<=) x || x <= d ) -> ( unsigned(c-(d+1)) (<|<=) unsigned(x-(d+1)) ) +(OrB ((Less|Leq)64 (Const64 [c]) x) (Leq64 x (Const64 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1]))) +(OrB ((Less|Leq)32 (Const32 [c]) x) (Leq32 x (Const32 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1]))) +(OrB ((Less|Leq)16 (Const16 [c]) x) (Leq16 x (Const16 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1]))) +(OrB ((Less|Leq)8 (Const8 [c]) x) (Leq8 x (Const8 [d]))) && c >= d+1 && d+1 > d => ((Less|Leq)8U (Const8 <x.Type> [c-d-1]) (Sub8 <x.Type> x (Const8 <x.Type> [d+1]))) + +// unsigned integer range: ( c (<|<=) x || x < d ) -> ( c-d (<|<=) x-d ) +(OrB ((Less|Leq)64U (Const64 [c]) x) (Less64U x (Const64 [d]))) && uint64(c) >= uint64(d) => ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d]))) +(OrB ((Less|Leq)32U (Const32 [c]) x) (Less32U x (Const32 [d]))) && uint32(c) >= uint32(d) => ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d]))) +(OrB ((Less|Leq)16U (Const16 [c]) x) (Less16U x (Const16 [d]))) && uint16(c) >= uint16(d) => ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d]))) +(OrB ((Less|Leq)8U (Const8 [c]) x) (Less8U x (Const8 [d]))) && uint8(c) >= uint8(d) => ((Less|Leq)8U (Const8 <x.Type> [c-d]) (Sub8 <x.Type> x (Const8 <x.Type> [d]))) + +// unsigned integer range: ( c (<|<=) x || x <= d ) -> ( c-(d+1) (<|<=) x-(d+1) ) +(OrB ((Less|Leq)64U (Const64 [c]) x) (Leq64U x (Const64 [d]))) && uint64(c) >= uint64(d+1) && uint64(d+1) > uint64(d) => ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1]))) +(OrB ((Less|Leq)32U (Const32 [c]) x) (Leq32U x (Const32 [d]))) && uint32(c) >= uint32(d+1) && uint32(d+1) > uint32(d) => ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1]))) +(OrB ((Less|Leq)16U (Const16 [c]) x) (Leq16U x (Const16 [d]))) && uint16(c) >= uint16(d+1) && uint16(d+1) > uint16(d) => ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1]))) +(OrB ((Less|Leq)8U (Const8 [c]) x) (Leq8U x (Const8 [d]))) && uint8(c) >= uint8(d+1) && uint8(d+1) > uint8(d) => ((Less|Leq)8U (Const8 <x.Type> [c-d-1]) (Sub8 <x.Type> x (Const8 <x.Type> [d+1]))) + +// Canonicalize x-const to x+(-const) +(Sub64 x (Const64 <t> [c])) && x.Op != OpConst64 => (Add64 (Const64 <t> [-c]) x) +(Sub32 x (Const32 <t> [c])) && x.Op != OpConst32 => (Add32 (Const32 <t> [-c]) x) +(Sub16 x (Const16 <t> [c])) && x.Op != OpConst16 => (Add16 (Const16 <t> [-c]) x) +(Sub8 x (Const8 <t> [c])) && x.Op != OpConst8 => (Add8 (Const8 <t> [-c]) x) + +// fold negation into comparison operators +(Not (Eq(64|32|16|8|B|Ptr|64F|32F) x y)) => (Neq(64|32|16|8|B|Ptr|64F|32F) x y) +(Not (Neq(64|32|16|8|B|Ptr|64F|32F) x y)) => (Eq(64|32|16|8|B|Ptr|64F|32F) x y) + +(Not (Less(64|32|16|8) x y)) => (Leq(64|32|16|8) y x) +(Not (Less(64|32|16|8)U x y)) => (Leq(64|32|16|8)U y x) +(Not (Leq(64|32|16|8) x y)) => (Less(64|32|16|8) y x) +(Not (Leq(64|32|16|8)U x y)) => (Less(64|32|16|8)U y x) + +// Distribute multiplication c * (d+x) -> c*d + c*x. Useful for: +// a[i].b = ...; a[i+1].b = ... +(Mul64 (Const64 <t> [c]) (Add64 <t> (Const64 <t> [d]) x)) => + (Add64 (Const64 <t> [c*d]) (Mul64 <t> (Const64 <t> [c]) x)) +(Mul32 (Const32 <t> [c]) (Add32 <t> (Const32 <t> [d]) x)) => + (Add32 (Const32 <t> [c*d]) (Mul32 <t> (Const32 <t> [c]) x)) + +// Rewrite x*y ± x*z to x*(y±z) +(Add(64|32|16|8) <t> (Mul(64|32|16|8) x y) (Mul(64|32|16|8) x z)) + => (Mul(64|32|16|8) x (Add(64|32|16|8) <t> y z)) +(Sub(64|32|16|8) <t> (Mul(64|32|16|8) x y) (Mul(64|32|16|8) x z)) + => (Mul(64|32|16|8) x (Sub(64|32|16|8) <t> y z)) + +// rewrite shifts of 8/16/32 bit consts into 64 bit consts to reduce +// the number of the other rewrite rules for const shifts +(Lsh64x32 <t> x (Const32 [c])) => (Lsh64x64 x (Const64 <t> [int64(uint32(c))])) +(Lsh64x16 <t> x (Const16 [c])) => (Lsh64x64 x (Const64 <t> [int64(uint16(c))])) +(Lsh64x8 <t> x (Const8 [c])) => (Lsh64x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh64x32 <t> x (Const32 [c])) => (Rsh64x64 x (Const64 <t> [int64(uint32(c))])) +(Rsh64x16 <t> x (Const16 [c])) => (Rsh64x64 x (Const64 <t> [int64(uint16(c))])) +(Rsh64x8 <t> x (Const8 [c])) => (Rsh64x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh64Ux32 <t> x (Const32 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint32(c))])) +(Rsh64Ux16 <t> x (Const16 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint16(c))])) +(Rsh64Ux8 <t> x (Const8 [c])) => (Rsh64Ux64 x (Const64 <t> [int64(uint8(c))])) + +(Lsh32x32 <t> x (Const32 [c])) => (Lsh32x64 x (Const64 <t> [int64(uint32(c))])) +(Lsh32x16 <t> x (Const16 [c])) => (Lsh32x64 x (Const64 <t> [int64(uint16(c))])) +(Lsh32x8 <t> x (Const8 [c])) => (Lsh32x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh32x32 <t> x (Const32 [c])) => (Rsh32x64 x (Const64 <t> [int64(uint32(c))])) +(Rsh32x16 <t> x (Const16 [c])) => (Rsh32x64 x (Const64 <t> [int64(uint16(c))])) +(Rsh32x8 <t> x (Const8 [c])) => (Rsh32x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh32Ux32 <t> x (Const32 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint32(c))])) +(Rsh32Ux16 <t> x (Const16 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint16(c))])) +(Rsh32Ux8 <t> x (Const8 [c])) => (Rsh32Ux64 x (Const64 <t> [int64(uint8(c))])) + +(Lsh16x32 <t> x (Const32 [c])) => (Lsh16x64 x (Const64 <t> [int64(uint32(c))])) +(Lsh16x16 <t> x (Const16 [c])) => (Lsh16x64 x (Const64 <t> [int64(uint16(c))])) +(Lsh16x8 <t> x (Const8 [c])) => (Lsh16x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh16x32 <t> x (Const32 [c])) => (Rsh16x64 x (Const64 <t> [int64(uint32(c))])) +(Rsh16x16 <t> x (Const16 [c])) => (Rsh16x64 x (Const64 <t> [int64(uint16(c))])) +(Rsh16x8 <t> x (Const8 [c])) => (Rsh16x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh16Ux32 <t> x (Const32 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint32(c))])) +(Rsh16Ux16 <t> x (Const16 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint16(c))])) +(Rsh16Ux8 <t> x (Const8 [c])) => (Rsh16Ux64 x (Const64 <t> [int64(uint8(c))])) + +(Lsh8x32 <t> x (Const32 [c])) => (Lsh8x64 x (Const64 <t> [int64(uint32(c))])) +(Lsh8x16 <t> x (Const16 [c])) => (Lsh8x64 x (Const64 <t> [int64(uint16(c))])) +(Lsh8x8 <t> x (Const8 [c])) => (Lsh8x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh8x32 <t> x (Const32 [c])) => (Rsh8x64 x (Const64 <t> [int64(uint32(c))])) +(Rsh8x16 <t> x (Const16 [c])) => (Rsh8x64 x (Const64 <t> [int64(uint16(c))])) +(Rsh8x8 <t> x (Const8 [c])) => (Rsh8x64 x (Const64 <t> [int64(uint8(c))])) +(Rsh8Ux32 <t> x (Const32 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint32(c))])) +(Rsh8Ux16 <t> x (Const16 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint16(c))])) +(Rsh8Ux8 <t> x (Const8 [c])) => (Rsh8Ux64 x (Const64 <t> [int64(uint8(c))])) + +// shifts by zero +(Lsh(64|32|16|8)x64 x (Const64 [0])) => x +(Rsh(64|32|16|8)x64 x (Const64 [0])) => x +(Rsh(64|32|16|8)Ux64 x (Const64 [0])) => x + +// rotates by multiples of register width +(RotateLeft64 x (Const64 [c])) && c%64 == 0 => x +(RotateLeft32 x (Const32 [c])) && c%32 == 0 => x +(RotateLeft16 x (Const16 [c])) && c%16 == 0 => x +(RotateLeft8 x (Const8 [c])) && c%8 == 0 => x + +// zero shifted +(Lsh64x(64|32|16|8) (Const64 [0]) _) => (Const64 [0]) +(Rsh64x(64|32|16|8) (Const64 [0]) _) => (Const64 [0]) +(Rsh64Ux(64|32|16|8) (Const64 [0]) _) => (Const64 [0]) +(Lsh32x(64|32|16|8) (Const32 [0]) _) => (Const32 [0]) +(Rsh32x(64|32|16|8) (Const32 [0]) _) => (Const32 [0]) +(Rsh32Ux(64|32|16|8) (Const32 [0]) _) => (Const32 [0]) +(Lsh16x(64|32|16|8) (Const16 [0]) _) => (Const16 [0]) +(Rsh16x(64|32|16|8) (Const16 [0]) _) => (Const16 [0]) +(Rsh16Ux(64|32|16|8) (Const16 [0]) _) => (Const16 [0]) +(Lsh8x(64|32|16|8) (Const8 [0]) _) => (Const8 [0]) +(Rsh8x(64|32|16|8) (Const8 [0]) _) => (Const8 [0]) +(Rsh8Ux(64|32|16|8) (Const8 [0]) _) => (Const8 [0]) + +// large left shifts of all values, and right shifts of unsigned values +((Lsh64|Rsh64U)x64 _ (Const64 [c])) && uint64(c) >= 64 => (Const64 [0]) +((Lsh32|Rsh32U)x64 _ (Const64 [c])) && uint64(c) >= 32 => (Const32 [0]) +((Lsh16|Rsh16U)x64 _ (Const64 [c])) && uint64(c) >= 16 => (Const16 [0]) +((Lsh8|Rsh8U)x64 _ (Const64 [c])) && uint64(c) >= 8 => (Const8 [0]) + +// combine const shifts +(Lsh64x64 <t> (Lsh64x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh64x64 x (Const64 <t> [c+d])) +(Lsh32x64 <t> (Lsh32x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh32x64 x (Const64 <t> [c+d])) +(Lsh16x64 <t> (Lsh16x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh16x64 x (Const64 <t> [c+d])) +(Lsh8x64 <t> (Lsh8x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Lsh8x64 x (Const64 <t> [c+d])) + +(Rsh64x64 <t> (Rsh64x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh64x64 x (Const64 <t> [c+d])) +(Rsh32x64 <t> (Rsh32x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh32x64 x (Const64 <t> [c+d])) +(Rsh16x64 <t> (Rsh16x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh16x64 x (Const64 <t> [c+d])) +(Rsh8x64 <t> (Rsh8x64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh8x64 x (Const64 <t> [c+d])) + +(Rsh64Ux64 <t> (Rsh64Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh64Ux64 x (Const64 <t> [c+d])) +(Rsh32Ux64 <t> (Rsh32Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh32Ux64 x (Const64 <t> [c+d])) +(Rsh16Ux64 <t> (Rsh16Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh16Ux64 x (Const64 <t> [c+d])) +(Rsh8Ux64 <t> (Rsh8Ux64 x (Const64 [c])) (Const64 [d])) && !uaddOvf(c,d) => (Rsh8Ux64 x (Const64 <t> [c+d])) + +// Remove signed right shift before an unsigned right shift that extracts the sign bit. +(Rsh8Ux64 (Rsh8x64 x _) (Const64 <t> [7] )) => (Rsh8Ux64 x (Const64 <t> [7] )) +(Rsh16Ux64 (Rsh16x64 x _) (Const64 <t> [15])) => (Rsh16Ux64 x (Const64 <t> [15])) +(Rsh32Ux64 (Rsh32x64 x _) (Const64 <t> [31])) => (Rsh32Ux64 x (Const64 <t> [31])) +(Rsh64Ux64 (Rsh64x64 x _) (Const64 <t> [63])) => (Rsh64Ux64 x (Const64 <t> [63])) + +// Convert x>>c<<c to x&^(1<<c-1) +(Lsh64x64 i:(Rsh(64|64U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 64 && i.Uses == 1 => (And64 x (Const64 <v.Type> [int64(-1) << c])) +(Lsh32x64 i:(Rsh(32|32U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 32 && i.Uses == 1 => (And32 x (Const32 <v.Type> [int32(-1) << c])) +(Lsh16x64 i:(Rsh(16|16U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 16 && i.Uses == 1 => (And16 x (Const16 <v.Type> [int16(-1) << c])) +(Lsh8x64 i:(Rsh(8|8U)x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 8 && i.Uses == 1 => (And8 x (Const8 <v.Type> [int8(-1) << c])) +// similarly for x<<c>>c +(Rsh64Ux64 i:(Lsh64x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 64 && i.Uses == 1 => (And64 x (Const64 <v.Type> [int64(^uint64(0)>>c)])) +(Rsh32Ux64 i:(Lsh32x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 32 && i.Uses == 1 => (And32 x (Const32 <v.Type> [int32(^uint32(0)>>c)])) +(Rsh16Ux64 i:(Lsh16x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 16 && i.Uses == 1 => (And16 x (Const16 <v.Type> [int16(^uint16(0)>>c)])) +(Rsh8Ux64 i:(Lsh8x64 x (Const64 [c])) (Const64 [c])) && c >= 0 && c < 8 && i.Uses == 1 => (And8 x (Const8 <v.Type> [int8 (^uint8 (0)>>c)])) + +// ((x >> c1) << c2) >> c3 +(Rsh(64|32|16|8)Ux64 (Lsh(64|32|16|8)x64 (Rsh(64|32|16|8)Ux64 x (Const64 [c1])) (Const64 [c2])) (Const64 [c3])) + && uint64(c1) >= uint64(c2) && uint64(c3) >= uint64(c2) && !uaddOvf(c1-c2, c3) + => (Rsh(64|32|16|8)Ux64 x (Const64 <typ.UInt64> [c1-c2+c3])) + +// ((x << c1) >> c2) << c3 +(Lsh(64|32|16|8)x64 (Rsh(64|32|16|8)Ux64 (Lsh(64|32|16|8)x64 x (Const64 [c1])) (Const64 [c2])) (Const64 [c3])) + && uint64(c1) >= uint64(c2) && uint64(c3) >= uint64(c2) && !uaddOvf(c1-c2, c3) + => (Lsh(64|32|16|8)x64 x (Const64 <typ.UInt64> [c1-c2+c3])) + +// (x >> c) & uppermask = 0 +(And64 (Const64 [m]) (Rsh64Ux64 _ (Const64 [c]))) && c >= int64(64-ntz64(m)) => (Const64 [0]) +(And32 (Const32 [m]) (Rsh32Ux64 _ (Const64 [c]))) && c >= int64(32-ntz32(m)) => (Const32 [0]) +(And16 (Const16 [m]) (Rsh16Ux64 _ (Const64 [c]))) && c >= int64(16-ntz16(m)) => (Const16 [0]) +(And8 (Const8 [m]) (Rsh8Ux64 _ (Const64 [c]))) && c >= int64(8-ntz8(m)) => (Const8 [0]) + +// (x << c) & lowermask = 0 +(And64 (Const64 [m]) (Lsh64x64 _ (Const64 [c]))) && c >= int64(64-nlz64(m)) => (Const64 [0]) +(And32 (Const32 [m]) (Lsh32x64 _ (Const64 [c]))) && c >= int64(32-nlz32(m)) => (Const32 [0]) +(And16 (Const16 [m]) (Lsh16x64 _ (Const64 [c]))) && c >= int64(16-nlz16(m)) => (Const16 [0]) +(And8 (Const8 [m]) (Lsh8x64 _ (Const64 [c]))) && c >= int64(8-nlz8(m)) => (Const8 [0]) + +// replace shifts with zero extensions +(Rsh16Ux64 (Lsh16x64 x (Const64 [8])) (Const64 [8])) => (ZeroExt8to16 (Trunc16to8 <typ.UInt8> x)) +(Rsh32Ux64 (Lsh32x64 x (Const64 [24])) (Const64 [24])) => (ZeroExt8to32 (Trunc32to8 <typ.UInt8> x)) +(Rsh64Ux64 (Lsh64x64 x (Const64 [56])) (Const64 [56])) => (ZeroExt8to64 (Trunc64to8 <typ.UInt8> x)) +(Rsh32Ux64 (Lsh32x64 x (Const64 [16])) (Const64 [16])) => (ZeroExt16to32 (Trunc32to16 <typ.UInt16> x)) +(Rsh64Ux64 (Lsh64x64 x (Const64 [48])) (Const64 [48])) => (ZeroExt16to64 (Trunc64to16 <typ.UInt16> x)) +(Rsh64Ux64 (Lsh64x64 x (Const64 [32])) (Const64 [32])) => (ZeroExt32to64 (Trunc64to32 <typ.UInt32> x)) + +// replace shifts with sign extensions +(Rsh16x64 (Lsh16x64 x (Const64 [8])) (Const64 [8])) => (SignExt8to16 (Trunc16to8 <typ.Int8> x)) +(Rsh32x64 (Lsh32x64 x (Const64 [24])) (Const64 [24])) => (SignExt8to32 (Trunc32to8 <typ.Int8> x)) +(Rsh64x64 (Lsh64x64 x (Const64 [56])) (Const64 [56])) => (SignExt8to64 (Trunc64to8 <typ.Int8> x)) +(Rsh32x64 (Lsh32x64 x (Const64 [16])) (Const64 [16])) => (SignExt16to32 (Trunc32to16 <typ.Int16> x)) +(Rsh64x64 (Lsh64x64 x (Const64 [48])) (Const64 [48])) => (SignExt16to64 (Trunc64to16 <typ.Int16> x)) +(Rsh64x64 (Lsh64x64 x (Const64 [32])) (Const64 [32])) => (SignExt32to64 (Trunc64to32 <typ.Int32> x)) + +// constant comparisons +(Eq(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c == d]) +(Neq(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c != d]) +(Less(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c < d]) +(Leq(64|32|16|8) (Const(64|32|16|8) [c]) (Const(64|32|16|8) [d])) => (ConstBool [c <= d]) + +(Less64U (Const64 [c]) (Const64 [d])) => (ConstBool [uint64(c) < uint64(d)]) +(Less32U (Const32 [c]) (Const32 [d])) => (ConstBool [uint32(c) < uint32(d)]) +(Less16U (Const16 [c]) (Const16 [d])) => (ConstBool [uint16(c) < uint16(d)]) +(Less8U (Const8 [c]) (Const8 [d])) => (ConstBool [ uint8(c) < uint8(d)]) + +(Leq64U (Const64 [c]) (Const64 [d])) => (ConstBool [uint64(c) <= uint64(d)]) +(Leq32U (Const32 [c]) (Const32 [d])) => (ConstBool [uint32(c) <= uint32(d)]) +(Leq16U (Const16 [c]) (Const16 [d])) => (ConstBool [uint16(c) <= uint16(d)]) +(Leq8U (Const8 [c]) (Const8 [d])) => (ConstBool [ uint8(c) <= uint8(d)]) + +(Leq8 (Const8 [0]) (And8 _ (Const8 [c]))) && c >= 0 => (ConstBool [true]) +(Leq16 (Const16 [0]) (And16 _ (Const16 [c]))) && c >= 0 => (ConstBool [true]) +(Leq32 (Const32 [0]) (And32 _ (Const32 [c]))) && c >= 0 => (ConstBool [true]) +(Leq64 (Const64 [0]) (And64 _ (Const64 [c]))) && c >= 0 => (ConstBool [true]) + +(Leq8 (Const8 [0]) (Rsh8Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true]) +(Leq16 (Const16 [0]) (Rsh16Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true]) +(Leq32 (Const32 [0]) (Rsh32Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true]) +(Leq64 (Const64 [0]) (Rsh64Ux64 _ (Const64 [c]))) && c > 0 => (ConstBool [true]) + +(Less(64|32|16|8) (Const(64|32|16|8) <t> [0]) x) && isNonNegative(x) => (Neq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x) +(Less(64|32|16|8) x (Const(64|32|16|8) <t> [1])) && isNonNegative(x) => (Eq(64|32|16|8) (Const(64|32|16|8) <t> [0]) x) + +// constant floating point comparisons +(Eq32F (Const32F [c]) (Const32F [d])) => (ConstBool [c == d]) +(Eq64F (Const64F [c]) (Const64F [d])) => (ConstBool [c == d]) +(Neq32F (Const32F [c]) (Const32F [d])) => (ConstBool [c != d]) +(Neq64F (Const64F [c]) (Const64F [d])) => (ConstBool [c != d]) +(Less32F (Const32F [c]) (Const32F [d])) => (ConstBool [c < d]) +(Less64F (Const64F [c]) (Const64F [d])) => (ConstBool [c < d]) +(Leq32F (Const32F [c]) (Const32F [d])) => (ConstBool [c <= d]) +(Leq64F (Const64F [c]) (Const64F [d])) => (ConstBool [c <= d]) + +// simplifications +(Or(64|32|16|8) x x) => x +(Or(64|32|16|8) (Const(64|32|16|8) [0]) x) => x +(Or(64|32|16|8) (Const(64|32|16|8) [-1]) _) => (Const(64|32|16|8) [-1]) +(Or(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [-1]) + +(And(64|32|16|8) x x) => x +(And(64|32|16|8) (Const(64|32|16|8) [-1]) x) => x +(And(64|32|16|8) (Const(64|32|16|8) [0]) _) => (Const(64|32|16|8) [0]) +(And(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [0]) + +(Xor(64|32|16|8) x x) => (Const(64|32|16|8) [0]) +(Xor(64|32|16|8) (Const(64|32|16|8) [0]) x) => x +(Xor(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [-1]) + +(Add(64|32|16|8) (Const(64|32|16|8) [0]) x) => x +(Sub(64|32|16|8) x x) => (Const(64|32|16|8) [0]) +(Mul(64|32|16|8) (Const(64|32|16|8) [0]) _) => (Const(64|32|16|8) [0]) +(Select0 (Mul(64|32)uover (Const(64|32) [0]) x)) => (Const(64|32) [0]) +(Select1 (Mul(64|32)uover (Const(64|32) [0]) x)) => (ConstBool [false]) + +(Com(64|32|16|8) (Com(64|32|16|8) x)) => x +(Com(64|32|16|8) (Const(64|32|16|8) [c])) => (Const(64|32|16|8) [^c]) + +(Neg(64|32|16|8) (Sub(64|32|16|8) x y)) => (Sub(64|32|16|8) y x) +(Add(64|32|16|8) x (Neg(64|32|16|8) y)) => (Sub(64|32|16|8) x y) + +(Xor(64|32|16|8) (Const(64|32|16|8) [-1]) x) => (Com(64|32|16|8) x) + +(Sub(64|32|16|8) (Neg(64|32|16|8) x) (Com(64|32|16|8) x)) => (Const(64|32|16|8) [1]) +(Sub(64|32|16|8) (Com(64|32|16|8) x) (Neg(64|32|16|8) x)) => (Const(64|32|16|8) [-1]) +(Add(64|32|16|8) (Com(64|32|16|8) x) x) => (Const(64|32|16|8) [-1]) + +// ^(x-1) == ^x+1 == -x +(Add(64|32|16|8) (Const(64|32|16|8) [1]) (Com(64|32|16|8) x)) => (Neg(64|32|16|8) x) +(Com(64|32|16|8) (Add(64|32|16|8) (Const(64|32|16|8) [-1]) x)) => (Neg(64|32|16|8) x) + +// -(-x) == x +(Neg(64|32|16|8) (Neg(64|32|16|8) x)) => x + +// -^x == x+1 +(Neg(64|32|16|8) <t> (Com(64|32|16|8) x)) => (Add(64|32|16|8) (Const(64|32|16|8) <t> [1]) x) + +(And(64|32|16|8) x (And(64|32|16|8) x y)) => (And(64|32|16|8) x y) +(Or(64|32|16|8) x (Or(64|32|16|8) x y)) => (Or(64|32|16|8) x y) +(Xor(64|32|16|8) x (Xor(64|32|16|8) x y)) => y + +// Unsigned comparisons to zero. +(Less(64U|32U|16U|8U) _ (Const(64|32|16|8) [0])) => (ConstBool [false]) +(Leq(64U|32U|16U|8U) (Const(64|32|16|8) [0]) _) => (ConstBool [true]) + +// Ands clear bits. Ors set bits. +// If a subsequent Or will set all the bits +// that an And cleared, we can skip the And. +// This happens in bitmasking code like: +// x &^= 3 << shift // clear two old bits +// x |= v << shift // set two new bits +// when shift is a small constant and v ends up a constant 3. +(Or8 (And8 x (Const8 [c2])) (Const8 <t> [c1])) && ^(c1 | c2) == 0 => (Or8 (Const8 <t> [c1]) x) +(Or16 (And16 x (Const16 [c2])) (Const16 <t> [c1])) && ^(c1 | c2) == 0 => (Or16 (Const16 <t> [c1]) x) +(Or32 (And32 x (Const32 [c2])) (Const32 <t> [c1])) && ^(c1 | c2) == 0 => (Or32 (Const32 <t> [c1]) x) +(Or64 (And64 x (Const64 [c2])) (Const64 <t> [c1])) && ^(c1 | c2) == 0 => (Or64 (Const64 <t> [c1]) x) + +(Trunc64to8 (And64 (Const64 [y]) x)) && y&0xFF == 0xFF => (Trunc64to8 x) +(Trunc64to16 (And64 (Const64 [y]) x)) && y&0xFFFF == 0xFFFF => (Trunc64to16 x) +(Trunc64to32 (And64 (Const64 [y]) x)) && y&0xFFFFFFFF == 0xFFFFFFFF => (Trunc64to32 x) +(Trunc32to8 (And32 (Const32 [y]) x)) && y&0xFF == 0xFF => (Trunc32to8 x) +(Trunc32to16 (And32 (Const32 [y]) x)) && y&0xFFFF == 0xFFFF => (Trunc32to16 x) +(Trunc16to8 (And16 (Const16 [y]) x)) && y&0xFF == 0xFF => (Trunc16to8 x) + +(ZeroExt8to64 (Trunc64to8 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 56 => x +(ZeroExt16to64 (Trunc64to16 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 48 => x +(ZeroExt32to64 (Trunc64to32 x:(Rsh64Ux64 _ (Const64 [s])))) && s >= 32 => x +(ZeroExt8to32 (Trunc32to8 x:(Rsh32Ux64 _ (Const64 [s])))) && s >= 24 => x +(ZeroExt16to32 (Trunc32to16 x:(Rsh32Ux64 _ (Const64 [s])))) && s >= 16 => x +(ZeroExt8to16 (Trunc16to8 x:(Rsh16Ux64 _ (Const64 [s])))) && s >= 8 => x + +(SignExt8to64 (Trunc64to8 x:(Rsh64x64 _ (Const64 [s])))) && s >= 56 => x +(SignExt16to64 (Trunc64to16 x:(Rsh64x64 _ (Const64 [s])))) && s >= 48 => x +(SignExt32to64 (Trunc64to32 x:(Rsh64x64 _ (Const64 [s])))) && s >= 32 => x +(SignExt8to32 (Trunc32to8 x:(Rsh32x64 _ (Const64 [s])))) && s >= 24 => x +(SignExt16to32 (Trunc32to16 x:(Rsh32x64 _ (Const64 [s])))) && s >= 16 => x +(SignExt8to16 (Trunc16to8 x:(Rsh16x64 _ (Const64 [s])))) && s >= 8 => x + +(Slicemask (Const32 [x])) && x > 0 => (Const32 [-1]) +(Slicemask (Const32 [0])) => (Const32 [0]) +(Slicemask (Const64 [x])) && x > 0 => (Const64 [-1]) +(Slicemask (Const64 [0])) => (Const64 [0]) + +// simplifications often used for lengths. e.g. len(s[i:i+5])==5 +(Sub(64|32|16|8) (Add(64|32|16|8) x y) x) => y +(Sub(64|32|16|8) (Add(64|32|16|8) x y) y) => x +(Sub(64|32|16|8) (Sub(64|32|16|8) x y) x) => (Neg(64|32|16|8) y) +(Sub(64|32|16|8) x (Add(64|32|16|8) x y)) => (Neg(64|32|16|8) y) +(Add(64|32|16|8) x (Sub(64|32|16|8) y x)) => y +(Add(64|32|16|8) x (Add(64|32|16|8) y (Sub(64|32|16|8) z x))) => (Add(64|32|16|8) y z) + +// basic phi simplifications +(Phi (Const8 [c]) (Const8 [c])) => (Const8 [c]) +(Phi (Const16 [c]) (Const16 [c])) => (Const16 [c]) +(Phi (Const32 [c]) (Const32 [c])) => (Const32 [c]) +(Phi (Const64 [c]) (Const64 [c])) => (Const64 [c]) + +// slice and interface comparisons +// The frontend ensures that we can only compare against nil, +// so we need only compare the first word (interface type or slice ptr). +(EqInter x y) => (EqPtr (ITab x) (ITab y)) +(NeqInter x y) => (NeqPtr (ITab x) (ITab y)) +(EqSlice x y) => (EqPtr (SlicePtr x) (SlicePtr y)) +(NeqSlice x y) => (NeqPtr (SlicePtr x) (SlicePtr y)) + +// Load of store of same address, with compatibly typed value and same size +(Load <t1> p1 (Store {t2} p2 x _)) + && isSamePtr(p1, p2) + && t1.Compare(x.Type) == types.CMPeq + && t1.Size() == t2.Size() + => x +(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 x _))) + && isSamePtr(p1, p3) + && t1.Compare(x.Type) == types.CMPeq + && t1.Size() == t2.Size() + && disjoint(p3, t3.Size(), p2, t2.Size()) + => x +(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 x _)))) + && isSamePtr(p1, p4) + && t1.Compare(x.Type) == types.CMPeq + && t1.Size() == t2.Size() + && disjoint(p4, t4.Size(), p2, t2.Size()) + && disjoint(p4, t4.Size(), p3, t3.Size()) + => x +(Load <t1> p1 (Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 x _))))) + && isSamePtr(p1, p5) + && t1.Compare(x.Type) == types.CMPeq + && t1.Size() == t2.Size() + && disjoint(p5, t5.Size(), p2, t2.Size()) + && disjoint(p5, t5.Size(), p3, t3.Size()) + && disjoint(p5, t5.Size(), p4, t4.Size()) + => x + +// Pass constants through math.Float{32,64}bits and math.Float{32,64}frombits + (Load <t1> p1 (Store {t2} p2 (Const64 [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 8 && is64BitFloat(t1) && !math.IsNaN(math.Float64frombits(uint64(x))) => (Const64F [math.Float64frombits(uint64(x))]) + (Load <t1> p1 (Store {t2} p2 (Const32 [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 4 && is32BitFloat(t1) && !math.IsNaN(float64(math.Float32frombits(uint32(x)))) => (Const32F [math.Float32frombits(uint32(x))]) +(Load <t1> p1 (Store {t2} p2 (Const64F [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 8 && is64BitInt(t1) => (Const64 [int64(math.Float64bits(x))]) +(Load <t1> p1 (Store {t2} p2 (Const32F [x]) _)) && isSamePtr(p1,p2) && sizeof(t2) == 4 && is32BitInt(t1) => (Const32 [int32(math.Float32bits(x))]) + +// Float Loads up to Zeros so they can be constant folded. +(Load <t1> op:(OffPtr [o1] p1) + (Store {t2} p2 _ + mem:(Zero [n] p3 _))) + && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p3) + && fe.CanSSA(t1) + && disjoint(op, t1.Size(), p2, t2.Size()) + => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p3) mem) +(Load <t1> op:(OffPtr [o1] p1) + (Store {t2} p2 _ + (Store {t3} p3 _ + mem:(Zero [n] p4 _)))) + && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p4) + && fe.CanSSA(t1) + && disjoint(op, t1.Size(), p2, t2.Size()) + && disjoint(op, t1.Size(), p3, t3.Size()) + => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p4) mem) +(Load <t1> op:(OffPtr [o1] p1) + (Store {t2} p2 _ + (Store {t3} p3 _ + (Store {t4} p4 _ + mem:(Zero [n] p5 _))))) + && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p5) + && fe.CanSSA(t1) + && disjoint(op, t1.Size(), p2, t2.Size()) + && disjoint(op, t1.Size(), p3, t3.Size()) + && disjoint(op, t1.Size(), p4, t4.Size()) + => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p5) mem) +(Load <t1> op:(OffPtr [o1] p1) + (Store {t2} p2 _ + (Store {t3} p3 _ + (Store {t4} p4 _ + (Store {t5} p5 _ + mem:(Zero [n] p6 _)))))) + && o1 >= 0 && o1+t1.Size() <= n && isSamePtr(p1, p6) + && fe.CanSSA(t1) + && disjoint(op, t1.Size(), p2, t2.Size()) + && disjoint(op, t1.Size(), p3, t3.Size()) + && disjoint(op, t1.Size(), p4, t4.Size()) + && disjoint(op, t1.Size(), p5, t5.Size()) + => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] p6) mem) + +// Zero to Load forwarding. +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && t1.IsBoolean() + && isSamePtr(p1, p2) + && n >= o + 1 + => (ConstBool [false]) +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && is8BitInt(t1) + && isSamePtr(p1, p2) + && n >= o + 1 + => (Const8 [0]) +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && is16BitInt(t1) + && isSamePtr(p1, p2) + && n >= o + 2 + => (Const16 [0]) +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && is32BitInt(t1) + && isSamePtr(p1, p2) + && n >= o + 4 + => (Const32 [0]) +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && is64BitInt(t1) + && isSamePtr(p1, p2) + && n >= o + 8 + => (Const64 [0]) +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && is32BitFloat(t1) + && isSamePtr(p1, p2) + && n >= o + 4 + => (Const32F [0]) +(Load <t1> (OffPtr [o] p1) (Zero [n] p2 _)) + && is64BitFloat(t1) + && isSamePtr(p1, p2) + && n >= o + 8 + => (Const64F [0]) + +// Eliminate stores of values that have just been loaded from the same location. +// We also handle the common case where there are some intermediate stores. +(Store {t1} p1 (Load <t2> p2 mem) mem) + && isSamePtr(p1, p2) + && t2.Size() == t1.Size() + => mem +(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ oldmem)) + && isSamePtr(p1, p2) + && t2.Size() == t1.Size() + && disjoint(p1, t1.Size(), p3, t3.Size()) + => mem +(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ (Store {t4} p4 _ oldmem))) + && isSamePtr(p1, p2) + && t2.Size() == t1.Size() + && disjoint(p1, t1.Size(), p3, t3.Size()) + && disjoint(p1, t1.Size(), p4, t4.Size()) + => mem +(Store {t1} p1 (Load <t2> p2 oldmem) mem:(Store {t3} p3 _ (Store {t4} p4 _ (Store {t5} p5 _ oldmem)))) + && isSamePtr(p1, p2) + && t2.Size() == t1.Size() + && disjoint(p1, t1.Size(), p3, t3.Size()) + && disjoint(p1, t1.Size(), p4, t4.Size()) + && disjoint(p1, t1.Size(), p5, t5.Size()) + => mem + +// Don't Store zeros to cleared variables. +(Store {t} (OffPtr [o] p1) x mem:(Zero [n] p2 _)) + && isConstZero(x) + && o >= 0 && t.Size() + o <= n && isSamePtr(p1, p2) + => mem +(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Zero [n] p3 _))) + && isConstZero(x) + && o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p3) + && disjoint(op, t1.Size(), p2, t2.Size()) + => mem +(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Store {t3} p3 _ (Zero [n] p4 _)))) + && isConstZero(x) + && o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p4) + && disjoint(op, t1.Size(), p2, t2.Size()) + && disjoint(op, t1.Size(), p3, t3.Size()) + => mem +(Store {t1} op:(OffPtr [o1] p1) x mem:(Store {t2} p2 _ (Store {t3} p3 _ (Store {t4} p4 _ (Zero [n] p5 _))))) + && isConstZero(x) + && o1 >= 0 && t1.Size() + o1 <= n && isSamePtr(p1, p5) + && disjoint(op, t1.Size(), p2, t2.Size()) + && disjoint(op, t1.Size(), p3, t3.Size()) + && disjoint(op, t1.Size(), p4, t4.Size()) + => mem + +// Collapse OffPtr +(OffPtr (OffPtr p [y]) [x]) => (OffPtr p [x+y]) +(OffPtr p [0]) && v.Type.Compare(p.Type) == types.CMPeq => p + +// indexing operations +// Note: bounds check has already been done +(PtrIndex <t> ptr idx) && config.PtrSize == 4 && is32Bit(t.Elem().Size()) => (AddPtr ptr (Mul32 <typ.Int> idx (Const32 <typ.Int> [int32(t.Elem().Size())]))) +(PtrIndex <t> ptr idx) && config.PtrSize == 8 => (AddPtr ptr (Mul64 <typ.Int> idx (Const64 <typ.Int> [t.Elem().Size()]))) + +// struct operations +(StructSelect (StructMake1 x)) => x +(StructSelect [0] (StructMake2 x _)) => x +(StructSelect [1] (StructMake2 _ x)) => x +(StructSelect [0] (StructMake3 x _ _)) => x +(StructSelect [1] (StructMake3 _ x _)) => x +(StructSelect [2] (StructMake3 _ _ x)) => x +(StructSelect [0] (StructMake4 x _ _ _)) => x +(StructSelect [1] (StructMake4 _ x _ _)) => x +(StructSelect [2] (StructMake4 _ _ x _)) => x +(StructSelect [3] (StructMake4 _ _ _ x)) => x + +(Load <t> _ _) && t.IsStruct() && t.NumFields() == 0 && fe.CanSSA(t) => + (StructMake0) +(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 1 && fe.CanSSA(t) => + (StructMake1 + (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem)) +(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 2 && fe.CanSSA(t) => + (StructMake2 + (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem) + (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem)) +(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 3 && fe.CanSSA(t) => + (StructMake3 + (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem) + (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem) + (Load <t.FieldType(2)> (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] ptr) mem)) +(Load <t> ptr mem) && t.IsStruct() && t.NumFields() == 4 && fe.CanSSA(t) => + (StructMake4 + (Load <t.FieldType(0)> (OffPtr <t.FieldType(0).PtrTo()> [0] ptr) mem) + (Load <t.FieldType(1)> (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] ptr) mem) + (Load <t.FieldType(2)> (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] ptr) mem) + (Load <t.FieldType(3)> (OffPtr <t.FieldType(3).PtrTo()> [t.FieldOff(3)] ptr) mem)) + +(StructSelect [i] x:(Load <t> ptr mem)) && !fe.CanSSA(t) => + @x.Block (Load <v.Type> (OffPtr <v.Type.PtrTo()> [t.FieldOff(int(i))] ptr) mem) + +(Store _ (StructMake0) mem) => mem +(Store dst (StructMake1 <t> f0) mem) => + (Store {t.FieldType(0)} (OffPtr <t.FieldType(0).PtrTo()> [0] dst) f0 mem) +(Store dst (StructMake2 <t> f0 f1) mem) => + (Store {t.FieldType(1)} + (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst) + f1 + (Store {t.FieldType(0)} + (OffPtr <t.FieldType(0).PtrTo()> [0] dst) + f0 mem)) +(Store dst (StructMake3 <t> f0 f1 f2) mem) => + (Store {t.FieldType(2)} + (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] dst) + f2 + (Store {t.FieldType(1)} + (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst) + f1 + (Store {t.FieldType(0)} + (OffPtr <t.FieldType(0).PtrTo()> [0] dst) + f0 mem))) +(Store dst (StructMake4 <t> f0 f1 f2 f3) mem) => + (Store {t.FieldType(3)} + (OffPtr <t.FieldType(3).PtrTo()> [t.FieldOff(3)] dst) + f3 + (Store {t.FieldType(2)} + (OffPtr <t.FieldType(2).PtrTo()> [t.FieldOff(2)] dst) + f2 + (Store {t.FieldType(1)} + (OffPtr <t.FieldType(1).PtrTo()> [t.FieldOff(1)] dst) + f1 + (Store {t.FieldType(0)} + (OffPtr <t.FieldType(0).PtrTo()> [0] dst) + f0 mem)))) + +// Putting struct{*byte} and similar into direct interfaces. +(IMake _typ (StructMake1 val)) => (IMake _typ val) +(StructSelect [0] (IData x)) => (IData x) + +// un-SSAable values use mem->mem copies +(Store {t} dst (Load src mem) mem) && !fe.CanSSA(t) => + (Move {t} [t.Size()] dst src mem) +(Store {t} dst (Load src mem) (VarDef {x} mem)) && !fe.CanSSA(t) => + (Move {t} [t.Size()] dst src (VarDef {x} mem)) + +// array ops +(ArraySelect (ArrayMake1 x)) => x + +(Load <t> _ _) && t.IsArray() && t.NumElem() == 0 => + (ArrayMake0) + +(Load <t> ptr mem) && t.IsArray() && t.NumElem() == 1 && fe.CanSSA(t) => + (ArrayMake1 (Load <t.Elem()> ptr mem)) + +(Store _ (ArrayMake0) mem) => mem +(Store dst (ArrayMake1 e) mem) => (Store {e.Type} dst e mem) + +// Putting [1]*byte and similar into direct interfaces. +(IMake _typ (ArrayMake1 val)) => (IMake _typ val) +(ArraySelect [0] (IData x)) => (IData x) + +// string ops +// Decomposing StringMake and lowering of StringPtr and StringLen +// happens in a later pass, dec, so that these operations are available +// to other passes for optimizations. +(StringPtr (StringMake (Addr <t> {s} base) _)) => (Addr <t> {s} base) +(StringLen (StringMake _ (Const64 <t> [c]))) => (Const64 <t> [c]) +(ConstString {str}) && config.PtrSize == 4 && str == "" => + (StringMake (ConstNil) (Const32 <typ.Int> [0])) +(ConstString {str}) && config.PtrSize == 8 && str == "" => + (StringMake (ConstNil) (Const64 <typ.Int> [0])) +(ConstString {str}) && config.PtrSize == 4 && str != "" => + (StringMake + (Addr <typ.BytePtr> {fe.StringData(str)} + (SB)) + (Const32 <typ.Int> [int32(len(str))])) +(ConstString {str}) && config.PtrSize == 8 && str != "" => + (StringMake + (Addr <typ.BytePtr> {fe.StringData(str)} + (SB)) + (Const64 <typ.Int> [int64(len(str))])) + +// slice ops +// Only a few slice rules are provided here. See dec.rules for +// a more comprehensive set. +(SliceLen (SliceMake _ (Const64 <t> [c]) _)) => (Const64 <t> [c]) +(SliceCap (SliceMake _ _ (Const64 <t> [c]))) => (Const64 <t> [c]) +(SliceLen (SliceMake _ (Const32 <t> [c]) _)) => (Const32 <t> [c]) +(SliceCap (SliceMake _ _ (Const32 <t> [c]))) => (Const32 <t> [c]) +(SlicePtr (SliceMake (SlicePtr x) _ _)) => (SlicePtr x) +(SliceLen (SliceMake _ (SliceLen x) _)) => (SliceLen x) +(SliceCap (SliceMake _ _ (SliceCap x))) => (SliceCap x) +(SliceCap (SliceMake _ _ (SliceLen x))) => (SliceLen x) +(ConstSlice) && config.PtrSize == 4 => + (SliceMake + (ConstNil <v.Type.Elem().PtrTo()>) + (Const32 <typ.Int> [0]) + (Const32 <typ.Int> [0])) +(ConstSlice) && config.PtrSize == 8 => + (SliceMake + (ConstNil <v.Type.Elem().PtrTo()>) + (Const64 <typ.Int> [0]) + (Const64 <typ.Int> [0])) + +// interface ops +(ConstInterface) => + (IMake + (ConstNil <typ.Uintptr>) + (ConstNil <typ.BytePtr>)) + +(NilCheck (GetG mem) mem) => mem + +(If (Not cond) yes no) => (If cond no yes) +(If (ConstBool [c]) yes no) && c => (First yes no) +(If (ConstBool [c]) yes no) && !c => (First no yes) + +(Phi <t> nx:(Not x) ny:(Not y)) && nx.Uses == 1 && ny.Uses == 1 => (Not (Phi <t> x y)) + +// Get rid of Convert ops for pointer arithmetic on unsafe.Pointer. +(Convert (Add(64|32) (Convert ptr mem) off) mem) => (AddPtr ptr off) +(Convert (Convert ptr mem) mem) => ptr + +// strength reduction of divide by a constant. +// See ../magic.go for a detailed description of these algorithms. + +// Unsigned divide by power of 2. Strength reduce to a shift. +(Div8u n (Const8 [c])) && isPowerOfTwo8(c) => (Rsh8Ux64 n (Const64 <typ.UInt64> [log8(c)])) +(Div16u n (Const16 [c])) && isPowerOfTwo16(c) => (Rsh16Ux64 n (Const64 <typ.UInt64> [log16(c)])) +(Div32u n (Const32 [c])) && isPowerOfTwo32(c) => (Rsh32Ux64 n (Const64 <typ.UInt64> [log32(c)])) +(Div64u n (Const64 [c])) && isPowerOfTwo64(c) => (Rsh64Ux64 n (Const64 <typ.UInt64> [log64(c)])) +(Div64u n (Const64 [-1<<63])) => (Rsh64Ux64 n (Const64 <typ.UInt64> [63])) + +// Signed non-negative divide by power of 2. +(Div8 n (Const8 [c])) && isNonNegative(n) && isPowerOfTwo8(c) => (Rsh8Ux64 n (Const64 <typ.UInt64> [log8(c)])) +(Div16 n (Const16 [c])) && isNonNegative(n) && isPowerOfTwo16(c) => (Rsh16Ux64 n (Const64 <typ.UInt64> [log16(c)])) +(Div32 n (Const32 [c])) && isNonNegative(n) && isPowerOfTwo32(c) => (Rsh32Ux64 n (Const64 <typ.UInt64> [log32(c)])) +(Div64 n (Const64 [c])) && isNonNegative(n) && isPowerOfTwo64(c) => (Rsh64Ux64 n (Const64 <typ.UInt64> [log64(c)])) +(Div64 n (Const64 [-1<<63])) && isNonNegative(n) => (Const64 [0]) + +// Unsigned divide, not a power of 2. Strength reduce to a multiply. +// For 8-bit divides, we just do a direct 9-bit by 8-bit multiply. +(Div8u x (Const8 [c])) && umagicOK8(c) => + (Trunc32to8 + (Rsh32Ux64 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(1<<8+umagic8(c).m)]) + (ZeroExt8to32 x)) + (Const64 <typ.UInt64> [8+umagic8(c).s]))) + +// For 16-bit divides on 64-bit machines, we do a direct 17-bit by 16-bit multiply. +(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 8 => + (Trunc64to16 + (Rsh64Ux64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(1<<16+umagic16(c).m)]) + (ZeroExt16to64 x)) + (Const64 <typ.UInt64> [16+umagic16(c).s]))) + +// For 16-bit divides on 32-bit machines +(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && umagic16(c).m&1 == 0 => + (Trunc32to16 + (Rsh32Ux64 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(1<<15+umagic16(c).m/2)]) + (ZeroExt16to32 x)) + (Const64 <typ.UInt64> [16+umagic16(c).s-1]))) +(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && c&1 == 0 => + (Trunc32to16 + (Rsh32Ux64 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(1<<15+(umagic16(c).m+1)/2)]) + (Rsh32Ux64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [1]))) + (Const64 <typ.UInt64> [16+umagic16(c).s-2]))) +(Div16u x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && config.useAvg => + (Trunc32to16 + (Rsh32Ux64 <typ.UInt32> + (Avg32u + (Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16])) + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(umagic16(c).m)]) + (ZeroExt16to32 x))) + (Const64 <typ.UInt64> [16+umagic16(c).s-1]))) + +// For 32-bit divides on 32-bit machines +(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && umagic32(c).m&1 == 0 && config.useHmul => + (Rsh32Ux64 <typ.UInt32> + (Hmul32u <typ.UInt32> + (Const32 <typ.UInt32> [int32(1<<31+umagic32(c).m/2)]) + x) + (Const64 <typ.UInt64> [umagic32(c).s-1])) +(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul => + (Rsh32Ux64 <typ.UInt32> + (Hmul32u <typ.UInt32> + (Const32 <typ.UInt32> [int32(1<<31+(umagic32(c).m+1)/2)]) + (Rsh32Ux64 <typ.UInt32> x (Const64 <typ.UInt64> [1]))) + (Const64 <typ.UInt64> [umagic32(c).s-2])) +(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul => + (Rsh32Ux64 <typ.UInt32> + (Avg32u + x + (Hmul32u <typ.UInt32> + (Const32 <typ.UInt32> [int32(umagic32(c).m)]) + x)) + (Const64 <typ.UInt64> [umagic32(c).s-1])) + +// For 32-bit divides on 64-bit machines +// We'll use a regular (non-hi) multiply for this case. +(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && umagic32(c).m&1 == 0 => + (Trunc64to32 + (Rsh64Ux64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(1<<31+umagic32(c).m/2)]) + (ZeroExt32to64 x)) + (Const64 <typ.UInt64> [32+umagic32(c).s-1]))) +(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && c&1 == 0 => + (Trunc64to32 + (Rsh64Ux64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(1<<31+(umagic32(c).m+1)/2)]) + (Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1]))) + (Const64 <typ.UInt64> [32+umagic32(c).s-2]))) +(Div32u x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.useAvg => + (Trunc64to32 + (Rsh64Ux64 <typ.UInt64> + (Avg64u + (Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32])) + (Mul64 <typ.UInt64> + (Const64 <typ.UInt32> [int64(umagic32(c).m)]) + (ZeroExt32to64 x))) + (Const64 <typ.UInt64> [32+umagic32(c).s-1]))) + +// For unsigned 64-bit divides on 32-bit machines, +// if the constant fits in 16 bits (so that the last term +// fits in 32 bits), convert to three 32-bit divides by a constant. +// +// If 1<<32 = Q * c + R +// and x = hi << 32 + lo +// +// Then x = (hi/c*c + hi%c) << 32 + lo +// = hi/c*c<<32 + hi%c<<32 + lo +// = hi/c*c<<32 + (hi%c)*(Q*c+R) + lo/c*c + lo%c +// = hi/c*c<<32 + (hi%c)*Q*c + lo/c*c + (hi%c*R+lo%c) +// and x / c = (hi/c)<<32 + (hi%c)*Q + lo/c + (hi%c*R+lo%c)/c +(Div64u x (Const64 [c])) && c > 0 && c <= 0xFFFF && umagicOK32(int32(c)) && config.RegSize == 4 && config.useHmul => + (Add64 + (Add64 <typ.UInt64> + (Add64 <typ.UInt64> + (Lsh64x64 <typ.UInt64> + (ZeroExt32to64 + (Div32u <typ.UInt32> + (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32]))) + (Const32 <typ.UInt32> [int32(c)]))) + (Const64 <typ.UInt64> [32])) + (ZeroExt32to64 (Div32u <typ.UInt32> (Trunc64to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(c)])))) + (Mul64 <typ.UInt64> + (ZeroExt32to64 <typ.UInt64> + (Mod32u <typ.UInt32> + (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32]))) + (Const32 <typ.UInt32> [int32(c)]))) + (Const64 <typ.UInt64> [int64((1<<32)/c)]))) + (ZeroExt32to64 + (Div32u <typ.UInt32> + (Add32 <typ.UInt32> + (Mod32u <typ.UInt32> (Trunc64to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(c)])) + (Mul32 <typ.UInt32> + (Mod32u <typ.UInt32> + (Trunc64to32 <typ.UInt32> (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [32]))) + (Const32 <typ.UInt32> [int32(c)])) + (Const32 <typ.UInt32> [int32((1<<32)%c)]))) + (Const32 <typ.UInt32> [int32(c)])))) + +// For 64-bit divides on 64-bit machines +// (64-bit divides on 32-bit machines are lowered to a runtime call by the walk pass.) +(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && umagic64(c).m&1 == 0 && config.useHmul => + (Rsh64Ux64 <typ.UInt64> + (Hmul64u <typ.UInt64> + (Const64 <typ.UInt64> [int64(1<<63+umagic64(c).m/2)]) + x) + (Const64 <typ.UInt64> [umagic64(c).s-1])) +(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && c&1 == 0 && config.useHmul => + (Rsh64Ux64 <typ.UInt64> + (Hmul64u <typ.UInt64> + (Const64 <typ.UInt64> [int64(1<<63+(umagic64(c).m+1)/2)]) + (Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [1]))) + (Const64 <typ.UInt64> [umagic64(c).s-2])) +(Div64u x (Const64 [c])) && umagicOK64(c) && config.RegSize == 8 && config.useAvg && config.useHmul => + (Rsh64Ux64 <typ.UInt64> + (Avg64u + x + (Hmul64u <typ.UInt64> + (Const64 <typ.UInt64> [int64(umagic64(c).m)]) + x)) + (Const64 <typ.UInt64> [umagic64(c).s-1])) + +// Signed divide by a negative constant. Rewrite to divide by a positive constant. +(Div8 <t> n (Const8 [c])) && c < 0 && c != -1<<7 => (Neg8 (Div8 <t> n (Const8 <t> [-c]))) +(Div16 <t> n (Const16 [c])) && c < 0 && c != -1<<15 => (Neg16 (Div16 <t> n (Const16 <t> [-c]))) +(Div32 <t> n (Const32 [c])) && c < 0 && c != -1<<31 => (Neg32 (Div32 <t> n (Const32 <t> [-c]))) +(Div64 <t> n (Const64 [c])) && c < 0 && c != -1<<63 => (Neg64 (Div64 <t> n (Const64 <t> [-c]))) + +// Dividing by the most-negative number. Result is always 0 except +// if the input is also the most-negative number. +// We can detect that using the sign bit of x & -x. +(Div8 <t> x (Const8 [-1<<7 ])) => (Rsh8Ux64 (And8 <t> x (Neg8 <t> x)) (Const64 <typ.UInt64> [7 ])) +(Div16 <t> x (Const16 [-1<<15])) => (Rsh16Ux64 (And16 <t> x (Neg16 <t> x)) (Const64 <typ.UInt64> [15])) +(Div32 <t> x (Const32 [-1<<31])) => (Rsh32Ux64 (And32 <t> x (Neg32 <t> x)) (Const64 <typ.UInt64> [31])) +(Div64 <t> x (Const64 [-1<<63])) => (Rsh64Ux64 (And64 <t> x (Neg64 <t> x)) (Const64 <typ.UInt64> [63])) + +// Signed divide by power of 2. +// n / c = n >> log(c) if n >= 0 +// = (n+c-1) >> log(c) if n < 0 +// We conditionally add c-1 by adding n>>63>>(64-log(c)) (first shift signed, second shift unsigned). +(Div8 <t> n (Const8 [c])) && isPowerOfTwo8(c) => + (Rsh8x64 + (Add8 <t> n (Rsh8Ux64 <t> (Rsh8x64 <t> n (Const64 <typ.UInt64> [ 7])) (Const64 <typ.UInt64> [int64( 8-log8(c))]))) + (Const64 <typ.UInt64> [int64(log8(c))])) +(Div16 <t> n (Const16 [c])) && isPowerOfTwo16(c) => + (Rsh16x64 + (Add16 <t> n (Rsh16Ux64 <t> (Rsh16x64 <t> n (Const64 <typ.UInt64> [15])) (Const64 <typ.UInt64> [int64(16-log16(c))]))) + (Const64 <typ.UInt64> [int64(log16(c))])) +(Div32 <t> n (Const32 [c])) && isPowerOfTwo32(c) => + (Rsh32x64 + (Add32 <t> n (Rsh32Ux64 <t> (Rsh32x64 <t> n (Const64 <typ.UInt64> [31])) (Const64 <typ.UInt64> [int64(32-log32(c))]))) + (Const64 <typ.UInt64> [int64(log32(c))])) +(Div64 <t> n (Const64 [c])) && isPowerOfTwo64(c) => + (Rsh64x64 + (Add64 <t> n (Rsh64Ux64 <t> (Rsh64x64 <t> n (Const64 <typ.UInt64> [63])) (Const64 <typ.UInt64> [int64(64-log64(c))]))) + (Const64 <typ.UInt64> [int64(log64(c))])) + +// Signed divide, not a power of 2. Strength reduce to a multiply. +(Div8 <t> x (Const8 [c])) && smagicOK8(c) => + (Sub8 <t> + (Rsh32x64 <t> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(smagic8(c).m)]) + (SignExt8to32 x)) + (Const64 <typ.UInt64> [8+smagic8(c).s])) + (Rsh32x64 <t> + (SignExt8to32 x) + (Const64 <typ.UInt64> [31]))) +(Div16 <t> x (Const16 [c])) && smagicOK16(c) => + (Sub16 <t> + (Rsh32x64 <t> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(smagic16(c).m)]) + (SignExt16to32 x)) + (Const64 <typ.UInt64> [16+smagic16(c).s])) + (Rsh32x64 <t> + (SignExt16to32 x) + (Const64 <typ.UInt64> [31]))) +(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 8 => + (Sub32 <t> + (Rsh64x64 <t> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(smagic32(c).m)]) + (SignExt32to64 x)) + (Const64 <typ.UInt64> [32+smagic32(c).s])) + (Rsh64x64 <t> + (SignExt32to64 x) + (Const64 <typ.UInt64> [63]))) +(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul => + (Sub32 <t> + (Rsh32x64 <t> + (Hmul32 <t> + (Const32 <typ.UInt32> [int32(smagic32(c).m/2)]) + x) + (Const64 <typ.UInt64> [smagic32(c).s-1])) + (Rsh32x64 <t> + x + (Const64 <typ.UInt64> [31]))) +(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul => + (Sub32 <t> + (Rsh32x64 <t> + (Add32 <t> + (Hmul32 <t> + (Const32 <typ.UInt32> [int32(smagic32(c).m)]) + x) + x) + (Const64 <typ.UInt64> [smagic32(c).s])) + (Rsh32x64 <t> + x + (Const64 <typ.UInt64> [31]))) +(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul => + (Sub64 <t> + (Rsh64x64 <t> + (Hmul64 <t> + (Const64 <typ.UInt64> [int64(smagic64(c).m/2)]) + x) + (Const64 <typ.UInt64> [smagic64(c).s-1])) + (Rsh64x64 <t> + x + (Const64 <typ.UInt64> [63]))) +(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul => + (Sub64 <t> + (Rsh64x64 <t> + (Add64 <t> + (Hmul64 <t> + (Const64 <typ.UInt64> [int64(smagic64(c).m)]) + x) + x) + (Const64 <typ.UInt64> [smagic64(c).s])) + (Rsh64x64 <t> + x + (Const64 <typ.UInt64> [63]))) + +// Unsigned mod by power of 2 constant. +(Mod8u <t> n (Const8 [c])) && isPowerOfTwo8(c) => (And8 n (Const8 <t> [c-1])) +(Mod16u <t> n (Const16 [c])) && isPowerOfTwo16(c) => (And16 n (Const16 <t> [c-1])) +(Mod32u <t> n (Const32 [c])) && isPowerOfTwo32(c) => (And32 n (Const32 <t> [c-1])) +(Mod64u <t> n (Const64 [c])) && isPowerOfTwo64(c) => (And64 n (Const64 <t> [c-1])) +(Mod64u <t> n (Const64 [-1<<63])) => (And64 n (Const64 <t> [1<<63-1])) + +// Signed non-negative mod by power of 2 constant. +(Mod8 <t> n (Const8 [c])) && isNonNegative(n) && isPowerOfTwo8(c) => (And8 n (Const8 <t> [c-1])) +(Mod16 <t> n (Const16 [c])) && isNonNegative(n) && isPowerOfTwo16(c) => (And16 n (Const16 <t> [c-1])) +(Mod32 <t> n (Const32 [c])) && isNonNegative(n) && isPowerOfTwo32(c) => (And32 n (Const32 <t> [c-1])) +(Mod64 <t> n (Const64 [c])) && isNonNegative(n) && isPowerOfTwo64(c) => (And64 n (Const64 <t> [c-1])) +(Mod64 n (Const64 [-1<<63])) && isNonNegative(n) => n + +// Signed mod by negative constant. +(Mod8 <t> n (Const8 [c])) && c < 0 && c != -1<<7 => (Mod8 <t> n (Const8 <t> [-c])) +(Mod16 <t> n (Const16 [c])) && c < 0 && c != -1<<15 => (Mod16 <t> n (Const16 <t> [-c])) +(Mod32 <t> n (Const32 [c])) && c < 0 && c != -1<<31 => (Mod32 <t> n (Const32 <t> [-c])) +(Mod64 <t> n (Const64 [c])) && c < 0 && c != -1<<63 => (Mod64 <t> n (Const64 <t> [-c])) + +// All other mods by constants, do A%B = A-(A/B*B). +// This implements % with two * and a bunch of ancillary ops. +// One of the * is free if the user's code also computes A/B. +(Mod8 <t> x (Const8 [c])) && x.Op != OpConst8 && (c > 0 || c == -1<<7) + => (Sub8 x (Mul8 <t> (Div8 <t> x (Const8 <t> [c])) (Const8 <t> [c]))) +(Mod16 <t> x (Const16 [c])) && x.Op != OpConst16 && (c > 0 || c == -1<<15) + => (Sub16 x (Mul16 <t> (Div16 <t> x (Const16 <t> [c])) (Const16 <t> [c]))) +(Mod32 <t> x (Const32 [c])) && x.Op != OpConst32 && (c > 0 || c == -1<<31) + => (Sub32 x (Mul32 <t> (Div32 <t> x (Const32 <t> [c])) (Const32 <t> [c]))) +(Mod64 <t> x (Const64 [c])) && x.Op != OpConst64 && (c > 0 || c == -1<<63) + => (Sub64 x (Mul64 <t> (Div64 <t> x (Const64 <t> [c])) (Const64 <t> [c]))) +(Mod8u <t> x (Const8 [c])) && x.Op != OpConst8 && c > 0 && umagicOK8( c) + => (Sub8 x (Mul8 <t> (Div8u <t> x (Const8 <t> [c])) (Const8 <t> [c]))) +(Mod16u <t> x (Const16 [c])) && x.Op != OpConst16 && c > 0 && umagicOK16(c) + => (Sub16 x (Mul16 <t> (Div16u <t> x (Const16 <t> [c])) (Const16 <t> [c]))) +(Mod32u <t> x (Const32 [c])) && x.Op != OpConst32 && c > 0 && umagicOK32(c) + => (Sub32 x (Mul32 <t> (Div32u <t> x (Const32 <t> [c])) (Const32 <t> [c]))) +(Mod64u <t> x (Const64 [c])) && x.Op != OpConst64 && c > 0 && umagicOK64(c) + => (Sub64 x (Mul64 <t> (Div64u <t> x (Const64 <t> [c])) (Const64 <t> [c]))) + +// For architectures without rotates on less than 32-bits, promote these checks to 32-bit. +(Eq8 (Mod8u x (Const8 [c])) (Const8 [0])) && x.Op != OpConst8 && udivisibleOK8(c) && !hasSmallRotate(config) => + (Eq32 (Mod32u <typ.UInt32> (ZeroExt8to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(uint8(c))])) (Const32 <typ.UInt32> [0])) +(Eq16 (Mod16u x (Const16 [c])) (Const16 [0])) && x.Op != OpConst16 && udivisibleOK16(c) && !hasSmallRotate(config) => + (Eq32 (Mod32u <typ.UInt32> (ZeroExt16to32 <typ.UInt32> x) (Const32 <typ.UInt32> [int32(uint16(c))])) (Const32 <typ.UInt32> [0])) +(Eq8 (Mod8 x (Const8 [c])) (Const8 [0])) && x.Op != OpConst8 && sdivisibleOK8(c) && !hasSmallRotate(config) => + (Eq32 (Mod32 <typ.Int32> (SignExt8to32 <typ.Int32> x) (Const32 <typ.Int32> [int32(c)])) (Const32 <typ.Int32> [0])) +(Eq16 (Mod16 x (Const16 [c])) (Const16 [0])) && x.Op != OpConst16 && sdivisibleOK16(c) && !hasSmallRotate(config) => + (Eq32 (Mod32 <typ.Int32> (SignExt16to32 <typ.Int32> x) (Const32 <typ.Int32> [int32(c)])) (Const32 <typ.Int32> [0])) + +// Divisibility checks x%c == 0 convert to multiply and rotate. +// Note, x%c == 0 is rewritten as x == c*(x/c) during the opt pass +// where (x/c) is performed using multiplication with magic constants. +// To rewrite x%c == 0 requires pattern matching the rewritten expression +// and checking that the division by the same constant wasn't already calculated. +// This check is made by counting uses of the magic constant multiplication. +// Note that if there were an intermediate opt pass, this rule could be applied +// directly on the Div op and magic division rewrites could be delayed to late opt. + +// Unsigned divisibility checks convert to multiply and rotate. +(Eq8 x (Mul8 (Const8 [c]) + (Trunc32to8 + (Rsh32Ux64 + mul:(Mul32 + (Const32 [m]) + (ZeroExt8to32 x)) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(1<<8+umagic8(c).m) && s == 8+umagic8(c).s + && x.Op != OpConst8 && udivisibleOK8(c) + => (Leq8U + (RotateLeft8 <typ.UInt8> + (Mul8 <typ.UInt8> + (Const8 <typ.UInt8> [int8(udivisible8(c).m)]) + x) + (Const8 <typ.UInt8> [int8(8-udivisible8(c).k)]) + ) + (Const8 <typ.UInt8> [int8(udivisible8(c).max)]) + ) + +(Eq16 x (Mul16 (Const16 [c]) + (Trunc64to16 + (Rsh64Ux64 + mul:(Mul64 + (Const64 [m]) + (ZeroExt16to64 x)) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(1<<16+umagic16(c).m) && s == 16+umagic16(c).s + && x.Op != OpConst16 && udivisibleOK16(c) + => (Leq16U + (RotateLeft16 <typ.UInt16> + (Mul16 <typ.UInt16> + (Const16 <typ.UInt16> [int16(udivisible16(c).m)]) + x) + (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)]) + ) + (Const16 <typ.UInt16> [int16(udivisible16(c).max)]) + ) + +(Eq16 x (Mul16 (Const16 [c]) + (Trunc32to16 + (Rsh32Ux64 + mul:(Mul32 + (Const32 [m]) + (ZeroExt16to32 x)) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(1<<15+umagic16(c).m/2) && s == 16+umagic16(c).s-1 + && x.Op != OpConst16 && udivisibleOK16(c) + => (Leq16U + (RotateLeft16 <typ.UInt16> + (Mul16 <typ.UInt16> + (Const16 <typ.UInt16> [int16(udivisible16(c).m)]) + x) + (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)]) + ) + (Const16 <typ.UInt16> [int16(udivisible16(c).max)]) + ) + +(Eq16 x (Mul16 (Const16 [c]) + (Trunc32to16 + (Rsh32Ux64 + mul:(Mul32 + (Const32 [m]) + (Rsh32Ux64 (ZeroExt16to32 x) (Const64 [1]))) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(1<<15+(umagic16(c).m+1)/2) && s == 16+umagic16(c).s-2 + && x.Op != OpConst16 && udivisibleOK16(c) + => (Leq16U + (RotateLeft16 <typ.UInt16> + (Mul16 <typ.UInt16> + (Const16 <typ.UInt16> [int16(udivisible16(c).m)]) + x) + (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)]) + ) + (Const16 <typ.UInt16> [int16(udivisible16(c).max)]) + ) + +(Eq16 x (Mul16 (Const16 [c]) + (Trunc32to16 + (Rsh32Ux64 + (Avg32u + (Lsh32x64 (ZeroExt16to32 x) (Const64 [16])) + mul:(Mul32 + (Const32 [m]) + (ZeroExt16to32 x))) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(umagic16(c).m) && s == 16+umagic16(c).s-1 + && x.Op != OpConst16 && udivisibleOK16(c) + => (Leq16U + (RotateLeft16 <typ.UInt16> + (Mul16 <typ.UInt16> + (Const16 <typ.UInt16> [int16(udivisible16(c).m)]) + x) + (Const16 <typ.UInt16> [int16(16-udivisible16(c).k)]) + ) + (Const16 <typ.UInt16> [int16(udivisible16(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Rsh32Ux64 + mul:(Hmul32u + (Const32 [m]) + x) + (Const64 [s])) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(1<<31+umagic32(c).m/2) && s == umagic32(c).s-1 + && x.Op != OpConst32 && udivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(udivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(udivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Rsh32Ux64 + mul:(Hmul32u + (Const32 <typ.UInt32> [m]) + (Rsh32Ux64 x (Const64 [1]))) + (Const64 [s])) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(1<<31+(umagic32(c).m+1)/2) && s == umagic32(c).s-2 + && x.Op != OpConst32 && udivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(udivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(udivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Rsh32Ux64 + (Avg32u + x + mul:(Hmul32u + (Const32 [m]) + x)) + (Const64 [s])) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(umagic32(c).m) && s == umagic32(c).s-1 + && x.Op != OpConst32 && udivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(udivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(udivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Trunc64to32 + (Rsh64Ux64 + mul:(Mul64 + (Const64 [m]) + (ZeroExt32to64 x)) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(1<<31+umagic32(c).m/2) && s == 32+umagic32(c).s-1 + && x.Op != OpConst32 && udivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(udivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(udivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Trunc64to32 + (Rsh64Ux64 + mul:(Mul64 + (Const64 [m]) + (Rsh64Ux64 (ZeroExt32to64 x) (Const64 [1]))) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(1<<31+(umagic32(c).m+1)/2) && s == 32+umagic32(c).s-2 + && x.Op != OpConst32 && udivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(udivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(udivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Trunc64to32 + (Rsh64Ux64 + (Avg64u + (Lsh64x64 (ZeroExt32to64 x) (Const64 [32])) + mul:(Mul64 + (Const64 [m]) + (ZeroExt32to64 x))) + (Const64 [s]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(umagic32(c).m) && s == 32+umagic32(c).s-1 + && x.Op != OpConst32 && udivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(udivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(32-udivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(udivisible32(c).max)]) + ) + +(Eq64 x (Mul64 (Const64 [c]) + (Rsh64Ux64 + mul:(Hmul64u + (Const64 [m]) + x) + (Const64 [s])) + ) +) && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(1<<63+umagic64(c).m/2) && s == umagic64(c).s-1 + && x.Op != OpConst64 && udivisibleOK64(c) + => (Leq64U + (RotateLeft64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(udivisible64(c).m)]) + x) + (Const64 <typ.UInt64> [64-udivisible64(c).k]) + ) + (Const64 <typ.UInt64> [int64(udivisible64(c).max)]) + ) +(Eq64 x (Mul64 (Const64 [c]) + (Rsh64Ux64 + mul:(Hmul64u + (Const64 [m]) + (Rsh64Ux64 x (Const64 [1]))) + (Const64 [s])) + ) +) && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(1<<63+(umagic64(c).m+1)/2) && s == umagic64(c).s-2 + && x.Op != OpConst64 && udivisibleOK64(c) + => (Leq64U + (RotateLeft64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(udivisible64(c).m)]) + x) + (Const64 <typ.UInt64> [64-udivisible64(c).k]) + ) + (Const64 <typ.UInt64> [int64(udivisible64(c).max)]) + ) +(Eq64 x (Mul64 (Const64 [c]) + (Rsh64Ux64 + (Avg64u + x + mul:(Hmul64u + (Const64 [m]) + x)) + (Const64 [s])) + ) +) && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(umagic64(c).m) && s == umagic64(c).s-1 + && x.Op != OpConst64 && udivisibleOK64(c) + => (Leq64U + (RotateLeft64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(udivisible64(c).m)]) + x) + (Const64 <typ.UInt64> [64-udivisible64(c).k]) + ) + (Const64 <typ.UInt64> [int64(udivisible64(c).max)]) + ) + +// Signed divisibility checks convert to multiply, add and rotate. +(Eq8 x (Mul8 (Const8 [c]) + (Sub8 + (Rsh32x64 + mul:(Mul32 + (Const32 [m]) + (SignExt8to32 x)) + (Const64 [s])) + (Rsh32x64 + (SignExt8to32 x) + (Const64 [31]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(smagic8(c).m) && s == 8+smagic8(c).s + && x.Op != OpConst8 && sdivisibleOK8(c) + => (Leq8U + (RotateLeft8 <typ.UInt8> + (Add8 <typ.UInt8> + (Mul8 <typ.UInt8> + (Const8 <typ.UInt8> [int8(sdivisible8(c).m)]) + x) + (Const8 <typ.UInt8> [int8(sdivisible8(c).a)]) + ) + (Const8 <typ.UInt8> [int8(8-sdivisible8(c).k)]) + ) + (Const8 <typ.UInt8> [int8(sdivisible8(c).max)]) + ) + +(Eq16 x (Mul16 (Const16 [c]) + (Sub16 + (Rsh32x64 + mul:(Mul32 + (Const32 [m]) + (SignExt16to32 x)) + (Const64 [s])) + (Rsh32x64 + (SignExt16to32 x) + (Const64 [31]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(smagic16(c).m) && s == 16+smagic16(c).s + && x.Op != OpConst16 && sdivisibleOK16(c) + => (Leq16U + (RotateLeft16 <typ.UInt16> + (Add16 <typ.UInt16> + (Mul16 <typ.UInt16> + (Const16 <typ.UInt16> [int16(sdivisible16(c).m)]) + x) + (Const16 <typ.UInt16> [int16(sdivisible16(c).a)]) + ) + (Const16 <typ.UInt16> [int16(16-sdivisible16(c).k)]) + ) + (Const16 <typ.UInt16> [int16(sdivisible16(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Sub32 + (Rsh64x64 + mul:(Mul64 + (Const64 [m]) + (SignExt32to64 x)) + (Const64 [s])) + (Rsh64x64 + (SignExt32to64 x) + (Const64 [63]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(smagic32(c).m) && s == 32+smagic32(c).s + && x.Op != OpConst32 && sdivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Add32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(sdivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(sdivisible32(c).a)]) + ) + (Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(sdivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Sub32 + (Rsh32x64 + mul:(Hmul32 + (Const32 [m]) + x) + (Const64 [s])) + (Rsh32x64 + x + (Const64 [31]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(smagic32(c).m/2) && s == smagic32(c).s-1 + && x.Op != OpConst32 && sdivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Add32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(sdivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(sdivisible32(c).a)]) + ) + (Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(sdivisible32(c).max)]) + ) + +(Eq32 x (Mul32 (Const32 [c]) + (Sub32 + (Rsh32x64 + (Add32 + mul:(Hmul32 + (Const32 [m]) + x) + x) + (Const64 [s])) + (Rsh32x64 + x + (Const64 [31]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int32(smagic32(c).m) && s == smagic32(c).s + && x.Op != OpConst32 && sdivisibleOK32(c) + => (Leq32U + (RotateLeft32 <typ.UInt32> + (Add32 <typ.UInt32> + (Mul32 <typ.UInt32> + (Const32 <typ.UInt32> [int32(sdivisible32(c).m)]) + x) + (Const32 <typ.UInt32> [int32(sdivisible32(c).a)]) + ) + (Const32 <typ.UInt32> [int32(32-sdivisible32(c).k)]) + ) + (Const32 <typ.UInt32> [int32(sdivisible32(c).max)]) + ) + +(Eq64 x (Mul64 (Const64 [c]) + (Sub64 + (Rsh64x64 + mul:(Hmul64 + (Const64 [m]) + x) + (Const64 [s])) + (Rsh64x64 + x + (Const64 [63]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(smagic64(c).m/2) && s == smagic64(c).s-1 + && x.Op != OpConst64 && sdivisibleOK64(c) + => (Leq64U + (RotateLeft64 <typ.UInt64> + (Add64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(sdivisible64(c).m)]) + x) + (Const64 <typ.UInt64> [int64(sdivisible64(c).a)]) + ) + (Const64 <typ.UInt64> [64-sdivisible64(c).k]) + ) + (Const64 <typ.UInt64> [int64(sdivisible64(c).max)]) + ) + +(Eq64 x (Mul64 (Const64 [c]) + (Sub64 + (Rsh64x64 + (Add64 + mul:(Hmul64 + (Const64 [m]) + x) + x) + (Const64 [s])) + (Rsh64x64 + x + (Const64 [63]))) + ) +) + && v.Block.Func.pass.name != "opt" && mul.Uses == 1 + && m == int64(smagic64(c).m) && s == smagic64(c).s + && x.Op != OpConst64 && sdivisibleOK64(c) + => (Leq64U + (RotateLeft64 <typ.UInt64> + (Add64 <typ.UInt64> + (Mul64 <typ.UInt64> + (Const64 <typ.UInt64> [int64(sdivisible64(c).m)]) + x) + (Const64 <typ.UInt64> [int64(sdivisible64(c).a)]) + ) + (Const64 <typ.UInt64> [64-sdivisible64(c).k]) + ) + (Const64 <typ.UInt64> [int64(sdivisible64(c).max)]) + ) + +// Divisibility check for signed integers for power of two constant are simple mask. +// However, we must match against the rewritten n%c == 0 -> n - c*(n/c) == 0 -> n == c*(n/c) +// where n/c contains fixup code to handle signed n. +((Eq8|Neq8) n (Lsh8x64 + (Rsh8x64 + (Add8 <t> n (Rsh8Ux64 <t> (Rsh8x64 <t> n (Const64 <typ.UInt64> [ 7])) (Const64 <typ.UInt64> [kbar]))) + (Const64 <typ.UInt64> [k])) + (Const64 <typ.UInt64> [k])) +) && k > 0 && k < 7 && kbar == 8 - k + => ((Eq8|Neq8) (And8 <t> n (Const8 <t> [1<<uint(k)-1])) (Const8 <t> [0])) + +((Eq16|Neq16) n (Lsh16x64 + (Rsh16x64 + (Add16 <t> n (Rsh16Ux64 <t> (Rsh16x64 <t> n (Const64 <typ.UInt64> [15])) (Const64 <typ.UInt64> [kbar]))) + (Const64 <typ.UInt64> [k])) + (Const64 <typ.UInt64> [k])) +) && k > 0 && k < 15 && kbar == 16 - k + => ((Eq16|Neq16) (And16 <t> n (Const16 <t> [1<<uint(k)-1])) (Const16 <t> [0])) + +((Eq32|Neq32) n (Lsh32x64 + (Rsh32x64 + (Add32 <t> n (Rsh32Ux64 <t> (Rsh32x64 <t> n (Const64 <typ.UInt64> [31])) (Const64 <typ.UInt64> [kbar]))) + (Const64 <typ.UInt64> [k])) + (Const64 <typ.UInt64> [k])) +) && k > 0 && k < 31 && kbar == 32 - k + => ((Eq32|Neq32) (And32 <t> n (Const32 <t> [1<<uint(k)-1])) (Const32 <t> [0])) + +((Eq64|Neq64) n (Lsh64x64 + (Rsh64x64 + (Add64 <t> n (Rsh64Ux64 <t> (Rsh64x64 <t> n (Const64 <typ.UInt64> [63])) (Const64 <typ.UInt64> [kbar]))) + (Const64 <typ.UInt64> [k])) + (Const64 <typ.UInt64> [k])) +) && k > 0 && k < 63 && kbar == 64 - k + => ((Eq64|Neq64) (And64 <t> n (Const64 <t> [1<<uint(k)-1])) (Const64 <t> [0])) + +(Eq(8|16|32|64) s:(Sub(8|16|32|64) x y) (Const(8|16|32|64) [0])) && s.Uses == 1 => (Eq(8|16|32|64) x y) +(Neq(8|16|32|64) s:(Sub(8|16|32|64) x y) (Const(8|16|32|64) [0])) && s.Uses == 1 => (Neq(8|16|32|64) x y) + +// Optimize bitsets +(Eq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [y])) && oneBit8(y) + => (Neq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [0])) +(Eq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [y])) && oneBit16(y) + => (Neq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [0])) +(Eq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [y])) && oneBit32(y) + => (Neq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [0])) +(Eq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [y])) && oneBit64(y) + => (Neq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [0])) +(Neq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [y])) && oneBit8(y) + => (Eq8 (And8 <t> x (Const8 <t> [y])) (Const8 <t> [0])) +(Neq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [y])) && oneBit16(y) + => (Eq16 (And16 <t> x (Const16 <t> [y])) (Const16 <t> [0])) +(Neq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [y])) && oneBit32(y) + => (Eq32 (And32 <t> x (Const32 <t> [y])) (Const32 <t> [0])) +(Neq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [y])) && oneBit64(y) + => (Eq64 (And64 <t> x (Const64 <t> [y])) (Const64 <t> [0])) + +// Reassociate expressions involving +// constants such that constants come first, +// exposing obvious constant-folding opportunities. +// Reassociate (op (op y C) x) to (op C (op x y)) or similar, where C +// is constant, which pushes constants to the outside +// of the expression. At that point, any constant-folding +// opportunities should be obvious. +// Note: don't include AddPtr here! In order to maintain the +// invariant that pointers must stay within the pointed-to object, +// we can't pull part of a pointer computation above the AddPtr. +// See issue 37881. +// Note: we don't need to handle any (x-C) cases because we already rewrite +// (x-C) to (x+(-C)). + +// x + (C + z) -> C + (x + z) +(Add64 (Add64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Add64 <t> z x)) +(Add32 (Add32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Add32 <t> z x)) +(Add16 (Add16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Add16 <t> z x)) +(Add8 (Add8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Add8 i (Add8 <t> z x)) + +// x + (C - z) -> C + (x - z) +(Add64 (Sub64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Sub64 <t> x z)) +(Add32 (Sub32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Sub32 <t> x z)) +(Add16 (Sub16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Sub16 <t> x z)) +(Add8 (Sub8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Add8 i (Sub8 <t> x z)) + +// x - (C - z) -> x + (z - C) -> (x + z) - C +(Sub64 x (Sub64 i:(Const64 <t>) z)) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 (Add64 <t> x z) i) +(Sub32 x (Sub32 i:(Const32 <t>) z)) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 (Add32 <t> x z) i) +(Sub16 x (Sub16 i:(Const16 <t>) z)) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 (Add16 <t> x z) i) +(Sub8 x (Sub8 i:(Const8 <t>) z)) && (z.Op != OpConst8 && x.Op != OpConst8) => (Sub8 (Add8 <t> x z) i) + +// x - (z + C) -> x + (-z - C) -> (x - z) - C +(Sub64 x (Add64 z i:(Const64 <t>))) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 (Sub64 <t> x z) i) +(Sub32 x (Add32 z i:(Const32 <t>))) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 (Sub32 <t> x z) i) +(Sub16 x (Add16 z i:(Const16 <t>))) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 (Sub16 <t> x z) i) +(Sub8 x (Add8 z i:(Const8 <t>))) && (z.Op != OpConst8 && x.Op != OpConst8) => (Sub8 (Sub8 <t> x z) i) + +// (C - z) - x -> C - (z + x) +(Sub64 (Sub64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Sub64 i (Add64 <t> z x)) +(Sub32 (Sub32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Sub32 i (Add32 <t> z x)) +(Sub16 (Sub16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Sub16 i (Add16 <t> z x)) +(Sub8 (Sub8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Sub8 i (Add8 <t> z x)) + +// (z + C) -x -> C + (z - x) +(Sub64 (Add64 z i:(Const64 <t>)) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Add64 i (Sub64 <t> z x)) +(Sub32 (Add32 z i:(Const32 <t>)) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Add32 i (Sub32 <t> z x)) +(Sub16 (Add16 z i:(Const16 <t>)) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Add16 i (Sub16 <t> z x)) +(Sub8 (Add8 z i:(Const8 <t>)) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Add8 i (Sub8 <t> z x)) + +// x & (C & z) -> C & (x & z) +(And64 (And64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (And64 i (And64 <t> z x)) +(And32 (And32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (And32 i (And32 <t> z x)) +(And16 (And16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (And16 i (And16 <t> z x)) +(And8 (And8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (And8 i (And8 <t> z x)) + +// x | (C | z) -> C | (x | z) +(Or64 (Or64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Or64 i (Or64 <t> z x)) +(Or32 (Or32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Or32 i (Or32 <t> z x)) +(Or16 (Or16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Or16 i (Or16 <t> z x)) +(Or8 (Or8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Or8 i (Or8 <t> z x)) + +// x ^ (C ^ z) -> C ^ (x ^ z) +(Xor64 (Xor64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Xor64 i (Xor64 <t> z x)) +(Xor32 (Xor32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Xor32 i (Xor32 <t> z x)) +(Xor16 (Xor16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Xor16 i (Xor16 <t> z x)) +(Xor8 (Xor8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Xor8 i (Xor8 <t> z x)) + +// x * (D * z) = D * (x * z) +(Mul64 (Mul64 i:(Const64 <t>) z) x) && (z.Op != OpConst64 && x.Op != OpConst64) => (Mul64 i (Mul64 <t> x z)) +(Mul32 (Mul32 i:(Const32 <t>) z) x) && (z.Op != OpConst32 && x.Op != OpConst32) => (Mul32 i (Mul32 <t> x z)) +(Mul16 (Mul16 i:(Const16 <t>) z) x) && (z.Op != OpConst16 && x.Op != OpConst16) => (Mul16 i (Mul16 <t> x z)) +(Mul8 (Mul8 i:(Const8 <t>) z) x) && (z.Op != OpConst8 && x.Op != OpConst8) => (Mul8 i (Mul8 <t> x z)) + +// C + (D + x) -> (C + D) + x +(Add64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Add64 (Const64 <t> [c+d]) x) +(Add32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Add32 (Const32 <t> [c+d]) x) +(Add16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Add16 (Const16 <t> [c+d]) x) +(Add8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Add8 (Const8 <t> [c+d]) x) + +// C + (D - x) -> (C + D) - x +(Add64 (Const64 <t> [c]) (Sub64 (Const64 <t> [d]) x)) => (Sub64 (Const64 <t> [c+d]) x) +(Add32 (Const32 <t> [c]) (Sub32 (Const32 <t> [d]) x)) => (Sub32 (Const32 <t> [c+d]) x) +(Add16 (Const16 <t> [c]) (Sub16 (Const16 <t> [d]) x)) => (Sub16 (Const16 <t> [c+d]) x) +(Add8 (Const8 <t> [c]) (Sub8 (Const8 <t> [d]) x)) => (Sub8 (Const8 <t> [c+d]) x) + +// C - (D - x) -> (C - D) + x +(Sub64 (Const64 <t> [c]) (Sub64 (Const64 <t> [d]) x)) => (Add64 (Const64 <t> [c-d]) x) +(Sub32 (Const32 <t> [c]) (Sub32 (Const32 <t> [d]) x)) => (Add32 (Const32 <t> [c-d]) x) +(Sub16 (Const16 <t> [c]) (Sub16 (Const16 <t> [d]) x)) => (Add16 (Const16 <t> [c-d]) x) +(Sub8 (Const8 <t> [c]) (Sub8 (Const8 <t> [d]) x)) => (Add8 (Const8 <t> [c-d]) x) + +// C - (D + x) -> (C - D) - x +(Sub64 (Const64 <t> [c]) (Add64 (Const64 <t> [d]) x)) => (Sub64 (Const64 <t> [c-d]) x) +(Sub32 (Const32 <t> [c]) (Add32 (Const32 <t> [d]) x)) => (Sub32 (Const32 <t> [c-d]) x) +(Sub16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) => (Sub16 (Const16 <t> [c-d]) x) +(Sub8 (Const8 <t> [c]) (Add8 (Const8 <t> [d]) x)) => (Sub8 (Const8 <t> [c-d]) x) + +// C & (D & x) -> (C & D) & x +(And64 (Const64 <t> [c]) (And64 (Const64 <t> [d]) x)) => (And64 (Const64 <t> [c&d]) x) +(And32 (Const32 <t> [c]) (And32 (Const32 <t> [d]) x)) => (And32 (Const32 <t> [c&d]) x) +(And16 (Const16 <t> [c]) (And16 (Const16 <t> [d]) x)) => (And16 (Const16 <t> [c&d]) x) +(And8 (Const8 <t> [c]) (And8 (Const8 <t> [d]) x)) => (And8 (Const8 <t> [c&d]) x) + +// C | (D | x) -> (C | D) | x +(Or64 (Const64 <t> [c]) (Or64 (Const64 <t> [d]) x)) => (Or64 (Const64 <t> [c|d]) x) +(Or32 (Const32 <t> [c]) (Or32 (Const32 <t> [d]) x)) => (Or32 (Const32 <t> [c|d]) x) +(Or16 (Const16 <t> [c]) (Or16 (Const16 <t> [d]) x)) => (Or16 (Const16 <t> [c|d]) x) +(Or8 (Const8 <t> [c]) (Or8 (Const8 <t> [d]) x)) => (Or8 (Const8 <t> [c|d]) x) + +// C ^ (D ^ x) -> (C ^ D) ^ x +(Xor64 (Const64 <t> [c]) (Xor64 (Const64 <t> [d]) x)) => (Xor64 (Const64 <t> [c^d]) x) +(Xor32 (Const32 <t> [c]) (Xor32 (Const32 <t> [d]) x)) => (Xor32 (Const32 <t> [c^d]) x) +(Xor16 (Const16 <t> [c]) (Xor16 (Const16 <t> [d]) x)) => (Xor16 (Const16 <t> [c^d]) x) +(Xor8 (Const8 <t> [c]) (Xor8 (Const8 <t> [d]) x)) => (Xor8 (Const8 <t> [c^d]) x) + +// C * (D * x) = (C * D) * x +(Mul64 (Const64 <t> [c]) (Mul64 (Const64 <t> [d]) x)) => (Mul64 (Const64 <t> [c*d]) x) +(Mul32 (Const32 <t> [c]) (Mul32 (Const32 <t> [d]) x)) => (Mul32 (Const32 <t> [c*d]) x) +(Mul16 (Const16 <t> [c]) (Mul16 (Const16 <t> [d]) x)) => (Mul16 (Const16 <t> [c*d]) x) +(Mul8 (Const8 <t> [c]) (Mul8 (Const8 <t> [d]) x)) => (Mul8 (Const8 <t> [c*d]) x) + +// floating point optimizations +(Mul(32|64)F x (Const(32|64)F [1])) => x +(Mul32F x (Const32F [-1])) => (Neg32F x) +(Mul64F x (Const64F [-1])) => (Neg64F x) +(Mul32F x (Const32F [2])) => (Add32F x x) +(Mul64F x (Const64F [2])) => (Add64F x x) + +(Div32F x (Const32F <t> [c])) && reciprocalExact32(c) => (Mul32F x (Const32F <t> [1/c])) +(Div64F x (Const64F <t> [c])) && reciprocalExact64(c) => (Mul64F x (Const64F <t> [1/c])) + +// rewrite single-precision sqrt expression "float32(math.Sqrt(float64(x)))" +(Cvt64Fto32F sqrt0:(Sqrt (Cvt32Fto64F x))) && sqrt0.Uses==1 => (Sqrt32 x) + +(Sqrt (Const64F [c])) && !math.IsNaN(math.Sqrt(c)) => (Const64F [math.Sqrt(c)]) + +// for rewriting results of some late-expanded rewrites (below) +(SelectN [0] (MakeResult x ___)) => x +(SelectN [1] (MakeResult x y ___)) => y +(SelectN [2] (MakeResult x y z ___)) => z + +// for late-expanded calls, recognize newobject and remove zeroing and nilchecks +(Zero (SelectN [0] call:(StaticLECall _ _)) mem:(SelectN [1] call)) + && isSameCall(call.Aux, "runtime.newobject") + => mem + +(Store (SelectN [0] call:(StaticLECall _ _)) x mem:(SelectN [1] call)) + && isConstZero(x) + && isSameCall(call.Aux, "runtime.newobject") + => mem + +(Store (OffPtr (SelectN [0] call:(StaticLECall _ _))) x mem:(SelectN [1] call)) + && isConstZero(x) + && isSameCall(call.Aux, "runtime.newobject") + => mem + +(NilCheck (SelectN [0] call:(StaticLECall _ _)) _) + && isSameCall(call.Aux, "runtime.newobject") + && warnRule(fe.Debug_checknil(), v, "removed nil check") + => (Invalid) + +(NilCheck (OffPtr (SelectN [0] call:(StaticLECall _ _))) _) + && isSameCall(call.Aux, "runtime.newobject") + && warnRule(fe.Debug_checknil(), v, "removed nil check") + => (Invalid) + +// for late-expanded calls, recognize memequal applied to a single constant byte +// Support is limited by 1, 2, 4, 8 byte sizes +(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [1]) mem) + && isSameCall(callAux, "runtime.memequal") + && symIsRO(scon) + => (MakeResult (Eq8 (Load <typ.Int8> sptr mem) (Const8 <typ.Int8> [int8(read8(scon,0))])) mem) + +(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [2]) mem) + && isSameCall(callAux, "runtime.memequal") + && symIsRO(scon) + && canLoadUnaligned(config) + => (MakeResult (Eq16 (Load <typ.Int16> sptr mem) (Const16 <typ.Int16> [int16(read16(scon,0,config.ctxt.Arch.ByteOrder))])) mem) + +(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [4]) mem) + && isSameCall(callAux, "runtime.memequal") + && symIsRO(scon) + && canLoadUnaligned(config) + => (MakeResult (Eq32 (Load <typ.Int32> sptr mem) (Const32 <typ.Int32> [int32(read32(scon,0,config.ctxt.Arch.ByteOrder))])) mem) + +(StaticLECall {callAux} sptr (Addr {scon} (SB)) (Const64 [8]) mem) + && isSameCall(callAux, "runtime.memequal") + && symIsRO(scon) + && canLoadUnaligned(config) && config.PtrSize == 8 + => (MakeResult (Eq64 (Load <typ.Int64> sptr mem) (Const64 <typ.Int64> [int64(read64(scon,0,config.ctxt.Arch.ByteOrder))])) mem) + +// Evaluate constant address comparisons. +(EqPtr x x) => (ConstBool [true]) +(NeqPtr x x) => (ConstBool [false]) +(EqPtr (Addr {x} _) (Addr {y} _)) => (ConstBool [x == y]) +(EqPtr (Addr {x} _) (OffPtr [o] (Addr {y} _))) => (ConstBool [x == y && o == 0]) +(EqPtr (OffPtr [o1] (Addr {x} _)) (OffPtr [o2] (Addr {y} _))) => (ConstBool [x == y && o1 == o2]) +(NeqPtr (Addr {x} _) (Addr {y} _)) => (ConstBool [x != y]) +(NeqPtr (Addr {x} _) (OffPtr [o] (Addr {y} _))) => (ConstBool [x != y || o != 0]) +(NeqPtr (OffPtr [o1] (Addr {x} _)) (OffPtr [o2] (Addr {y} _))) => (ConstBool [x != y || o1 != o2]) +(EqPtr (LocalAddr {x} _ _) (LocalAddr {y} _ _)) => (ConstBool [x == y]) +(EqPtr (LocalAddr {x} _ _) (OffPtr [o] (LocalAddr {y} _ _))) => (ConstBool [x == y && o == 0]) +(EqPtr (OffPtr [o1] (LocalAddr {x} _ _)) (OffPtr [o2] (LocalAddr {y} _ _))) => (ConstBool [x == y && o1 == o2]) +(NeqPtr (LocalAddr {x} _ _) (LocalAddr {y} _ _)) => (ConstBool [x != y]) +(NeqPtr (LocalAddr {x} _ _) (OffPtr [o] (LocalAddr {y} _ _))) => (ConstBool [x != y || o != 0]) +(NeqPtr (OffPtr [o1] (LocalAddr {x} _ _)) (OffPtr [o2] (LocalAddr {y} _ _))) => (ConstBool [x != y || o1 != o2]) +(EqPtr (OffPtr [o1] p1) p2) && isSamePtr(p1, p2) => (ConstBool [o1 == 0]) +(NeqPtr (OffPtr [o1] p1) p2) && isSamePtr(p1, p2) => (ConstBool [o1 != 0]) +(EqPtr (OffPtr [o1] p1) (OffPtr [o2] p2)) && isSamePtr(p1, p2) => (ConstBool [o1 == o2]) +(NeqPtr (OffPtr [o1] p1) (OffPtr [o2] p2)) && isSamePtr(p1, p2) => (ConstBool [o1 != o2]) +(EqPtr (Const(32|64) [c]) (Const(32|64) [d])) => (ConstBool [c == d]) +(NeqPtr (Const(32|64) [c]) (Const(32|64) [d])) => (ConstBool [c != d]) + +(EqPtr (LocalAddr _ _) (Addr _)) => (ConstBool [false]) +(EqPtr (OffPtr (LocalAddr _ _)) (Addr _)) => (ConstBool [false]) +(EqPtr (LocalAddr _ _) (OffPtr (Addr _))) => (ConstBool [false]) +(EqPtr (OffPtr (LocalAddr _ _)) (OffPtr (Addr _))) => (ConstBool [false]) +(NeqPtr (LocalAddr _ _) (Addr _)) => (ConstBool [true]) +(NeqPtr (OffPtr (LocalAddr _ _)) (Addr _)) => (ConstBool [true]) +(NeqPtr (LocalAddr _ _) (OffPtr (Addr _))) => (ConstBool [true]) +(NeqPtr (OffPtr (LocalAddr _ _)) (OffPtr (Addr _))) => (ConstBool [true]) + +// Simplify address comparisons. +(EqPtr (AddPtr p1 o1) p2) && isSamePtr(p1, p2) => (Not (IsNonNil o1)) +(NeqPtr (AddPtr p1 o1) p2) && isSamePtr(p1, p2) => (IsNonNil o1) +(EqPtr (Const(32|64) [0]) p) => (Not (IsNonNil p)) +(NeqPtr (Const(32|64) [0]) p) => (IsNonNil p) +(EqPtr (ConstNil) p) => (Not (IsNonNil p)) +(NeqPtr (ConstNil) p) => (IsNonNil p) + +// Evaluate constant user nil checks. +(IsNonNil (ConstNil)) => (ConstBool [false]) +(IsNonNil (Const(32|64) [c])) => (ConstBool [c != 0]) +(IsNonNil (Addr _)) => (ConstBool [true]) +(IsNonNil (LocalAddr _ _)) => (ConstBool [true]) + +// Inline small or disjoint runtime.memmove calls with constant length. +// See the comment in op Move in genericOps.go for discussion of the type. +// +// Note that we've lost any knowledge of the type and alignment requirements +// of the source and destination. We only know the size, and that the type +// contains no pointers. +// The type of the move is not necessarily v.Args[0].Type().Elem()! +// See issue 55122 for details. +// +// Because expand calls runs after prove, constants useful to this pattern may not appear. +// Both versions need to exist; the memory and register variants. +// +// Match post-expansion calls, memory version. +(SelectN [0] call:(StaticCall {sym} s1:(Store _ (Const(64|32) [sz]) s2:(Store _ src s3:(Store {t} _ dst mem))))) + && sz >= 0 + && isSameCall(sym, "runtime.memmove") + && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 + && isInlinableMemmove(dst, src, int64(sz), config) + && clobber(s1, s2, s3, call) + => (Move {types.Types[types.TUINT8]} [int64(sz)] dst src mem) + +// Match post-expansion calls, register version. +(SelectN [0] call:(StaticCall {sym} dst src (Const(64|32) [sz]) mem)) + && sz >= 0 + && call.Uses == 1 // this will exclude all calls with results + && isSameCall(sym, "runtime.memmove") + && isInlinableMemmove(dst, src, int64(sz), config) + && clobber(call) + => (Move {types.Types[types.TUINT8]} [int64(sz)] dst src mem) + +// Match pre-expansion calls. +(SelectN [0] call:(StaticLECall {sym} dst src (Const(64|32) [sz]) mem)) + && sz >= 0 + && call.Uses == 1 // this will exclude all calls with results + && isSameCall(sym, "runtime.memmove") + && isInlinableMemmove(dst, src, int64(sz), config) + && clobber(call) + => (Move {types.Types[types.TUINT8]} [int64(sz)] dst src mem) + +// De-virtualize late-expanded interface calls into late-expanded static calls. +// Note that (ITab (IMake)) doesn't get rewritten until after the first opt pass, +// so this rule should trigger reliably. +// devirtLECall removes the first argument, adds the devirtualized symbol to the AuxCall, and changes the opcode +(InterLECall [argsize] {auxCall} (Load (OffPtr [off] (ITab (IMake (Addr {itab} (SB)) _))) _) ___) && devirtLESym(v, auxCall, itab, off) != + nil => devirtLECall(v, devirtLESym(v, auxCall, itab, off)) + +// Move and Zero optimizations. +// Move source and destination may overlap. + +// Convert Moves into Zeros when the source is known to be zeros. +(Move {t} [n] dst1 src mem:(Zero {t} [n] dst2 _)) && isSamePtr(src, dst2) + => (Zero {t} [n] dst1 mem) +(Move {t} [n] dst1 src mem:(VarDef (Zero {t} [n] dst0 _))) && isSamePtr(src, dst0) + => (Zero {t} [n] dst1 mem) +(Move {t} [n] dst (Addr {sym} (SB)) mem) && symIsROZero(sym) => (Zero {t} [n] dst mem) + +// Don't Store to variables that are about to be overwritten by Move/Zero. +(Zero {t1} [n] p1 store:(Store {t2} (OffPtr [o2] p2) _ mem)) + && isSamePtr(p1, p2) && store.Uses == 1 + && n >= o2 + t2.Size() + && clobber(store) + => (Zero {t1} [n] p1 mem) +(Move {t1} [n] dst1 src1 store:(Store {t2} op:(OffPtr [o2] dst2) _ mem)) + && isSamePtr(dst1, dst2) && store.Uses == 1 + && n >= o2 + t2.Size() + && disjoint(src1, n, op, t2.Size()) + && clobber(store) + => (Move {t1} [n] dst1 src1 mem) + +// Don't Move to variables that are immediately completely overwritten. +(Zero {t} [n] dst1 move:(Move {t} [n] dst2 _ mem)) + && move.Uses == 1 + && isSamePtr(dst1, dst2) + && clobber(move) + => (Zero {t} [n] dst1 mem) +(Move {t} [n] dst1 src1 move:(Move {t} [n] dst2 _ mem)) + && move.Uses == 1 + && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n) + && clobber(move) + => (Move {t} [n] dst1 src1 mem) +(Zero {t} [n] dst1 vardef:(VarDef {x} move:(Move {t} [n] dst2 _ mem))) + && move.Uses == 1 && vardef.Uses == 1 + && isSamePtr(dst1, dst2) + && clobber(move, vardef) + => (Zero {t} [n] dst1 (VarDef {x} mem)) +(Move {t} [n] dst1 src1 vardef:(VarDef {x} move:(Move {t} [n] dst2 _ mem))) + && move.Uses == 1 && vardef.Uses == 1 + && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n) + && clobber(move, vardef) + => (Move {t} [n] dst1 src1 (VarDef {x} mem)) +(Store {t1} op1:(OffPtr [o1] p1) d1 + m2:(Store {t2} op2:(OffPtr [0] p2) d2 + m3:(Move [n] p3 _ mem))) + && m2.Uses == 1 && m3.Uses == 1 + && o1 == t2.Size() + && n == t2.Size() + t1.Size() + && isSamePtr(p1, p2) && isSamePtr(p2, p3) + && clobber(m2, m3) + => (Store {t1} op1 d1 (Store {t2} op2 d2 mem)) +(Store {t1} op1:(OffPtr [o1] p1) d1 + m2:(Store {t2} op2:(OffPtr [o2] p2) d2 + m3:(Store {t3} op3:(OffPtr [0] p3) d3 + m4:(Move [n] p4 _ mem)))) + && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 + && o2 == t3.Size() + && o1-o2 == t2.Size() + && n == t3.Size() + t2.Size() + t1.Size() + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) + && clobber(m2, m3, m4) + => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 mem))) +(Store {t1} op1:(OffPtr [o1] p1) d1 + m2:(Store {t2} op2:(OffPtr [o2] p2) d2 + m3:(Store {t3} op3:(OffPtr [o3] p3) d3 + m4:(Store {t4} op4:(OffPtr [0] p4) d4 + m5:(Move [n] p5 _ mem))))) + && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 && m5.Uses == 1 + && o3 == t4.Size() + && o2-o3 == t3.Size() + && o1-o2 == t2.Size() + && n == t4.Size() + t3.Size() + t2.Size() + t1.Size() + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) + && clobber(m2, m3, m4, m5) + => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 (Store {t4} op4 d4 mem)))) + +// Don't Zero variables that are immediately completely overwritten +// before being accessed. +(Move {t} [n] dst1 src1 zero:(Zero {t} [n] dst2 mem)) + && zero.Uses == 1 + && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n) + && clobber(zero) + => (Move {t} [n] dst1 src1 mem) +(Move {t} [n] dst1 src1 vardef:(VarDef {x} zero:(Zero {t} [n] dst2 mem))) + && zero.Uses == 1 && vardef.Uses == 1 + && isSamePtr(dst1, dst2) && disjoint(src1, n, dst2, n) + && clobber(zero, vardef) + => (Move {t} [n] dst1 src1 (VarDef {x} mem)) +(Store {t1} op1:(OffPtr [o1] p1) d1 + m2:(Store {t2} op2:(OffPtr [0] p2) d2 + m3:(Zero [n] p3 mem))) + && m2.Uses == 1 && m3.Uses == 1 + && o1 == t2.Size() + && n == t2.Size() + t1.Size() + && isSamePtr(p1, p2) && isSamePtr(p2, p3) + && clobber(m2, m3) + => (Store {t1} op1 d1 (Store {t2} op2 d2 mem)) +(Store {t1} op1:(OffPtr [o1] p1) d1 + m2:(Store {t2} op2:(OffPtr [o2] p2) d2 + m3:(Store {t3} op3:(OffPtr [0] p3) d3 + m4:(Zero [n] p4 mem)))) + && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 + && o2 == t3.Size() + && o1-o2 == t2.Size() + && n == t3.Size() + t2.Size() + t1.Size() + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) + && clobber(m2, m3, m4) + => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 mem))) +(Store {t1} op1:(OffPtr [o1] p1) d1 + m2:(Store {t2} op2:(OffPtr [o2] p2) d2 + m3:(Store {t3} op3:(OffPtr [o3] p3) d3 + m4:(Store {t4} op4:(OffPtr [0] p4) d4 + m5:(Zero [n] p5 mem))))) + && m2.Uses == 1 && m3.Uses == 1 && m4.Uses == 1 && m5.Uses == 1 + && o3 == t4.Size() + && o2-o3 == t3.Size() + && o1-o2 == t2.Size() + && n == t4.Size() + t3.Size() + t2.Size() + t1.Size() + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) + && clobber(m2, m3, m4, m5) + => (Store {t1} op1 d1 (Store {t2} op2 d2 (Store {t3} op3 d3 (Store {t4} op4 d4 mem)))) + +// Don't Move from memory if the values are likely to already be +// in registers. +(Move {t1} [n] dst p1 + mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Store {t3} op3:(OffPtr <tt3> [0] p3) d2 _))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && o2 == t3.Size() + && n == t2.Size() + t3.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [0] dst) d2 mem)) +(Move {t1} [n] dst p1 + mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2 + (Store {t4} op4:(OffPtr <tt4> [0] p4) d3 _)))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && o3 == t4.Size() + && o2-o3 == t3.Size() + && n == t2.Size() + t3.Size() + t4.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [0] dst) d3 mem))) +(Move {t1} [n] dst p1 + mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2 + (Store {t4} op4:(OffPtr <tt4> [o4] p4) d3 + (Store {t5} op5:(OffPtr <tt5> [0] p5) d4 _))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && t5.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && registerizable(b, t5) + && o4 == t5.Size() + && o3-o4 == t4.Size() + && o2-o3 == t3.Size() + && n == t2.Size() + t3.Size() + t4.Size() + t5.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [o4] dst) d3 + (Store {t5} (OffPtr <tt5> [0] dst) d4 mem)))) + +// Same thing but with VarDef in the middle. +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Store {t3} op3:(OffPtr <tt3> [0] p3) d2 _)))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && o2 == t3.Size() + && n == t2.Size() + t3.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [0] dst) d2 mem)) +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2 + (Store {t4} op4:(OffPtr <tt4> [0] p4) d3 _))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && o3 == t4.Size() + && o2-o3 == t3.Size() + && n == t2.Size() + t3.Size() + t4.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [0] dst) d3 mem))) +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Store {t3} op3:(OffPtr <tt3> [o3] p3) d2 + (Store {t4} op4:(OffPtr <tt4> [o4] p4) d3 + (Store {t5} op5:(OffPtr <tt5> [0] p5) d4 _)))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && t5.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && registerizable(b, t5) + && o4 == t5.Size() + && o3-o4 == t4.Size() + && o2-o3 == t3.Size() + && n == t2.Size() + t3.Size() + t4.Size() + t5.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [o4] dst) d3 + (Store {t5} (OffPtr <tt5> [0] dst) d4 mem)))) + +// Prefer to Zero and Store than to Move. +(Move {t1} [n] dst p1 + mem:(Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Zero {t3} [n] p3 _))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && n >= o2 + t2.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Zero {t1} [n] dst mem)) +(Move {t1} [n] dst p1 + mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1 + (Store {t3} (OffPtr <tt3> [o3] p3) d2 + (Zero {t4} [n] p4 _)))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && n >= o2 + t2.Size() + && n >= o3 + t3.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Zero {t1} [n] dst mem))) +(Move {t1} [n] dst p1 + mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1 + (Store {t3} (OffPtr <tt3> [o3] p3) d2 + (Store {t4} (OffPtr <tt4> [o4] p4) d3 + (Zero {t5} [n] p5 _))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && t5.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && n >= o2 + t2.Size() + && n >= o3 + t3.Size() + && n >= o4 + t4.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [o4] dst) d3 + (Zero {t1} [n] dst mem)))) +(Move {t1} [n] dst p1 + mem:(Store {t2} (OffPtr <tt2> [o2] p2) d1 + (Store {t3} (OffPtr <tt3> [o3] p3) d2 + (Store {t4} (OffPtr <tt4> [o4] p4) d3 + (Store {t5} (OffPtr <tt5> [o5] p5) d4 + (Zero {t6} [n] p6 _)))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) && isSamePtr(p5, p6) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && t5.Alignment() <= t1.Alignment() + && t6.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && registerizable(b, t5) + && n >= o2 + t2.Size() + && n >= o3 + t3.Size() + && n >= o4 + t4.Size() + && n >= o5 + t5.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [o4] dst) d3 + (Store {t5} (OffPtr <tt5> [o5] dst) d4 + (Zero {t1} [n] dst mem))))) +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} op2:(OffPtr <tt2> [o2] p2) d1 + (Zero {t3} [n] p3 _)))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && n >= o2 + t2.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Zero {t1} [n] dst mem)) +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} (OffPtr <tt2> [o2] p2) d1 + (Store {t3} (OffPtr <tt3> [o3] p3) d2 + (Zero {t4} [n] p4 _))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && n >= o2 + t2.Size() + && n >= o3 + t3.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Zero {t1} [n] dst mem))) +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} (OffPtr <tt2> [o2] p2) d1 + (Store {t3} (OffPtr <tt3> [o3] p3) d2 + (Store {t4} (OffPtr <tt4> [o4] p4) d3 + (Zero {t5} [n] p5 _)))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && t5.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && n >= o2 + t2.Size() + && n >= o3 + t3.Size() + && n >= o4 + t4.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [o4] dst) d3 + (Zero {t1} [n] dst mem)))) +(Move {t1} [n] dst p1 + mem:(VarDef + (Store {t2} (OffPtr <tt2> [o2] p2) d1 + (Store {t3} (OffPtr <tt3> [o3] p3) d2 + (Store {t4} (OffPtr <tt4> [o4] p4) d3 + (Store {t5} (OffPtr <tt5> [o5] p5) d4 + (Zero {t6} [n] p6 _))))))) + && isSamePtr(p1, p2) && isSamePtr(p2, p3) && isSamePtr(p3, p4) && isSamePtr(p4, p5) && isSamePtr(p5, p6) + && t2.Alignment() <= t1.Alignment() + && t3.Alignment() <= t1.Alignment() + && t4.Alignment() <= t1.Alignment() + && t5.Alignment() <= t1.Alignment() + && t6.Alignment() <= t1.Alignment() + && registerizable(b, t2) + && registerizable(b, t3) + && registerizable(b, t4) + && registerizable(b, t5) + && n >= o2 + t2.Size() + && n >= o3 + t3.Size() + && n >= o4 + t4.Size() + && n >= o5 + t5.Size() + => (Store {t2} (OffPtr <tt2> [o2] dst) d1 + (Store {t3} (OffPtr <tt3> [o3] dst) d2 + (Store {t4} (OffPtr <tt4> [o4] dst) d3 + (Store {t5} (OffPtr <tt5> [o5] dst) d4 + (Zero {t1} [n] dst mem))))) + +(SelectN [0] call:(StaticLECall {sym} a x)) && needRaceCleanup(sym, call) && clobber(call) => x +(SelectN [0] call:(StaticLECall {sym} x)) && needRaceCleanup(sym, call) && clobber(call) => x + +// When rewriting append to growslice, we use as the the new length the result of +// growslice so that we don't have to spill/restore the new length around the growslice call. +// The exception here is that if the new length is a constant, avoiding spilling it +// is pointless and its constantness is sometimes useful for subsequent optimizations. +// See issue 56440. +// Note there are 2 rules here, one for the pre-decomposed []T result and one for +// the post-decomposed (*T,int,int) result. (The latter is generated after call expansion.) +(SliceLen (SelectN [0] (StaticLECall {sym} _ newLen:(Const(64|32)) _ _ _ _))) && isSameCall(sym, "runtime.growslice") => newLen +(SelectN [1] (StaticCall {sym} _ newLen:(Const(64|32)) _ _ _ _)) && v.Type.IsInteger() && isSameCall(sym, "runtime.growslice") => newLen + +// Collapse moving A -> B -> C into just A -> C. +// Later passes (deadstore, elim unread auto) will remove the A -> B move, if possible. +// This happens most commonly when B is an autotmp inserted earlier +// during compilation to ensure correctness. +// Take care that overlapping moves are preserved. +// Restrict this optimization to the stack, to avoid duplicating loads from the heap; +// see CL 145208 for discussion. +(Move {t1} [s] dst tmp1 midmem:(Move {t2} [s] tmp2 src _)) + && t1.Compare(t2) == types.CMPeq + && isSamePtr(tmp1, tmp2) + && isStackPtr(src) && !isVolatile(src) + && disjoint(src, s, tmp2, s) + && (disjoint(src, s, dst, s) || isInlinableMemmove(dst, src, s, config)) + => (Move {t1} [s] dst src midmem) + +// Same, but for large types that require VarDefs. +(Move {t1} [s] dst tmp1 midmem:(VarDef (Move {t2} [s] tmp2 src _))) + && t1.Compare(t2) == types.CMPeq + && isSamePtr(tmp1, tmp2) + && isStackPtr(src) && !isVolatile(src) + && disjoint(src, s, tmp2, s) + && (disjoint(src, s, dst, s) || isInlinableMemmove(dst, src, s, config)) + => (Move {t1} [s] dst src midmem) + +// Don't zero the same bits twice. +(Zero {t} [s] dst1 zero:(Zero {t} [s] dst2 _)) && isSamePtr(dst1, dst2) => zero +(Zero {t} [s] dst1 vardef:(VarDef (Zero {t} [s] dst2 _))) && isSamePtr(dst1, dst2) => vardef + +// Elide self-moves. This only happens rarely (e.g test/fixedbugs/bug277.go). +// However, this rule is needed to prevent the previous rule from looping forever in such cases. +(Move dst src mem) && isSamePtr(dst, src) => mem + +// Constant rotate detection. +((Add64|Or64|Xor64) (Lsh64x64 x z:(Const64 <t> [c])) (Rsh64Ux64 x (Const64 [d]))) && c < 64 && d == 64-c && canRotate(config, 64) => (RotateLeft64 x z) +((Add32|Or32|Xor32) (Lsh32x64 x z:(Const64 <t> [c])) (Rsh32Ux64 x (Const64 [d]))) && c < 32 && d == 32-c && canRotate(config, 32) => (RotateLeft32 x z) +((Add16|Or16|Xor16) (Lsh16x64 x z:(Const64 <t> [c])) (Rsh16Ux64 x (Const64 [d]))) && c < 16 && d == 16-c && canRotate(config, 16) => (RotateLeft16 x z) +((Add8|Or8|Xor8) (Lsh8x64 x z:(Const64 <t> [c])) (Rsh8Ux64 x (Const64 [d]))) && c < 8 && d == 8-c && canRotate(config, 8) => (RotateLeft8 x z) + +// Non-constant rotate detection. +// We use shiftIsBounded to make sure that neither of the shifts are >64. +// Note: these rules are subtle when the shift amounts are 0/64, as Go shifts +// are different from most native shifts. But it works out. +((Add64|Or64|Xor64) left:(Lsh64x64 x y) right:(Rsh64Ux64 x (Sub64 (Const64 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y) +((Add64|Or64|Xor64) left:(Lsh64x32 x y) right:(Rsh64Ux32 x (Sub32 (Const32 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y) +((Add64|Or64|Xor64) left:(Lsh64x16 x y) right:(Rsh64Ux16 x (Sub16 (Const16 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y) +((Add64|Or64|Xor64) left:(Lsh64x8 x y) right:(Rsh64Ux8 x (Sub8 (Const8 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x y) + +((Add64|Or64|Xor64) right:(Rsh64Ux64 x y) left:(Lsh64x64 x z:(Sub64 (Const64 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z) +((Add64|Or64|Xor64) right:(Rsh64Ux32 x y) left:(Lsh64x32 x z:(Sub32 (Const32 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z) +((Add64|Or64|Xor64) right:(Rsh64Ux16 x y) left:(Lsh64x16 x z:(Sub16 (Const16 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z) +((Add64|Or64|Xor64) right:(Rsh64Ux8 x y) left:(Lsh64x8 x z:(Sub8 (Const8 [64]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 64) => (RotateLeft64 x z) + +((Add32|Or32|Xor32) left:(Lsh32x64 x y) right:(Rsh32Ux64 x (Sub64 (Const64 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y) +((Add32|Or32|Xor32) left:(Lsh32x32 x y) right:(Rsh32Ux32 x (Sub32 (Const32 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y) +((Add32|Or32|Xor32) left:(Lsh32x16 x y) right:(Rsh32Ux16 x (Sub16 (Const16 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y) +((Add32|Or32|Xor32) left:(Lsh32x8 x y) right:(Rsh32Ux8 x (Sub8 (Const8 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x y) + +((Add32|Or32|Xor32) right:(Rsh32Ux64 x y) left:(Lsh32x64 x z:(Sub64 (Const64 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z) +((Add32|Or32|Xor32) right:(Rsh32Ux32 x y) left:(Lsh32x32 x z:(Sub32 (Const32 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z) +((Add32|Or32|Xor32) right:(Rsh32Ux16 x y) left:(Lsh32x16 x z:(Sub16 (Const16 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z) +((Add32|Or32|Xor32) right:(Rsh32Ux8 x y) left:(Lsh32x8 x z:(Sub8 (Const8 [32]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 32) => (RotateLeft32 x z) + +((Add16|Or16|Xor16) left:(Lsh16x64 x y) right:(Rsh16Ux64 x (Sub64 (Const64 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y) +((Add16|Or16|Xor16) left:(Lsh16x32 x y) right:(Rsh16Ux32 x (Sub32 (Const32 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y) +((Add16|Or16|Xor16) left:(Lsh16x16 x y) right:(Rsh16Ux16 x (Sub16 (Const16 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y) +((Add16|Or16|Xor16) left:(Lsh16x8 x y) right:(Rsh16Ux8 x (Sub8 (Const8 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x y) + +((Add16|Or16|Xor16) right:(Rsh16Ux64 x y) left:(Lsh16x64 x z:(Sub64 (Const64 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z) +((Add16|Or16|Xor16) right:(Rsh16Ux32 x y) left:(Lsh16x32 x z:(Sub32 (Const32 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z) +((Add16|Or16|Xor16) right:(Rsh16Ux16 x y) left:(Lsh16x16 x z:(Sub16 (Const16 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z) +((Add16|Or16|Xor16) right:(Rsh16Ux8 x y) left:(Lsh16x8 x z:(Sub8 (Const8 [16]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 16) => (RotateLeft16 x z) + +((Add8|Or8|Xor8) left:(Lsh8x64 x y) right:(Rsh8Ux64 x (Sub64 (Const64 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y) +((Add8|Or8|Xor8) left:(Lsh8x32 x y) right:(Rsh8Ux32 x (Sub32 (Const32 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y) +((Add8|Or8|Xor8) left:(Lsh8x16 x y) right:(Rsh8Ux16 x (Sub16 (Const16 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y) +((Add8|Or8|Xor8) left:(Lsh8x8 x y) right:(Rsh8Ux8 x (Sub8 (Const8 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x y) + +((Add8|Or8|Xor8) right:(Rsh8Ux64 x y) left:(Lsh8x64 x z:(Sub64 (Const64 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z) +((Add8|Or8|Xor8) right:(Rsh8Ux32 x y) left:(Lsh8x32 x z:(Sub32 (Const32 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z) +((Add8|Or8|Xor8) right:(Rsh8Ux16 x y) left:(Lsh8x16 x z:(Sub16 (Const16 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z) +((Add8|Or8|Xor8) right:(Rsh8Ux8 x y) left:(Lsh8x8 x z:(Sub8 (Const8 [8]) y))) && (shiftIsBounded(left) || shiftIsBounded(right)) && canRotate(config, 8) => (RotateLeft8 x z) + +// Rotating by y&c, with c a mask that doesn't change the bottom bits, is the same as rotating by y. +(RotateLeft64 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&63 == 63 => (RotateLeft64 x y) +(RotateLeft32 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&31 == 31 => (RotateLeft32 x y) +(RotateLeft16 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&15 == 15 => (RotateLeft16 x y) +(RotateLeft8 x (And(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&7 == 7 => (RotateLeft8 x y) + +// Rotating by -(y&c), with c a mask that doesn't change the bottom bits, is the same as rotating by -y. +(RotateLeft64 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&63 == 63 => (RotateLeft64 x (Neg(64|32|16|8) <y.Type> y)) +(RotateLeft32 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&31 == 31 => (RotateLeft32 x (Neg(64|32|16|8) <y.Type> y)) +(RotateLeft16 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&15 == 15 => (RotateLeft16 x (Neg(64|32|16|8) <y.Type> y)) +(RotateLeft8 x (Neg(64|32|16|8) (And(64|32|16|8) y (Const(64|32|16|8) [c])))) && c&7 == 7 => (RotateLeft8 x (Neg(64|32|16|8) <y.Type> y)) + +// Rotating by y+c, with c a multiple of the value width, is the same as rotating by y. +(RotateLeft64 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&63 == 0 => (RotateLeft64 x y) +(RotateLeft32 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&31 == 0 => (RotateLeft32 x y) +(RotateLeft16 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&15 == 0 => (RotateLeft16 x y) +(RotateLeft8 x (Add(64|32|16|8) y (Const(64|32|16|8) [c]))) && c&7 == 0 => (RotateLeft8 x y) + +// Rotating by c-y, with c a multiple of the value width, is the same as rotating by -y. +(RotateLeft64 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&63 == 0 => (RotateLeft64 x (Neg(64|32|16|8) <y.Type> y)) +(RotateLeft32 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&31 == 0 => (RotateLeft32 x (Neg(64|32|16|8) <y.Type> y)) +(RotateLeft16 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&15 == 0 => (RotateLeft16 x (Neg(64|32|16|8) <y.Type> y)) +(RotateLeft8 x (Sub(64|32|16|8) (Const(64|32|16|8) [c]) y)) && c&7 == 0 => (RotateLeft8 x (Neg(64|32|16|8) <y.Type> y)) + +// Ensure we don't do Const64 rotates in a 32-bit system. +(RotateLeft64 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft64 x (Const32 <t> [int32(c)])) +(RotateLeft32 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft32 x (Const32 <t> [int32(c)])) +(RotateLeft16 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft16 x (Const32 <t> [int32(c)])) +(RotateLeft8 x (Const64 <t> [c])) && config.PtrSize == 4 => (RotateLeft8 x (Const32 <t> [int32(c)])) + +// Rotating by c, then by d, is the same as rotating by c+d. +// We're trading a rotate for an add, which seems generally a good choice. It is especially good when c and d are constants. +// This rule is a bit tricky as c and d might be different widths. We handle only cases where they are the same width. +(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 8 && d.Type.Size() == 8 => (RotateLeft(64|32|16|8) x (Add64 <c.Type> c d)) +(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 4 && d.Type.Size() == 4 => (RotateLeft(64|32|16|8) x (Add32 <c.Type> c d)) +(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 2 && d.Type.Size() == 2 => (RotateLeft(64|32|16|8) x (Add16 <c.Type> c d)) +(RotateLeft(64|32|16|8) (RotateLeft(64|32|16|8) x c) d) && c.Type.Size() == 1 && d.Type.Size() == 1 => (RotateLeft(64|32|16|8) x (Add8 <c.Type> c d)) diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go new file mode 100644 index 0000000..a4c8fc9 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -0,0 +1,664 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +// Generic opcodes typically specify a width. The inputs and outputs +// of that op are the given number of bits wide. There is no notion of +// "sign", so Add32 can be used both for signed and unsigned 32-bit +// addition. + +// Signed/unsigned is explicit with the extension ops +// (SignExt*/ZeroExt*) and implicit as the arg to some opcodes +// (e.g. the second argument to shifts is unsigned). If not mentioned, +// all args take signed inputs, or don't care whether their inputs +// are signed or unsigned. + +var genericOps = []opData{ + // 2-input arithmetic + // Types must be consistent with Go typing. Add, for example, must take two values + // of the same type and produces that same type. + {name: "Add8", argLength: 2, commutative: true}, // arg0 + arg1 + {name: "Add16", argLength: 2, commutative: true}, + {name: "Add32", argLength: 2, commutative: true}, + {name: "Add64", argLength: 2, commutative: true}, + {name: "AddPtr", argLength: 2}, // For address calculations. arg0 is a pointer and arg1 is an int. + {name: "Add32F", argLength: 2, commutative: true}, + {name: "Add64F", argLength: 2, commutative: true}, + + {name: "Sub8", argLength: 2}, // arg0 - arg1 + {name: "Sub16", argLength: 2}, + {name: "Sub32", argLength: 2}, + {name: "Sub64", argLength: 2}, + {name: "SubPtr", argLength: 2}, + {name: "Sub32F", argLength: 2}, + {name: "Sub64F", argLength: 2}, + + {name: "Mul8", argLength: 2, commutative: true}, // arg0 * arg1 + {name: "Mul16", argLength: 2, commutative: true}, + {name: "Mul32", argLength: 2, commutative: true}, + {name: "Mul64", argLength: 2, commutative: true}, + {name: "Mul32F", argLength: 2, commutative: true}, + {name: "Mul64F", argLength: 2, commutative: true}, + + {name: "Div32F", argLength: 2}, // arg0 / arg1 + {name: "Div64F", argLength: 2}, + + {name: "Hmul32", argLength: 2, commutative: true}, + {name: "Hmul32u", argLength: 2, commutative: true}, + {name: "Hmul64", argLength: 2, commutative: true}, + {name: "Hmul64u", argLength: 2, commutative: true}, + + {name: "Mul32uhilo", argLength: 2, typ: "(UInt32,UInt32)", commutative: true}, // arg0 * arg1, returns (hi, lo) + {name: "Mul64uhilo", argLength: 2, typ: "(UInt64,UInt64)", commutative: true}, // arg0 * arg1, returns (hi, lo) + + {name: "Mul32uover", argLength: 2, typ: "(UInt32,Bool)", commutative: true}, // Let x = arg0*arg1 (full 32x32-> 64 unsigned multiply), returns (uint32(x), (uint32(x) != x)) + {name: "Mul64uover", argLength: 2, typ: "(UInt64,Bool)", commutative: true}, // Let x = arg0*arg1 (full 64x64->128 unsigned multiply), returns (uint64(x), (uint64(x) != x)) + + // Weird special instructions for use in the strength reduction of divides. + // These ops compute unsigned (arg0 + arg1) / 2, correct to all + // 32/64 bits, even when the intermediate result of the add has 33/65 bits. + // These ops can assume arg0 >= arg1. + // Note: these ops aren't commutative! + {name: "Avg32u", argLength: 2, typ: "UInt32"}, // 32-bit platforms only + {name: "Avg64u", argLength: 2, typ: "UInt64"}, // 64-bit platforms only + + // For Div16, Div32 and Div64, AuxInt non-zero means that the divisor has been proved to be not -1 + // or that the dividend is not the most negative value. + {name: "Div8", argLength: 2}, // arg0 / arg1, signed + {name: "Div8u", argLength: 2}, // arg0 / arg1, unsigned + {name: "Div16", argLength: 2, aux: "Bool"}, + {name: "Div16u", argLength: 2}, + {name: "Div32", argLength: 2, aux: "Bool"}, + {name: "Div32u", argLength: 2}, + {name: "Div64", argLength: 2, aux: "Bool"}, + {name: "Div64u", argLength: 2}, + {name: "Div128u", argLength: 3}, // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r) + + // For Mod16, Mod32 and Mod64, AuxInt non-zero means that the divisor has been proved to be not -1. + {name: "Mod8", argLength: 2}, // arg0 % arg1, signed + {name: "Mod8u", argLength: 2}, // arg0 % arg1, unsigned + {name: "Mod16", argLength: 2, aux: "Bool"}, + {name: "Mod16u", argLength: 2}, + {name: "Mod32", argLength: 2, aux: "Bool"}, + {name: "Mod32u", argLength: 2}, + {name: "Mod64", argLength: 2, aux: "Bool"}, + {name: "Mod64u", argLength: 2}, + + {name: "And8", argLength: 2, commutative: true}, // arg0 & arg1 + {name: "And16", argLength: 2, commutative: true}, + {name: "And32", argLength: 2, commutative: true}, + {name: "And64", argLength: 2, commutative: true}, + + {name: "Or8", argLength: 2, commutative: true}, // arg0 | arg1 + {name: "Or16", argLength: 2, commutative: true}, + {name: "Or32", argLength: 2, commutative: true}, + {name: "Or64", argLength: 2, commutative: true}, + + {name: "Xor8", argLength: 2, commutative: true}, // arg0 ^ arg1 + {name: "Xor16", argLength: 2, commutative: true}, + {name: "Xor32", argLength: 2, commutative: true}, + {name: "Xor64", argLength: 2, commutative: true}, + + // For shifts, AxB means the shifted value has A bits and the shift amount has B bits. + // Shift amounts are considered unsigned. + // If arg1 is known to be nonnegative and less than the number of bits in arg0, + // then auxInt may be set to 1. + // This enables better code generation on some platforms. + {name: "Lsh8x8", argLength: 2, aux: "Bool"}, // arg0 << arg1 + {name: "Lsh8x16", argLength: 2, aux: "Bool"}, + {name: "Lsh8x32", argLength: 2, aux: "Bool"}, + {name: "Lsh8x64", argLength: 2, aux: "Bool"}, + {name: "Lsh16x8", argLength: 2, aux: "Bool"}, + {name: "Lsh16x16", argLength: 2, aux: "Bool"}, + {name: "Lsh16x32", argLength: 2, aux: "Bool"}, + {name: "Lsh16x64", argLength: 2, aux: "Bool"}, + {name: "Lsh32x8", argLength: 2, aux: "Bool"}, + {name: "Lsh32x16", argLength: 2, aux: "Bool"}, + {name: "Lsh32x32", argLength: 2, aux: "Bool"}, + {name: "Lsh32x64", argLength: 2, aux: "Bool"}, + {name: "Lsh64x8", argLength: 2, aux: "Bool"}, + {name: "Lsh64x16", argLength: 2, aux: "Bool"}, + {name: "Lsh64x32", argLength: 2, aux: "Bool"}, + {name: "Lsh64x64", argLength: 2, aux: "Bool"}, + + {name: "Rsh8x8", argLength: 2, aux: "Bool"}, // arg0 >> arg1, signed + {name: "Rsh8x16", argLength: 2, aux: "Bool"}, + {name: "Rsh8x32", argLength: 2, aux: "Bool"}, + {name: "Rsh8x64", argLength: 2, aux: "Bool"}, + {name: "Rsh16x8", argLength: 2, aux: "Bool"}, + {name: "Rsh16x16", argLength: 2, aux: "Bool"}, + {name: "Rsh16x32", argLength: 2, aux: "Bool"}, + {name: "Rsh16x64", argLength: 2, aux: "Bool"}, + {name: "Rsh32x8", argLength: 2, aux: "Bool"}, + {name: "Rsh32x16", argLength: 2, aux: "Bool"}, + {name: "Rsh32x32", argLength: 2, aux: "Bool"}, + {name: "Rsh32x64", argLength: 2, aux: "Bool"}, + {name: "Rsh64x8", argLength: 2, aux: "Bool"}, + {name: "Rsh64x16", argLength: 2, aux: "Bool"}, + {name: "Rsh64x32", argLength: 2, aux: "Bool"}, + {name: "Rsh64x64", argLength: 2, aux: "Bool"}, + + {name: "Rsh8Ux8", argLength: 2, aux: "Bool"}, // arg0 >> arg1, unsigned + {name: "Rsh8Ux16", argLength: 2, aux: "Bool"}, + {name: "Rsh8Ux32", argLength: 2, aux: "Bool"}, + {name: "Rsh8Ux64", argLength: 2, aux: "Bool"}, + {name: "Rsh16Ux8", argLength: 2, aux: "Bool"}, + {name: "Rsh16Ux16", argLength: 2, aux: "Bool"}, + {name: "Rsh16Ux32", argLength: 2, aux: "Bool"}, + {name: "Rsh16Ux64", argLength: 2, aux: "Bool"}, + {name: "Rsh32Ux8", argLength: 2, aux: "Bool"}, + {name: "Rsh32Ux16", argLength: 2, aux: "Bool"}, + {name: "Rsh32Ux32", argLength: 2, aux: "Bool"}, + {name: "Rsh32Ux64", argLength: 2, aux: "Bool"}, + {name: "Rsh64Ux8", argLength: 2, aux: "Bool"}, + {name: "Rsh64Ux16", argLength: 2, aux: "Bool"}, + {name: "Rsh64Ux32", argLength: 2, aux: "Bool"}, + {name: "Rsh64Ux64", argLength: 2, aux: "Bool"}, + + // 2-input comparisons + {name: "Eq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1 + {name: "Eq16", argLength: 2, commutative: true, typ: "Bool"}, + {name: "Eq32", argLength: 2, commutative: true, typ: "Bool"}, + {name: "Eq64", argLength: 2, commutative: true, typ: "Bool"}, + {name: "EqPtr", argLength: 2, commutative: true, typ: "Bool"}, + {name: "EqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend + {name: "EqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend + {name: "Eq32F", argLength: 2, commutative: true, typ: "Bool"}, + {name: "Eq64F", argLength: 2, commutative: true, typ: "Bool"}, + + {name: "Neq8", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1 + {name: "Neq16", argLength: 2, commutative: true, typ: "Bool"}, + {name: "Neq32", argLength: 2, commutative: true, typ: "Bool"}, + {name: "Neq64", argLength: 2, commutative: true, typ: "Bool"}, + {name: "NeqPtr", argLength: 2, commutative: true, typ: "Bool"}, + {name: "NeqInter", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend + {name: "NeqSlice", argLength: 2, typ: "Bool"}, // arg0 or arg1 is nil; other cases handled by frontend + {name: "Neq32F", argLength: 2, commutative: true, typ: "Bool"}, + {name: "Neq64F", argLength: 2, commutative: true, typ: "Bool"}, + + {name: "Less8", argLength: 2, typ: "Bool"}, // arg0 < arg1, signed + {name: "Less8U", argLength: 2, typ: "Bool"}, // arg0 < arg1, unsigned + {name: "Less16", argLength: 2, typ: "Bool"}, + {name: "Less16U", argLength: 2, typ: "Bool"}, + {name: "Less32", argLength: 2, typ: "Bool"}, + {name: "Less32U", argLength: 2, typ: "Bool"}, + {name: "Less64", argLength: 2, typ: "Bool"}, + {name: "Less64U", argLength: 2, typ: "Bool"}, + {name: "Less32F", argLength: 2, typ: "Bool"}, + {name: "Less64F", argLength: 2, typ: "Bool"}, + + {name: "Leq8", argLength: 2, typ: "Bool"}, // arg0 <= arg1, signed + {name: "Leq8U", argLength: 2, typ: "Bool"}, // arg0 <= arg1, unsigned + {name: "Leq16", argLength: 2, typ: "Bool"}, + {name: "Leq16U", argLength: 2, typ: "Bool"}, + {name: "Leq32", argLength: 2, typ: "Bool"}, + {name: "Leq32U", argLength: 2, typ: "Bool"}, + {name: "Leq64", argLength: 2, typ: "Bool"}, + {name: "Leq64U", argLength: 2, typ: "Bool"}, + {name: "Leq32F", argLength: 2, typ: "Bool"}, + {name: "Leq64F", argLength: 2, typ: "Bool"}, + + // the type of a CondSelect is the same as the type of its first + // two arguments, which should be register-width scalars; the third + // argument should be a boolean + {name: "CondSelect", argLength: 3}, // arg2 ? arg0 : arg1 + + // boolean ops + {name: "AndB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 && arg1 (not shortcircuited) + {name: "OrB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 || arg1 (not shortcircuited) + {name: "EqB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 == arg1 + {name: "NeqB", argLength: 2, commutative: true, typ: "Bool"}, // arg0 != arg1 + {name: "Not", argLength: 1, typ: "Bool"}, // !arg0, boolean + + // 1-input ops + {name: "Neg8", argLength: 1}, // -arg0 + {name: "Neg16", argLength: 1}, + {name: "Neg32", argLength: 1}, + {name: "Neg64", argLength: 1}, + {name: "Neg32F", argLength: 1}, + {name: "Neg64F", argLength: 1}, + + {name: "Com8", argLength: 1}, // ^arg0 + {name: "Com16", argLength: 1}, + {name: "Com32", argLength: 1}, + {name: "Com64", argLength: 1}, + + {name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8) + {name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16) + {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32) + {name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64) + {name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7 + {name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15 + {name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31 + {name: "Ctz64NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-63 + {name: "BitLen8", argLength: 1}, // Number of bits in arg[0] (returns 0-8) + {name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16) + {name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32) + {name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64) + + {name: "Bswap32", argLength: 1}, // Swap bytes + {name: "Bswap64", argLength: 1}, // Swap bytes + + {name: "BitRev8", argLength: 1}, // Reverse the bits in arg[0] + {name: "BitRev16", argLength: 1}, // Reverse the bits in arg[0] + {name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0] + {name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0] + + {name: "PopCount8", argLength: 1}, // Count bits in arg[0] + {name: "PopCount16", argLength: 1}, // Count bits in arg[0] + {name: "PopCount32", argLength: 1}, // Count bits in arg[0] + {name: "PopCount64", argLength: 1}, // Count bits in arg[0] + + // RotateLeftX instructions rotate the X bits of arg[0] to the left + // by the low lg_2(X) bits of arg[1], interpreted as an unsigned value. + // Note that this works out regardless of the bit width or signedness of + // arg[1]. In particular, RotateLeft by x is the same as RotateRight by -x. + {name: "RotateLeft64", argLength: 2}, + {name: "RotateLeft32", argLength: 2}, + {name: "RotateLeft16", argLength: 2}, + {name: "RotateLeft8", argLength: 2}, + + // Square root. + // Special cases: + // +∞ → +∞ + // ±0 → ±0 (sign preserved) + // x<0 → NaN + // NaN → NaN + {name: "Sqrt", argLength: 1}, // √arg0 (floating point, double precision) + {name: "Sqrt32", argLength: 1}, // √arg0 (floating point, single precision) + + // Round to integer, float64 only. + // Special cases: + // ±∞ → ±∞ (sign preserved) + // ±0 → ±0 (sign preserved) + // NaN → NaN + {name: "Floor", argLength: 1}, // round arg0 toward -∞ + {name: "Ceil", argLength: 1}, // round arg0 toward +∞ + {name: "Trunc", argLength: 1}, // round arg0 toward 0 + {name: "Round", argLength: 1}, // round arg0 to nearest, ties away from 0 + {name: "RoundToEven", argLength: 1}, // round arg0 to nearest, ties to even + + // Modify the sign bit + {name: "Abs", argLength: 1}, // absolute value arg0 + {name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1 + + // 3-input opcode. + // Fused-multiply-add, float64 only. + // When a*b+c is exactly zero (before rounding), then the result is +0 or -0. + // The 0's sign is determined according to the standard rules for the + // addition (-0 if both a*b and c are -0, +0 otherwise). + // + // Otherwise, when a*b+c rounds to zero, then the resulting 0's sign is + // determined by the sign of the exact result a*b+c. + // See section 6.3 in ieee754. + // + // When the multiply is an infinity times a zero, the result is NaN. + // See section 7.2 in ieee754. + {name: "FMA", argLength: 3}, // compute (a*b)+c without intermediate rounding + + // Data movement. Max argument length for Phi is indefinite. + {name: "Phi", argLength: -1, zeroWidth: true}, // select an argument based on which predecessor block we came from + {name: "Copy", argLength: 1}, // output = arg0 + // Convert converts between pointers and integers. + // We have a special op for this so as to not confuse GC + // (particularly stack maps). It takes a memory arg so it + // gets correctly ordered with respect to GC safepoints. + // It gets compiled to nothing, so its result must in the same + // register as its argument. regalloc knows it can use any + // allocatable integer register for OpConvert. + // arg0=ptr/int arg1=mem, output=int/ptr + {name: "Convert", argLength: 2, zeroWidth: true, resultInArg0: true}, + + // constants. Constant values are stored in the aux or + // auxint fields. + {name: "ConstBool", aux: "Bool"}, // auxint is 0 for false and 1 for true + {name: "ConstString", aux: "String"}, // value is aux.(string) + {name: "ConstNil", typ: "BytePtr"}, // nil pointer + {name: "Const8", aux: "Int8"}, // auxint is sign-extended 8 bits + {name: "Const16", aux: "Int16"}, // auxint is sign-extended 16 bits + {name: "Const32", aux: "Int32"}, // auxint is sign-extended 32 bits + // Note: ConstX are sign-extended even when the type of the value is unsigned. + // For instance, uint8(0xaa) is stored as auxint=0xffffffffffffffaa. + {name: "Const64", aux: "Int64"}, // value is auxint + // Note: for both Const32F and Const64F, we disallow encoding NaNs. + // Signaling NaNs are tricky because if you do anything with them, they become quiet. + // Particularly, converting a 32 bit sNaN to 64 bit and back converts it to a qNaN. + // See issue 36399 and 36400. + // Encodings of +inf, -inf, and -0 are fine. + {name: "Const32F", aux: "Float32"}, // value is math.Float64frombits(uint64(auxint)) and is exactly representable as float 32 + {name: "Const64F", aux: "Float64"}, // value is math.Float64frombits(uint64(auxint)) + {name: "ConstInterface"}, // nil interface + {name: "ConstSlice"}, // nil slice + + // Constant-like things + {name: "InitMem", zeroWidth: true}, // memory input to the function. + {name: "Arg", aux: "SymOff", symEffect: "Read", zeroWidth: true}, // argument to the function. aux=GCNode of arg, off = offset in that arg. + + // Like Arg, these are generic ops that survive lowering. AuxInt is a register index, and the actual output register for each index is defined by the architecture. + // AuxInt = integer argument index (not a register number). ABI-specified spill loc obtained from function + {name: "ArgIntReg", aux: "NameOffsetInt8", zeroWidth: true}, // argument to the function in an int reg. + {name: "ArgFloatReg", aux: "NameOffsetInt8", zeroWidth: true}, // argument to the function in a float reg. + + // The address of a variable. arg0 is the base pointer. + // If the variable is a global, the base pointer will be SB and + // the Aux field will be a *obj.LSym. + // If the variable is a local, the base pointer will be SP and + // the Aux field will be a *gc.Node. + {name: "Addr", argLength: 1, aux: "Sym", symEffect: "Addr"}, // Address of a variable. Arg0=SB. Aux identifies the variable. + {name: "LocalAddr", argLength: 2, aux: "Sym", symEffect: "Addr"}, // Address of a variable. Arg0=SP. Arg1=mem. Aux identifies the variable. + + {name: "SP", zeroWidth: true}, // stack pointer + {name: "SB", typ: "Uintptr", zeroWidth: true}, // static base pointer (a.k.a. globals pointer) + {name: "Invalid"}, // unused value + + // Memory operations + {name: "Load", argLength: 2}, // Load from arg0. arg1=memory + {name: "Dereference", argLength: 2}, // Load from arg0. arg1=memory. Helper op for arg/result passing, result is an otherwise not-SSA-able "value". + {name: "Store", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + // Normally we require that the source and destination of Move do not overlap. + // There is an exception when we know all the loads will happen before all + // the stores. In that case, overlap is ok. See + // memmove inlining in generic.rules. When inlineablememmovesize (in ../rewrite.go) + // returns true, we must do all loads before all stores, when lowering Move. + // The type of Move is used for the write barrier pass to insert write barriers + // and for alignment on some architectures. + // For pointerless types, it is possible for the type to be inaccurate. + // For type alignment and pointer information, use the type in Aux; + // for type size, use the size in AuxInt. + // The "inline runtime.memmove" rewrite rule generates Moves with inaccurate types, + // such as type byte instead of the more accurate type [8]byte. + {name: "Move", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type. Returns memory. + {name: "Zero", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory. + + // Memory operations with write barriers. + // Expand to runtime calls. Write barrier will be removed if write on stack. + {name: "StoreWB", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory. + {name: "MoveWB", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type. Returns memory. + {name: "ZeroWB", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory. + + // WB invokes runtime.gcWriteBarrier. This is not a normal + // call: it takes arguments in registers, doesn't clobber + // general-purpose registers (the exact clobber set is + // arch-dependent), and is not a safe-point. + {name: "WB", argLength: 3, typ: "Mem", aux: "Sym", symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + + {name: "HasCPUFeature", argLength: 0, typ: "bool", aux: "Sym", symEffect: "None"}, // aux=place that this feature flag can be loaded from + + // PanicBounds and PanicExtend generate a runtime panic. + // Their arguments provide index values to use in panic messages. + // Both PanicBounds and PanicExtend have an AuxInt value from the BoundsKind type (in ../op.go). + // PanicBounds' index is int sized. + // PanicExtend's index is int64 sized. (PanicExtend is only used on 32-bit archs.) + {name: "PanicBounds", argLength: 3, aux: "Int64", typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. + {name: "PanicExtend", argLength: 4, aux: "Int64", typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. + + // Function calls. Arguments to the call have already been written to the stack. + // Return values appear on the stack. The method receiver, if any, is treated + // as a phantom first argument. + // TODO(josharian): ClosureCall and InterCall should have Int32 aux + // to match StaticCall's 32 bit arg size limit. + // TODO(drchase,josharian): could the arg size limit be bundled into the rules for CallOff? + + // Before lowering, LECalls receive their fixed inputs (first), memory (last), + // and a variable number of input values in the middle. + // They produce a variable number of result values. + // These values are not necessarily "SSA-able"; they can be too large, + // but in that case inputs are loaded immediately before with OpDereference, + // and outputs are stored immediately with OpStore. + // + // After call expansion, Calls have the same fixed-middle-memory arrangement of inputs, + // with the difference that the "middle" is only the register-resident inputs, + // and the non-register inputs are instead stored at ABI-defined offsets from SP + // (and the stores thread through the memory that is ultimately an input to the call). + // Outputs follow a similar pattern; register-resident outputs are the leading elements + // of a Result-typed output, with memory last, and any memory-resident outputs have been + // stored to ABI-defined locations. Each non-memory input or output fits in a register. + // + // Subsequent architecture-specific lowering only changes the opcode. + + {name: "ClosureCall", argLength: -1, aux: "CallOff", call: true}, // arg0=code pointer, arg1=context ptr, arg2..argN-1 are register inputs, argN=memory. auxint=arg size. Returns Result of register results, plus memory. + {name: "StaticCall", argLength: -1, aux: "CallOff", call: true}, // call function aux.(*obj.LSym), arg0..argN-1 are register inputs, argN=memory. auxint=arg size. Returns Result of register results, plus memory. + {name: "InterCall", argLength: -1, aux: "CallOff", call: true}, // interface call. arg0=code pointer, arg1..argN-1 are register inputs, argN=memory, auxint=arg size. Returns Result of register results, plus memory. + {name: "TailCall", argLength: -1, aux: "CallOff", call: true}, // tail call function aux.(*obj.LSym), arg0..argN-1 are register inputs, argN=memory. auxint=arg size. Returns Result of register results, plus memory. + + {name: "ClosureLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded closure call. arg0=code pointer, arg1=context ptr, arg2..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem. + {name: "StaticLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded static call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem. + {name: "InterLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded interface call. arg0=code pointer, arg1..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem. + {name: "TailLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded static tail call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem. + + // Conversions: signed extensions, zero (unsigned) extensions, truncations + {name: "SignExt8to16", argLength: 1, typ: "Int16"}, + {name: "SignExt8to32", argLength: 1, typ: "Int32"}, + {name: "SignExt8to64", argLength: 1, typ: "Int64"}, + {name: "SignExt16to32", argLength: 1, typ: "Int32"}, + {name: "SignExt16to64", argLength: 1, typ: "Int64"}, + {name: "SignExt32to64", argLength: 1, typ: "Int64"}, + {name: "ZeroExt8to16", argLength: 1, typ: "UInt16"}, + {name: "ZeroExt8to32", argLength: 1, typ: "UInt32"}, + {name: "ZeroExt8to64", argLength: 1, typ: "UInt64"}, + {name: "ZeroExt16to32", argLength: 1, typ: "UInt32"}, + {name: "ZeroExt16to64", argLength: 1, typ: "UInt64"}, + {name: "ZeroExt32to64", argLength: 1, typ: "UInt64"}, + {name: "Trunc16to8", argLength: 1}, + {name: "Trunc32to8", argLength: 1}, + {name: "Trunc32to16", argLength: 1}, + {name: "Trunc64to8", argLength: 1}, + {name: "Trunc64to16", argLength: 1}, + {name: "Trunc64to32", argLength: 1}, + + {name: "Cvt32to32F", argLength: 1}, + {name: "Cvt32to64F", argLength: 1}, + {name: "Cvt64to32F", argLength: 1}, + {name: "Cvt64to64F", argLength: 1}, + {name: "Cvt32Fto32", argLength: 1}, + {name: "Cvt32Fto64", argLength: 1}, + {name: "Cvt64Fto32", argLength: 1}, + {name: "Cvt64Fto64", argLength: 1}, + {name: "Cvt32Fto64F", argLength: 1}, + {name: "Cvt64Fto32F", argLength: 1}, + {name: "CvtBoolToUint8", argLength: 1}, + + // Force rounding to precision of type. + {name: "Round32F", argLength: 1}, + {name: "Round64F", argLength: 1}, + + // Automatically inserted safety checks + {name: "IsNonNil", argLength: 1, typ: "Bool"}, // arg0 != nil + {name: "IsInBounds", argLength: 2, typ: "Bool"}, // 0 <= arg0 < arg1. arg1 is guaranteed >= 0. + {name: "IsSliceInBounds", argLength: 2, typ: "Bool"}, // 0 <= arg0 <= arg1. arg1 is guaranteed >= 0. + {name: "NilCheck", argLength: 2, typ: "Void"}, // arg0=ptr, arg1=mem. Panics if arg0 is nil. Returns void. + + // Pseudo-ops + {name: "GetG", argLength: 1, zeroWidth: true}, // runtime.getg() (read g pointer). arg0=mem + {name: "GetClosurePtr"}, // get closure pointer from dedicated register + {name: "GetCallerPC"}, // for getcallerpc intrinsic + {name: "GetCallerSP"}, // for getcallersp intrinsic + + // Indexing operations + {name: "PtrIndex", argLength: 2}, // arg0=ptr, arg1=index. Computes ptr+sizeof(*v.type)*index, where index is extended to ptrwidth type + {name: "OffPtr", argLength: 1, aux: "Int64"}, // arg0 + auxint (arg0 and result are pointers) + + // Slices + {name: "SliceMake", argLength: 3}, // arg0=ptr, arg1=len, arg2=cap + {name: "SlicePtr", argLength: 1, typ: "BytePtr"}, // ptr(arg0) + {name: "SliceLen", argLength: 1}, // len(arg0) + {name: "SliceCap", argLength: 1}, // cap(arg0) + // SlicePtrUnchecked, like SlicePtr, extracts the pointer from a slice. + // SlicePtr values are assumed non-nil, because they are guarded by bounds checks. + // SlicePtrUnchecked values can be nil. + {name: "SlicePtrUnchecked", argLength: 1}, + + // Complex (part/whole) + {name: "ComplexMake", argLength: 2}, // arg0=real, arg1=imag + {name: "ComplexReal", argLength: 1}, // real(arg0) + {name: "ComplexImag", argLength: 1}, // imag(arg0) + + // Strings + {name: "StringMake", argLength: 2}, // arg0=ptr, arg1=len + {name: "StringPtr", argLength: 1, typ: "BytePtr"}, // ptr(arg0) + {name: "StringLen", argLength: 1, typ: "Int"}, // len(arg0) + + // Interfaces + {name: "IMake", argLength: 2}, // arg0=itab, arg1=data + {name: "ITab", argLength: 1, typ: "Uintptr"}, // arg0=interface, returns itable field + {name: "IData", argLength: 1}, // arg0=interface, returns data field + + // Structs + {name: "StructMake0"}, // Returns struct with 0 fields. + {name: "StructMake1", argLength: 1}, // arg0=field0. Returns struct. + {name: "StructMake2", argLength: 2}, // arg0,arg1=field0,field1. Returns struct. + {name: "StructMake3", argLength: 3}, // arg0..2=field0..2. Returns struct. + {name: "StructMake4", argLength: 4}, // arg0..3=field0..3. Returns struct. + {name: "StructSelect", argLength: 1, aux: "Int64"}, // arg0=struct, auxint=field index. Returns the auxint'th field. + + // Arrays + {name: "ArrayMake0"}, // Returns array with 0 elements + {name: "ArrayMake1", argLength: 1}, // Returns array with 1 element + {name: "ArraySelect", argLength: 1, aux: "Int64"}, // arg0=array, auxint=index. Returns a[i]. + + // Spill&restore ops for the register allocator. These are + // semantically identical to OpCopy; they do not take/return + // stores like regular memory ops do. We can get away without memory + // args because we know there is no aliasing of spill slots on the stack. + {name: "StoreReg", argLength: 1}, + {name: "LoadReg", argLength: 1}, + + // Used during ssa construction. Like Copy, but the arg has not been specified yet. + {name: "FwdRef", aux: "Sym", symEffect: "None"}, + + // Unknown value. Used for Values whose values don't matter because they are dead code. + {name: "Unknown"}, + + {name: "VarDef", argLength: 1, aux: "Sym", typ: "Mem", symEffect: "None", zeroWidth: true}, // aux is a *gc.Node of a variable that is about to be initialized. arg0=mem, returns mem + // TODO: what's the difference between VarLive and KeepAlive? + {name: "VarLive", argLength: 1, aux: "Sym", symEffect: "Read", zeroWidth: true}, // aux is a *gc.Node of a variable that must be kept live. arg0=mem, returns mem + {name: "KeepAlive", argLength: 2, typ: "Mem", zeroWidth: true}, // arg[0] is a value that must be kept alive until this mark. arg[1]=mem, returns mem + + // InlMark marks the start of an inlined function body. Its AuxInt field + // distinguishes which entry in the local inline tree it is marking. + {name: "InlMark", argLength: 1, aux: "Int32", typ: "Void"}, // arg[0]=mem, returns void. + + // Ops for breaking 64-bit operations on 32-bit architectures + {name: "Int64Make", argLength: 2, typ: "UInt64"}, // arg0=hi, arg1=lo + {name: "Int64Hi", argLength: 1, typ: "UInt32"}, // high 32-bit of arg0 + {name: "Int64Lo", argLength: 1, typ: "UInt32"}, // low 32-bit of arg0 + + {name: "Add32carry", argLength: 2, commutative: true, typ: "(UInt32,Flags)"}, // arg0 + arg1, returns (value, carry) + {name: "Add32withcarry", argLength: 3, commutative: true}, // arg0 + arg1 + arg2, arg2=carry (0 or 1) + + {name: "Sub32carry", argLength: 2, typ: "(UInt32,Flags)"}, // arg0 - arg1, returns (value, carry) + {name: "Sub32withcarry", argLength: 3}, // arg0 - arg1 - arg2, arg2=carry (0 or 1) + + {name: "Add64carry", argLength: 3, commutative: true, typ: "(UInt64,UInt64)"}, // arg0 + arg1 + arg2, arg2 must be 0 or 1. returns (value, value>>64) + {name: "Sub64borrow", argLength: 3, typ: "(UInt64,UInt64)"}, // arg0 - (arg1 + arg2), arg2 must be 0 or 1. returns (value, value>>64&1) + + {name: "Signmask", argLength: 1, typ: "Int32"}, // 0 if arg0 >= 0, -1 if arg0 < 0 + {name: "Zeromask", argLength: 1, typ: "UInt32"}, // 0 if arg0 == 0, 0xffffffff if arg0 != 0 + {name: "Slicemask", argLength: 1}, // 0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0. Type is native int size. + + {name: "SpectreIndex", argLength: 2}, // arg0 if 0 <= arg0 < arg1, 0 otherwise. Type is native int size. + {name: "SpectreSliceIndex", argLength: 2}, // arg0 if 0 <= arg0 <= arg1, 0 otherwise. Type is native int size. + + {name: "Cvt32Uto32F", argLength: 1}, // uint32 -> float32, only used on 32-bit arch + {name: "Cvt32Uto64F", argLength: 1}, // uint32 -> float64, only used on 32-bit arch + {name: "Cvt32Fto32U", argLength: 1}, // float32 -> uint32, only used on 32-bit arch + {name: "Cvt64Fto32U", argLength: 1}, // float64 -> uint32, only used on 32-bit arch + {name: "Cvt64Uto32F", argLength: 1}, // uint64 -> float32, only used on archs that has the instruction + {name: "Cvt64Uto64F", argLength: 1}, // uint64 -> float64, only used on archs that has the instruction + {name: "Cvt32Fto64U", argLength: 1}, // float32 -> uint64, only used on archs that has the instruction + {name: "Cvt64Fto64U", argLength: 1}, // float64 -> uint64, only used on archs that has the instruction + + // pseudo-ops for breaking Tuple + {name: "Select0", argLength: 1, zeroWidth: true}, // the first component of a tuple + {name: "Select1", argLength: 1, zeroWidth: true}, // the second component of a tuple + {name: "SelectN", argLength: 1, aux: "Int64"}, // arg0=result, auxint=field index. Returns the auxint'th member. + {name: "SelectNAddr", argLength: 1, aux: "Int64"}, // arg0=result, auxint=field index. Returns the address of auxint'th member. Used for un-SSA-able result types. + {name: "MakeResult", argLength: -1}, // arg0 .. are components of a "Result" (like the result from a Call). The last arg should be memory (like the result from a call). + + // Atomic operations used for semantically inlining sync/atomic and + // runtime/internal/atomic. Atomic loads return a new memory so that + // the loads are properly ordered with respect to other loads and + // stores. + {name: "AtomicLoad8", argLength: 2, typ: "(UInt8,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory. + {name: "AtomicLoad32", argLength: 2, typ: "(UInt32,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory. + {name: "AtomicLoad64", argLength: 2, typ: "(UInt64,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory. + {name: "AtomicLoadPtr", argLength: 2, typ: "(BytePtr,Mem)"}, // Load from arg0. arg1=memory. Returns loaded value and new memory. + {name: "AtomicLoadAcq32", argLength: 2, typ: "(UInt32,Mem)"}, // Load from arg0. arg1=memory. Lock acquisition, returns loaded value and new memory. + {name: "AtomicLoadAcq64", argLength: 2, typ: "(UInt64,Mem)"}, // Load from arg0. arg1=memory. Lock acquisition, returns loaded value and new memory. + {name: "AtomicStore8", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory. + {name: "AtomicStore32", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory. + {name: "AtomicStore64", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory. + {name: "AtomicStorePtrNoWB", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns memory. + {name: "AtomicStoreRel32", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Lock release, returns memory. + {name: "AtomicStoreRel64", argLength: 3, typ: "Mem", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Lock release, returns memory. + {name: "AtomicExchange32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory. + {name: "AtomicExchange64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory. + {name: "AtomicAdd32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory. + {name: "AtomicAdd64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory. + {name: "AtomicCompareAndSwap32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory. + {name: "AtomicCompareAndSwap64", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory. + {name: "AtomicCompareAndSwapRel32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Lock release, reports whether store happens and new memory. + {name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory. + {name: "AtomicAnd32", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory. + {name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory. + {name: "AtomicOr32", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory. + + // Atomic operation variants + // These variants have the same semantics as above atomic operations. + // But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection. + // Currently, they are used on ARM64 only. + {name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory. + {name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory. + {name: "AtomicExchange32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory. + {name: "AtomicExchange64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Store arg1 to *arg0. arg2=memory. Returns old contents of *arg0 and new memory. + {name: "AtomicCompareAndSwap32Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory. + {name: "AtomicCompareAndSwap64Variant", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2. Returns true if store happens and new memory. + {name: "AtomicAnd8Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory. + {name: "AtomicAnd32Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory. + {name: "AtomicOr8Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory. + {name: "AtomicOr32Variant", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory. + + // Publication barrier + {name: "PubBarrier", argLength: 1, hasSideEffects: true}, // Do data barrier. arg0=memory. + + // Clobber experiment op + {name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable + {name: "ClobberReg", argLength: 0, typ: "Void"}, // clobber a register + + // Prefetch instruction + {name: "PrefetchCache", argLength: 2, hasSideEffects: true}, // Do prefetch arg0 to cache. arg0=addr, arg1=memory. + {name: "PrefetchCacheStreamed", argLength: 2, hasSideEffects: true}, // Do non-temporal or streamed prefetch arg0 to cache. arg0=addr, arg1=memory. +} + +// kind controls successors implicit exit +// ---------------------------------------------------------- +// Exit [return mem] [] yes +// Ret [return mem] [] yes +// RetJmp [return mem] [] yes +// Plain [] [next] +// If [boolean Value] [then, else] +// First [] [always, never] + +var genericBlocks = []blockData{ + {name: "Plain"}, // a single successor + {name: "If", controls: 1}, // if Controls[0] goto Succs[0] else goto Succs[1] + {name: "Defer", controls: 1}, // Succs[0]=defer queued, Succs[1]=defer recovered. Controls[0] is call op (of memory type) + {name: "Ret", controls: 1}, // no successors, Controls[0] value is memory result + {name: "RetJmp", controls: 1}, // no successors, Controls[0] value is a tail call + {name: "Exit", controls: 1}, // no successors, Controls[0] value generates a panic + {name: "JumpTable", controls: 1}, // multiple successors, the integer Controls[0] selects which one + + // transient block state used for dead code removal + {name: "First"}, // 2 successors, always takes the first one (second is dead) +} + +func init() { + archs = append(archs, arch{ + name: "generic", + ops: genericOps, + blocks: genericBlocks, + generic: true, + }) +} diff --git a/src/cmd/compile/internal/ssa/_gen/main.go b/src/cmd/compile/internal/ssa/_gen/main.go new file mode 100644 index 0000000..9251ba5 --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/main.go @@ -0,0 +1,571 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The gen command generates Go code (in the parent directory) for all +// the architecture-specific opcodes, blocks, and rewrites. +package main + +import ( + "bytes" + "flag" + "fmt" + "go/format" + "log" + "math/bits" + "os" + "path" + "regexp" + "runtime" + "runtime/pprof" + "runtime/trace" + "sort" + "strings" + "sync" +) + +// TODO: capitalize these types, so that we can more easily tell variable names +// apart from type names, and avoid awkward func parameters like "arch arch". + +type arch struct { + name string + pkg string // obj package to import for this arch. + genfile string // source file containing opcode code generation. + ops []opData + blocks []blockData + regnames []string + ParamIntRegNames string + ParamFloatRegNames string + gpregmask regMask + fpregmask regMask + fp32regmask regMask + fp64regmask regMask + specialregmask regMask + framepointerreg int8 + linkreg int8 + generic bool + imports []string +} + +type opData struct { + name string + reg regInfo + asm string + typ string // default result type + aux string + rematerializeable bool + argLength int32 // number of arguments, if -1, then this operation has a variable number of arguments + commutative bool // this operation is commutative on its first 2 arguments (e.g. addition) + resultInArg0 bool // (first, if a tuple) output of v and v.Args[0] must be allocated to the same register + resultNotInArgs bool // outputs must not be allocated to the same registers as inputs + clobberFlags bool // this op clobbers flags register + needIntTemp bool // need a temporary free integer register + call bool // is a function call + tailCall bool // is a tail call + nilCheck bool // this op is a nil check on arg0 + faultOnNilArg0 bool // this op will fault if arg0 is nil (and aux encodes a small offset) + faultOnNilArg1 bool // this op will fault if arg1 is nil (and aux encodes a small offset) + hasSideEffects bool // for "reasons", not to be eliminated. E.g., atomic store, #19182. + zeroWidth bool // op never translates into any machine code. example: copy, which may sometimes translate to machine code, is not zero-width. + unsafePoint bool // this op is an unsafe point, i.e. not safe for async preemption + symEffect string // effect this op has on symbol in aux + scale uint8 // amd64/386 indexed load scale +} + +type blockData struct { + name string // the suffix for this block ("EQ", "LT", etc.) + controls int // the number of control values this type of block requires + aux string // the type of the Aux/AuxInt value, if any +} + +type regInfo struct { + // inputs[i] encodes the set of registers allowed for the i'th input. + // Inputs that don't use registers (flags, memory, etc.) should be 0. + inputs []regMask + // clobbers encodes the set of registers that are overwritten by + // the instruction (other than the output registers). + clobbers regMask + // outputs[i] encodes the set of registers allowed for the i'th output. + outputs []regMask +} + +type regMask uint64 + +func (a arch) regMaskComment(r regMask) string { + var buf strings.Builder + for i := uint64(0); r != 0; i++ { + if r&1 != 0 { + if buf.Len() == 0 { + buf.WriteString(" //") + } + buf.WriteString(" ") + buf.WriteString(a.regnames[i]) + } + r >>= 1 + } + return buf.String() +} + +var archs []arch + +var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`") +var memprofile = flag.String("memprofile", "", "write memory profile to `file`") +var tracefile = flag.String("trace", "", "write trace to `file`") + +func main() { + flag.Parse() + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + defer f.Close() + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + } + if *tracefile != "" { + f, err := os.Create(*tracefile) + if err != nil { + log.Fatalf("failed to create trace output file: %v", err) + } + defer func() { + if err := f.Close(); err != nil { + log.Fatalf("failed to close trace file: %v", err) + } + }() + + if err := trace.Start(f); err != nil { + log.Fatalf("failed to start trace: %v", err) + } + defer trace.Stop() + } + + sort.Sort(ArchsByName(archs)) + + // The generate tasks are run concurrently, since they are CPU-intensive + // that can easily make use of many cores on a machine. + // + // Note that there is no limit on the concurrency at the moment. On a + // four-core laptop at the time of writing, peak RSS usually reaches + // ~200MiB, which seems doable by practically any machine nowadays. If + // that stops being the case, we can cap this func to a fixed number of + // architectures being generated at once. + + tasks := []func(){ + genOp, + genAllocators, + } + for _, a := range archs { + a := a // the funcs are ran concurrently at a later time + tasks = append(tasks, func() { + genRules(a) + genSplitLoadRules(a) + genLateLowerRules(a) + }) + } + var wg sync.WaitGroup + for _, task := range tasks { + task := task + wg.Add(1) + go func() { + task() + wg.Done() + }() + } + wg.Wait() + + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal("could not create memory profile: ", err) + } + defer f.Close() + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + log.Fatal("could not write memory profile: ", err) + } + } +} + +func genOp() { + w := new(bytes.Buffer) + fmt.Fprintf(w, "// Code generated from _gen/*Ops.go; DO NOT EDIT.\n") + fmt.Fprintln(w) + fmt.Fprintln(w, "package ssa") + + fmt.Fprintln(w, "import (") + fmt.Fprintln(w, "\"cmd/internal/obj\"") + for _, a := range archs { + if a.pkg != "" { + fmt.Fprintf(w, "%q\n", a.pkg) + } + } + fmt.Fprintln(w, ")") + + // generate Block* declarations + fmt.Fprintln(w, "const (") + fmt.Fprintln(w, "BlockInvalid BlockKind = iota") + for _, a := range archs { + fmt.Fprintln(w) + for _, d := range a.blocks { + fmt.Fprintf(w, "Block%s%s\n", a.Name(), d.name) + } + } + fmt.Fprintln(w, ")") + + // generate block kind string method + fmt.Fprintln(w, "var blockString = [...]string{") + fmt.Fprintln(w, "BlockInvalid:\"BlockInvalid\",") + for _, a := range archs { + fmt.Fprintln(w) + for _, b := range a.blocks { + fmt.Fprintf(w, "Block%s%s:\"%s\",\n", a.Name(), b.name, b.name) + } + } + fmt.Fprintln(w, "}") + fmt.Fprintln(w, "func (k BlockKind) String() string {return blockString[k]}") + + // generate block kind auxint method + fmt.Fprintln(w, "func (k BlockKind) AuxIntType() string {") + fmt.Fprintln(w, "switch k {") + for _, a := range archs { + for _, b := range a.blocks { + if b.auxIntType() == "invalid" { + continue + } + fmt.Fprintf(w, "case Block%s%s: return \"%s\"\n", a.Name(), b.name, b.auxIntType()) + } + } + fmt.Fprintln(w, "}") + fmt.Fprintln(w, "return \"\"") + fmt.Fprintln(w, "}") + + // generate Op* declarations + fmt.Fprintln(w, "const (") + fmt.Fprintln(w, "OpInvalid Op = iota") // make sure OpInvalid is 0. + for _, a := range archs { + fmt.Fprintln(w) + for _, v := range a.ops { + if v.name == "Invalid" { + continue + } + fmt.Fprintf(w, "Op%s%s\n", a.Name(), v.name) + } + } + fmt.Fprintln(w, ")") + + // generate OpInfo table + fmt.Fprintln(w, "var opcodeTable = [...]opInfo{") + fmt.Fprintln(w, " { name: \"OpInvalid\" },") + for _, a := range archs { + fmt.Fprintln(w) + + pkg := path.Base(a.pkg) + for _, v := range a.ops { + if v.name == "Invalid" { + continue + } + fmt.Fprintln(w, "{") + fmt.Fprintf(w, "name:\"%s\",\n", v.name) + + // flags + if v.aux != "" { + fmt.Fprintf(w, "auxType: aux%s,\n", v.aux) + } + fmt.Fprintf(w, "argLen: %d,\n", v.argLength) + + if v.rematerializeable { + if v.reg.clobbers != 0 { + log.Fatalf("%s is rematerializeable and clobbers registers", v.name) + } + if v.clobberFlags { + log.Fatalf("%s is rematerializeable and clobbers flags", v.name) + } + fmt.Fprintln(w, "rematerializeable: true,") + } + if v.commutative { + fmt.Fprintln(w, "commutative: true,") + } + if v.resultInArg0 { + fmt.Fprintln(w, "resultInArg0: true,") + // OpConvert's register mask is selected dynamically, + // so don't try to check it in the static table. + if v.name != "Convert" && v.reg.inputs[0] != v.reg.outputs[0] { + log.Fatalf("%s: input[0] and output[0] must use the same registers for %s", a.name, v.name) + } + if v.name != "Convert" && v.commutative && v.reg.inputs[1] != v.reg.outputs[0] { + log.Fatalf("%s: input[1] and output[0] must use the same registers for %s", a.name, v.name) + } + } + if v.resultNotInArgs { + fmt.Fprintln(w, "resultNotInArgs: true,") + } + if v.clobberFlags { + fmt.Fprintln(w, "clobberFlags: true,") + } + if v.needIntTemp { + fmt.Fprintln(w, "needIntTemp: true,") + } + if v.call { + fmt.Fprintln(w, "call: true,") + } + if v.tailCall { + fmt.Fprintln(w, "tailCall: true,") + } + if v.nilCheck { + fmt.Fprintln(w, "nilCheck: true,") + } + if v.faultOnNilArg0 { + fmt.Fprintln(w, "faultOnNilArg0: true,") + if v.aux != "Sym" && v.aux != "SymOff" && v.aux != "SymValAndOff" && v.aux != "Int64" && v.aux != "Int32" && v.aux != "" { + log.Fatalf("faultOnNilArg0 with aux %s not allowed", v.aux) + } + } + if v.faultOnNilArg1 { + fmt.Fprintln(w, "faultOnNilArg1: true,") + if v.aux != "Sym" && v.aux != "SymOff" && v.aux != "SymValAndOff" && v.aux != "Int64" && v.aux != "Int32" && v.aux != "" { + log.Fatalf("faultOnNilArg1 with aux %s not allowed", v.aux) + } + } + if v.hasSideEffects { + fmt.Fprintln(w, "hasSideEffects: true,") + } + if v.zeroWidth { + fmt.Fprintln(w, "zeroWidth: true,") + } + if v.unsafePoint { + fmt.Fprintln(w, "unsafePoint: true,") + } + needEffect := strings.HasPrefix(v.aux, "Sym") + if v.symEffect != "" { + if !needEffect { + log.Fatalf("symEffect with aux %s not allowed", v.aux) + } + fmt.Fprintf(w, "symEffect: Sym%s,\n", strings.Replace(v.symEffect, ",", "|Sym", -1)) + } else if needEffect { + log.Fatalf("symEffect needed for aux %s", v.aux) + } + if a.name == "generic" { + fmt.Fprintln(w, "generic:true,") + fmt.Fprintln(w, "},") // close op + // generic ops have no reg info or asm + continue + } + if v.asm != "" { + fmt.Fprintf(w, "asm: %s.A%s,\n", pkg, v.asm) + } + if v.scale != 0 { + fmt.Fprintf(w, "scale: %d,\n", v.scale) + } + fmt.Fprintln(w, "reg:regInfo{") + + // Compute input allocation order. We allocate from the + // most to the least constrained input. This order guarantees + // that we will always be able to find a register. + var s []intPair + for i, r := range v.reg.inputs { + if r != 0 { + s = append(s, intPair{countRegs(r), i}) + } + } + if len(s) > 0 { + sort.Sort(byKey(s)) + fmt.Fprintln(w, "inputs: []inputInfo{") + for _, p := range s { + r := v.reg.inputs[p.val] + fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r)) + } + fmt.Fprintln(w, "},") + } + + if v.reg.clobbers > 0 { + fmt.Fprintf(w, "clobbers: %d,%s\n", v.reg.clobbers, a.regMaskComment(v.reg.clobbers)) + } + + // reg outputs + s = s[:0] + for i, r := range v.reg.outputs { + s = append(s, intPair{countRegs(r), i}) + } + if len(s) > 0 { + sort.Sort(byKey(s)) + fmt.Fprintln(w, "outputs: []outputInfo{") + for _, p := range s { + r := v.reg.outputs[p.val] + fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r)) + } + fmt.Fprintln(w, "},") + } + fmt.Fprintln(w, "},") // close reg info + fmt.Fprintln(w, "},") // close op + } + } + fmt.Fprintln(w, "}") + + fmt.Fprintln(w, "func (o Op) Asm() obj.As {return opcodeTable[o].asm}") + fmt.Fprintln(w, "func (o Op) Scale() int16 {return int16(opcodeTable[o].scale)}") + + // generate op string method + fmt.Fprintln(w, "func (o Op) String() string {return opcodeTable[o].name }") + + fmt.Fprintln(w, "func (o Op) SymEffect() SymEffect { return opcodeTable[o].symEffect }") + fmt.Fprintln(w, "func (o Op) IsCall() bool { return opcodeTable[o].call }") + fmt.Fprintln(w, "func (o Op) IsTailCall() bool { return opcodeTable[o].tailCall }") + fmt.Fprintln(w, "func (o Op) HasSideEffects() bool { return opcodeTable[o].hasSideEffects }") + fmt.Fprintln(w, "func (o Op) UnsafePoint() bool { return opcodeTable[o].unsafePoint }") + fmt.Fprintln(w, "func (o Op) ResultInArg0() bool { return opcodeTable[o].resultInArg0 }") + + // generate registers + for _, a := range archs { + if a.generic { + continue + } + fmt.Fprintf(w, "var registers%s = [...]Register {\n", a.name) + var gcRegN int + num := map[string]int8{} + for i, r := range a.regnames { + num[r] = int8(i) + pkg := a.pkg[len("cmd/internal/obj/"):] + var objname string // name in cmd/internal/obj/$ARCH + switch r { + case "SB": + // SB isn't a real register. cmd/internal/obj expects 0 in this case. + objname = "0" + case "SP": + objname = pkg + ".REGSP" + case "g": + objname = pkg + ".REGG" + default: + objname = pkg + ".REG_" + r + } + // Assign a GC register map index to registers + // that may contain pointers. + gcRegIdx := -1 + if a.gpregmask&(1<<uint(i)) != 0 { + gcRegIdx = gcRegN + gcRegN++ + } + fmt.Fprintf(w, " {%d, %s, %d, \"%s\"},\n", i, objname, gcRegIdx, r) + } + parameterRegisterList := func(paramNamesString string) []int8 { + paramNamesString = strings.TrimSpace(paramNamesString) + if paramNamesString == "" { + return nil + } + paramNames := strings.Split(paramNamesString, " ") + var paramRegs []int8 + for _, regName := range paramNames { + if regName == "" { + // forgive extra spaces + continue + } + if regNum, ok := num[regName]; ok { + paramRegs = append(paramRegs, regNum) + delete(num, regName) + } else { + log.Fatalf("parameter register %s for architecture %s not a register name (or repeated in parameter list)", regName, a.name) + } + } + return paramRegs + } + + paramIntRegs := parameterRegisterList(a.ParamIntRegNames) + paramFloatRegs := parameterRegisterList(a.ParamFloatRegNames) + + if gcRegN > 32 { + // Won't fit in a uint32 mask. + log.Fatalf("too many GC registers (%d > 32) on %s", gcRegN, a.name) + } + fmt.Fprintln(w, "}") + fmt.Fprintf(w, "var paramIntReg%s = %#v\n", a.name, paramIntRegs) + fmt.Fprintf(w, "var paramFloatReg%s = %#v\n", a.name, paramFloatRegs) + fmt.Fprintf(w, "var gpRegMask%s = regMask(%d)\n", a.name, a.gpregmask) + fmt.Fprintf(w, "var fpRegMask%s = regMask(%d)\n", a.name, a.fpregmask) + if a.fp32regmask != 0 { + fmt.Fprintf(w, "var fp32RegMask%s = regMask(%d)\n", a.name, a.fp32regmask) + } + if a.fp64regmask != 0 { + fmt.Fprintf(w, "var fp64RegMask%s = regMask(%d)\n", a.name, a.fp64regmask) + } + fmt.Fprintf(w, "var specialRegMask%s = regMask(%d)\n", a.name, a.specialregmask) + fmt.Fprintf(w, "var framepointerReg%s = int8(%d)\n", a.name, a.framepointerreg) + fmt.Fprintf(w, "var linkReg%s = int8(%d)\n", a.name, a.linkreg) + } + + // gofmt result + b := w.Bytes() + var err error + b, err = format.Source(b) + if err != nil { + fmt.Printf("%s\n", w.Bytes()) + panic(err) + } + + if err := os.WriteFile("../opGen.go", b, 0666); err != nil { + log.Fatalf("can't write output: %v\n", err) + } + + // Check that the arch genfile handles all the arch-specific opcodes. + // This is very much a hack, but it is better than nothing. + // + // Do a single regexp pass to record all ops being handled in a map, and + // then compare that with the ops list. This is much faster than one + // regexp pass per opcode. + for _, a := range archs { + if a.genfile == "" { + continue + } + + pattern := fmt.Sprintf(`\Wssa\.Op%s([a-zA-Z0-9_]+)\W`, a.name) + rxOp, err := regexp.Compile(pattern) + if err != nil { + log.Fatalf("bad opcode regexp %s: %v", pattern, err) + } + + src, err := os.ReadFile(a.genfile) + if err != nil { + log.Fatalf("can't read %s: %v", a.genfile, err) + } + seen := make(map[string]bool, len(a.ops)) + for _, m := range rxOp.FindAllSubmatch(src, -1) { + seen[string(m[1])] = true + } + for _, op := range a.ops { + if !seen[op.name] { + log.Fatalf("Op%s%s has no code generation in %s", a.name, op.name, a.genfile) + } + } + } +} + +// Name returns the name of the architecture for use in Op* and Block* enumerations. +func (a arch) Name() string { + s := a.name + if s == "generic" { + s = "" + } + return s +} + +// countRegs returns the number of set bits in the register mask. +func countRegs(r regMask) int { + return bits.OnesCount64(uint64(r)) +} + +// for sorting a pair of integers by key +type intPair struct { + key, val int +} +type byKey []intPair + +func (a byKey) Len() int { return len(a) } +func (a byKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a byKey) Less(i, j int) bool { return a[i].key < a[j].key } + +type ArchsByName []arch + +func (x ArchsByName) Len() int { return len(x) } +func (x ArchsByName) Swap(i, j int) { x[i], x[j] = x[j], x[i] } +func (x ArchsByName) Less(i, j int) bool { return x[i].name < x[j].name } diff --git a/src/cmd/compile/internal/ssa/_gen/rulegen.go b/src/cmd/compile/internal/ssa/_gen/rulegen.go new file mode 100644 index 0000000..80fa37a --- /dev/null +++ b/src/cmd/compile/internal/ssa/_gen/rulegen.go @@ -0,0 +1,1885 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This program generates Go code that applies rewrite rules to a Value. +// The generated code implements a function of type func (v *Value) bool +// which reports whether if did something. +// Ideas stolen from Swift: http://www.hpl.hp.com/techreports/Compaq-DEC/WRL-2000-2.html + +package main + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "go/ast" + "go/format" + "go/parser" + "go/printer" + "go/token" + "io" + "log" + "os" + "path" + "regexp" + "sort" + "strconv" + "strings" + + "golang.org/x/tools/go/ast/astutil" +) + +// rule syntax: +// sexpr [&& extra conditions] => [@block] sexpr +// +// sexpr are s-expressions (lisp-like parenthesized groupings) +// sexpr ::= [variable:](opcode sexpr*) +// | variable +// | <type> +// | [auxint] +// | {aux} +// +// aux ::= variable | {code} +// type ::= variable | {code} +// variable ::= some token +// opcode ::= one of the opcodes from the *Ops.go files + +// special rules: trailing ellipsis "..." (in the outermost sexpr?) must match on both sides of a rule. +// trailing three underscore "___" in the outermost match sexpr indicate the presence of +// extra ignored args that need not appear in the replacement + +// extra conditions is just a chunk of Go that evaluates to a boolean. It may use +// variables declared in the matching tsexpr. The variable "v" is predefined to be +// the value matched by the entire rule. + +// If multiple rules match, the first one in file order is selected. + +var ( + genLog = flag.Bool("log", false, "generate code that logs; for debugging only") + addLine = flag.Bool("line", false, "add line number comment to generated rules; for debugging only") +) + +type Rule struct { + Rule string + Loc string // file name & line number +} + +func (r Rule) String() string { + return fmt.Sprintf("rule %q at %s", r.Rule, r.Loc) +} + +func normalizeSpaces(s string) string { + return strings.Join(strings.Fields(strings.TrimSpace(s)), " ") +} + +// parse returns the matching part of the rule, additional conditions, and the result. +func (r Rule) parse() (match, cond, result string) { + s := strings.Split(r.Rule, "=>") + match = normalizeSpaces(s[0]) + result = normalizeSpaces(s[1]) + cond = "" + if i := strings.Index(match, "&&"); i >= 0 { + cond = normalizeSpaces(match[i+2:]) + match = normalizeSpaces(match[:i]) + } + return match, cond, result +} + +func genRules(arch arch) { genRulesSuffix(arch, "") } +func genSplitLoadRules(arch arch) { genRulesSuffix(arch, "splitload") } +func genLateLowerRules(arch arch) { genRulesSuffix(arch, "latelower") } + +func genRulesSuffix(arch arch, suff string) { + // Open input file. + text, err := os.Open(arch.name + suff + ".rules") + if err != nil { + if suff == "" { + // All architectures must have a plain rules file. + log.Fatalf("can't read rule file: %v", err) + } + // Some architectures have bonus rules files that others don't share. That's fine. + return + } + + // oprules contains a list of rules for each block and opcode + blockrules := map[string][]Rule{} + oprules := map[string][]Rule{} + + // read rule file + scanner := bufio.NewScanner(text) + rule := "" + var lineno int + var ruleLineno int // line number of "=>" + for scanner.Scan() { + lineno++ + line := scanner.Text() + if i := strings.Index(line, "//"); i >= 0 { + // Remove comments. Note that this isn't string safe, so + // it will truncate lines with // inside strings. Oh well. + line = line[:i] + } + rule += " " + line + rule = strings.TrimSpace(rule) + if rule == "" { + continue + } + if !strings.Contains(rule, "=>") { + continue + } + if ruleLineno == 0 { + ruleLineno = lineno + } + if strings.HasSuffix(rule, "=>") { + continue // continue on the next line + } + if n := balance(rule); n > 0 { + continue // open parentheses remain, continue on the next line + } else if n < 0 { + break // continuing the line can't help, and it will only make errors worse + } + + loc := fmt.Sprintf("%s%s.rules:%d", arch.name, suff, ruleLineno) + for _, rule2 := range expandOr(rule) { + r := Rule{Rule: rule2, Loc: loc} + if rawop := strings.Split(rule2, " ")[0][1:]; isBlock(rawop, arch) { + blockrules[rawop] = append(blockrules[rawop], r) + continue + } + // Do fancier value op matching. + match, _, _ := r.parse() + op, oparch, _, _, _, _ := parseValue(match, arch, loc) + opname := fmt.Sprintf("Op%s%s", oparch, op.name) + oprules[opname] = append(oprules[opname], r) + } + rule = "" + ruleLineno = 0 + } + if err := scanner.Err(); err != nil { + log.Fatalf("scanner failed: %v\n", err) + } + if balance(rule) != 0 { + log.Fatalf("%s.rules:%d: unbalanced rule: %v\n", arch.name, lineno, rule) + } + + // Order all the ops. + var ops []string + for op := range oprules { + ops = append(ops, op) + } + sort.Strings(ops) + + genFile := &File{Arch: arch, Suffix: suff} + // Main rewrite routine is a switch on v.Op. + fn := &Func{Kind: "Value", ArgLen: -1} + + sw := &Switch{Expr: exprf("v.Op")} + for _, op := range ops { + eop, ok := parseEllipsisRules(oprules[op], arch) + if ok { + if strings.Contains(oprules[op][0].Rule, "=>") && opByName(arch, op).aux != opByName(arch, eop).aux { + panic(fmt.Sprintf("can't use ... for ops that have different aux types: %s and %s", op, eop)) + } + swc := &Case{Expr: exprf("%s", op)} + swc.add(stmtf("v.Op = %s", eop)) + swc.add(stmtf("return true")) + sw.add(swc) + continue + } + + swc := &Case{Expr: exprf("%s", op)} + swc.add(stmtf("return rewriteValue%s%s_%s(v)", arch.name, suff, op)) + sw.add(swc) + } + if len(sw.List) > 0 { // skip if empty + fn.add(sw) + } + fn.add(stmtf("return false")) + genFile.add(fn) + + // Generate a routine per op. Note that we don't make one giant routine + // because it is too big for some compilers. + for _, op := range ops { + rules := oprules[op] + _, ok := parseEllipsisRules(oprules[op], arch) + if ok { + continue + } + + // rr is kept between iterations, so that each rule can check + // that the previous rule wasn't unconditional. + var rr *RuleRewrite + fn := &Func{ + Kind: "Value", + Suffix: fmt.Sprintf("_%s", op), + ArgLen: opByName(arch, op).argLength, + } + fn.add(declReserved("b", "v.Block")) + fn.add(declReserved("config", "b.Func.Config")) + fn.add(declReserved("fe", "b.Func.fe")) + fn.add(declReserved("typ", "&b.Func.Config.Types")) + for _, rule := range rules { + if rr != nil && !rr.CanFail { + log.Fatalf("unconditional rule %s is followed by other rules", rr.Match) + } + rr = &RuleRewrite{Loc: rule.Loc} + rr.Match, rr.Cond, rr.Result = rule.parse() + pos, _ := genMatch(rr, arch, rr.Match, fn.ArgLen >= 0) + if pos == "" { + pos = "v.Pos" + } + if rr.Cond != "" { + rr.add(breakf("!(%s)", rr.Cond)) + } + genResult(rr, arch, rr.Result, pos) + if *genLog { + rr.add(stmtf("logRule(%q)", rule.Loc)) + } + fn.add(rr) + } + if rr.CanFail { + fn.add(stmtf("return false")) + } + genFile.add(fn) + } + + // Generate block rewrite function. There are only a few block types + // so we can make this one function with a switch. + fn = &Func{Kind: "Block"} + fn.add(declReserved("config", "b.Func.Config")) + fn.add(declReserved("typ", "&b.Func.Config.Types")) + + sw = &Switch{Expr: exprf("b.Kind")} + ops = ops[:0] + for op := range blockrules { + ops = append(ops, op) + } + sort.Strings(ops) + for _, op := range ops { + name, data := getBlockInfo(op, arch) + swc := &Case{Expr: exprf("%s", name)} + for _, rule := range blockrules[op] { + swc.add(genBlockRewrite(rule, arch, data)) + } + sw.add(swc) + } + if len(sw.List) > 0 { // skip if empty + fn.add(sw) + } + fn.add(stmtf("return false")) + genFile.add(fn) + + // Remove unused imports and variables. + buf := new(bytes.Buffer) + fprint(buf, genFile) + fset := token.NewFileSet() + file, err := parser.ParseFile(fset, "", buf, parser.ParseComments) + if err != nil { + filename := fmt.Sprintf("%s_broken.go", arch.name) + if err := os.WriteFile(filename, buf.Bytes(), 0644); err != nil { + log.Printf("failed to dump broken code to %s: %v", filename, err) + } else { + log.Printf("dumped broken code to %s", filename) + } + log.Fatalf("failed to parse generated code for arch %s: %v", arch.name, err) + } + tfile := fset.File(file.Pos()) + + // First, use unusedInspector to find the unused declarations by their + // start position. + u := unusedInspector{unused: make(map[token.Pos]bool)} + u.node(file) + + // Then, delete said nodes via astutil.Apply. + pre := func(c *astutil.Cursor) bool { + node := c.Node() + if node == nil { + return true + } + if u.unused[node.Pos()] { + c.Delete() + // Unused imports and declarations use exactly + // one line. Prevent leaving an empty line. + tfile.MergeLine(tfile.Position(node.Pos()).Line) + return false + } + return true + } + post := func(c *astutil.Cursor) bool { + switch node := c.Node().(type) { + case *ast.GenDecl: + if len(node.Specs) == 0 { + // Don't leave a broken or empty GenDecl behind, + // such as "import ()". + c.Delete() + } + } + return true + } + file = astutil.Apply(file, pre, post).(*ast.File) + + // Write the well-formatted source to file + f, err := os.Create("../rewrite" + arch.name + suff + ".go") + if err != nil { + log.Fatalf("can't write output: %v", err) + } + defer f.Close() + // gofmt result; use a buffered writer, as otherwise go/format spends + // far too much time in syscalls. + bw := bufio.NewWriter(f) + if err := format.Node(bw, fset, file); err != nil { + log.Fatalf("can't format output: %v", err) + } + if err := bw.Flush(); err != nil { + log.Fatalf("can't write output: %v", err) + } + if err := f.Close(); err != nil { + log.Fatalf("can't write output: %v", err) + } +} + +// unusedInspector can be used to detect unused variables and imports in an +// ast.Node via its node method. The result is available in the "unused" map. +// +// note that unusedInspector is lazy and best-effort; it only supports the node +// types and patterns used by the rulegen program. +type unusedInspector struct { + // scope is the current scope, which can never be nil when a declaration + // is encountered. That is, the unusedInspector.node entrypoint should + // generally be an entire file or block. + scope *scope + + // unused is the resulting set of unused declared names, indexed by the + // starting position of the node that declared the name. + unused map[token.Pos]bool + + // defining is the object currently being defined; this is useful so + // that if "foo := bar" is unused and removed, we can then detect if + // "bar" becomes unused as well. + defining *object +} + +// scoped opens a new scope when called, and returns a function which closes +// that same scope. When a scope is closed, unused variables are recorded. +func (u *unusedInspector) scoped() func() { + outer := u.scope + u.scope = &scope{outer: outer, objects: map[string]*object{}} + return func() { + for anyUnused := true; anyUnused; { + anyUnused = false + for _, obj := range u.scope.objects { + if obj.numUses > 0 { + continue + } + u.unused[obj.pos] = true + for _, used := range obj.used { + if used.numUses--; used.numUses == 0 { + anyUnused = true + } + } + // We've decremented numUses for each of the + // objects in used. Zero this slice too, to keep + // everything consistent. + obj.used = nil + } + } + u.scope = outer + } +} + +func (u *unusedInspector) exprs(list []ast.Expr) { + for _, x := range list { + u.node(x) + } +} + +func (u *unusedInspector) node(node ast.Node) { + switch node := node.(type) { + case *ast.File: + defer u.scoped()() + for _, decl := range node.Decls { + u.node(decl) + } + case *ast.GenDecl: + for _, spec := range node.Specs { + u.node(spec) + } + case *ast.ImportSpec: + impPath, _ := strconv.Unquote(node.Path.Value) + name := path.Base(impPath) + u.scope.objects[name] = &object{ + name: name, + pos: node.Pos(), + } + case *ast.FuncDecl: + u.node(node.Type) + if node.Body != nil { + u.node(node.Body) + } + case *ast.FuncType: + if node.Params != nil { + u.node(node.Params) + } + if node.Results != nil { + u.node(node.Results) + } + case *ast.FieldList: + for _, field := range node.List { + u.node(field) + } + case *ast.Field: + u.node(node.Type) + + // statements + + case *ast.BlockStmt: + defer u.scoped()() + for _, stmt := range node.List { + u.node(stmt) + } + case *ast.DeclStmt: + u.node(node.Decl) + case *ast.IfStmt: + if node.Init != nil { + u.node(node.Init) + } + u.node(node.Cond) + u.node(node.Body) + if node.Else != nil { + u.node(node.Else) + } + case *ast.ForStmt: + if node.Init != nil { + u.node(node.Init) + } + if node.Cond != nil { + u.node(node.Cond) + } + if node.Post != nil { + u.node(node.Post) + } + u.node(node.Body) + case *ast.SwitchStmt: + if node.Init != nil { + u.node(node.Init) + } + if node.Tag != nil { + u.node(node.Tag) + } + u.node(node.Body) + case *ast.CaseClause: + u.exprs(node.List) + defer u.scoped()() + for _, stmt := range node.Body { + u.node(stmt) + } + case *ast.BranchStmt: + case *ast.ExprStmt: + u.node(node.X) + case *ast.AssignStmt: + if node.Tok != token.DEFINE { + u.exprs(node.Rhs) + u.exprs(node.Lhs) + break + } + lhs := node.Lhs + if len(lhs) == 2 && lhs[1].(*ast.Ident).Name == "_" { + lhs = lhs[:1] + } + if len(lhs) != 1 { + panic("no support for := with multiple names") + } + + name := lhs[0].(*ast.Ident) + obj := &object{ + name: name.Name, + pos: name.NamePos, + } + + old := u.defining + u.defining = obj + u.exprs(node.Rhs) + u.defining = old + + u.scope.objects[name.Name] = obj + case *ast.ReturnStmt: + u.exprs(node.Results) + case *ast.IncDecStmt: + u.node(node.X) + + // expressions + + case *ast.CallExpr: + u.node(node.Fun) + u.exprs(node.Args) + case *ast.SelectorExpr: + u.node(node.X) + case *ast.UnaryExpr: + u.node(node.X) + case *ast.BinaryExpr: + u.node(node.X) + u.node(node.Y) + case *ast.StarExpr: + u.node(node.X) + case *ast.ParenExpr: + u.node(node.X) + case *ast.IndexExpr: + u.node(node.X) + u.node(node.Index) + case *ast.TypeAssertExpr: + u.node(node.X) + u.node(node.Type) + case *ast.Ident: + if obj := u.scope.Lookup(node.Name); obj != nil { + obj.numUses++ + if u.defining != nil { + u.defining.used = append(u.defining.used, obj) + } + } + case *ast.BasicLit: + case *ast.ValueSpec: + u.exprs(node.Values) + default: + panic(fmt.Sprintf("unhandled node: %T", node)) + } +} + +// scope keeps track of a certain scope and its declared names, as well as the +// outer (parent) scope. +type scope struct { + outer *scope // can be nil, if this is the top-level scope + objects map[string]*object // indexed by each declared name +} + +func (s *scope) Lookup(name string) *object { + if obj := s.objects[name]; obj != nil { + return obj + } + if s.outer == nil { + return nil + } + return s.outer.Lookup(name) +} + +// object keeps track of a declared name, such as a variable or import. +type object struct { + name string + pos token.Pos // start position of the node declaring the object + + numUses int // number of times this object is used + used []*object // objects that its declaration makes use of +} + +func fprint(w io.Writer, n Node) { + switch n := n.(type) { + case *File: + file := n + seenRewrite := make(map[[3]string]string) + fmt.Fprintf(w, "// Code generated from _gen/%s%s.rules; DO NOT EDIT.\n", n.Arch.name, n.Suffix) + fmt.Fprintf(w, "// generated with: cd _gen; go run .\n") + fmt.Fprintf(w, "\npackage ssa\n") + for _, path := range append([]string{ + "fmt", + "internal/buildcfg", + "math", + "cmd/internal/obj", + "cmd/compile/internal/base", + "cmd/compile/internal/types", + }, n.Arch.imports...) { + fmt.Fprintf(w, "import %q\n", path) + } + for _, f := range n.List { + f := f.(*Func) + fmt.Fprintf(w, "func rewrite%s%s%s%s(", f.Kind, n.Arch.name, n.Suffix, f.Suffix) + fmt.Fprintf(w, "%c *%s) bool {\n", strings.ToLower(f.Kind)[0], f.Kind) + if f.Kind == "Value" && f.ArgLen > 0 { + for i := f.ArgLen - 1; i >= 0; i-- { + fmt.Fprintf(w, "v_%d := v.Args[%d]\n", i, i) + } + } + for _, n := range f.List { + fprint(w, n) + + if rr, ok := n.(*RuleRewrite); ok { + k := [3]string{ + normalizeMatch(rr.Match, file.Arch), + normalizeWhitespace(rr.Cond), + normalizeWhitespace(rr.Result), + } + if prev, ok := seenRewrite[k]; ok { + log.Fatalf("duplicate rule %s, previously seen at %s\n", rr.Loc, prev) + } + seenRewrite[k] = rr.Loc + } + } + fmt.Fprintf(w, "}\n") + } + case *Switch: + fmt.Fprintf(w, "switch ") + fprint(w, n.Expr) + fmt.Fprintf(w, " {\n") + for _, n := range n.List { + fprint(w, n) + } + fmt.Fprintf(w, "}\n") + case *Case: + fmt.Fprintf(w, "case ") + fprint(w, n.Expr) + fmt.Fprintf(w, ":\n") + for _, n := range n.List { + fprint(w, n) + } + case *RuleRewrite: + if *addLine { + fmt.Fprintf(w, "// %s\n", n.Loc) + } + fmt.Fprintf(w, "// match: %s\n", n.Match) + if n.Cond != "" { + fmt.Fprintf(w, "// cond: %s\n", n.Cond) + } + fmt.Fprintf(w, "// result: %s\n", n.Result) + fmt.Fprintf(w, "for %s {\n", n.Check) + nCommutative := 0 + for _, n := range n.List { + if b, ok := n.(*CondBreak); ok { + b.InsideCommuteLoop = nCommutative > 0 + } + fprint(w, n) + if loop, ok := n.(StartCommuteLoop); ok { + if nCommutative != loop.Depth { + panic("mismatch commute loop depth") + } + nCommutative++ + } + } + fmt.Fprintf(w, "return true\n") + for i := 0; i < nCommutative; i++ { + fmt.Fprintln(w, "}") + } + if n.CommuteDepth > 0 && n.CanFail { + fmt.Fprint(w, "break\n") + } + fmt.Fprintf(w, "}\n") + case *Declare: + fmt.Fprintf(w, "%s := ", n.Name) + fprint(w, n.Value) + fmt.Fprintln(w) + case *CondBreak: + fmt.Fprintf(w, "if ") + fprint(w, n.Cond) + fmt.Fprintf(w, " {\n") + if n.InsideCommuteLoop { + fmt.Fprintf(w, "continue") + } else { + fmt.Fprintf(w, "break") + } + fmt.Fprintf(w, "\n}\n") + case ast.Node: + printConfig.Fprint(w, emptyFset, n) + if _, ok := n.(ast.Stmt); ok { + fmt.Fprintln(w) + } + case StartCommuteLoop: + fmt.Fprintf(w, "for _i%[1]d := 0; _i%[1]d <= 1; _i%[1]d, %[2]s_0, %[2]s_1 = _i%[1]d + 1, %[2]s_1, %[2]s_0 {\n", n.Depth, n.V) + default: + log.Fatalf("cannot print %T", n) + } +} + +var printConfig = printer.Config{ + Mode: printer.RawFormat, // we use go/format later, so skip work here +} + +var emptyFset = token.NewFileSet() + +// Node can be a Statement or an ast.Expr. +type Node interface{} + +// Statement can be one of our high-level statement struct types, or an +// ast.Stmt under some limited circumstances. +type Statement interface{} + +// BodyBase is shared by all of our statement pseudo-node types which can +// contain other statements. +type BodyBase struct { + List []Statement + CanFail bool +} + +func (w *BodyBase) add(node Statement) { + var last Statement + if len(w.List) > 0 { + last = w.List[len(w.List)-1] + } + if node, ok := node.(*CondBreak); ok { + w.CanFail = true + if last, ok := last.(*CondBreak); ok { + // Add to the previous "if <cond> { break }" via a + // logical OR, which will save verbosity. + last.Cond = &ast.BinaryExpr{ + Op: token.LOR, + X: last.Cond, + Y: node.Cond, + } + return + } + } + + w.List = append(w.List, node) +} + +// predeclared contains globally known tokens that should not be redefined. +var predeclared = map[string]bool{ + "nil": true, + "false": true, + "true": true, +} + +// declared reports if the body contains a Declare with the given name. +func (w *BodyBase) declared(name string) bool { + if predeclared[name] { + // Treat predeclared names as having already been declared. + // This lets us use nil to match an aux field or + // true and false to match an auxint field. + return true + } + for _, s := range w.List { + if decl, ok := s.(*Declare); ok && decl.Name == name { + return true + } + } + return false +} + +// These types define some high-level statement struct types, which can be used +// as a Statement. This allows us to keep some node structs simpler, and have +// higher-level nodes such as an entire rule rewrite. +// +// Note that ast.Expr is always used as-is; we don't declare our own expression +// nodes. +type ( + File struct { + BodyBase // []*Func + Arch arch + Suffix string + } + Func struct { + BodyBase + Kind string // "Value" or "Block" + Suffix string + ArgLen int32 // if kind == "Value", number of args for this op + } + Switch struct { + BodyBase // []*Case + Expr ast.Expr + } + Case struct { + BodyBase + Expr ast.Expr + } + RuleRewrite struct { + BodyBase + Match, Cond, Result string // top comments + Check string // top-level boolean expression + + Alloc int // for unique var names + Loc string // file name & line number of the original rule + CommuteDepth int // used to track depth of commute loops + } + Declare struct { + Name string + Value ast.Expr + } + CondBreak struct { + Cond ast.Expr + InsideCommuteLoop bool + } + StartCommuteLoop struct { + Depth int + V string + } +) + +// exprf parses a Go expression generated from fmt.Sprintf, panicking if an +// error occurs. +func exprf(format string, a ...interface{}) ast.Expr { + src := fmt.Sprintf(format, a...) + expr, err := parser.ParseExpr(src) + if err != nil { + log.Fatalf("expr parse error on %q: %v", src, err) + } + return expr +} + +// stmtf parses a Go statement generated from fmt.Sprintf. This function is only +// meant for simple statements that don't have a custom Statement node declared +// in this package, such as ast.ReturnStmt or ast.ExprStmt. +func stmtf(format string, a ...interface{}) Statement { + src := fmt.Sprintf(format, a...) + fsrc := "package p\nfunc _() {\n" + src + "\n}\n" + file, err := parser.ParseFile(token.NewFileSet(), "", fsrc, 0) + if err != nil { + log.Fatalf("stmt parse error on %q: %v", src, err) + } + return file.Decls[0].(*ast.FuncDecl).Body.List[0] +} + +var reservedNames = map[string]bool{ + "v": true, // Values[i], etc + "b": true, // v.Block + "config": true, // b.Func.Config + "fe": true, // b.Func.fe + "typ": true, // &b.Func.Config.Types +} + +// declf constructs a simple "name := value" declaration, +// using exprf for its value. +// +// name must not be one of reservedNames. +// This helps prevent unintended shadowing and name clashes. +// To declare a reserved name, use declReserved. +func declf(loc, name, format string, a ...interface{}) *Declare { + if reservedNames[name] { + log.Fatalf("rule %s uses the reserved name %s", loc, name) + } + return &Declare{name, exprf(format, a...)} +} + +// declReserved is like declf, but the name must be one of reservedNames. +// Calls to declReserved should generally be static and top-level. +func declReserved(name, value string) *Declare { + if !reservedNames[name] { + panic(fmt.Sprintf("declReserved call does not use a reserved name: %q", name)) + } + return &Declare{name, exprf(value)} +} + +// breakf constructs a simple "if cond { break }" statement, using exprf for its +// condition. +func breakf(format string, a ...interface{}) *CondBreak { + return &CondBreak{Cond: exprf(format, a...)} +} + +func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite { + rr := &RuleRewrite{Loc: rule.Loc} + rr.Match, rr.Cond, rr.Result = rule.parse() + _, _, auxint, aux, s := extract(rr.Match) // remove parens, then split + + // check match of control values + if len(s) < data.controls { + log.Fatalf("incorrect number of arguments in %s, got %v wanted at least %v", rule, len(s), data.controls) + } + controls := s[:data.controls] + pos := make([]string, data.controls) + for i, arg := range controls { + cname := fmt.Sprintf("b.Controls[%v]", i) + if strings.Contains(arg, "(") { + vname, expr := splitNameExpr(arg) + if vname == "" { + vname = fmt.Sprintf("v_%v", i) + } + rr.add(declf(rr.Loc, vname, cname)) + p, op := genMatch0(rr, arch, expr, vname, nil, false) // TODO: pass non-nil cnt? + if op != "" { + check := fmt.Sprintf("%s.Op == %s", cname, op) + if rr.Check == "" { + rr.Check = check + } else { + rr.Check += " && " + check + } + } + if p == "" { + p = vname + ".Pos" + } + pos[i] = p + } else { + rr.add(declf(rr.Loc, arg, cname)) + pos[i] = arg + ".Pos" + } + } + for _, e := range []struct { + name, field, dclType string + }{ + {auxint, "AuxInt", data.auxIntType()}, + {aux, "Aux", data.auxType()}, + } { + if e.name == "" { + continue + } + + if e.dclType == "" { + log.Fatalf("op %s has no declared type for %s", data.name, e.field) + } + if !token.IsIdentifier(e.name) || rr.declared(e.name) { + rr.add(breakf("%sTo%s(b.%s) != %s", unTitle(e.field), title(e.dclType), e.field, e.name)) + } else { + rr.add(declf(rr.Loc, e.name, "%sTo%s(b.%s)", unTitle(e.field), title(e.dclType), e.field)) + } + } + if rr.Cond != "" { + rr.add(breakf("!(%s)", rr.Cond)) + } + + // Rule matches. Generate result. + outop, _, auxint, aux, t := extract(rr.Result) // remove parens, then split + blockName, outdata := getBlockInfo(outop, arch) + if len(t) < outdata.controls { + log.Fatalf("incorrect number of output arguments in %s, got %v wanted at least %v", rule, len(s), outdata.controls) + } + + // Check if newsuccs is the same set as succs. + succs := s[data.controls:] + newsuccs := t[outdata.controls:] + m := map[string]bool{} + for _, succ := range succs { + if m[succ] { + log.Fatalf("can't have a repeat successor name %s in %s", succ, rule) + } + m[succ] = true + } + for _, succ := range newsuccs { + if !m[succ] { + log.Fatalf("unknown successor %s in %s", succ, rule) + } + delete(m, succ) + } + if len(m) != 0 { + log.Fatalf("unmatched successors %v in %s", m, rule) + } + + var genControls [2]string + for i, control := range t[:outdata.controls] { + // Select a source position for any new control values. + // TODO: does it always make sense to use the source position + // of the original control values or should we be using the + // block's source position in some cases? + newpos := "b.Pos" // default to block's source position + if i < len(pos) && pos[i] != "" { + // Use the previous control value's source position. + newpos = pos[i] + } + + // Generate a new control value (or copy an existing value). + genControls[i] = genResult0(rr, arch, control, false, false, newpos, nil) + } + switch outdata.controls { + case 0: + rr.add(stmtf("b.Reset(%s)", blockName)) + case 1: + rr.add(stmtf("b.resetWithControl(%s, %s)", blockName, genControls[0])) + case 2: + rr.add(stmtf("b.resetWithControl2(%s, %s, %s)", blockName, genControls[0], genControls[1])) + default: + log.Fatalf("too many controls: %d", outdata.controls) + } + + if auxint != "" { + // Make sure auxint value has the right type. + rr.add(stmtf("b.AuxInt = %sToAuxInt(%s)", unTitle(outdata.auxIntType()), auxint)) + } + if aux != "" { + // Make sure aux value has the right type. + rr.add(stmtf("b.Aux = %sToAux(%s)", unTitle(outdata.auxType()), aux)) + } + + succChanged := false + for i := 0; i < len(succs); i++ { + if succs[i] != newsuccs[i] { + succChanged = true + } + } + if succChanged { + if len(succs) != 2 { + log.Fatalf("changed successors, len!=2 in %s", rule) + } + if succs[0] != newsuccs[1] || succs[1] != newsuccs[0] { + log.Fatalf("can only handle swapped successors in %s", rule) + } + rr.add(stmtf("b.swapSuccessors()")) + } + + if *genLog { + rr.add(stmtf("logRule(%q)", rule.Loc)) + } + return rr +} + +// genMatch returns the variable whose source position should be used for the +// result (or "" if no opinion), and a boolean that reports whether the match can fail. +func genMatch(rr *RuleRewrite, arch arch, match string, pregenTop bool) (pos, checkOp string) { + cnt := varCount(rr) + return genMatch0(rr, arch, match, "v", cnt, pregenTop) +} + +func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int, pregenTop bool) (pos, checkOp string) { + if match[0] != '(' || match[len(match)-1] != ')' { + log.Fatalf("%s: non-compound expr in genMatch0: %q", rr.Loc, match) + } + op, oparch, typ, auxint, aux, args := parseValue(match, arch, rr.Loc) + + checkOp = fmt.Sprintf("Op%s%s", oparch, op.name) + + if op.faultOnNilArg0 || op.faultOnNilArg1 { + // Prefer the position of an instruction which could fault. + pos = v + ".Pos" + } + + // If the last argument is ___, it means "don't care about trailing arguments, really" + // The likely/intended use is for rewrites that are too tricky to express in the existing pattern language + // Do a length check early because long patterns fed short (ultimately not-matching) inputs will + // do an indexing error in pattern-matching. + if op.argLength == -1 { + l := len(args) + if l == 0 || args[l-1] != "___" { + rr.add(breakf("len(%s.Args) != %d", v, l)) + } else if l > 1 && args[l-1] == "___" { + rr.add(breakf("len(%s.Args) < %d", v, l-1)) + } + } + + for _, e := range []struct { + name, field, dclType string + }{ + {typ, "Type", "*types.Type"}, + {auxint, "AuxInt", op.auxIntType()}, + {aux, "Aux", op.auxType()}, + } { + if e.name == "" { + continue + } + + if e.dclType == "" { + log.Fatalf("op %s has no declared type for %s", op.name, e.field) + } + if !token.IsIdentifier(e.name) || rr.declared(e.name) { + switch e.field { + case "Aux": + rr.add(breakf("auxTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name)) + case "AuxInt": + rr.add(breakf("auxIntTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name)) + case "Type": + rr.add(breakf("%s.%s != %s", v, e.field, e.name)) + } + } else { + switch e.field { + case "Aux": + rr.add(declf(rr.Loc, e.name, "auxTo%s(%s.%s)", title(e.dclType), v, e.field)) + case "AuxInt": + rr.add(declf(rr.Loc, e.name, "auxIntTo%s(%s.%s)", title(e.dclType), v, e.field)) + case "Type": + rr.add(declf(rr.Loc, e.name, "%s.%s", v, e.field)) + } + } + } + + commutative := op.commutative + if commutative { + if args[0] == args[1] { + // When we have (Add x x), for any x, + // even if there are other uses of x besides these two, + // and even if x is not a variable, + // we can skip the commutative match. + commutative = false + } + if cnt[args[0]] == 1 && cnt[args[1]] == 1 { + // When we have (Add x y) with no other uses + // of x and y in the matching rule and condition, + // then we can skip the commutative match (Add y x). + commutative = false + } + } + + if !pregenTop { + // Access last argument first to minimize bounds checks. + for n := len(args) - 1; n > 0; n-- { + a := args[n] + if a == "_" { + continue + } + if !rr.declared(a) && token.IsIdentifier(a) && !(commutative && len(args) == 2) { + rr.add(declf(rr.Loc, a, "%s.Args[%d]", v, n)) + // delete the last argument so it is not reprocessed + args = args[:n] + } else { + rr.add(stmtf("_ = %s.Args[%d]", v, n)) + } + break + } + } + if commutative && !pregenTop { + for i := 0; i <= 1; i++ { + vname := fmt.Sprintf("%s_%d", v, i) + rr.add(declf(rr.Loc, vname, "%s.Args[%d]", v, i)) + } + } + if commutative { + rr.add(StartCommuteLoop{rr.CommuteDepth, v}) + rr.CommuteDepth++ + } + for i, arg := range args { + if arg == "_" { + continue + } + var rhs string + if (commutative && i < 2) || pregenTop { + rhs = fmt.Sprintf("%s_%d", v, i) + } else { + rhs = fmt.Sprintf("%s.Args[%d]", v, i) + } + if !strings.Contains(arg, "(") { + // leaf variable + if rr.declared(arg) { + // variable already has a definition. Check whether + // the old definition and the new definition match. + // For example, (add x x). Equality is just pointer equality + // on Values (so cse is important to do before lowering). + rr.add(breakf("%s != %s", arg, rhs)) + } else { + if arg != rhs { + rr.add(declf(rr.Loc, arg, "%s", rhs)) + } + } + continue + } + // compound sexpr + argname, expr := splitNameExpr(arg) + if argname == "" { + argname = fmt.Sprintf("%s_%d", v, i) + } + if argname == "b" { + log.Fatalf("don't name args 'b', it is ambiguous with blocks") + } + + if argname != rhs { + rr.add(declf(rr.Loc, argname, "%s", rhs)) + } + bexpr := exprf("%s.Op != addLater", argname) + rr.add(&CondBreak{Cond: bexpr}) + argPos, argCheckOp := genMatch0(rr, arch, expr, argname, cnt, false) + bexpr.(*ast.BinaryExpr).Y.(*ast.Ident).Name = argCheckOp + + if argPos != "" { + // Keep the argument in preference to the parent, as the + // argument is normally earlier in program flow. + // Keep the argument in preference to an earlier argument, + // as that prefers the memory argument which is also earlier + // in the program flow. + pos = argPos + } + } + + return pos, checkOp +} + +func genResult(rr *RuleRewrite, arch arch, result, pos string) { + move := result[0] == '@' + if move { + // parse @block directive + s := strings.SplitN(result[1:], " ", 2) + rr.add(stmtf("b = %s", s[0])) + result = s[1] + } + cse := make(map[string]string) + genResult0(rr, arch, result, true, move, pos, cse) +} + +func genResult0(rr *RuleRewrite, arch arch, result string, top, move bool, pos string, cse map[string]string) string { + resname, expr := splitNameExpr(result) + result = expr + // TODO: when generating a constant result, use f.constVal to avoid + // introducing copies just to clean them up again. + if result[0] != '(' { + // variable + if top { + // It in not safe in general to move a variable between blocks + // (and particularly not a phi node). + // Introduce a copy. + rr.add(stmtf("v.copyOf(%s)", result)) + } + return result + } + + w := normalizeWhitespace(result) + if prev := cse[w]; prev != "" { + return prev + } + + op, oparch, typ, auxint, aux, args := parseValue(result, arch, rr.Loc) + + // Find the type of the variable. + typeOverride := typ != "" + if typ == "" && op.typ != "" { + typ = typeName(op.typ) + } + + v := "v" + if top && !move { + rr.add(stmtf("v.reset(Op%s%s)", oparch, op.name)) + if typeOverride { + rr.add(stmtf("v.Type = %s", typ)) + } + } else { + if typ == "" { + log.Fatalf("sub-expression %s (op=Op%s%s) at %s must have a type", result, oparch, op.name, rr.Loc) + } + if resname == "" { + v = fmt.Sprintf("v%d", rr.Alloc) + } else { + v = resname + } + rr.Alloc++ + rr.add(declf(rr.Loc, v, "b.NewValue0(%s, Op%s%s, %s)", pos, oparch, op.name, typ)) + if move && top { + // Rewrite original into a copy + rr.add(stmtf("v.copyOf(%s)", v)) + } + } + + if auxint != "" { + // Make sure auxint value has the right type. + rr.add(stmtf("%s.AuxInt = %sToAuxInt(%s)", v, unTitle(op.auxIntType()), auxint)) + } + if aux != "" { + // Make sure aux value has the right type. + rr.add(stmtf("%s.Aux = %sToAux(%s)", v, unTitle(op.auxType()), aux)) + } + all := new(strings.Builder) + for i, arg := range args { + x := genResult0(rr, arch, arg, false, move, pos, cse) + if i > 0 { + all.WriteString(", ") + } + all.WriteString(x) + } + switch len(args) { + case 0: + case 1: + rr.add(stmtf("%s.AddArg(%s)", v, all.String())) + default: + rr.add(stmtf("%s.AddArg%d(%s)", v, len(args), all.String())) + } + + if cse != nil { + cse[w] = v + } + return v +} + +func split(s string) []string { + var r []string + +outer: + for s != "" { + d := 0 // depth of ({[< + var open, close byte // opening and closing markers ({[< or )}]> + nonsp := false // found a non-space char so far + for i := 0; i < len(s); i++ { + switch { + case d == 0 && s[i] == '(': + open, close = '(', ')' + d++ + case d == 0 && s[i] == '<': + open, close = '<', '>' + d++ + case d == 0 && s[i] == '[': + open, close = '[', ']' + d++ + case d == 0 && s[i] == '{': + open, close = '{', '}' + d++ + case d == 0 && (s[i] == ' ' || s[i] == '\t'): + if nonsp { + r = append(r, strings.TrimSpace(s[:i])) + s = s[i:] + continue outer + } + case d > 0 && s[i] == open: + d++ + case d > 0 && s[i] == close: + d-- + default: + nonsp = true + } + } + if d != 0 { + log.Fatalf("imbalanced expression: %q", s) + } + if nonsp { + r = append(r, strings.TrimSpace(s)) + } + break + } + return r +} + +// isBlock reports whether this op is a block opcode. +func isBlock(name string, arch arch) bool { + for _, b := range genericBlocks { + if b.name == name { + return true + } + } + for _, b := range arch.blocks { + if b.name == name { + return true + } + } + return false +} + +func extract(val string) (op, typ, auxint, aux string, args []string) { + val = val[1 : len(val)-1] // remove () + + // Split val up into regions. + // Split by spaces/tabs, except those contained in (), {}, [], or <>. + s := split(val) + + // Extract restrictions and args. + op = s[0] + for _, a := range s[1:] { + switch a[0] { + case '<': + typ = a[1 : len(a)-1] // remove <> + case '[': + auxint = a[1 : len(a)-1] // remove [] + case '{': + aux = a[1 : len(a)-1] // remove {} + default: + args = append(args, a) + } + } + return +} + +// parseValue parses a parenthesized value from a rule. +// The value can be from the match or the result side. +// It returns the op and unparsed strings for typ, auxint, and aux restrictions and for all args. +// oparch is the architecture that op is located in, or "" for generic. +func parseValue(val string, arch arch, loc string) (op opData, oparch, typ, auxint, aux string, args []string) { + // Resolve the op. + var s string + s, typ, auxint, aux, args = extract(val) + + // match reports whether x is a good op to select. + // If strict is true, rule generation might succeed. + // If strict is false, rule generation has failed, + // but we're trying to generate a useful error. + // Doing strict=true then strict=false allows + // precise op matching while retaining good error messages. + match := func(x opData, strict bool, archname string) bool { + if x.name != s { + return false + } + if x.argLength != -1 && int(x.argLength) != len(args) && (len(args) != 1 || args[0] != "...") { + if strict { + return false + } + log.Printf("%s: op %s (%s) should have %d args, has %d", loc, s, archname, x.argLength, len(args)) + } + return true + } + + for _, x := range genericOps { + if match(x, true, "generic") { + op = x + break + } + } + for _, x := range arch.ops { + if arch.name != "generic" && match(x, true, arch.name) { + if op.name != "" { + log.Fatalf("%s: matches for op %s found in both generic and %s", loc, op.name, arch.name) + } + op = x + oparch = arch.name + break + } + } + + if op.name == "" { + // Failed to find the op. + // Run through everything again with strict=false + // to generate useful diagnosic messages before failing. + for _, x := range genericOps { + match(x, false, "generic") + } + for _, x := range arch.ops { + match(x, false, arch.name) + } + log.Fatalf("%s: unknown op %s", loc, s) + } + + // Sanity check aux, auxint. + if auxint != "" && !opHasAuxInt(op) { + log.Fatalf("%s: op %s %s can't have auxint", loc, op.name, op.aux) + } + if aux != "" && !opHasAux(op) { + log.Fatalf("%s: op %s %s can't have aux", loc, op.name, op.aux) + } + return +} + +func opHasAuxInt(op opData) bool { + switch op.aux { + case "Bool", "Int8", "Int16", "Int32", "Int64", "Int128", "UInt8", "Float32", "Float64", + "SymOff", "CallOff", "SymValAndOff", "TypSize", "ARM64BitField", "FlagConstant", "CCop": + return true + } + return false +} + +func opHasAux(op opData) bool { + switch op.aux { + case "String", "Sym", "SymOff", "Call", "CallOff", "SymValAndOff", "Typ", "TypSize", + "S390XCCMask", "S390XRotateParams": + return true + } + return false +} + +// splitNameExpr splits s-expr arg, possibly prefixed by "name:", +// into name and the unprefixed expression. +// For example, "x:(Foo)" yields "x", "(Foo)", +// and "(Foo)" yields "", "(Foo)". +func splitNameExpr(arg string) (name, expr string) { + colon := strings.Index(arg, ":") + if colon < 0 { + return "", arg + } + openparen := strings.Index(arg, "(") + if openparen < 0 { + log.Fatalf("splitNameExpr(%q): colon but no open parens", arg) + } + if colon > openparen { + // colon is inside the parens, such as in "(Foo x:(Bar))". + return "", arg + } + return arg[:colon], arg[colon+1:] +} + +func getBlockInfo(op string, arch arch) (name string, data blockData) { + for _, b := range genericBlocks { + if b.name == op { + return "Block" + op, b + } + } + for _, b := range arch.blocks { + if b.name == op { + return "Block" + arch.name + op, b + } + } + log.Fatalf("could not find block data for %s", op) + panic("unreachable") +} + +// typeName returns the string to use to generate a type. +func typeName(typ string) string { + if typ[0] == '(' { + ts := strings.Split(typ[1:len(typ)-1], ",") + if len(ts) != 2 { + log.Fatalf("Tuple expect 2 arguments") + } + return "types.NewTuple(" + typeName(ts[0]) + ", " + typeName(ts[1]) + ")" + } + switch typ { + case "Flags", "Mem", "Void", "Int128": + return "types.Type" + typ + default: + return "typ." + typ + } +} + +// balance returns the number of unclosed '(' characters in s. +// If a ')' appears without a corresponding '(', balance returns -1. +func balance(s string) int { + balance := 0 + for _, c := range s { + switch c { + case '(': + balance++ + case ')': + balance-- + if balance < 0 { + // don't allow ")(" to return 0 + return -1 + } + } + } + return balance +} + +// findAllOpcode is a function to find the opcode portion of s-expressions. +var findAllOpcode = regexp.MustCompile(`[(](\w+[|])+\w+[)]`).FindAllStringIndex + +// excludeFromExpansion reports whether the substring s[idx[0]:idx[1]] in a rule +// should be disregarded as a candidate for | expansion. +// It uses simple syntactic checks to see whether the substring +// is inside an AuxInt expression or inside the && conditions. +func excludeFromExpansion(s string, idx []int) bool { + left := s[:idx[0]] + if strings.LastIndexByte(left, '[') > strings.LastIndexByte(left, ']') { + // Inside an AuxInt expression. + return true + } + right := s[idx[1]:] + if strings.Contains(left, "&&") && strings.Contains(right, "=>") { + // Inside && conditions. + return true + } + return false +} + +// expandOr converts a rule into multiple rules by expanding | ops. +func expandOr(r string) []string { + // Find every occurrence of |-separated things. + // They look like MOV(B|W|L|Q|SS|SD)load or MOV(Q|L)loadidx(1|8). + // Generate rules selecting one case from each |-form. + + // Count width of |-forms. They must match. + n := 1 + for _, idx := range findAllOpcode(r, -1) { + if excludeFromExpansion(r, idx) { + continue + } + s := r[idx[0]:idx[1]] + c := strings.Count(s, "|") + 1 + if c == 1 { + continue + } + if n > 1 && n != c { + log.Fatalf("'|' count doesn't match in %s: both %d and %d\n", r, n, c) + } + n = c + } + if n == 1 { + // No |-form in this rule. + return []string{r} + } + // Build each new rule. + res := make([]string, n) + for i := 0; i < n; i++ { + buf := new(strings.Builder) + x := 0 + for _, idx := range findAllOpcode(r, -1) { + if excludeFromExpansion(r, idx) { + continue + } + buf.WriteString(r[x:idx[0]]) // write bytes we've skipped over so far + s := r[idx[0]+1 : idx[1]-1] // remove leading "(" and trailing ")" + buf.WriteString(strings.Split(s, "|")[i]) // write the op component for this rule + x = idx[1] // note that we've written more bytes + } + buf.WriteString(r[x:]) + res[i] = buf.String() + } + return res +} + +// varCount returns a map which counts the number of occurrences of +// Value variables in the s-expression rr.Match and the Go expression rr.Cond. +func varCount(rr *RuleRewrite) map[string]int { + cnt := map[string]int{} + varCount1(rr.Loc, rr.Match, cnt) + if rr.Cond != "" { + expr, err := parser.ParseExpr(rr.Cond) + if err != nil { + log.Fatalf("%s: failed to parse cond %q: %v", rr.Loc, rr.Cond, err) + } + ast.Inspect(expr, func(n ast.Node) bool { + if id, ok := n.(*ast.Ident); ok { + cnt[id.Name]++ + } + return true + }) + } + return cnt +} + +func varCount1(loc, m string, cnt map[string]int) { + if m[0] == '<' || m[0] == '[' || m[0] == '{' { + return + } + if token.IsIdentifier(m) { + cnt[m]++ + return + } + // Split up input. + name, expr := splitNameExpr(m) + if name != "" { + cnt[name]++ + } + if expr[0] != '(' || expr[len(expr)-1] != ')' { + log.Fatalf("%s: non-compound expr in varCount1: %q", loc, expr) + } + s := split(expr[1 : len(expr)-1]) + for _, arg := range s[1:] { + varCount1(loc, arg, cnt) + } +} + +// normalizeWhitespace replaces 2+ whitespace sequences with a single space. +func normalizeWhitespace(x string) string { + x = strings.Join(strings.Fields(x), " ") + x = strings.Replace(x, "( ", "(", -1) + x = strings.Replace(x, " )", ")", -1) + x = strings.Replace(x, "[ ", "[", -1) + x = strings.Replace(x, " ]", "]", -1) + x = strings.Replace(x, ")=>", ") =>", -1) + return x +} + +// opIsCommutative reports whether op s is commutative. +func opIsCommutative(op string, arch arch) bool { + for _, x := range genericOps { + if op == x.name { + if x.commutative { + return true + } + break + } + } + if arch.name != "generic" { + for _, x := range arch.ops { + if op == x.name { + if x.commutative { + return true + } + break + } + } + } + return false +} + +func normalizeMatch(m string, arch arch) string { + if token.IsIdentifier(m) { + return m + } + op, typ, auxint, aux, args := extract(m) + if opIsCommutative(op, arch) { + if args[1] < args[0] { + args[0], args[1] = args[1], args[0] + } + } + s := new(strings.Builder) + fmt.Fprintf(s, "%s <%s> [%s] {%s}", op, typ, auxint, aux) + for _, arg := range args { + prefix, expr := splitNameExpr(arg) + fmt.Fprint(s, " ", prefix, normalizeMatch(expr, arch)) + } + return s.String() +} + +func parseEllipsisRules(rules []Rule, arch arch) (newop string, ok bool) { + if len(rules) != 1 { + for _, r := range rules { + if strings.Contains(r.Rule, "...") { + log.Fatalf("%s: found ellipsis in rule, but there are other rules with the same op", r.Loc) + } + } + return "", false + } + rule := rules[0] + match, cond, result := rule.parse() + if cond != "" || !isEllipsisValue(match) || !isEllipsisValue(result) { + if strings.Contains(rule.Rule, "...") { + log.Fatalf("%s: found ellipsis in non-ellipsis rule", rule.Loc) + } + checkEllipsisRuleCandidate(rule, arch) + return "", false + } + op, oparch, _, _, _, _ := parseValue(result, arch, rule.Loc) + return fmt.Sprintf("Op%s%s", oparch, op.name), true +} + +// isEllipsisValue reports whether s is of the form (OpX ...). +func isEllipsisValue(s string) bool { + if len(s) < 2 || s[0] != '(' || s[len(s)-1] != ')' { + return false + } + c := split(s[1 : len(s)-1]) + if len(c) != 2 || c[1] != "..." { + return false + } + return true +} + +func checkEllipsisRuleCandidate(rule Rule, arch arch) { + match, cond, result := rule.parse() + if cond != "" { + return + } + op, _, _, auxint, aux, args := parseValue(match, arch, rule.Loc) + var auxint2, aux2 string + var args2 []string + var usingCopy string + var eop opData + if result[0] != '(' { + // Check for (Foo x) => x, which can be converted to (Foo ...) => (Copy ...). + args2 = []string{result} + usingCopy = " using Copy" + } else { + eop, _, _, auxint2, aux2, args2 = parseValue(result, arch, rule.Loc) + } + // Check that all restrictions in match are reproduced exactly in result. + if aux != aux2 || auxint != auxint2 || len(args) != len(args2) { + return + } + if strings.Contains(rule.Rule, "=>") && op.aux != eop.aux { + return + } + for i := range args { + if args[i] != args2[i] { + return + } + } + switch { + case opHasAux(op) && aux == "" && aux2 == "": + fmt.Printf("%s: rule silently zeros aux, either copy aux or explicitly zero\n", rule.Loc) + case opHasAuxInt(op) && auxint == "" && auxint2 == "": + fmt.Printf("%s: rule silently zeros auxint, either copy auxint or explicitly zero\n", rule.Loc) + default: + fmt.Printf("%s: possible ellipsis rule candidate%s: %q\n", rule.Loc, usingCopy, rule.Rule) + } +} + +func opByName(arch arch, name string) opData { + name = name[2:] + for _, x := range genericOps { + if name == x.name { + return x + } + } + if arch.name != "generic" { + name = name[len(arch.name):] + for _, x := range arch.ops { + if name == x.name { + return x + } + } + } + log.Fatalf("failed to find op named %s in arch %s", name, arch.name) + panic("unreachable") +} + +// auxType returns the Go type that this operation should store in its aux field. +func (op opData) auxType() string { + switch op.aux { + case "String": + return "string" + case "Sym": + // Note: a Sym can be an *obj.LSym, a *gc.Node, or nil. + return "Sym" + case "SymOff": + return "Sym" + case "Call": + return "Call" + case "CallOff": + return "Call" + case "SymValAndOff": + return "Sym" + case "Typ": + return "*types.Type" + case "TypSize": + return "*types.Type" + case "S390XCCMask": + return "s390x.CCMask" + case "S390XRotateParams": + return "s390x.RotateParams" + default: + return "invalid" + } +} + +// auxIntType returns the Go type that this operation should store in its auxInt field. +func (op opData) auxIntType() string { + switch op.aux { + case "Bool": + return "bool" + case "Int8": + return "int8" + case "Int16": + return "int16" + case "Int32": + return "int32" + case "Int64": + return "int64" + case "Int128": + return "int128" + case "UInt8": + return "uint8" + case "Float32": + return "float32" + case "Float64": + return "float64" + case "CallOff": + return "int32" + case "SymOff": + return "int32" + case "SymValAndOff": + return "ValAndOff" + case "TypSize": + return "int64" + case "CCop": + return "Op" + case "FlagConstant": + return "flagConstant" + case "ARM64BitField": + return "arm64BitField" + default: + return "invalid" + } +} + +// auxType returns the Go type that this block should store in its aux field. +func (b blockData) auxType() string { + switch b.aux { + case "Sym": + return "Sym" + case "S390XCCMask", "S390XCCMaskInt8", "S390XCCMaskUint8": + return "s390x.CCMask" + case "S390XRotateParams": + return "s390x.RotateParams" + default: + return "invalid" + } +} + +// auxIntType returns the Go type that this block should store in its auxInt field. +func (b blockData) auxIntType() string { + switch b.aux { + case "S390XCCMaskInt8": + return "int8" + case "S390XCCMaskUint8": + return "uint8" + case "Int64": + return "int64" + default: + return "invalid" + } +} + +func title(s string) string { + if i := strings.Index(s, "."); i >= 0 { + switch strings.ToLower(s[:i]) { + case "s390x": // keep arch prefix for clarity + s = s[:i] + s[i+1:] + default: + s = s[i+1:] + } + } + return strings.Title(s) +} + +func unTitle(s string) string { + if i := strings.Index(s, "."); i >= 0 { + switch strings.ToLower(s[:i]) { + case "s390x": // keep arch prefix for clarity + s = s[:i] + s[i+1:] + default: + s = s[i+1:] + } + } + return strings.ToLower(s[:1]) + s[1:] +} |