diff options
Diffstat (limited to 'src/cmd/compile/internal/amd64')
-rw-r--r-- | src/cmd/compile/internal/amd64/galign.go | 26 | ||||
-rw-r--r-- | src/cmd/compile/internal/amd64/ggen.go | 137 | ||||
-rw-r--r-- | src/cmd/compile/internal/amd64/ssa.go | 1334 |
3 files changed, 1497 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/amd64/galign.go b/src/cmd/compile/internal/amd64/galign.go new file mode 100644 index 0000000..af58440 --- /dev/null +++ b/src/cmd/compile/internal/amd64/galign.go @@ -0,0 +1,26 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "cmd/compile/internal/gc" + "cmd/internal/obj/x86" +) + +var leaptr = x86.ALEAQ + +func Init(arch *gc.Arch) { + arch.LinkArch = &x86.Linkamd64 + arch.REGSP = x86.REGSP + arch.MAXWIDTH = 1 << 50 + + arch.ZeroRange = zerorange + arch.Ginsnop = ginsnop + arch.Ginsnopdefer = ginsnop + + arch.SSAMarkMoves = ssaMarkMoves + arch.SSAGenValue = ssaGenValue + arch.SSAGenBlock = ssaGenBlock +} diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go new file mode 100644 index 0000000..0c1456f --- /dev/null +++ b/src/cmd/compile/internal/amd64/ggen.go @@ -0,0 +1,137 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "cmd/compile/internal/gc" + "cmd/internal/obj" + "cmd/internal/obj/x86" + "cmd/internal/objabi" +) + +// no floating point in note handlers on Plan 9 +var isPlan9 = objabi.GOOS == "plan9" + +// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, +// See runtime/mkduff.go. +const ( + dzBlocks = 16 // number of MOV/ADD blocks + dzBlockLen = 4 // number of clears per block + dzBlockSize = 19 // size of instructions in a single block + dzMovSize = 4 // size of single MOV instruction w/ offset + dzLeaqSize = 4 // size of single LEAQ instruction + dzClearStep = 16 // number of bytes cleared by each MOV instruction + + dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block + dzSize = dzBlocks * dzBlockSize +) + +// dzOff returns the offset for a jump into DUFFZERO. +// b is the number of bytes to zero. +func dzOff(b int64) int64 { + off := int64(dzSize) + off -= b / dzClearLen * dzBlockSize + tailLen := b % dzClearLen + if tailLen >= dzClearStep { + off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep) + } + return off +} + +// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO. +// b is the number of bytes to zero. +func dzDI(b int64) int64 { + tailLen := b % dzClearLen + if tailLen < dzClearStep { + return 0 + } + tailSteps := tailLen / dzClearStep + return -dzClearStep * (dzBlockLen - tailSteps) +} + +func zerorange(pp *gc.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog { + const ( + ax = 1 << iota + x0 + ) + + if cnt == 0 { + return p + } + + if cnt%int64(gc.Widthreg) != 0 { + // should only happen with nacl + if cnt%int64(gc.Widthptr) != 0 { + gc.Fatalf("zerorange count not a multiple of widthptr %d", cnt) + } + if *state&ax == 0 { + p = pp.Appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) + *state |= ax + } + p = pp.Appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, off) + off += int64(gc.Widthptr) + cnt -= int64(gc.Widthptr) + } + + if cnt == 8 { + if *state&ax == 0 { + p = pp.Appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) + *state |= ax + } + p = pp.Appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, off) + } else if !isPlan9 && cnt <= int64(8*gc.Widthreg) { + if *state&x0 == 0 { + p = pp.Appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0) + *state |= x0 + } + + for i := int64(0); i < cnt/16; i++ { + p = pp.Appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16) + } + + if cnt%16 != 0 { + p = pp.Appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16)) + } + } else if !isPlan9 && (cnt <= int64(128*gc.Widthreg)) { + if *state&x0 == 0 { + p = pp.Appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0) + *state |= x0 + } + p = pp.Appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) + p.To.Sym = gc.Duffzero + + if cnt%16 != 0 { + p = pp.Appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) + } + } else { + if *state&ax == 0 { + p = pp.Appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) + *state |= ax + } + + p = pp.Appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) + p = pp.Appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) + p = pp.Appendpp(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) + } + + return p +} + +func ginsnop(pp *gc.Progs) *obj.Prog { + // This is a hardware nop (1-byte 0x90) instruction, + // even though we describe it as an explicit XCHGL here. + // Particularly, this does not zero the high 32 bits + // like typical *L opcodes. + // (gas assembles "xchg %eax,%eax" to 0x87 0xc0, which + // does zero the high 32 bits.) + p := pp.Prog(x86.AXCHGL) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_AX + return p +} diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go new file mode 100644 index 0000000..5ff05a0 --- /dev/null +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -0,0 +1,1334 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "fmt" + "math" + + "cmd/compile/internal/gc" + "cmd/compile/internal/logopt" + "cmd/compile/internal/ssa" + "cmd/compile/internal/types" + "cmd/internal/obj" + "cmd/internal/obj/x86" +) + +// markMoves marks any MOVXconst ops that need to avoid clobbering flags. +func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) { + flive := b.FlagsLiveAtEnd + for _, c := range b.ControlValues() { + flive = c.Type.IsFlags() || flive + } + for i := len(b.Values) - 1; i >= 0; i-- { + v := b.Values[i] + if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) { + // The "mark" is any non-nil Aux value. + v.Aux = v + } + if v.Type.IsFlags() { + flive = false + } + for _, a := range v.Args { + if a.Type.IsFlags() { + flive = true + } + } + } +} + +// loadByType returns the load instruction of the given type. +func loadByType(t *types.Type) obj.As { + // Avoid partial register write + if !t.IsFloat() { + switch t.Size() { + case 1: + return x86.AMOVBLZX + case 2: + return x86.AMOVWLZX + } + } + // Otherwise, there's no difference between load and store opcodes. + return storeByType(t) +} + +// storeByType returns the store instruction of the given type. +func storeByType(t *types.Type) obj.As { + width := t.Size() + if t.IsFloat() { + switch width { + case 4: + return x86.AMOVSS + case 8: + return x86.AMOVSD + } + } else { + switch width { + case 1: + return x86.AMOVB + case 2: + return x86.AMOVW + case 4: + return x86.AMOVL + case 8: + return x86.AMOVQ + } + } + panic(fmt.Sprintf("bad store type %v", t)) +} + +// moveByType returns the reg->reg move instruction of the given type. +func moveByType(t *types.Type) obj.As { + if t.IsFloat() { + // Moving the whole sse2 register is faster + // than moving just the correct low portion of it. + // There is no xmm->xmm move with 1 byte opcode, + // so use movups, which has 2 byte opcode. + return x86.AMOVUPS + } else { + switch t.Size() { + case 1: + // Avoids partial register write + return x86.AMOVL + case 2: + return x86.AMOVL + case 4: + return x86.AMOVL + case 8: + return x86.AMOVQ + case 16: + return x86.AMOVUPS // int128s are in SSE registers + default: + panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t)) + } + } +} + +// opregreg emits instructions for +// dest := dest(To) op src(From) +// and also returns the created obj.Prog so it +// may be further adjusted (offset, scale, etc). +func opregreg(s *gc.SSAGenState, op obj.As, dest, src int16) *obj.Prog { + p := s.Prog(op) + p.From.Type = obj.TYPE_REG + p.To.Type = obj.TYPE_REG + p.To.Reg = dest + p.From.Reg = src + return p +} + +// memIdx fills out a as an indexed memory reference for v. +// It assumes that the base register and the index register +// are v.Args[0].Reg() and v.Args[1].Reg(), respectively. +// The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary. +func memIdx(a *obj.Addr, v *ssa.Value) { + r, i := v.Args[0].Reg(), v.Args[1].Reg() + a.Type = obj.TYPE_MEM + a.Scale = v.Op.Scale() + if a.Scale == 1 && i == x86.REG_SP { + r, i = i, r + } + a.Reg = r + a.Index = i +} + +// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, +// See runtime/mkduff.go. +func duffStart(size int64) int64 { + x, _ := duff(size) + return x +} +func duffAdj(size int64) int64 { + _, x := duff(size) + return x +} + +// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes) +// required to use the duffzero mechanism for a block of the given size. +func duff(size int64) (int64, int64) { + if size < 32 || size > 1024 || size%dzClearStep != 0 { + panic("bad duffzero size") + } + steps := size / dzClearStep + blocks := steps / dzBlockLen + steps %= dzBlockLen + off := dzBlockSize * (dzBlocks - blocks) + var adj int64 + if steps != 0 { + off -= dzLeaqSize + off -= dzMovSize * steps + adj -= dzClearStep * (dzBlockLen - steps) + } + return off, adj +} + +func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { + switch v.Op { + case ssa.OpAMD64VFMADD231SD: + p := s.Prog(v.Op.Asm()) + p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()} + p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()} + p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()}) + if v.Reg() != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + switch { + case r == r1: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r2 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case r == r2: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + default: + var asm obj.As + if v.Op == ssa.OpAMD64ADDQ { + asm = x86.ALEAQ + } else { + asm = x86.ALEAL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_MEM + p.From.Reg = r1 + p.From.Scale = 1 + p.From.Index = r2 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + } + // 2-address opcode arithmetic + case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL, + ssa.OpAMD64MULQ, ssa.OpAMD64MULL, + ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL, + ssa.OpAMD64ORQ, ssa.OpAMD64ORL, + ssa.OpAMD64XORQ, ssa.OpAMD64XORL, + ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL, + ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB, + ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB, + ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB, + ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB, + ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD, + ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD, + ssa.OpAMD64PXOR, + ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ, + ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ, + ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + opregreg(s, v.Op.Asm(), r, v.Args[1].Reg()) + + case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU: + // Arg[0] (the dividend) is in AX. + // Arg[1] (the divisor) can be in any other register. + // Result[0] (the quotient) is in AX. + // Result[1] (the remainder) is in DX. + r := v.Args[1].Reg() + + // Zero extend dividend. + c := s.Prog(x86.AXORL) + c.From.Type = obj.TYPE_REG + c.From.Reg = x86.REG_DX + c.To.Type = obj.TYPE_REG + c.To.Reg = x86.REG_DX + + // Issue divide. + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + + case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW: + // Arg[0] (the dividend) is in AX. + // Arg[1] (the divisor) can be in any other register. + // Result[0] (the quotient) is in AX. + // Result[1] (the remainder) is in DX. + r := v.Args[1].Reg() + var j1 *obj.Prog + + // CPU faults upon signed overflow, which occurs when the most + // negative int is divided by -1. Handle divide by -1 as a special case. + if ssa.DivisionNeedsFixUp(v) { + var c *obj.Prog + switch v.Op { + case ssa.OpAMD64DIVQ: + c = s.Prog(x86.ACMPQ) + case ssa.OpAMD64DIVL: + c = s.Prog(x86.ACMPL) + case ssa.OpAMD64DIVW: + c = s.Prog(x86.ACMPW) + } + c.From.Type = obj.TYPE_REG + c.From.Reg = r + c.To.Type = obj.TYPE_CONST + c.To.Offset = -1 + j1 = s.Prog(x86.AJEQ) + j1.To.Type = obj.TYPE_BRANCH + } + + // Sign extend dividend. + switch v.Op { + case ssa.OpAMD64DIVQ: + s.Prog(x86.ACQO) + case ssa.OpAMD64DIVL: + s.Prog(x86.ACDQ) + case ssa.OpAMD64DIVW: + s.Prog(x86.ACWD) + } + + // Issue divide. + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + + if j1 != nil { + // Skip over -1 fixup code. + j2 := s.Prog(obj.AJMP) + j2.To.Type = obj.TYPE_BRANCH + + // Issue -1 fixup code. + // n / -1 = -n + var n1 *obj.Prog + switch v.Op { + case ssa.OpAMD64DIVQ: + n1 = s.Prog(x86.ANEGQ) + case ssa.OpAMD64DIVL: + n1 = s.Prog(x86.ANEGL) + case ssa.OpAMD64DIVW: + n1 = s.Prog(x86.ANEGW) + } + n1.To.Type = obj.TYPE_REG + n1.To.Reg = x86.REG_AX + + // n % -1 == 0 + n2 := s.Prog(x86.AXORL) + n2.From.Type = obj.TYPE_REG + n2.From.Reg = x86.REG_DX + n2.To.Type = obj.TYPE_REG + n2.To.Reg = x86.REG_DX + + // TODO(khr): issue only the -1 fixup code we need. + // For instance, if only the quotient is used, no point in zeroing the remainder. + + j1.To.SetTarget(n1) + j2.To.SetTarget(s.Pc()) + } + + case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU: + // the frontend rewrites constant division by 8/16/32 bit integers into + // HMUL by a constant + // SSA rewrites generate the 64 bit versions + + // Arg[0] is already in AX as it's the only register we allow + // and DX is the only output we care about (the high bits) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + // IMULB puts the high portion in AH instead of DL, + // so move it to DL for consistency + if v.Type.Size() == 1 { + m := s.Prog(x86.AMOVB) + m.From.Type = obj.TYPE_REG + m.From.Reg = x86.REG_AH + m.To.Type = obj.TYPE_REG + m.To.Reg = x86.REG_DX + } + + case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU: + // Arg[0] is already in AX as it's the only register we allow + // results lo in AX + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + case ssa.OpAMD64MULQU2: + // Arg[0] is already in AX as it's the only register we allow + // results hi in DX, lo in AX + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + case ssa.OpAMD64DIVQU2: + // Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow + // results q in AX, r in DX + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + + case ssa.OpAMD64AVGQU: + // compute (x+y)/2 unsigned. + // Do a 64-bit add, the overflow goes into the carry. + // Shift right once and pull the carry back into the 63rd bit. + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + p := s.Prog(x86.AADDQ) + p.From.Type = obj.TYPE_REG + p.To.Type = obj.TYPE_REG + p.To.Reg = r + p.From.Reg = v.Args[1].Reg() + p = s.Prog(x86.ARCRQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ: + r := v.Reg0() + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + switch r { + case r0: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case r1: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + default: + v.Fatalf("output not in same register as an input %s", v.LongString()) + } + + case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + + case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + + case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst: + r := v.Reg() + a := v.Args[0].Reg() + if r == a { + switch v.AuxInt { + case 1: + var asm obj.As + // Software optimization manual recommends add $1,reg. + // But inc/dec is 1 byte smaller. ICC always uses inc + // Clang/GCC choose depending on flags, but prefer add. + // Experiments show that inc/dec is both a little faster + // and make a binary a little smaller. + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.AINCQ + } else { + asm = x86.AINCL + } + p := s.Prog(asm) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + case -1: + var asm obj.As + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.ADECQ + } else { + asm = x86.ADECL + } + p := s.Prog(asm) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + case 0x80: + // 'SUBQ $-0x80, r' is shorter to encode than + // and functionally equivalent to 'ADDQ $0x80, r'. + asm := x86.ASUBL + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.ASUBQ + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_CONST + p.From.Offset = -0x80 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + } + var asm obj.As + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.ALEAQ + } else { + asm = x86.ALEAL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_MEM + p.From.Reg = a + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ, + ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT, + ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE, + ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT, + ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE, + ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE, + ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI, + ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS, + ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC, + ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS, + ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF, + ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + // Flag condition: ^ZERO || PARITY + // Generate: + // CMOV*NE SRC,DST + // CMOV*PS SRC,DST + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = r + var q *obj.Prog + if v.Op == ssa.OpAMD64CMOVQNEF { + q = s.Prog(x86.ACMOVQPS) + } else if v.Op == ssa.OpAMD64CMOVLNEF { + q = s.Prog(x86.ACMOVLPS) + } else { + q = s.Prog(x86.ACMOVWPS) + } + q.From.Type = obj.TYPE_REG + q.From.Reg = v.Args[1].Reg() + q.To.Type = obj.TYPE_REG + q.To.Reg = r + + case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + + // Flag condition: ZERO && !PARITY + // Generate: + // MOV SRC,AX + // CMOV*NE DST,AX + // CMOV*PC AX,DST + // + // TODO(rasky): we could generate: + // CMOV*NE DST,SRC + // CMOV*PC SRC,DST + // But this requires a way for regalloc to know that SRC might be + // clobbered by this instruction. + if v.Args[1].Reg() != x86.REG_AX { + opregreg(s, moveByType(v.Type), x86.REG_AX, v.Args[1].Reg()) + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_AX + var q *obj.Prog + if v.Op == ssa.OpAMD64CMOVQEQF { + q = s.Prog(x86.ACMOVQPC) + } else if v.Op == ssa.OpAMD64CMOVLEQF { + q = s.Prog(x86.ACMOVLPC) + } else { + q = s.Prog(x86.ACMOVWPC) + } + q.From.Type = obj.TYPE_REG + q.From.Reg = x86.REG_AX + q.To.Type = obj.TYPE_REG + q.To.Reg = r + + case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst: + r := v.Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()}) + + case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, + ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst, + ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst, + ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst, + ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, + ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst, + ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst, + ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask: + r := v.Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8, + ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8, + ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8: + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + o := v.Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = o + if v.AuxInt != 0 && v.Aux == nil { + // Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA. + switch v.Op { + case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8: + p = s.Prog(x86.ALEAQ) + case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8: + p = s.Prog(x86.ALEAL) + case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8: + p = s.Prog(x86.ALEAW) + } + p.From.Type = obj.TYPE_MEM + p.From.Reg = o + p.To.Type = obj.TYPE_REG + p.To.Reg = o + } + gc.AddAux(&p.From, v) + case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB, + ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB, + ssa.OpAMD64BTL, ssa.OpAMD64BTQ: + opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg()) + case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD: + // Go assembler has swapped operands for UCOMISx relative to CMP, + // must account for that right here. + opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg()) + case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_CONST + p.To.Offset = v.AuxInt + case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst, + ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst, + ssa.OpAMD64BTSLconst, ssa.OpAMD64BTSQconst, + ssa.OpAMD64BTCLconst, ssa.OpAMD64BTCQconst, + ssa.OpAMD64BTRLconst, ssa.OpAMD64BTRQconst: + op := v.Op + if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 { + // Emit 32-bit version because it's shorter + op = ssa.OpAMD64BTLconst + } + p := s.Prog(op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[0].Reg() + case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[1].Reg() + case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload: + sc := v.AuxValAndOff() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + gc.AddAux2(&p.From, v, sc.Off()) + p.To.Type = obj.TYPE_CONST + p.To.Offset = sc.Val() + case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1: + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[2].Reg() + case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1: + sc := v.AuxValAndOff() + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + gc.AddAux2(&p.From, v, sc.Off()) + p.To.Type = obj.TYPE_CONST + p.To.Offset = sc.Val() + case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst: + x := v.Reg() + + // If flags aren't live (indicated by v.Aux == nil), + // then we can rewrite MOV $0, AX into XOR AX, AX. + if v.AuxInt == 0 && v.Aux == nil { + p := s.Prog(x86.AXORL) + p.From.Type = obj.TYPE_REG + p.From.Reg = x + p.To.Type = obj.TYPE_REG + p.To.Reg = x + break + } + + asm := v.Op.Asm() + // Use MOVL to move a small constant into a register + // when the constant is positive and fits into 32 bits. + if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) { + // The upper 32bit are zeroed automatically when using MOVL. + asm = x86.AMOVL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = x + case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst: + x := v.Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_FCONST + p.From.Val = math.Float64frombits(uint64(v.AuxInt)) + p.To.Type = obj.TYPE_REG + p.To.Reg = x + case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1, + ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2: + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore, + ssa.OpAMD64BTCQmodify, ssa.OpAMD64BTCLmodify, ssa.OpAMD64BTRQmodify, ssa.OpAMD64BTRLmodify, ssa.OpAMD64BTSQmodify, ssa.OpAMD64BTSLmodify, + ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify, + ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux(&p.To, v) + case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1, + ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2, + ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8, + ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8, + ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8, + ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8, + ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + memIdx(&p.To, v) + gc.AddAux(&p.To, v) + case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify: + sc := v.AuxValAndOff() + off := sc.Off() + val := sc.Val() + if val == 1 || val == -1 { + var asm obj.As + if v.Op == ssa.OpAMD64ADDQconstmodify { + if val == 1 { + asm = x86.AINCQ + } else { + asm = x86.ADECQ + } + } else { + if val == 1 { + asm = x86.AINCL + } else { + asm = x86.ADECL + } + } + p := s.Prog(asm) + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux2(&p.To, v, off) + break + } + fallthrough + case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify, + ssa.OpAMD64BTCQconstmodify, ssa.OpAMD64BTCLconstmodify, ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTSLconstmodify, + ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTRLconstmodify, ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify: + sc := v.AuxValAndOff() + off := sc.Off() + val := sc.Val() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = val + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux2(&p.To, v, off) + + case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + sc := v.AuxValAndOff() + p.From.Offset = sc.Val() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux2(&p.To, v, sc.Off()) + case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1, + ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8, + ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8, + ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8, + ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + sc := v.AuxValAndOff() + p.From.Offset = sc.Val() + switch { + case p.As == x86.AADDQ && p.From.Offset == 1: + p.As = x86.AINCQ + p.From.Type = obj.TYPE_NONE + case p.As == x86.AADDQ && p.From.Offset == -1: + p.As = x86.ADECQ + p.From.Type = obj.TYPE_NONE + case p.As == x86.AADDL && p.From.Offset == 1: + p.As = x86.AINCL + p.From.Type = obj.TYPE_NONE + case p.As == x86.AADDL && p.From.Offset == -1: + p.As = x86.ADECL + p.From.Type = obj.TYPE_NONE + } + memIdx(&p.To, v) + gc.AddAux2(&p.To, v, sc.Off()) + case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX, + ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ, + ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS: + opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg()) + case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS: + r := v.Reg() + // Break false dependency on destination register. + opregreg(s, x86.AXORPS, r, r) + opregreg(s, v.Op.Asm(), r, v.Args[0].Reg()) + case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i: + var p *obj.Prog + switch v.Op { + case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i: + p = s.Prog(x86.AMOVQ) + case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i: + p = s.Prog(x86.AMOVL) + } + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload, + ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload, + ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload, + ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload, + ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[1].Reg() + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + if v.Reg() != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8, + ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8, + ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8, + ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8, + ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8, + ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8, + ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8, + ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8, + ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8: + p := s.Prog(v.Op.Asm()) + + r, i := v.Args[1].Reg(), v.Args[2].Reg() + p.From.Type = obj.TYPE_MEM + p.From.Scale = v.Op.Scale() + if p.From.Scale == 1 && i == x86.REG_SP { + r, i = i, r + } + p.From.Reg = r + p.From.Index = i + + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + if v.Reg() != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + case ssa.OpAMD64DUFFZERO: + off := duffStart(v.AuxInt) + adj := duffAdj(v.AuxInt) + var p *obj.Prog + if adj != 0 { + p = s.Prog(x86.ALEAQ) + p.From.Type = obj.TYPE_MEM + p.From.Offset = adj + p.From.Reg = x86.REG_DI + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_DI + } + p = s.Prog(obj.ADUFFZERO) + p.To.Type = obj.TYPE_ADDR + p.To.Sym = gc.Duffzero + p.To.Offset = off + case ssa.OpAMD64MOVOconst: + if v.AuxInt != 0 { + v.Fatalf("MOVOconst can only do constant=0") + } + r := v.Reg() + opregreg(s, x86.AXORPS, r, r) + case ssa.OpAMD64DUFFCOPY: + p := s.Prog(obj.ADUFFCOPY) + p.To.Type = obj.TYPE_ADDR + p.To.Sym = gc.Duffcopy + if v.AuxInt%16 != 0 { + v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt) + } + p.To.Offset = 14 * (64 - v.AuxInt/16) + // 14 and 64 are magic constants. 14 is the number of bytes to encode: + // MOVUPS (SI), X0 + // ADDQ $16, SI + // MOVUPS X0, (DI) + // ADDQ $16, DI + // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy. + + case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? + if v.Type.IsMemory() { + return + } + x := v.Args[0].Reg() + y := v.Reg() + if x != y { + opregreg(s, moveByType(v.Type), y, x) + } + case ssa.OpLoadReg: + if v.Type.IsFlags() { + v.Fatalf("load flags not implemented: %v", v.LongString()) + return + } + p := s.Prog(loadByType(v.Type)) + gc.AddrAuto(&p.From, v.Args[0]) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpStoreReg: + if v.Type.IsFlags() { + v.Fatalf("store flags not implemented: %v", v.LongString()) + return + } + p := s.Prog(storeByType(v.Type)) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + gc.AddrAuto(&p.To, v) + case ssa.OpAMD64LoweredHasCPUFeature: + p := s.Prog(x86.AMOVBQZX) + p.From.Type = obj.TYPE_MEM + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64LoweredGetClosurePtr: + // Closure pointer is DX. + gc.CheckLoweredGetClosurePtr(v) + case ssa.OpAMD64LoweredGetG: + r := v.Reg() + // See the comments in cmd/internal/obj/x86/obj6.go + // near CanUse1InsnTLS for a detailed explanation of these instructions. + if x86.CanUse1InsnTLS(gc.Ctxt) { + // MOVQ (TLS), r + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_MEM + p.From.Reg = x86.REG_TLS + p.To.Type = obj.TYPE_REG + p.To.Reg = r + } else { + // MOVQ TLS, r + // MOVQ (r)(TLS*1), r + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_TLS + p.To.Type = obj.TYPE_REG + p.To.Reg = r + q := s.Prog(x86.AMOVQ) + q.From.Type = obj.TYPE_MEM + q.From.Reg = r + q.From.Index = x86.REG_TLS + q.From.Scale = 1 + q.To.Type = obj.TYPE_REG + q.To.Reg = r + } + case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter: + s.Call(v) + + case ssa.OpAMD64LoweredGetCallerPC: + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_MEM + p.From.Offset = -8 // PC is stored 8 bytes below first parameter. + p.From.Name = obj.NAME_PARAM + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64LoweredGetCallerSP: + // caller's SP is the address of the first arg + mov := x86.AMOVQ + if gc.Widthptr == 4 { + mov = x86.AMOVL + } + p := s.Prog(mov) + p.From.Type = obj.TYPE_ADDR + p.From.Offset = -gc.Ctxt.FixedFrameSize() // 0 on amd64, just to be consistent with other architectures + p.From.Name = obj.NAME_PARAM + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64LoweredWB: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + // arg0 is in DI. Set sym to match where regalloc put arg1. + p.To.Sym = gc.GCWriteBarrierReg[v.Args[1].Reg()] + + case ssa.OpAMD64LoweredPanicBoundsA, ssa.OpAMD64LoweredPanicBoundsB, ssa.OpAMD64LoweredPanicBoundsC: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = gc.BoundsCheckFunc[v.AuxInt] + s.UseArgs(int64(2 * gc.Widthptr)) // space used in callee args area by assembly stubs + + case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL, + ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL, + ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpAMD64NEGLflags: + r := v.Reg0() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + switch v.Op { + case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ: + p.To.Reg = v.Reg0() + case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD: + p.To.Reg = v.Reg() + } + case ssa.OpAMD64ROUNDSD: + p := s.Prog(v.Op.Asm()) + val := v.AuxInt + // 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc + if val < 0 || val > 3 { + v.Fatalf("Invalid rounding mode") + } + p.From.Offset = val + p.From.Type = obj.TYPE_CONST + p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()}) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL: + if v.Args[0].Reg() != v.Reg() { + // POPCNT on Intel has a false dependency on the destination register. + // Xor register with itself to break the dependency. + p := s.Prog(x86.AXORQ) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE, + ssa.OpAMD64SETL, ssa.OpAMD64SETLE, + ssa.OpAMD64SETG, ssa.OpAMD64SETGE, + ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF, + ssa.OpAMD64SETB, ssa.OpAMD64SETBE, + ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN, + ssa.OpAMD64SETA, ssa.OpAMD64SETAE, + ssa.OpAMD64SETO: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore, + ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore, + ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore, + ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore, + ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux(&p.To, v) + + case ssa.OpAMD64SETNEF: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + q := s.Prog(x86.ASETPS) + q.To.Type = obj.TYPE_REG + q.To.Reg = x86.REG_AX + // ORL avoids partial register write and is smaller than ORQ, used by old compiler + opregreg(s, x86.AORL, v.Reg(), x86.REG_AX) + + case ssa.OpAMD64SETEQF: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + q := s.Prog(x86.ASETPC) + q.To.Type = obj.TYPE_REG + q.To.Reg = x86.REG_AX + // ANDL avoids partial register write and is smaller than ANDQ, used by old compiler + opregreg(s, x86.AANDL, v.Reg(), x86.REG_AX) + + case ssa.OpAMD64InvertFlags: + v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString()) + case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT: + v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString()) + case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64: + v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString()) + case ssa.OpAMD64REPSTOSQ: + s.Prog(x86.AREP) + s.Prog(x86.ASTOSQ) + case ssa.OpAMD64REPMOVSQ: + s.Prog(x86.AREP) + s.Prog(x86.AMOVSQ) + case ssa.OpAMD64LoweredNilCheck: + // Issue a load which will fault if the input is nil. + // TODO: We currently use the 2-byte instruction TESTB AX, (reg). + // Should we use the 3-byte TESTB $0, (reg) instead? It is larger + // but it doesn't have false dependency on AX. + // Or maybe allocate an output register and use MOVL (reg),reg2 ? + // That trades clobbering flags for clobbering a register. + p := s.Prog(x86.ATESTB) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + if logopt.Enabled() { + logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name) + } + if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers + gc.Warnl(v.Pos, "generated nil check") + } + case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + gc.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ: + r := v.Reg0() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output[0] not in same register %s", v.LongString()) + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[1].Reg() + gc.AddAux(&p.To, v) + case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock: + r := v.Reg0() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output[0] not in same register %s", v.LongString()) + } + s.Prog(x86.ALOCK) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[1].Reg() + gc.AddAux(&p.To, v) + case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock: + if v.Args[1].Reg() != x86.REG_AX { + v.Fatalf("input[1] not in AX %s", v.LongString()) + } + s.Prog(x86.ALOCK) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux(&p.To, v) + p = s.Prog(x86.ASETEQ) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock: + s.Prog(x86.ALOCK) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + gc.AddAux(&p.To, v) + case ssa.OpClobber: + p := s.Prog(x86.AMOVL) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 0xdeaddead + p.To.Type = obj.TYPE_MEM + p.To.Reg = x86.REG_SP + gc.AddAux(&p.To, v) + p = s.Prog(x86.AMOVL) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 0xdeaddead + p.To.Type = obj.TYPE_MEM + p.To.Reg = x86.REG_SP + gc.AddAux(&p.To, v) + p.To.Offset += 4 + default: + v.Fatalf("genValue not implemented: %s", v.LongString()) + } +} + +var blockJump = [...]struct { + asm, invasm obj.As +}{ + ssa.BlockAMD64EQ: {x86.AJEQ, x86.AJNE}, + ssa.BlockAMD64NE: {x86.AJNE, x86.AJEQ}, + ssa.BlockAMD64LT: {x86.AJLT, x86.AJGE}, + ssa.BlockAMD64GE: {x86.AJGE, x86.AJLT}, + ssa.BlockAMD64LE: {x86.AJLE, x86.AJGT}, + ssa.BlockAMD64GT: {x86.AJGT, x86.AJLE}, + ssa.BlockAMD64OS: {x86.AJOS, x86.AJOC}, + ssa.BlockAMD64OC: {x86.AJOC, x86.AJOS}, + ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC}, + ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS}, + ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS}, + ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI}, + ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS}, + ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC}, +} + +var eqfJumps = [2][2]gc.IndexJump{ + {{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0] + {{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1] +} +var nefJumps = [2][2]gc.IndexJump{ + {{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0] + {{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1] +} + +func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) { + switch b.Kind { + case ssa.BlockPlain: + if b.Succs[0].Block() != next { + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()}) + } + case ssa.BlockDefer: + // defer returns in rax: + // 0 if we should continue executing + // 1 if we should jump to deferreturn call + p := s.Prog(x86.ATESTL) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_AX + p = s.Prog(x86.AJNE) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()}) + if b.Succs[0].Block() != next { + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()}) + } + case ssa.BlockExit: + case ssa.BlockRet: + s.Prog(obj.ARET) + case ssa.BlockRetJmp: + p := s.Prog(obj.ARET) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = b.Aux.(*obj.LSym) + + case ssa.BlockAMD64EQF: + s.CombJump(b, next, &eqfJumps) + + case ssa.BlockAMD64NEF: + s.CombJump(b, next, &nefJumps) + + case ssa.BlockAMD64EQ, ssa.BlockAMD64NE, + ssa.BlockAMD64LT, ssa.BlockAMD64GE, + ssa.BlockAMD64LE, ssa.BlockAMD64GT, + ssa.BlockAMD64OS, ssa.BlockAMD64OC, + ssa.BlockAMD64ULT, ssa.BlockAMD64UGT, + ssa.BlockAMD64ULE, ssa.BlockAMD64UGE: + jmp := blockJump[b.Kind] + switch next { + case b.Succs[0].Block(): + s.Br(jmp.invasm, b.Succs[1].Block()) + case b.Succs[1].Block(): + s.Br(jmp.asm, b.Succs[0].Block()) + default: + if b.Likely != ssa.BranchUnlikely { + s.Br(jmp.asm, b.Succs[0].Block()) + s.Br(obj.AJMP, b.Succs[1].Block()) + } else { + s.Br(jmp.invasm, b.Succs[1].Block()) + s.Br(obj.AJMP, b.Succs[0].Block()) + } + } + + default: + b.Fatalf("branch not implemented: %s", b.LongString()) + } +} |