diff options
Diffstat (limited to 'src/cmd/compile/internal/amd64')
-rw-r--r-- | src/cmd/compile/internal/amd64/galign.go | 27 | ||||
-rw-r--r-- | src/cmd/compile/internal/amd64/ggen.go | 154 | ||||
-rw-r--r-- | src/cmd/compile/internal/amd64/ssa.go | 1439 | ||||
-rw-r--r-- | src/cmd/compile/internal/amd64/versions_test.go | 433 |
4 files changed, 2053 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/amd64/galign.go b/src/cmd/compile/internal/amd64/galign.go new file mode 100644 index 0000000..ca44263 --- /dev/null +++ b/src/cmd/compile/internal/amd64/galign.go @@ -0,0 +1,27 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "cmd/compile/internal/ssagen" + "cmd/internal/obj/x86" +) + +var leaptr = x86.ALEAQ + +func Init(arch *ssagen.ArchInfo) { + arch.LinkArch = &x86.Linkamd64 + arch.REGSP = x86.REGSP + arch.MAXWIDTH = 1 << 50 + + arch.ZeroRange = zerorange + arch.Ginsnop = ginsnop + + arch.SSAMarkMoves = ssaMarkMoves + arch.SSAGenValue = ssaGenValue + arch.SSAGenBlock = ssaGenBlock + arch.LoadRegResult = loadRegResult + arch.SpillArgReg = spillArgReg +} diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go new file mode 100644 index 0000000..b8dce81 --- /dev/null +++ b/src/cmd/compile/internal/amd64/ggen.go @@ -0,0 +1,154 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "cmd/compile/internal/base" + "cmd/compile/internal/ir" + "cmd/compile/internal/objw" + "cmd/compile/internal/types" + "cmd/internal/obj" + "cmd/internal/obj/x86" + "internal/buildcfg" +) + +// no floating point in note handlers on Plan 9 +var isPlan9 = buildcfg.GOOS == "plan9" + +// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, +// See runtime/mkduff.go. +const ( + dzBlocks = 16 // number of MOV/ADD blocks + dzBlockLen = 4 // number of clears per block + dzBlockSize = 23 // size of instructions in a single block + dzMovSize = 5 // size of single MOV instruction w/ offset + dzLeaqSize = 4 // size of single LEAQ instruction + dzClearStep = 16 // number of bytes cleared by each MOV instruction + + dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block + dzSize = dzBlocks * dzBlockSize +) + +// dzOff returns the offset for a jump into DUFFZERO. +// b is the number of bytes to zero. +func dzOff(b int64) int64 { + off := int64(dzSize) + off -= b / dzClearLen * dzBlockSize + tailLen := b % dzClearLen + if tailLen >= dzClearStep { + off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep) + } + return off +} + +// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO. +// b is the number of bytes to zero. +func dzDI(b int64) int64 { + tailLen := b % dzClearLen + if tailLen < dzClearStep { + return 0 + } + tailSteps := tailLen / dzClearStep + return -dzClearStep * (dzBlockLen - tailSteps) +} + +func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog { + const ( + r13 = 1 << iota // if R13 is already zeroed. + ) + + if cnt == 0 { + return p + } + + if cnt%int64(types.RegSize) != 0 { + // should only happen with nacl + if cnt%int64(types.PtrSize) != 0 { + base.Fatalf("zerorange count not a multiple of widthptr %d", cnt) + } + if *state&r13 == 0 { + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_R13, 0) + *state |= r13 + } + p = pp.Append(p, x86.AMOVL, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_MEM, x86.REG_SP, off) + off += int64(types.PtrSize) + cnt -= int64(types.PtrSize) + } + + if cnt == 8 { + if *state&r13 == 0 { + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_R13, 0) + *state |= r13 + } + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_MEM, x86.REG_SP, off) + } else if !isPlan9 && cnt <= int64(8*types.RegSize) { + for i := int64(0); i < cnt/16; i++ { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16) + } + + if cnt%16 != 0 { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16)) + } + } else if !isPlan9 && (cnt <= int64(128*types.RegSize)) { + // Save DI to r12. With the amd64 Go register abi, DI can contain + // an incoming parameter, whereas R12 is always scratch. + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) + // Emit duffzero call + p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) + p.To.Sym = ir.Syms.Duffzero + if cnt%16 != 0 { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) + } + // Restore DI from r12 + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) + + } else { + // When the register ABI is in effect, at this point in the + // prolog we may have live values in all of RAX,RDI,RCX. Save + // them off to registers before the REPSTOSQ below, then + // restore. Note that R12 and R13 are always available as + // scratch regs; here we also use R15 (this is safe to do + // since there won't be any globals accessed in the prolog). + // See rewriteToUseGot() in obj6.go for more on r15 use. + + // Save rax/rdi/rcx + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0) + + // Set up the REPSTOSQ and kick it off. + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0) + p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) + p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) + + // Restore rax/rdi/rcx + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0) + + // Record the fact that r13 is no longer zero. + *state &= ^uint32(r13) + } + + return p +} + +func ginsnop(pp *objw.Progs) *obj.Prog { + // This is a hardware nop (1-byte 0x90) instruction, + // even though we describe it as an explicit XCHGL here. + // Particularly, this does not zero the high 32 bits + // like typical *L opcodes. + // (gas assembles "xchg %eax,%eax" to 0x87 0xc0, which + // does zero the high 32 bits.) + p := pp.Prog(x86.AXCHGL) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_AX + return p +} diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go new file mode 100644 index 0000000..6139d5e --- /dev/null +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -0,0 +1,1439 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "fmt" + "internal/buildcfg" + "math" + + "cmd/compile/internal/base" + "cmd/compile/internal/ir" + "cmd/compile/internal/logopt" + "cmd/compile/internal/objw" + "cmd/compile/internal/ssa" + "cmd/compile/internal/ssagen" + "cmd/compile/internal/types" + "cmd/internal/obj" + "cmd/internal/obj/x86" +) + +// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. +func ssaMarkMoves(s *ssagen.State, b *ssa.Block) { + flive := b.FlagsLiveAtEnd + for _, c := range b.ControlValues() { + flive = c.Type.IsFlags() || flive + } + for i := len(b.Values) - 1; i >= 0; i-- { + v := b.Values[i] + if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) { + // The "mark" is any non-nil Aux value. + v.Aux = v + } + if v.Type.IsFlags() { + flive = false + } + for _, a := range v.Args { + if a.Type.IsFlags() { + flive = true + } + } + } +} + +// loadByType returns the load instruction of the given type. +func loadByType(t *types.Type) obj.As { + // Avoid partial register write + if !t.IsFloat() { + switch t.Size() { + case 1: + return x86.AMOVBLZX + case 2: + return x86.AMOVWLZX + } + } + // Otherwise, there's no difference between load and store opcodes. + return storeByType(t) +} + +// storeByType returns the store instruction of the given type. +func storeByType(t *types.Type) obj.As { + width := t.Size() + if t.IsFloat() { + switch width { + case 4: + return x86.AMOVSS + case 8: + return x86.AMOVSD + } + } else { + switch width { + case 1: + return x86.AMOVB + case 2: + return x86.AMOVW + case 4: + return x86.AMOVL + case 8: + return x86.AMOVQ + case 16: + return x86.AMOVUPS + } + } + panic(fmt.Sprintf("bad store type %v", t)) +} + +// moveByType returns the reg->reg move instruction of the given type. +func moveByType(t *types.Type) obj.As { + if t.IsFloat() { + // Moving the whole sse2 register is faster + // than moving just the correct low portion of it. + // There is no xmm->xmm move with 1 byte opcode, + // so use movups, which has 2 byte opcode. + return x86.AMOVUPS + } else { + switch t.Size() { + case 1: + // Avoids partial register write + return x86.AMOVL + case 2: + return x86.AMOVL + case 4: + return x86.AMOVL + case 8: + return x86.AMOVQ + case 16: + return x86.AMOVUPS // int128s are in SSE registers + default: + panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t)) + } + } +} + +// opregreg emits instructions for +// +// dest := dest(To) op src(From) +// +// and also returns the created obj.Prog so it +// may be further adjusted (offset, scale, etc). +func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog { + p := s.Prog(op) + p.From.Type = obj.TYPE_REG + p.To.Type = obj.TYPE_REG + p.To.Reg = dest + p.From.Reg = src + return p +} + +// memIdx fills out a as an indexed memory reference for v. +// It assumes that the base register and the index register +// are v.Args[0].Reg() and v.Args[1].Reg(), respectively. +// The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary. +func memIdx(a *obj.Addr, v *ssa.Value) { + r, i := v.Args[0].Reg(), v.Args[1].Reg() + a.Type = obj.TYPE_MEM + a.Scale = v.Op.Scale() + if a.Scale == 1 && i == x86.REG_SP { + r, i = i, r + } + a.Reg = r + a.Index = i +} + +// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, +// See runtime/mkduff.go. +func duffStart(size int64) int64 { + x, _ := duff(size) + return x +} +func duffAdj(size int64) int64 { + _, x := duff(size) + return x +} + +// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes) +// required to use the duffzero mechanism for a block of the given size. +func duff(size int64) (int64, int64) { + if size < 32 || size > 1024 || size%dzClearStep != 0 { + panic("bad duffzero size") + } + steps := size / dzClearStep + blocks := steps / dzBlockLen + steps %= dzBlockLen + off := dzBlockSize * (dzBlocks - blocks) + var adj int64 + if steps != 0 { + off -= dzLeaqSize + off -= dzMovSize * steps + adj -= dzClearStep * (dzBlockLen - steps) + } + return off, adj +} + +func getgFromTLS(s *ssagen.State, r int16) { + // See the comments in cmd/internal/obj/x86/obj6.go + // near CanUse1InsnTLS for a detailed explanation of these instructions. + if x86.CanUse1InsnTLS(base.Ctxt) { + // MOVQ (TLS), r + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_MEM + p.From.Reg = x86.REG_TLS + p.To.Type = obj.TYPE_REG + p.To.Reg = r + } else { + // MOVQ TLS, r + // MOVQ (r)(TLS*1), r + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_TLS + p.To.Type = obj.TYPE_REG + p.To.Reg = r + q := s.Prog(x86.AMOVQ) + q.From.Type = obj.TYPE_MEM + q.From.Reg = r + q.From.Index = x86.REG_TLS + q.From.Scale = 1 + q.To.Type = obj.TYPE_REG + q.To.Reg = r + } +} + +func ssaGenValue(s *ssagen.State, v *ssa.Value) { + switch v.Op { + case ssa.OpAMD64VFMADD231SD: + p := s.Prog(v.Op.Asm()) + p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()} + p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()} + p.SetFrom3Reg(v.Args[1].Reg()) + case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + switch { + case r == r1: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r2 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case r == r2: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + default: + var asm obj.As + if v.Op == ssa.OpAMD64ADDQ { + asm = x86.ALEAQ + } else { + asm = x86.ALEAL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_MEM + p.From.Reg = r1 + p.From.Scale = 1 + p.From.Index = r2 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + } + // 2-address opcode arithmetic + case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL, + ssa.OpAMD64MULQ, ssa.OpAMD64MULL, + ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL, + ssa.OpAMD64ORQ, ssa.OpAMD64ORL, + ssa.OpAMD64XORQ, ssa.OpAMD64XORL, + ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL, + ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB, + ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB, + ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB, + ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB, + ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD, + ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD, + ssa.OpAMD64PXOR, + ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ, + ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ, + ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ: + opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg()) + + case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ: + p := s.Prog(v.Op.Asm()) + lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg() + p.From.Type = obj.TYPE_REG + p.From.Reg = bits + p.To.Type = obj.TYPE_REG + p.To.Reg = lo + p.SetFrom3Reg(hi) + + case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL, + ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL, + ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + p.SetFrom3Reg(v.Args[1].Reg()) + + case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ, + ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ, + ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ: + p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg()) + p.SetFrom3Reg(v.Args[0].Reg()) + + case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload, + ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload, + ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload: + p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg()) + m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()} + ssagen.AddAux(&m, v) + p.SetFrom3(m) + + case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8, + ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8, + ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8, + ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8, + ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8, + ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8: + p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg()) + m := obj.Addr{Type: obj.TYPE_MEM} + memIdx(&m, v) + ssagen.AddAux(&m, v) + p.SetFrom3(m) + + case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU: + // Arg[0] (the dividend) is in AX. + // Arg[1] (the divisor) can be in any other register. + // Result[0] (the quotient) is in AX. + // Result[1] (the remainder) is in DX. + r := v.Args[1].Reg() + + // Zero extend dividend. + opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX) + + // Issue divide. + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + + case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW: + // Arg[0] (the dividend) is in AX. + // Arg[1] (the divisor) can be in any other register. + // Result[0] (the quotient) is in AX. + // Result[1] (the remainder) is in DX. + r := v.Args[1].Reg() + var j1 *obj.Prog + + // CPU faults upon signed overflow, which occurs when the most + // negative int is divided by -1. Handle divide by -1 as a special case. + if ssa.DivisionNeedsFixUp(v) { + var c *obj.Prog + switch v.Op { + case ssa.OpAMD64DIVQ: + c = s.Prog(x86.ACMPQ) + case ssa.OpAMD64DIVL: + c = s.Prog(x86.ACMPL) + case ssa.OpAMD64DIVW: + c = s.Prog(x86.ACMPW) + } + c.From.Type = obj.TYPE_REG + c.From.Reg = r + c.To.Type = obj.TYPE_CONST + c.To.Offset = -1 + j1 = s.Prog(x86.AJEQ) + j1.To.Type = obj.TYPE_BRANCH + } + + // Sign extend dividend. + switch v.Op { + case ssa.OpAMD64DIVQ: + s.Prog(x86.ACQO) + case ssa.OpAMD64DIVL: + s.Prog(x86.ACDQ) + case ssa.OpAMD64DIVW: + s.Prog(x86.ACWD) + } + + // Issue divide. + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + + if j1 != nil { + // Skip over -1 fixup code. + j2 := s.Prog(obj.AJMP) + j2.To.Type = obj.TYPE_BRANCH + + // Issue -1 fixup code. + // n / -1 = -n + var n1 *obj.Prog + switch v.Op { + case ssa.OpAMD64DIVQ: + n1 = s.Prog(x86.ANEGQ) + case ssa.OpAMD64DIVL: + n1 = s.Prog(x86.ANEGL) + case ssa.OpAMD64DIVW: + n1 = s.Prog(x86.ANEGW) + } + n1.To.Type = obj.TYPE_REG + n1.To.Reg = x86.REG_AX + + // n % -1 == 0 + opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX) + + // TODO(khr): issue only the -1 fixup code we need. + // For instance, if only the quotient is used, no point in zeroing the remainder. + + j1.To.SetTarget(n1) + j2.To.SetTarget(s.Pc()) + } + + case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU: + // the frontend rewrites constant division by 8/16/32 bit integers into + // HMUL by a constant + // SSA rewrites generate the 64 bit versions + + // Arg[0] is already in AX as it's the only register we allow + // and DX is the only output we care about (the high bits) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + // IMULB puts the high portion in AH instead of DL, + // so move it to DL for consistency + if v.Type.Size() == 1 { + m := s.Prog(x86.AMOVB) + m.From.Type = obj.TYPE_REG + m.From.Reg = x86.REG_AH + m.To.Type = obj.TYPE_REG + m.To.Reg = x86.REG_DX + } + + case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU: + // Arg[0] is already in AX as it's the only register we allow + // results lo in AX + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + case ssa.OpAMD64MULQU2: + // Arg[0] is already in AX as it's the only register we allow + // results hi in DX, lo in AX + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + case ssa.OpAMD64DIVQU2: + // Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow + // results q in AX, r in DX + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + + case ssa.OpAMD64AVGQU: + // compute (x+y)/2 unsigned. + // Do a 64-bit add, the overflow goes into the carry. + // Shift right once and pull the carry back into the 63rd bit. + p := s.Prog(x86.AADDQ) + p.From.Type = obj.TYPE_REG + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + p.From.Reg = v.Args[1].Reg() + p = s.Prog(x86.ARCRQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 1 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ: + r := v.Reg0() + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + switch r { + case r0: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case r1: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + default: + v.Fatalf("output not in same register as an input %s", v.LongString()) + } + + case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + + case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + + case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst: + r := v.Reg() + a := v.Args[0].Reg() + if r == a { + switch v.AuxInt { + case 1: + var asm obj.As + // Software optimization manual recommends add $1,reg. + // But inc/dec is 1 byte smaller. ICC always uses inc + // Clang/GCC choose depending on flags, but prefer add. + // Experiments show that inc/dec is both a little faster + // and make a binary a little smaller. + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.AINCQ + } else { + asm = x86.AINCL + } + p := s.Prog(asm) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + case -1: + var asm obj.As + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.ADECQ + } else { + asm = x86.ADECL + } + p := s.Prog(asm) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + case 0x80: + // 'SUBQ $-0x80, r' is shorter to encode than + // and functionally equivalent to 'ADDQ $0x80, r'. + asm := x86.ASUBL + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.ASUBQ + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_CONST + p.From.Offset = -0x80 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + return + } + var asm obj.As + if v.Op == ssa.OpAMD64ADDQconst { + asm = x86.ALEAQ + } else { + asm = x86.ALEAL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_MEM + p.From.Reg = a + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ, + ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT, + ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE, + ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT, + ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE, + ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE, + ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI, + ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS, + ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC, + ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS, + ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF, + ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF: + // Flag condition: ^ZERO || PARITY + // Generate: + // CMOV*NE SRC,DST + // CMOV*PS SRC,DST + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + var q *obj.Prog + if v.Op == ssa.OpAMD64CMOVQNEF { + q = s.Prog(x86.ACMOVQPS) + } else if v.Op == ssa.OpAMD64CMOVLNEF { + q = s.Prog(x86.ACMOVLPS) + } else { + q = s.Prog(x86.ACMOVWPS) + } + q.From.Type = obj.TYPE_REG + q.From.Reg = v.Args[1].Reg() + q.To.Type = obj.TYPE_REG + q.To.Reg = v.Reg() + + case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF: + // Flag condition: ZERO && !PARITY + // Generate: + // MOV SRC,TMP + // CMOV*NE DST,TMP + // CMOV*PC TMP,DST + // + // TODO(rasky): we could generate: + // CMOV*NE DST,SRC + // CMOV*PC SRC,DST + // But this requires a way for regalloc to know that SRC might be + // clobbered by this instruction. + t := v.RegTmp() + opregreg(s, moveByType(v.Type), t, v.Args[1].Reg()) + + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = t + var q *obj.Prog + if v.Op == ssa.OpAMD64CMOVQEQF { + q = s.Prog(x86.ACMOVQPC) + } else if v.Op == ssa.OpAMD64CMOVLEQF { + q = s.Prog(x86.ACMOVLPC) + } else { + q = s.Prog(x86.ACMOVWPC) + } + q.From.Type = obj.TYPE_REG + q.From.Reg = t + q.To.Type = obj.TYPE_REG + q.To.Reg = v.Reg() + + case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst: + r := v.Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = r + p.SetFrom3Reg(v.Args[0].Reg()) + + case ssa.OpAMD64ANDQconst: + asm := v.Op.Asm() + // If the constant is positive and fits into 32 bits, use ANDL. + // This saves a few bytes of encoding. + if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) { + asm = x86.AANDL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, + ssa.OpAMD64ANDLconst, + ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst, + ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst, + ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, + ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst, + ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst, + ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask: + r := v.Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8, + ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8, + ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8: + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + o := v.Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = o + if v.AuxInt != 0 && v.Aux == nil { + // Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA. + switch v.Op { + case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8: + p = s.Prog(x86.ALEAQ) + case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8: + p = s.Prog(x86.ALEAL) + case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8: + p = s.Prog(x86.ALEAW) + } + p.From.Type = obj.TYPE_MEM + p.From.Reg = o + p.To.Type = obj.TYPE_REG + p.To.Reg = o + } + ssagen.AddAux(&p.From, v) + case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB, + ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB, + ssa.OpAMD64BTL, ssa.OpAMD64BTQ: + opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg()) + case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD: + // Go assembler has swapped operands for UCOMISx relative to CMP, + // must account for that right here. + opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg()) + case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_CONST + p.To.Offset = v.AuxInt + case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst, + ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst, + ssa.OpAMD64BTSLconst, ssa.OpAMD64BTSQconst, + ssa.OpAMD64BTCLconst, ssa.OpAMD64BTCQconst, + ssa.OpAMD64BTRLconst, ssa.OpAMD64BTRQconst: + op := v.Op + if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 { + // Emit 32-bit version because it's shorter + op = ssa.OpAMD64BTLconst + } + p := s.Prog(op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[0].Reg() + case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[1].Reg() + case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload: + sc := v.AuxValAndOff() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux2(&p.From, v, sc.Off64()) + p.To.Type = obj.TYPE_CONST + p.To.Offset = sc.Val64() + case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1: + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[2].Reg() + case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1: + sc := v.AuxValAndOff() + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + ssagen.AddAux2(&p.From, v, sc.Off64()) + p.To.Type = obj.TYPE_CONST + p.To.Offset = sc.Val64() + case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst: + x := v.Reg() + + // If flags aren't live (indicated by v.Aux == nil), + // then we can rewrite MOV $0, AX into XOR AX, AX. + if v.AuxInt == 0 && v.Aux == nil { + opregreg(s, x86.AXORL, x, x) + break + } + + asm := v.Op.Asm() + // Use MOVL to move a small constant into a register + // when the constant is positive and fits into 32 bits. + if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) { + // The upper 32bit are zeroed automatically when using MOVL. + asm = x86.AMOVL + } + p := s.Prog(asm) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = x + case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst: + x := v.Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_FCONST + p.From.Val = math.Float64frombits(uint64(v.AuxInt)) + p.To.Type = obj.TYPE_REG + p.To.Reg = x + case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload, + ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, + ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1, + ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2, + ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8: + p := s.Prog(v.Op.Asm()) + memIdx(&p.From, v) + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore, + ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify, + ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify, + ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1, + ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2, + ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8, + ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8, + ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8, + ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8, + ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8, + ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + memIdx(&p.To, v) + ssagen.AddAux(&p.To, v) + case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify: + sc := v.AuxValAndOff() + off := sc.Off64() + val := sc.Val() + if val == 1 || val == -1 { + var asm obj.As + if v.Op == ssa.OpAMD64ADDQconstmodify { + if val == 1 { + asm = x86.AINCQ + } else { + asm = x86.ADECQ + } + } else { + if val == 1 { + asm = x86.AINCL + } else { + asm = x86.ADECL + } + } + p := s.Prog(asm) + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux2(&p.To, v, off) + break + } + fallthrough + case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify, + ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify: + sc := v.AuxValAndOff() + off := sc.Off64() + val := sc.Val64() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = val + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux2(&p.To, v, off) + + case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + sc := v.AuxValAndOff() + p.From.Offset = sc.Val64() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux2(&p.To, v, sc.Off64()) + case ssa.OpAMD64MOVOstoreconst: + sc := v.AuxValAndOff() + if sc.Val() != 0 { + v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString()) + } + + if s.ABI != obj.ABIInternal { + // zero X15 manually + opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_X15 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux2(&p.To, v, sc.Off64()) + + case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1, + ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8, + ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8, + ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8, + ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + sc := v.AuxValAndOff() + p.From.Offset = sc.Val64() + switch { + case p.As == x86.AADDQ && p.From.Offset == 1: + p.As = x86.AINCQ + p.From.Type = obj.TYPE_NONE + case p.As == x86.AADDQ && p.From.Offset == -1: + p.As = x86.ADECQ + p.From.Type = obj.TYPE_NONE + case p.As == x86.AADDL && p.From.Offset == 1: + p.As = x86.AINCL + p.From.Type = obj.TYPE_NONE + case p.As == x86.AADDL && p.From.Offset == -1: + p.As = x86.ADECL + p.From.Type = obj.TYPE_NONE + } + memIdx(&p.To, v) + ssagen.AddAux2(&p.To, v, sc.Off64()) + case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX, + ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ, + ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS: + opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg()) + case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS: + r := v.Reg() + // Break false dependency on destination register. + opregreg(s, x86.AXORPS, r, r) + opregreg(s, v.Op.Asm(), r, v.Args[0].Reg()) + case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i: + var p *obj.Prog + switch v.Op { + case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i: + p = s.Prog(x86.AMOVQ) + case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i: + p = s.Prog(x86.AMOVL) + } + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload, + ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload, + ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload, + ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload, + ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[1].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8, + ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8, + ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8, + ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8, + ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8, + ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8, + ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8, + ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8, + ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8: + p := s.Prog(v.Op.Asm()) + + r, i := v.Args[1].Reg(), v.Args[2].Reg() + p.From.Type = obj.TYPE_MEM + p.From.Scale = v.Op.Scale() + if p.From.Scale == 1 && i == x86.REG_SP { + r, i = i, r + } + p.From.Reg = r + p.From.Index = i + + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64DUFFZERO: + if s.ABI != obj.ABIInternal { + // zero X15 manually + opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + } + off := duffStart(v.AuxInt) + adj := duffAdj(v.AuxInt) + var p *obj.Prog + if adj != 0 { + p = s.Prog(x86.ALEAQ) + p.From.Type = obj.TYPE_MEM + p.From.Offset = adj + p.From.Reg = x86.REG_DI + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_DI + } + p = s.Prog(obj.ADUFFZERO) + p.To.Type = obj.TYPE_ADDR + p.To.Sym = ir.Syms.Duffzero + p.To.Offset = off + case ssa.OpAMD64DUFFCOPY: + p := s.Prog(obj.ADUFFCOPY) + p.To.Type = obj.TYPE_ADDR + p.To.Sym = ir.Syms.Duffcopy + if v.AuxInt%16 != 0 { + v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt) + } + p.To.Offset = 14 * (64 - v.AuxInt/16) + // 14 and 64 are magic constants. 14 is the number of bytes to encode: + // MOVUPS (SI), X0 + // ADDQ $16, SI + // MOVUPS X0, (DI) + // ADDQ $16, DI + // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy. + + case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? + if v.Type.IsMemory() { + return + } + x := v.Args[0].Reg() + y := v.Reg() + if x != y { + opregreg(s, moveByType(v.Type), y, x) + } + case ssa.OpLoadReg: + if v.Type.IsFlags() { + v.Fatalf("load flags not implemented: %v", v.LongString()) + return + } + p := s.Prog(loadByType(v.Type)) + ssagen.AddrAuto(&p.From, v.Args[0]) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpStoreReg: + if v.Type.IsFlags() { + v.Fatalf("store flags not implemented: %v", v.LongString()) + return + } + p := s.Prog(storeByType(v.Type)) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + ssagen.AddrAuto(&p.To, v) + case ssa.OpAMD64LoweredHasCPUFeature: + p := s.Prog(x86.AMOVBQZX) + p.From.Type = obj.TYPE_MEM + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpArgIntReg, ssa.OpArgFloatReg: + // The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill + // The loop only runs once. + for _, ap := range v.Block.Func.RegArgs { + // Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack. + addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize) + s.FuncInfo().AddSpill( + obj.RegSpill{Reg: ap.Reg, Addr: addr, Unspill: loadByType(ap.Type), Spill: storeByType(ap.Type)}) + } + v.Block.Func.RegArgs = nil + ssagen.CheckArgReg(v) + case ssa.OpAMD64LoweredGetClosurePtr: + // Closure pointer is DX. + ssagen.CheckLoweredGetClosurePtr(v) + case ssa.OpAMD64LoweredGetG: + if s.ABI == obj.ABIInternal { + v.Fatalf("LoweredGetG should not appear in ABIInternal") + } + r := v.Reg() + getgFromTLS(s, r) + case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail: + if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal { + // zeroing X15 when entering ABIInternal from ABI0 + if buildcfg.GOOS != "plan9" { // do not use SSE on Plan 9 + opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + } + // set G register from TLS + getgFromTLS(s, x86.REG_R14) + } + if v.Op == ssa.OpAMD64CALLtail { + s.TailCall(v) + break + } + s.Call(v) + if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 { + // zeroing X15 when entering ABIInternal from ABI0 + if buildcfg.GOOS != "plan9" { // do not use SSE on Plan 9 + opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + } + // set G register from TLS + getgFromTLS(s, x86.REG_R14) + } + case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter: + s.Call(v) + + case ssa.OpAMD64LoweredGetCallerPC: + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_MEM + p.From.Offset = -8 // PC is stored 8 bytes below first parameter. + p.From.Name = obj.NAME_PARAM + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64LoweredGetCallerSP: + // caller's SP is the address of the first arg + mov := x86.AMOVQ + if types.PtrSize == 4 { + mov = x86.AMOVL + } + p := s.Prog(mov) + p.From.Type = obj.TYPE_ADDR + p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures + p.From.Name = obj.NAME_PARAM + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64LoweredWB: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + // arg0 is in DI. Set sym to match where regalloc put arg1. + p.To.Sym = ssagen.GCWriteBarrierReg[v.Args[1].Reg()] + + case ssa.OpAMD64LoweredPanicBoundsA, ssa.OpAMD64LoweredPanicBoundsB, ssa.OpAMD64LoweredPanicBoundsC: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt] + s.UseArgs(int64(2 * types.PtrSize)) // space used in callee args area by assembly stubs + + case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL, + ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL, + ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64NEGLflags: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + + case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + switch v.Op { + case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ: + p.To.Reg = v.Reg0() + case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS: + p.To.Reg = v.Reg() + } + case ssa.OpAMD64ROUNDSD: + p := s.Prog(v.Op.Asm()) + val := v.AuxInt + // 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc + if val < 0 || val > 3 { + v.Fatalf("Invalid rounding mode") + } + p.From.Offset = val + p.From.Type = obj.TYPE_CONST + p.SetFrom3Reg(v.Args[0].Reg()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL, + ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL, + ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL: + if v.Args[0].Reg() != v.Reg() { + // POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus. + // TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7. + // Xor register with itself to break the dependency. + opregreg(s, x86.AXORL, v.Reg(), v.Reg()) + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE, + ssa.OpAMD64SETL, ssa.OpAMD64SETLE, + ssa.OpAMD64SETG, ssa.OpAMD64SETGE, + ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF, + ssa.OpAMD64SETB, ssa.OpAMD64SETBE, + ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN, + ssa.OpAMD64SETA, ssa.OpAMD64SETAE, + ssa.OpAMD64SETO: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore, + ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore, + ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore, + ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore, + ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + + case ssa.OpAMD64SETNEF: + t := v.RegTmp() + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + q := s.Prog(x86.ASETPS) + q.To.Type = obj.TYPE_REG + q.To.Reg = t + // ORL avoids partial register write and is smaller than ORQ, used by old compiler + opregreg(s, x86.AORL, v.Reg(), t) + + case ssa.OpAMD64SETEQF: + t := v.RegTmp() + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + q := s.Prog(x86.ASETPC) + q.To.Type = obj.TYPE_REG + q.To.Reg = t + // ANDL avoids partial register write and is smaller than ANDQ, used by old compiler + opregreg(s, x86.AANDL, v.Reg(), t) + + case ssa.OpAMD64InvertFlags: + v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString()) + case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT: + v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString()) + case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64: + v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString()) + case ssa.OpAMD64REPSTOSQ: + s.Prog(x86.AREP) + s.Prog(x86.ASTOSQ) + case ssa.OpAMD64REPMOVSQ: + s.Prog(x86.AREP) + s.Prog(x86.AMOVSQ) + case ssa.OpAMD64LoweredNilCheck: + // Issue a load which will fault if the input is nil. + // TODO: We currently use the 2-byte instruction TESTB AX, (reg). + // Should we use the 3-byte TESTB $0, (reg) instead? It is larger + // but it doesn't have false dependency on AX. + // Or maybe allocate an output register and use MOVL (reg),reg2 ? + // That trades clobbering flags for clobbering a register. + p := s.Prog(x86.ATESTB) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + if logopt.Enabled() { + logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name) + } + if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers + base.WarnfAt(v.Pos, "generated nil check") + } + case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Reg0() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[1].Reg() + ssagen.AddAux(&p.To, v) + case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock: + s.Prog(x86.ALOCK) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Reg0() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[1].Reg() + ssagen.AddAux(&p.To, v) + case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock: + if v.Args[1].Reg() != x86.REG_AX { + v.Fatalf("input[1] not in AX %s", v.LongString()) + } + s.Prog(x86.ALOCK) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + p = s.Prog(x86.ASETEQ) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock: + s.Prog(x86.ALOCK) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + case ssa.OpClobber: + p := s.Prog(x86.AMOVL) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 0xdeaddead + p.To.Type = obj.TYPE_MEM + p.To.Reg = x86.REG_SP + ssagen.AddAux(&p.To, v) + p = s.Prog(x86.AMOVL) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 0xdeaddead + p.To.Type = obj.TYPE_MEM + p.To.Reg = x86.REG_SP + ssagen.AddAux(&p.To, v) + p.To.Offset += 4 + case ssa.OpClobberReg: + x := uint64(0xdeaddeaddeaddead) + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(x) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + default: + v.Fatalf("genValue not implemented: %s", v.LongString()) + } +} + +var blockJump = [...]struct { + asm, invasm obj.As +}{ + ssa.BlockAMD64EQ: {x86.AJEQ, x86.AJNE}, + ssa.BlockAMD64NE: {x86.AJNE, x86.AJEQ}, + ssa.BlockAMD64LT: {x86.AJLT, x86.AJGE}, + ssa.BlockAMD64GE: {x86.AJGE, x86.AJLT}, + ssa.BlockAMD64LE: {x86.AJLE, x86.AJGT}, + ssa.BlockAMD64GT: {x86.AJGT, x86.AJLE}, + ssa.BlockAMD64OS: {x86.AJOS, x86.AJOC}, + ssa.BlockAMD64OC: {x86.AJOC, x86.AJOS}, + ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC}, + ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS}, + ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS}, + ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI}, + ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS}, + ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC}, +} + +var eqfJumps = [2][2]ssagen.IndexJump{ + {{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0] + {{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1] +} +var nefJumps = [2][2]ssagen.IndexJump{ + {{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0] + {{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1] +} + +func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) { + switch b.Kind { + case ssa.BlockPlain: + if b.Succs[0].Block() != next { + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()}) + } + case ssa.BlockDefer: + // defer returns in rax: + // 0 if we should continue executing + // 1 if we should jump to deferreturn call + p := s.Prog(x86.ATESTL) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_AX + p = s.Prog(x86.AJNE) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[1].Block()}) + if b.Succs[0].Block() != next { + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()}) + } + case ssa.BlockExit, ssa.BlockRetJmp: + case ssa.BlockRet: + s.Prog(obj.ARET) + + case ssa.BlockAMD64EQF: + s.CombJump(b, next, &eqfJumps) + + case ssa.BlockAMD64NEF: + s.CombJump(b, next, &nefJumps) + + case ssa.BlockAMD64EQ, ssa.BlockAMD64NE, + ssa.BlockAMD64LT, ssa.BlockAMD64GE, + ssa.BlockAMD64LE, ssa.BlockAMD64GT, + ssa.BlockAMD64OS, ssa.BlockAMD64OC, + ssa.BlockAMD64ULT, ssa.BlockAMD64UGT, + ssa.BlockAMD64ULE, ssa.BlockAMD64UGE: + jmp := blockJump[b.Kind] + switch next { + case b.Succs[0].Block(): + s.Br(jmp.invasm, b.Succs[1].Block()) + case b.Succs[1].Block(): + s.Br(jmp.asm, b.Succs[0].Block()) + default: + if b.Likely != ssa.BranchUnlikely { + s.Br(jmp.asm, b.Succs[0].Block()) + s.Br(obj.AJMP, b.Succs[1].Block()) + } else { + s.Br(jmp.invasm, b.Succs[1].Block()) + s.Br(obj.AJMP, b.Succs[0].Block()) + } + } + + case ssa.BlockAMD64JUMPTABLE: + // JMP *(TABLE)(INDEX*8) + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_MEM + p.To.Reg = b.Controls[1].Reg() + p.To.Index = b.Controls[0].Reg() + p.To.Scale = 8 + // Save jump tables for later resolution of the target blocks. + s.JumpTables = append(s.JumpTables, b) + + default: + b.Fatalf("branch not implemented: %s", b.LongString()) + } +} + +func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog { + p := s.Prog(loadByType(t)) + p.From.Type = obj.TYPE_MEM + p.From.Name = obj.NAME_AUTO + p.From.Sym = n.Linksym() + p.From.Offset = n.FrameOffset() + off + p.To.Type = obj.TYPE_REG + p.To.Reg = reg + return p +} + +func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog { + p = pp.Append(p, storeByType(t), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off) + p.To.Name = obj.NAME_PARAM + p.To.Sym = n.Linksym() + p.Pos = p.Pos.WithNotStmt() + return p +} diff --git a/src/cmd/compile/internal/amd64/versions_test.go b/src/cmd/compile/internal/amd64/versions_test.go new file mode 100644 index 0000000..fc0046a --- /dev/null +++ b/src/cmd/compile/internal/amd64/versions_test.go @@ -0,0 +1,433 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// When using GOEXPERIMENT=boringcrypto, the test program links in the boringcrypto syso, +// which does not respect GOAMD64, so we skip the test if boringcrypto is enabled. +//go:build !boringcrypto + +package amd64_test + +import ( + "bufio" + "debug/elf" + "debug/macho" + "errors" + "fmt" + "go/build" + "internal/testenv" + "io" + "math" + "math/bits" + "os" + "os/exec" + "regexp" + "runtime" + "strconv" + "strings" + "testing" +) + +// Test to make sure that when building for GOAMD64=v1, we don't +// use any >v1 instructions. +func TestGoAMD64v1(t *testing.T) { + if runtime.GOARCH != "amd64" { + t.Skip("amd64-only test") + } + if runtime.GOOS != "linux" && runtime.GOOS != "darwin" { + t.Skip("test only works on elf or macho platforms") + } + for _, tag := range build.Default.ToolTags { + if tag == "amd64.v2" { + t.Skip("compiling for GOAMD64=v2 or higher") + } + } + if os.Getenv("TESTGOAMD64V1") != "" { + t.Skip("recursive call") + } + + // Make a binary which will be a modified version of the + // currently running binary. + dst, err := os.CreateTemp("", "TestGoAMD64v1") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + defer os.Remove(dst.Name()) + dst.Chmod(0500) // make executable + + // Clobber all the non-v1 opcodes. + opcodes := map[string]bool{} + var features []string + for feature, opcodeList := range featureToOpcodes { + if runtimeFeatures[feature] { + features = append(features, fmt.Sprintf("cpu.%s=off", feature)) + } + for _, op := range opcodeList { + opcodes[op] = true + } + } + clobber(t, os.Args[0], dst, opcodes) + if err = dst.Close(); err != nil { + t.Fatalf("can't close binary: %v", err) + } + + // Run the resulting binary. + cmd := testenv.Command(t, dst.Name()) + testenv.CleanCmdEnv(cmd) + cmd.Env = append(cmd.Env, "TESTGOAMD64V1=yes") + cmd.Env = append(cmd.Env, fmt.Sprintf("GODEBUG=%s", strings.Join(features, ","))) + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("couldn't execute test: %s", err) + } + // Expect to see output of the form "PASS\n", unless the test binary + // was compiled for coverage (in which case there will be an extra line). + success := false + lines := strings.Split(string(out), "\n") + if len(lines) == 2 { + success = lines[0] == "PASS" && lines[1] == "" + } else if len(lines) == 3 { + success = lines[0] == "PASS" && + strings.HasPrefix(lines[1], "coverage") && lines[2] == "" + } + if !success { + t.Fatalf("test reported error: %s lines=%+v", string(out), lines) + } +} + +// Clobber copies the binary src to dst, replacing all the instructions in opcodes with +// faulting instructions. +func clobber(t *testing.T, src string, dst *os.File, opcodes map[string]bool) { + // Run objdump to get disassembly. + var re *regexp.Regexp + var disasm io.Reader + if false { + // TODO: go tool objdump doesn't disassemble the bmi1 instructions + // in question correctly. See issue 48584. + cmd := testenv.Command(t, "go", "tool", "objdump", src) + var err error + disasm, err = cmd.StdoutPipe() + if err != nil { + t.Fatal(err) + } + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + if err := cmd.Wait(); err != nil { + t.Error(err) + } + }) + re = regexp.MustCompile(`^[^:]*:[-\d]+\s+0x([\da-f]+)\s+([\da-f]+)\s+([A-Z]+)`) + } else { + // TODO: we're depending on platform-native objdump here. Hence the Skipf + // below if it doesn't run for some reason. + cmd := testenv.Command(t, "objdump", "-d", src) + var err error + disasm, err = cmd.StdoutPipe() + if err != nil { + t.Fatal(err) + } + if err := cmd.Start(); err != nil { + if errors.Is(err, exec.ErrNotFound) { + t.Skipf("can't run test due to missing objdump: %s", err) + } + t.Fatal(err) + } + t.Cleanup(func() { + if err := cmd.Wait(); err != nil { + t.Error(err) + } + }) + re = regexp.MustCompile(`^\s*([\da-f]+):\s*((?:[\da-f][\da-f] )+)\s*([a-z\d]+)`) + } + + // Find all the instruction addresses we need to edit. + virtualEdits := map[uint64]bool{} + scanner := bufio.NewScanner(disasm) + for scanner.Scan() { + line := scanner.Text() + parts := re.FindStringSubmatch(line) + if len(parts) == 0 { + continue + } + addr, err := strconv.ParseUint(parts[1], 16, 64) + if err != nil { + continue // not a hex address + } + opcode := strings.ToLower(parts[3]) + if !opcodes[opcode] { + continue + } + t.Logf("clobbering instruction %s", line) + n := (len(parts[2]) - strings.Count(parts[2], " ")) / 2 // number of bytes in instruction encoding + for i := 0; i < n; i++ { + // Only really need to make the first byte faulting, but might + // as well make all the bytes faulting. + virtualEdits[addr+uint64(i)] = true + } + } + + // Figure out where in the binary the edits must be done. + physicalEdits := map[uint64]bool{} + if e, err := elf.Open(src); err == nil { + for _, sec := range e.Sections { + vaddr := sec.Addr + paddr := sec.Offset + size := sec.Size + for a := range virtualEdits { + if a >= vaddr && a < vaddr+size { + physicalEdits[paddr+(a-vaddr)] = true + } + } + } + } else if m, err2 := macho.Open(src); err2 == nil { + for _, sec := range m.Sections { + vaddr := sec.Addr + paddr := uint64(sec.Offset) + size := sec.Size + for a := range virtualEdits { + if a >= vaddr && a < vaddr+size { + physicalEdits[paddr+(a-vaddr)] = true + } + } + } + } else { + t.Log(err) + t.Log(err2) + t.Fatal("executable format not elf or macho") + } + if len(virtualEdits) != len(physicalEdits) { + t.Fatal("couldn't find an instruction in text sections") + } + + // Copy source to destination, making edits along the way. + f, err := os.Open(src) + if err != nil { + t.Fatal(err) + } + r := bufio.NewReader(f) + w := bufio.NewWriter(dst) + a := uint64(0) + done := 0 + for { + b, err := r.ReadByte() + if err == io.EOF { + break + } + if err != nil { + t.Fatal("can't read") + } + if physicalEdits[a] { + b = 0xcc // INT3 opcode + done++ + } + err = w.WriteByte(b) + if err != nil { + t.Fatal("can't write") + } + a++ + } + if done != len(physicalEdits) { + t.Fatal("physical edits remaining") + } + w.Flush() + f.Close() +} + +func setOf(keys ...string) map[string]bool { + m := make(map[string]bool, len(keys)) + for _, key := range keys { + m[key] = true + } + return m +} + +var runtimeFeatures = setOf( + "adx", "aes", "avx", "avx2", "bmi1", "bmi2", "erms", "fma", + "pclmulqdq", "popcnt", "rdtscp", "sse3", "sse41", "sse42", "ssse3", +) + +var featureToOpcodes = map[string][]string{ + // Note: we include *q, *l, and plain opcodes here. + // go tool objdump doesn't include a [QL] on popcnt instructions, until CL 351889 + // native objdump doesn't include [QL] on linux. + "popcnt": {"popcntq", "popcntl", "popcnt"}, + "bmi1": { + "andnq", "andnl", "andn", + "blsiq", "blsil", "blsi", + "blsmskq", "blsmskl", "blsmsk", + "blsrq", "blsrl", "blsr", + "tzcntq", "tzcntl", "tzcnt", + }, + "bmi2": { + "sarxq", "sarxl", "sarx", + "shlxq", "shlxl", "shlx", + "shrxq", "shrxl", "shrx", + }, + "sse41": { + "roundsd", + "pinsrq", "pinsrl", "pinsrd", "pinsrb", "pinsr", + "pextrq", "pextrl", "pextrd", "pextrb", "pextr", + "pminsb", "pminsd", "pminuw", "pminud", // Note: ub and sw are ok. + "pmaxsb", "pmaxsd", "pmaxuw", "pmaxud", + "pmovzxbw", "pmovzxbd", "pmovzxbq", "pmovzxwd", "pmovzxwq", "pmovzxdq", + "pmovsxbw", "pmovsxbd", "pmovsxbq", "pmovsxwd", "pmovsxwq", "pmovsxdq", + "pblendvb", + }, + "fma": {"vfmadd231sd"}, + "movbe": {"movbeqq", "movbeq", "movbell", "movbel", "movbe"}, + "lzcnt": {"lzcntq", "lzcntl", "lzcnt"}, +} + +// Test to use POPCNT instruction, if available +func TestPopCnt(t *testing.T) { + for _, tt := range []struct { + x uint64 + want int + }{ + {0b00001111, 4}, + {0b00001110, 3}, + {0b00001100, 2}, + {0b00000000, 0}, + } { + if got := bits.OnesCount64(tt.x); got != tt.want { + t.Errorf("OnesCount64(%#x) = %d, want %d", tt.x, got, tt.want) + } + if got := bits.OnesCount32(uint32(tt.x)); got != tt.want { + t.Errorf("OnesCount32(%#x) = %d, want %d", tt.x, got, tt.want) + } + } +} + +// Test to use ANDN, if available +func TestAndNot(t *testing.T) { + for _, tt := range []struct { + x, y, want uint64 + }{ + {0b00001111, 0b00000011, 0b1100}, + {0b00001111, 0b00001100, 0b0011}, + {0b00000000, 0b00000000, 0b0000}, + } { + if got := tt.x &^ tt.y; got != tt.want { + t.Errorf("%#x &^ %#x = %#x, want %#x", tt.x, tt.y, got, tt.want) + } + if got := uint32(tt.x) &^ uint32(tt.y); got != uint32(tt.want) { + t.Errorf("%#x &^ %#x = %#x, want %#x", tt.x, tt.y, got, tt.want) + } + } +} + +// Test to use BLSI, if available +func TestBLSI(t *testing.T) { + for _, tt := range []struct { + x, want uint64 + }{ + {0b00001111, 0b001}, + {0b00001110, 0b010}, + {0b00001100, 0b100}, + {0b11000110, 0b010}, + {0b00000000, 0b000}, + } { + if got := tt.x & -tt.x; got != tt.want { + t.Errorf("%#x & (-%#x) = %#x, want %#x", tt.x, tt.x, got, tt.want) + } + if got := uint32(tt.x) & -uint32(tt.x); got != uint32(tt.want) { + t.Errorf("%#x & (-%#x) = %#x, want %#x", tt.x, tt.x, got, tt.want) + } + } +} + +// Test to use BLSMSK, if available +func TestBLSMSK(t *testing.T) { + for _, tt := range []struct { + x, want uint64 + }{ + {0b00001111, 0b001}, + {0b00001110, 0b011}, + {0b00001100, 0b111}, + {0b11000110, 0b011}, + {0b00000000, 1<<64 - 1}, + } { + if got := tt.x ^ (tt.x - 1); got != tt.want { + t.Errorf("%#x ^ (%#x-1) = %#x, want %#x", tt.x, tt.x, got, tt.want) + } + if got := uint32(tt.x) ^ (uint32(tt.x) - 1); got != uint32(tt.want) { + t.Errorf("%#x ^ (%#x-1) = %#x, want %#x", tt.x, tt.x, got, uint32(tt.want)) + } + } +} + +// Test to use BLSR, if available +func TestBLSR(t *testing.T) { + for _, tt := range []struct { + x, want uint64 + }{ + {0b00001111, 0b00001110}, + {0b00001110, 0b00001100}, + {0b00001100, 0b00001000}, + {0b11000110, 0b11000100}, + {0b00000000, 0b00000000}, + } { + if got := tt.x & (tt.x - 1); got != tt.want { + t.Errorf("%#x & (%#x-1) = %#x, want %#x", tt.x, tt.x, got, tt.want) + } + if got := uint32(tt.x) & (uint32(tt.x) - 1); got != uint32(tt.want) { + t.Errorf("%#x & (%#x-1) = %#x, want %#x", tt.x, tt.x, got, tt.want) + } + } +} + +func TestTrailingZeros(t *testing.T) { + for _, tt := range []struct { + x uint64 + want int + }{ + {0b00001111, 0}, + {0b00001110, 1}, + {0b00001100, 2}, + {0b00001000, 3}, + {0b00000000, 64}, + } { + if got := bits.TrailingZeros64(tt.x); got != tt.want { + t.Errorf("TrailingZeros64(%#x) = %d, want %d", tt.x, got, tt.want) + } + want := tt.want + if want == 64 { + want = 32 + } + if got := bits.TrailingZeros32(uint32(tt.x)); got != want { + t.Errorf("TrailingZeros64(%#x) = %d, want %d", tt.x, got, want) + } + } +} + +func TestRound(t *testing.T) { + for _, tt := range []struct { + x, want float64 + }{ + {1.4, 1}, + {1.5, 2}, + {1.6, 2}, + {2.4, 2}, + {2.5, 2}, + {2.6, 3}, + } { + if got := math.RoundToEven(tt.x); got != tt.want { + t.Errorf("RoundToEven(%f) = %f, want %f", tt.x, got, tt.want) + } + } +} + +func TestFMA(t *testing.T) { + for _, tt := range []struct { + x, y, z, want float64 + }{ + {2, 3, 4, 10}, + {3, 4, 5, 17}, + } { + if got := math.FMA(tt.x, tt.y, tt.z); got != tt.want { + t.Errorf("FMA(%f,%f,%f) = %f, want %f", tt.x, tt.y, tt.z, got, tt.want) + } + } +} |