diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 13:16:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 13:16:40 +0000 |
commit | 47ab3d4a42e9ab51c465c4322d2ec233f6324e6b (patch) | |
tree | a61a0ffd83f4a3def4b36e5c8e99630c559aa723 /src/cmd/compile/internal/ppc64 | |
parent | Initial commit. (diff) | |
download | golang-1.18-upstream.tar.xz golang-1.18-upstream.zip |
Adding upstream version 1.18.10.upstream/1.18.10upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/cmd/compile/internal/ppc64')
-rw-r--r-- | src/cmd/compile/internal/ppc64/galign.go | 29 | ||||
-rw-r--r-- | src/cmd/compile/internal/ppc64/ggen.go | 55 | ||||
-rw-r--r-- | src/cmd/compile/internal/ppc64/opt.go | 12 | ||||
-rw-r--r-- | src/cmd/compile/internal/ppc64/ssa.go | 2068 |
4 files changed, 2164 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ppc64/galign.go b/src/cmd/compile/internal/ppc64/galign.go new file mode 100644 index 0000000..20fd8ce --- /dev/null +++ b/src/cmd/compile/internal/ppc64/galign.go @@ -0,0 +1,29 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ppc64 + +import ( + "cmd/compile/internal/ssagen" + "cmd/internal/obj/ppc64" + "internal/buildcfg" +) + +func Init(arch *ssagen.ArchInfo) { + arch.LinkArch = &ppc64.Linkppc64 + if buildcfg.GOARCH == "ppc64le" { + arch.LinkArch = &ppc64.Linkppc64le + } + arch.REGSP = ppc64.REGSP + arch.MAXWIDTH = 1 << 50 + + arch.ZeroRange = zerorange + arch.Ginsnop = ginsnop + + arch.SSAMarkMoves = ssaMarkMoves + arch.SSAGenValue = ssaGenValue + arch.SSAGenBlock = ssaGenBlock + arch.LoadRegResult = loadRegResult + arch.SpillArgReg = spillArgReg +} diff --git a/src/cmd/compile/internal/ppc64/ggen.go b/src/cmd/compile/internal/ppc64/ggen.go new file mode 100644 index 0000000..3ae6422 --- /dev/null +++ b/src/cmd/compile/internal/ppc64/ggen.go @@ -0,0 +1,55 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ppc64 + +import ( + "cmd/compile/internal/base" + "cmd/compile/internal/ir" + "cmd/compile/internal/objw" + "cmd/compile/internal/types" + "cmd/internal/obj" + "cmd/internal/obj/ppc64" +) + +func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, _ *uint32) *obj.Prog { + if cnt == 0 { + return p + } + if cnt < int64(4*types.PtrSize) { + for i := int64(0); i < cnt; i += int64(types.PtrSize) { + p = pp.Append(p, ppc64.AMOVD, obj.TYPE_REG, ppc64.REGZERO, 0, obj.TYPE_MEM, ppc64.REGSP, base.Ctxt.FixedFrameSize()+off+i) + } + } else if cnt <= int64(128*types.PtrSize) { + p = pp.Append(p, ppc64.AADD, obj.TYPE_CONST, 0, base.Ctxt.FixedFrameSize()+off-8, obj.TYPE_REG, ppc64.REGRT1, 0) + p.Reg = ppc64.REGSP + p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_MEM, 0, 0) + p.To.Name = obj.NAME_EXTERN + p.To.Sym = ir.Syms.Duffzero + p.To.Offset = 4 * (128 - cnt/int64(types.PtrSize)) + } else { + p = pp.Append(p, ppc64.AMOVD, obj.TYPE_CONST, 0, base.Ctxt.FixedFrameSize()+off-8, obj.TYPE_REG, ppc64.REGTMP, 0) + p = pp.Append(p, ppc64.AADD, obj.TYPE_REG, ppc64.REGTMP, 0, obj.TYPE_REG, ppc64.REGRT1, 0) + p.Reg = ppc64.REGSP + p = pp.Append(p, ppc64.AMOVD, obj.TYPE_CONST, 0, cnt, obj.TYPE_REG, ppc64.REGTMP, 0) + p = pp.Append(p, ppc64.AADD, obj.TYPE_REG, ppc64.REGTMP, 0, obj.TYPE_REG, ppc64.REGRT2, 0) + p.Reg = ppc64.REGRT1 + p = pp.Append(p, ppc64.AMOVDU, obj.TYPE_REG, ppc64.REGZERO, 0, obj.TYPE_MEM, ppc64.REGRT1, int64(types.PtrSize)) + p1 := p + p = pp.Append(p, ppc64.ACMP, obj.TYPE_REG, ppc64.REGRT1, 0, obj.TYPE_REG, ppc64.REGRT2, 0) + p = pp.Append(p, ppc64.ABNE, obj.TYPE_NONE, 0, 0, obj.TYPE_BRANCH, 0, 0) + p.To.SetTarget(p1) + } + + return p +} + +func ginsnop(pp *objw.Progs) *obj.Prog { + p := pp.Prog(ppc64.AOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_R0 + return p +} diff --git a/src/cmd/compile/internal/ppc64/opt.go b/src/cmd/compile/internal/ppc64/opt.go new file mode 100644 index 0000000..4f81aa9 --- /dev/null +++ b/src/cmd/compile/internal/ppc64/opt.go @@ -0,0 +1,12 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ppc64 + +// Many Power ISA arithmetic and logical instructions come in four +// standard variants. These bits let us map between variants. +const ( + V_CC = 1 << 0 // xCC (affect CR field 0 flags) + V_V = 1 << 1 // xV (affect SO and OV flags) +) diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go new file mode 100644 index 0000000..98316c1 --- /dev/null +++ b/src/cmd/compile/internal/ppc64/ssa.go @@ -0,0 +1,2068 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ppc64 + +import ( + "cmd/compile/internal/base" + "cmd/compile/internal/ir" + "cmd/compile/internal/logopt" + "cmd/compile/internal/objw" + "cmd/compile/internal/ssa" + "cmd/compile/internal/ssagen" + "cmd/compile/internal/types" + "cmd/internal/obj" + "cmd/internal/obj/ppc64" + "internal/buildcfg" + "math" + "strings" +) + +// markMoves marks any MOVXconst ops that need to avoid clobbering flags. +func ssaMarkMoves(s *ssagen.State, b *ssa.Block) { + // flive := b.FlagsLiveAtEnd + // if b.Control != nil && b.Control.Type.IsFlags() { + // flive = true + // } + // for i := len(b.Values) - 1; i >= 0; i-- { + // v := b.Values[i] + // if flive && (v.Op == v.Op == ssa.OpPPC64MOVDconst) { + // // The "mark" is any non-nil Aux value. + // v.Aux = v + // } + // if v.Type.IsFlags() { + // flive = false + // } + // for _, a := range v.Args { + // if a.Type.IsFlags() { + // flive = true + // } + // } + // } +} + +// loadByType returns the load instruction of the given type. +func loadByType(t *types.Type) obj.As { + if t.IsFloat() { + switch t.Size() { + case 4: + return ppc64.AFMOVS + case 8: + return ppc64.AFMOVD + } + } else { + switch t.Size() { + case 1: + if t.IsSigned() { + return ppc64.AMOVB + } else { + return ppc64.AMOVBZ + } + case 2: + if t.IsSigned() { + return ppc64.AMOVH + } else { + return ppc64.AMOVHZ + } + case 4: + if t.IsSigned() { + return ppc64.AMOVW + } else { + return ppc64.AMOVWZ + } + case 8: + return ppc64.AMOVD + } + } + panic("bad load type") +} + +// storeByType returns the store instruction of the given type. +func storeByType(t *types.Type) obj.As { + if t.IsFloat() { + switch t.Size() { + case 4: + return ppc64.AFMOVS + case 8: + return ppc64.AFMOVD + } + } else { + switch t.Size() { + case 1: + return ppc64.AMOVB + case 2: + return ppc64.AMOVH + case 4: + return ppc64.AMOVW + case 8: + return ppc64.AMOVD + } + } + panic("bad store type") +} + +func ssaGenValue(s *ssagen.State, v *ssa.Value) { + switch v.Op { + case ssa.OpCopy: + t := v.Type + if t.IsMemory() { + return + } + x := v.Args[0].Reg() + y := v.Reg() + if x != y { + rt := obj.TYPE_REG + op := ppc64.AMOVD + + if t.IsFloat() { + op = ppc64.AFMOVD + } + p := s.Prog(op) + p.From.Type = rt + p.From.Reg = x + p.To.Type = rt + p.To.Reg = y + } + + case ssa.OpPPC64LoweredMuluhilo: + // MULHDU Rarg1, Rarg0, Reg0 + // MULLD Rarg1, Rarg0, Reg1 + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + p := s.Prog(ppc64.AMULHDU) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + p1 := s.Prog(ppc64.AMULLD) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.Reg = r0 + p1.To.Type = obj.TYPE_REG + p1.To.Reg = v.Reg1() + + case ssa.OpPPC64LoweredAdd64Carry: + // ADDC Rarg2, -1, Rtmp + // ADDE Rarg1, Rarg0, Reg0 + // ADDZE Rzero, Reg1 + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + r2 := v.Args[2].Reg() + p := s.Prog(ppc64.AADDC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = -1 + p.Reg = r2 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + p1 := s.Prog(ppc64.AADDE) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.Reg = r0 + p1.To.Type = obj.TYPE_REG + p1.To.Reg = v.Reg0() + p2 := s.Prog(ppc64.AADDZE) + p2.From.Type = obj.TYPE_REG + p2.From.Reg = ppc64.REGZERO + p2.To.Type = obj.TYPE_REG + p2.To.Reg = v.Reg1() + + case ssa.OpPPC64LoweredAtomicAnd8, + ssa.OpPPC64LoweredAtomicAnd32, + ssa.OpPPC64LoweredAtomicOr8, + ssa.OpPPC64LoweredAtomicOr32: + // LWSYNC + // LBAR/LWAR (Rarg0), Rtmp + // AND/OR Rarg1, Rtmp + // STBCCC/STWCCC Rtmp, (Rarg0) + // BNE -3(PC) + ld := ppc64.ALBAR + st := ppc64.ASTBCCC + if v.Op == ssa.OpPPC64LoweredAtomicAnd32 || v.Op == ssa.OpPPC64LoweredAtomicOr32 { + ld = ppc64.ALWAR + st = ppc64.ASTWCCC + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + // LWSYNC - Assuming shared data not write-through-required nor + // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b. + plwsync := s.Prog(ppc64.ALWSYNC) + plwsync.To.Type = obj.TYPE_NONE + // LBAR or LWAR + p := s.Prog(ld) + p.From.Type = obj.TYPE_MEM + p.From.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + // AND/OR reg1,out + p1 := s.Prog(v.Op.Asm()) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.To.Type = obj.TYPE_REG + p1.To.Reg = ppc64.REGTMP + // STBCCC or STWCCC + p2 := s.Prog(st) + p2.From.Type = obj.TYPE_REG + p2.From.Reg = ppc64.REGTMP + p2.To.Type = obj.TYPE_MEM + p2.To.Reg = r0 + p2.RegTo2 = ppc64.REGTMP + // BNE retry + p3 := s.Prog(ppc64.ABNE) + p3.To.Type = obj.TYPE_BRANCH + p3.To.SetTarget(p) + + case ssa.OpPPC64LoweredAtomicAdd32, + ssa.OpPPC64LoweredAtomicAdd64: + // LWSYNC + // LDAR/LWAR (Rarg0), Rout + // ADD Rarg1, Rout + // STDCCC/STWCCC Rout, (Rarg0) + // BNE -3(PC) + // MOVW Rout,Rout (if Add32) + ld := ppc64.ALDAR + st := ppc64.ASTDCCC + if v.Op == ssa.OpPPC64LoweredAtomicAdd32 { + ld = ppc64.ALWAR + st = ppc64.ASTWCCC + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + out := v.Reg0() + // LWSYNC - Assuming shared data not write-through-required nor + // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b. + plwsync := s.Prog(ppc64.ALWSYNC) + plwsync.To.Type = obj.TYPE_NONE + // LDAR or LWAR + p := s.Prog(ld) + p.From.Type = obj.TYPE_MEM + p.From.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = out + // ADD reg1,out + p1 := s.Prog(ppc64.AADD) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.To.Reg = out + p1.To.Type = obj.TYPE_REG + // STDCCC or STWCCC + p3 := s.Prog(st) + p3.From.Type = obj.TYPE_REG + p3.From.Reg = out + p3.To.Type = obj.TYPE_MEM + p3.To.Reg = r0 + // BNE retry + p4 := s.Prog(ppc64.ABNE) + p4.To.Type = obj.TYPE_BRANCH + p4.To.SetTarget(p) + + // Ensure a 32 bit result + if v.Op == ssa.OpPPC64LoweredAtomicAdd32 { + p5 := s.Prog(ppc64.AMOVWZ) + p5.To.Type = obj.TYPE_REG + p5.To.Reg = out + p5.From.Type = obj.TYPE_REG + p5.From.Reg = out + } + + case ssa.OpPPC64LoweredAtomicExchange32, + ssa.OpPPC64LoweredAtomicExchange64: + // LWSYNC + // LDAR/LWAR (Rarg0), Rout + // STDCCC/STWCCC Rout, (Rarg0) + // BNE -2(PC) + // ISYNC + ld := ppc64.ALDAR + st := ppc64.ASTDCCC + if v.Op == ssa.OpPPC64LoweredAtomicExchange32 { + ld = ppc64.ALWAR + st = ppc64.ASTWCCC + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + out := v.Reg0() + // LWSYNC - Assuming shared data not write-through-required nor + // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b. + plwsync := s.Prog(ppc64.ALWSYNC) + plwsync.To.Type = obj.TYPE_NONE + // LDAR or LWAR + p := s.Prog(ld) + p.From.Type = obj.TYPE_MEM + p.From.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = out + // STDCCC or STWCCC + p1 := s.Prog(st) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.To.Type = obj.TYPE_MEM + p1.To.Reg = r0 + // BNE retry + p2 := s.Prog(ppc64.ABNE) + p2.To.Type = obj.TYPE_BRANCH + p2.To.SetTarget(p) + // ISYNC + pisync := s.Prog(ppc64.AISYNC) + pisync.To.Type = obj.TYPE_NONE + + case ssa.OpPPC64LoweredAtomicLoad8, + ssa.OpPPC64LoweredAtomicLoad32, + ssa.OpPPC64LoweredAtomicLoad64, + ssa.OpPPC64LoweredAtomicLoadPtr: + // SYNC + // MOVB/MOVD/MOVW (Rarg0), Rout + // CMP Rout,Rout + // BNE 1(PC) + // ISYNC + ld := ppc64.AMOVD + cmp := ppc64.ACMP + switch v.Op { + case ssa.OpPPC64LoweredAtomicLoad8: + ld = ppc64.AMOVBZ + case ssa.OpPPC64LoweredAtomicLoad32: + ld = ppc64.AMOVWZ + cmp = ppc64.ACMPW + } + arg0 := v.Args[0].Reg() + out := v.Reg0() + // SYNC when AuxInt == 1; otherwise, load-acquire + if v.AuxInt == 1 { + psync := s.Prog(ppc64.ASYNC) + psync.To.Type = obj.TYPE_NONE + } + // Load + p := s.Prog(ld) + p.From.Type = obj.TYPE_MEM + p.From.Reg = arg0 + p.To.Type = obj.TYPE_REG + p.To.Reg = out + // CMP + p1 := s.Prog(cmp) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = out + p1.To.Type = obj.TYPE_REG + p1.To.Reg = out + // BNE + p2 := s.Prog(ppc64.ABNE) + p2.To.Type = obj.TYPE_BRANCH + // ISYNC + pisync := s.Prog(ppc64.AISYNC) + pisync.To.Type = obj.TYPE_NONE + p2.To.SetTarget(pisync) + + case ssa.OpPPC64LoweredAtomicStore8, + ssa.OpPPC64LoweredAtomicStore32, + ssa.OpPPC64LoweredAtomicStore64: + // SYNC or LWSYNC + // MOVB/MOVW/MOVD arg1,(arg0) + st := ppc64.AMOVD + switch v.Op { + case ssa.OpPPC64LoweredAtomicStore8: + st = ppc64.AMOVB + case ssa.OpPPC64LoweredAtomicStore32: + st = ppc64.AMOVW + } + arg0 := v.Args[0].Reg() + arg1 := v.Args[1].Reg() + // If AuxInt == 0, LWSYNC (Store-Release), else SYNC + // SYNC + syncOp := ppc64.ASYNC + if v.AuxInt == 0 { + syncOp = ppc64.ALWSYNC + } + psync := s.Prog(syncOp) + psync.To.Type = obj.TYPE_NONE + // Store + p := s.Prog(st) + p.To.Type = obj.TYPE_MEM + p.To.Reg = arg0 + p.From.Type = obj.TYPE_REG + p.From.Reg = arg1 + + case ssa.OpPPC64LoweredAtomicCas64, + ssa.OpPPC64LoweredAtomicCas32: + // LWSYNC + // loop: + // LDAR (Rarg0), MutexHint, Rtmp + // CMP Rarg1, Rtmp + // BNE fail + // STDCCC Rarg2, (Rarg0) + // BNE loop + // LWSYNC // Only for sequential consistency; not required in CasRel. + // MOVD $1, Rout + // BR end + // fail: + // MOVD $0, Rout + // end: + ld := ppc64.ALDAR + st := ppc64.ASTDCCC + cmp := ppc64.ACMP + if v.Op == ssa.OpPPC64LoweredAtomicCas32 { + ld = ppc64.ALWAR + st = ppc64.ASTWCCC + cmp = ppc64.ACMPW + } + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + r2 := v.Args[2].Reg() + out := v.Reg0() + // LWSYNC - Assuming shared data not write-through-required nor + // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b. + plwsync1 := s.Prog(ppc64.ALWSYNC) + plwsync1.To.Type = obj.TYPE_NONE + // LDAR or LWAR + p := s.Prog(ld) + p.From.Type = obj.TYPE_MEM + p.From.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + // If it is a Compare-and-Swap-Release operation, set the EH field with + // the release hint. + if v.AuxInt == 0 { + p.SetFrom3Const(0) + } + // CMP reg1,reg2 + p1 := s.Prog(cmp) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.To.Reg = ppc64.REGTMP + p1.To.Type = obj.TYPE_REG + // BNE cas_fail + p2 := s.Prog(ppc64.ABNE) + p2.To.Type = obj.TYPE_BRANCH + // STDCCC or STWCCC + p3 := s.Prog(st) + p3.From.Type = obj.TYPE_REG + p3.From.Reg = r2 + p3.To.Type = obj.TYPE_MEM + p3.To.Reg = r0 + // BNE retry + p4 := s.Prog(ppc64.ABNE) + p4.To.Type = obj.TYPE_BRANCH + p4.To.SetTarget(p) + // LWSYNC - Assuming shared data not write-through-required nor + // caching-inhibited. See Appendix B.2.1.1 in the ISA 2.07b. + // If the operation is a CAS-Release, then synchronization is not necessary. + if v.AuxInt != 0 { + plwsync2 := s.Prog(ppc64.ALWSYNC) + plwsync2.To.Type = obj.TYPE_NONE + } + // return true + p5 := s.Prog(ppc64.AMOVD) + p5.From.Type = obj.TYPE_CONST + p5.From.Offset = 1 + p5.To.Type = obj.TYPE_REG + p5.To.Reg = out + // BR done + p6 := s.Prog(obj.AJMP) + p6.To.Type = obj.TYPE_BRANCH + // return false + p7 := s.Prog(ppc64.AMOVD) + p7.From.Type = obj.TYPE_CONST + p7.From.Offset = 0 + p7.To.Type = obj.TYPE_REG + p7.To.Reg = out + p2.To.SetTarget(p7) + // done (label) + p8 := s.Prog(obj.ANOP) + p6.To.SetTarget(p8) + + case ssa.OpPPC64LoweredGetClosurePtr: + // Closure pointer is R11 (already) + ssagen.CheckLoweredGetClosurePtr(v) + + case ssa.OpPPC64LoweredGetCallerSP: + // caller's SP is FixedFrameSize below the address of the first arg + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_ADDR + p.From.Offset = -base.Ctxt.FixedFrameSize() + p.From.Name = obj.NAME_PARAM + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64LoweredGetCallerPC: + p := s.Prog(obj.AGETCALLERPC) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64LoweredRound32F, ssa.OpPPC64LoweredRound64F: + // input is already rounded + + case ssa.OpLoadReg: + loadOp := loadByType(v.Type) + p := s.Prog(loadOp) + ssagen.AddrAuto(&p.From, v.Args[0]) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpStoreReg: + storeOp := storeByType(v.Type) + p := s.Prog(storeOp) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + ssagen.AddrAuto(&p.To, v) + + case ssa.OpArgIntReg, ssa.OpArgFloatReg: + // The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill + // The loop only runs once. + for _, a := range v.Block.Func.RegArgs { + // Pass the spill/unspill information along to the assembler, offset by size of + // the saved LR slot. + addr := ssagen.SpillSlotAddr(a, ppc64.REGSP, base.Ctxt.FixedFrameSize()) + s.FuncInfo().AddSpill( + obj.RegSpill{Reg: a.Reg, Addr: addr, Unspill: loadByType(a.Type), Spill: storeByType(a.Type)}) + } + v.Block.Func.RegArgs = nil + + ssagen.CheckArgReg(v) + + case ssa.OpPPC64DIVD: + // For now, + // + // cmp arg1, -1 + // be ahead + // v = arg0 / arg1 + // b over + // ahead: v = - arg0 + // over: nop + r := v.Reg() + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + + p := s.Prog(ppc64.ACMP) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_CONST + p.To.Offset = -1 + + pbahead := s.Prog(ppc64.ABEQ) + pbahead.To.Type = obj.TYPE_BRANCH + + p = s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + pbover := s.Prog(obj.AJMP) + pbover.To.Type = obj.TYPE_BRANCH + + p = s.Prog(ppc64.ANEG) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + p.From.Type = obj.TYPE_REG + p.From.Reg = r0 + pbahead.To.SetTarget(p) + + p = s.Prog(obj.ANOP) + pbover.To.SetTarget(p) + + case ssa.OpPPC64DIVW: + // word-width version of above + r := v.Reg() + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + + p := s.Prog(ppc64.ACMPW) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.To.Type = obj.TYPE_CONST + p.To.Offset = -1 + + pbahead := s.Prog(ppc64.ABEQ) + pbahead.To.Type = obj.TYPE_BRANCH + + p = s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + pbover := s.Prog(obj.AJMP) + pbover.To.Type = obj.TYPE_BRANCH + + p = s.Prog(ppc64.ANEG) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + p.From.Type = obj.TYPE_REG + p.From.Reg = r0 + pbahead.To.SetTarget(p) + + p = s.Prog(obj.ANOP) + pbover.To.SetTarget(p) + + case ssa.OpPPC64CLRLSLWI: + r := v.Reg() + r1 := v.Args[0].Reg() + shifts := v.AuxInt + p := s.Prog(v.Op.Asm()) + // clrlslwi ra,rs,mb,sh will become rlwinm ra,rs,sh,mb-sh,31-sh as described in ISA + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)} + p.SetFrom3Const(ssa.GetPPC64Shiftsh(shifts)) + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpPPC64CLRLSLDI: + r := v.Reg() + r1 := v.Args[0].Reg() + shifts := v.AuxInt + p := s.Prog(v.Op.Asm()) + // clrlsldi ra,rs,mb,sh will become rldic ra,rs,sh,mb-sh + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)} + p.SetFrom3Const(ssa.GetPPC64Shiftsh(shifts)) + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + // Mask has been set as sh + case ssa.OpPPC64RLDICL: + r := v.Reg() + r1 := v.Args[0].Reg() + shifts := v.AuxInt + p := s.Prog(v.Op.Asm()) + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)} + p.SetFrom3Const(ssa.GetPPC64Shiftmb(shifts)) + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpPPC64ADD, ssa.OpPPC64FADD, ssa.OpPPC64FADDS, ssa.OpPPC64SUB, ssa.OpPPC64FSUB, ssa.OpPPC64FSUBS, + ssa.OpPPC64MULLD, ssa.OpPPC64MULLW, ssa.OpPPC64DIVDU, ssa.OpPPC64DIVWU, + ssa.OpPPC64SRAD, ssa.OpPPC64SRAW, ssa.OpPPC64SRD, ssa.OpPPC64SRW, ssa.OpPPC64SLD, ssa.OpPPC64SLW, + ssa.OpPPC64ROTL, ssa.OpPPC64ROTLW, + ssa.OpPPC64MULHD, ssa.OpPPC64MULHW, ssa.OpPPC64MULHDU, ssa.OpPPC64MULHWU, + ssa.OpPPC64FMUL, ssa.OpPPC64FMULS, ssa.OpPPC64FDIV, ssa.OpPPC64FDIVS, ssa.OpPPC64FCPSGN, + ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64ANDN, ssa.OpPPC64ORN, ssa.OpPPC64NOR, ssa.OpPPC64XOR, ssa.OpPPC64EQV, + ssa.OpPPC64MODUD, ssa.OpPPC64MODSD, ssa.OpPPC64MODUW, ssa.OpPPC64MODSW: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r2 + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpPPC64ANDCC, ssa.OpPPC64ORCC, ssa.OpPPC64XORCC: + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r2 + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP // result is not needed + + case ssa.OpPPC64ROTLconst, ssa.OpPPC64ROTLWconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + // Auxint holds encoded rotate + mask + case ssa.OpPPC64RLWINM, ssa.OpPPC64RLWMI: + rot, mb, me, _ := ssa.DecodePPC64RotateMask(v.AuxInt) + p := s.Prog(v.Op.Asm()) + p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()} + p.Reg = v.Args[0].Reg() + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(rot)} + p.SetRestArgs([]obj.Addr{{Type: obj.TYPE_CONST, Offset: mb}, {Type: obj.TYPE_CONST, Offset: me}}) + + // Auxint holds mask + case ssa.OpPPC64RLWNM: + _, mb, me, _ := ssa.DecodePPC64RotateMask(v.AuxInt) + p := s.Prog(v.Op.Asm()) + p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()} + p.Reg = v.Args[0].Reg() + p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()} + p.SetRestArgs([]obj.Addr{{Type: obj.TYPE_CONST, Offset: mb}, {Type: obj.TYPE_CONST, Offset: me}}) + + case ssa.OpPPC64MADDLD: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + r3 := v.Args[2].Reg() + // r = r1*r2 ± r3 + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r2 + p.SetFrom3Reg(r3) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpPPC64FMADD, ssa.OpPPC64FMADDS, ssa.OpPPC64FMSUB, ssa.OpPPC64FMSUBS: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() + r3 := v.Args[2].Reg() + // r = r1*r2 ± r3 + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r3 + p.SetFrom3Reg(r2) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FFLOOR, ssa.OpPPC64FTRUNC, ssa.OpPPC64FCEIL, + ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FCFIDS, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW, + ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB, ssa.OpPPC64MFVSRD, ssa.OpPPC64MTVSRD, ssa.OpPPC64FABS, ssa.OpPPC64FNABS, + ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD: + r := v.Reg() + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + + case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst, + ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst, + ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst, ssa.OpPPC64MULLWconst, ssa.OpPPC64MULLDconst: + p := s.Prog(v.Op.Asm()) + p.Reg = v.Args[0].Reg() + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64SUBFCconst: + p := s.Prog(v.Op.Asm()) + p.SetFrom3Const(v.AuxInt) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64ANDCCconst: + p := s.Prog(v.Op.Asm()) + p.Reg = v.Args[0].Reg() + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP // discard result + + case ssa.OpPPC64MOVDaddr: + switch v.Aux.(type) { + default: + v.Fatalf("aux in MOVDaddr is of unknown type %T", v.Aux) + case nil: + // If aux offset and aux int are both 0, and the same + // input and output regs are used, no instruction + // needs to be generated, since it would just be + // addi rx, rx, 0. + if v.AuxInt != 0 || v.Args[0].Reg() != v.Reg() { + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_ADDR + p.From.Reg = v.Args[0].Reg() + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + } + + case *obj.LSym, ir.Node: + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_ADDR + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + ssagen.AddAux(&p.From, v) + + } + + case ssa.OpPPC64MOVDconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64FMOVDconst, ssa.OpPPC64FMOVSconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_FCONST + p.From.Val = math.Float64frombits(uint64(v.AuxInt)) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64FCMPU, ssa.OpPPC64CMP, ssa.OpPPC64CMPW, ssa.OpPPC64CMPU, ssa.OpPPC64CMPWU: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[1].Reg() + + case ssa.OpPPC64CMPconst, ssa.OpPPC64CMPUconst, ssa.OpPPC64CMPWconst, ssa.OpPPC64CMPWUconst: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_CONST + p.To.Offset = v.AuxInt + + case ssa.OpPPC64MOVBreg, ssa.OpPPC64MOVBZreg, ssa.OpPPC64MOVHreg, ssa.OpPPC64MOVHZreg, ssa.OpPPC64MOVWreg, ssa.OpPPC64MOVWZreg: + // Shift in register to required size + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Reg = v.Reg() + p.To.Type = obj.TYPE_REG + + case ssa.OpPPC64MOVDload, ssa.OpPPC64MOVWload: + + // MOVDload and MOVWload are DS form instructions that are restricted to + // offsets that are a multiple of 4. If the offset is not a multple of 4, + // then the address of the symbol to be loaded is computed (base + offset) + // and used as the new base register and the offset field in the instruction + // can be set to zero. + + // This same problem can happen with gostrings since the final offset is not + // known yet, but could be unaligned after the relocation is resolved. + // So gostrings are handled the same way. + + // This allows the MOVDload and MOVWload to be generated in more cases and + // eliminates some offset and alignment checking in the rules file. + + fromAddr := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()} + ssagen.AddAux(&fromAddr, v) + + genAddr := false + + switch fromAddr.Name { + case obj.NAME_EXTERN, obj.NAME_STATIC: + // Special case for a rule combines the bytes of gostring. + // The v alignment might seem OK, but we don't want to load it + // using an offset because relocation comes later. + genAddr = strings.HasPrefix(fromAddr.Sym.Name, "go.string") || v.Type.Alignment()%4 != 0 || fromAddr.Offset%4 != 0 + default: + genAddr = fromAddr.Offset%4 != 0 + } + if genAddr { + // Load full address into the temp register. + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_ADDR + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + // Load target using temp as base register + // and offset zero. Setting NAME_NONE + // prevents any extra offsets from being + // added. + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + fromAddr.Reg = ppc64.REGTMP + // Clear the offset field and other + // information that might be used + // by the assembler to add to the + // final offset value. + fromAddr.Offset = 0 + fromAddr.Name = obj.NAME_NONE + fromAddr.Sym = nil + } + p := s.Prog(v.Op.Asm()) + p.From = fromAddr + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + break + + case ssa.OpPPC64MOVHload, ssa.OpPPC64MOVWZload, ssa.OpPPC64MOVBZload, ssa.OpPPC64MOVHZload, ssa.OpPPC64FMOVDload, ssa.OpPPC64FMOVSload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64MOVDBRload, ssa.OpPPC64MOVWBRload, ssa.OpPPC64MOVHBRload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64MOVDBRstore, ssa.OpPPC64MOVWBRstore, ssa.OpPPC64MOVHBRstore: + p := s.Prog(v.Op.Asm()) + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + + case ssa.OpPPC64MOVDloadidx, ssa.OpPPC64MOVWloadidx, ssa.OpPPC64MOVHloadidx, ssa.OpPPC64MOVWZloadidx, + ssa.OpPPC64MOVBZloadidx, ssa.OpPPC64MOVHZloadidx, ssa.OpPPC64FMOVDloadidx, ssa.OpPPC64FMOVSloadidx, + ssa.OpPPC64MOVDBRloadidx, ssa.OpPPC64MOVWBRloadidx, ssa.OpPPC64MOVHBRloadidx: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + p.From.Index = v.Args[1].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + case ssa.OpPPC64DCBT: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_CONST + p.To.Offset = v.AuxInt + + case ssa.OpPPC64MOVWstorezero, ssa.OpPPC64MOVHstorezero, ssa.OpPPC64MOVBstorezero: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGZERO + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + + case ssa.OpPPC64MOVDstore, ssa.OpPPC64MOVDstorezero: + + // MOVDstore and MOVDstorezero become DS form instructions that are restricted + // to offset values that are a multple of 4. If the offset field is not a + // multiple of 4, then the full address of the store target is computed (base + + // offset) and used as the new base register and the offset in the instruction + // is set to 0. + + // This allows the MOVDstore and MOVDstorezero to be generated in more cases, + // and prevents checking of the offset value and alignment in the rules. + + toAddr := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()} + ssagen.AddAux(&toAddr, v) + + if toAddr.Offset%4 != 0 { + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_ADDR + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + toAddr.Reg = ppc64.REGTMP + // Clear the offset field and other + // information that might be used + // by the assembler to add to the + // final offset value. + toAddr.Offset = 0 + toAddr.Name = obj.NAME_NONE + toAddr.Sym = nil + } + p := s.Prog(v.Op.Asm()) + p.To = toAddr + p.From.Type = obj.TYPE_REG + if v.Op == ssa.OpPPC64MOVDstorezero { + p.From.Reg = ppc64.REGZERO + } else { + p.From.Reg = v.Args[1].Reg() + } + + case ssa.OpPPC64MOVWstore, ssa.OpPPC64MOVHstore, ssa.OpPPC64MOVBstore, ssa.OpPPC64FMOVDstore, ssa.OpPPC64FMOVSstore: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[1].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + + case ssa.OpPPC64MOVDstoreidx, ssa.OpPPC64MOVWstoreidx, ssa.OpPPC64MOVHstoreidx, ssa.OpPPC64MOVBstoreidx, + ssa.OpPPC64FMOVDstoreidx, ssa.OpPPC64FMOVSstoreidx, ssa.OpPPC64MOVDBRstoreidx, ssa.OpPPC64MOVWBRstoreidx, + ssa.OpPPC64MOVHBRstoreidx: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[2].Reg() + p.To.Index = v.Args[1].Reg() + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + + case ssa.OpPPC64ISEL, ssa.OpPPC64ISELB: + // ISEL, ISELB + // AuxInt value indicates condition: 0=LT 1=GT 2=EQ 4=GE 5=LE 6=NE + // ISEL only accepts 0, 1, 2 condition values but the others can be + // achieved by swapping operand order. + // arg0 ? arg1 : arg2 with conditions LT, GT, EQ + // arg0 ? arg2 : arg1 for conditions GE, LE, NE + // ISELB is used when a boolean result is needed, returning 0 or 1 + p := s.Prog(ppc64.AISEL) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + // For ISELB, boolean result 0 or 1. Use R0 for 0 operand to avoid load. + r := obj.Addr{Type: obj.TYPE_REG, Reg: ppc64.REG_R0} + if v.Op == ssa.OpPPC64ISEL { + r.Reg = v.Args[1].Reg() + } + // AuxInt values 4,5,6 implemented with reverse operand order from 0,1,2 + if v.AuxInt > 3 { + p.Reg = r.Reg + p.SetFrom3Reg(v.Args[0].Reg()) + } else { + p.Reg = v.Args[0].Reg() + p.SetFrom3(r) + } + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt & 3 + + case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort: + // The LoweredQuad code generation + // generates STXV instructions on + // power9. The Short variation is used + // if no loop is generated. + + // sizes >= 64 generate a loop as follows: + + // Set up loop counter in CTR, used by BC + // XXLXOR clears VS32 + // XXLXOR VS32,VS32,VS32 + // MOVD len/64,REG_TMP + // MOVD REG_TMP,CTR + // loop: + // STXV VS32,0(R20) + // STXV VS32,16(R20) + // STXV VS32,32(R20) + // STXV VS32,48(R20) + // ADD $64,R20 + // BC 16, 0, loop + + // Bytes per iteration + ctr := v.AuxInt / 64 + + // Remainder bytes + rem := v.AuxInt % 64 + + // Only generate a loop if there is more + // than 1 iteration. + if ctr > 1 { + // Set up VS32 (V0) to hold 0s + p := s.Prog(ppc64.AXXLXOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + p.Reg = ppc64.REG_VS32 + + // Set up CTR loop counter + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ctr + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_CTR + + // Don't generate padding for + // loops with few iterations. + if ctr > 3 { + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + } + + // generate 4 STXVs to zero 64 bytes + var top *obj.Prog + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + + // Save the top of loop + if top == nil { + top = p + } + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = 16 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = 32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = 48 + + // Increment address for the + // 64 bytes just zeroed. + p = s.Prog(ppc64.AADD) + p.Reg = v.Args[0].Reg() + p.From.Type = obj.TYPE_CONST + p.From.Offset = 64 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[0].Reg() + + // Branch back to top of loop + // based on CTR + // BC with BO_BCTR generates bdnz + p = s.Prog(ppc64.ABC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ppc64.BO_BCTR + p.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(top) + } + // When ctr == 1 the loop was not generated but + // there are at least 64 bytes to clear, so add + // that to the remainder to generate the code + // to clear those doublewords + if ctr == 1 { + rem += 64 + } + + // Clear the remainder starting at offset zero + offset := int64(0) + + if rem >= 16 && ctr <= 1 { + // If the XXLXOR hasn't already been + // generated, do it here to initialize + // VS32 (V0) to 0. + p := s.Prog(ppc64.AXXLXOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + p.Reg = ppc64.REG_VS32 + } + // Generate STXV for 32 or 64 + // bytes. + for rem >= 32 { + p := s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + 16 + offset += 32 + rem -= 32 + } + // Generate 16 bytes + if rem >= 16 { + p := s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + offset += 16 + rem -= 16 + } + + // first clear as many doublewords as possible + // then clear remaining sizes as available + for rem > 0 { + op, size := ppc64.AMOVB, int64(1) + switch { + case rem >= 8: + op, size = ppc64.AMOVD, 8 + case rem >= 4: + op, size = ppc64.AMOVW, 4 + case rem >= 2: + op, size = ppc64.AMOVH, 2 + } + p := s.Prog(op) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + rem -= size + offset += size + } + + case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort: + + // Unaligned data doesn't hurt performance + // for these instructions on power8. + + // For sizes >= 64 generate a loop as follows: + + // Set up loop counter in CTR, used by BC + // XXLXOR VS32,VS32,VS32 + // MOVD len/32,REG_TMP + // MOVD REG_TMP,CTR + // MOVD $16,REG_TMP + // loop: + // STXVD2X VS32,(R0)(R20) + // STXVD2X VS32,(R31)(R20) + // ADD $32,R20 + // BC 16, 0, loop + // + // any remainder is done as described below + + // for sizes < 64 bytes, first clear as many doublewords as possible, + // then handle the remainder + // MOVD R0,(R20) + // MOVD R0,8(R20) + // .... etc. + // + // the remainder bytes are cleared using one or more + // of the following instructions with the appropriate + // offsets depending which instructions are needed + // + // MOVW R0,n1(R20) 4 bytes + // MOVH R0,n2(R20) 2 bytes + // MOVB R0,n3(R20) 1 byte + // + // 7 bytes: MOVW, MOVH, MOVB + // 6 bytes: MOVW, MOVH + // 5 bytes: MOVW, MOVB + // 3 bytes: MOVH, MOVB + + // each loop iteration does 32 bytes + ctr := v.AuxInt / 32 + + // remainder bytes + rem := v.AuxInt % 32 + + // only generate a loop if there is more + // than 1 iteration. + if ctr > 1 { + // Set up VS32 (V0) to hold 0s + p := s.Prog(ppc64.AXXLXOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + p.Reg = ppc64.REG_VS32 + + // Set up CTR loop counter + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ctr + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_CTR + + // Set up R31 to hold index value 16 + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + // Don't add padding for alignment + // with few loop iterations. + if ctr > 3 { + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + } + + // generate 2 STXVD2Xs to store 16 bytes + // when this is a loop then the top must be saved + var top *obj.Prog + // This is the top of loop + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Index = ppc64.REGZERO + // Save the top of loop + if top == nil { + top = p + } + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Index = ppc64.REGTMP + + // Increment address for the + // 4 doublewords just zeroed. + p = s.Prog(ppc64.AADD) + p.Reg = v.Args[0].Reg() + p.From.Type = obj.TYPE_CONST + p.From.Offset = 32 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Args[0].Reg() + + // Branch back to top of loop + // based on CTR + // BC with BO_BCTR generates bdnz + p = s.Prog(ppc64.ABC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ppc64.BO_BCTR + p.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(top) + } + + // when ctr == 1 the loop was not generated but + // there are at least 32 bytes to clear, so add + // that to the remainder to generate the code + // to clear those doublewords + if ctr == 1 { + rem += 32 + } + + // clear the remainder starting at offset zero + offset := int64(0) + + // first clear as many doublewords as possible + // then clear remaining sizes as available + for rem > 0 { + op, size := ppc64.AMOVB, int64(1) + switch { + case rem >= 8: + op, size = ppc64.AMOVD, 8 + case rem >= 4: + op, size = ppc64.AMOVW, 4 + case rem >= 2: + op, size = ppc64.AMOVH, 2 + } + p := s.Prog(op) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Offset = offset + rem -= size + offset += size + } + + case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort: + + bytesPerLoop := int64(32) + // This will be used when moving more + // than 8 bytes. Moves start with + // as many 8 byte moves as possible, then + // 4, 2, or 1 byte(s) as remaining. This will + // work and be efficient for power8 or later. + // If there are 64 or more bytes, then a + // loop is generated to move 32 bytes and + // update the src and dst addresses on each + // iteration. When < 64 bytes, the appropriate + // number of moves are generated based on the + // size. + // When moving >= 64 bytes a loop is used + // MOVD len/32,REG_TMP + // MOVD REG_TMP,CTR + // MOVD $16,REG_TMP + // top: + // LXVD2X (R0)(R21),VS32 + // LXVD2X (R31)(R21),VS33 + // ADD $32,R21 + // STXVD2X VS32,(R0)(R20) + // STXVD2X VS33,(R31)(R20) + // ADD $32,R20 + // BC 16,0,top + // Bytes not moved by this loop are moved + // with a combination of the following instructions, + // starting with the largest sizes and generating as + // many as needed, using the appropriate offset value. + // MOVD n(R21),R31 + // MOVD R31,n(R20) + // MOVW n1(R21),R31 + // MOVW R31,n1(R20) + // MOVH n2(R21),R31 + // MOVH R31,n2(R20) + // MOVB n3(R21),R31 + // MOVB R31,n3(R20) + + // Each loop iteration moves 32 bytes + ctr := v.AuxInt / bytesPerLoop + + // Remainder after the loop + rem := v.AuxInt % bytesPerLoop + + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + + // The set of registers used here, must match the clobbered reg list + // in PPC64Ops.go. + offset := int64(0) + + // top of the loop + var top *obj.Prog + // Only generate looping code when loop counter is > 1 for >= 64 bytes + if ctr > 1 { + // Set up the CTR + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ctr + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_CTR + + // Use REGTMP as index reg + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + // Don't adding padding for + // alignment with small iteration + // counts. + if ctr > 3 { + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + } + + // Generate 16 byte loads and stores. + // Use temp register for index (16) + // on the second one. + + p = s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Index = ppc64.REGZERO + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + if top == nil { + top = p + } + p = s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Index = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + // increment the src reg for next iteration + p = s.Prog(ppc64.AADD) + p.Reg = srcReg + p.From.Type = obj.TYPE_CONST + p.From.Offset = bytesPerLoop + p.To.Type = obj.TYPE_REG + p.To.Reg = srcReg + + // generate 16 byte stores + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Index = ppc64.REGZERO + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Index = ppc64.REGTMP + + // increment the dst reg for next iteration + p = s.Prog(ppc64.AADD) + p.Reg = dstReg + p.From.Type = obj.TYPE_CONST + p.From.Offset = bytesPerLoop + p.To.Type = obj.TYPE_REG + p.To.Reg = dstReg + + // BC with BO_BCTR generates bdnz to branch on nonzero CTR + // to loop top. + p = s.Prog(ppc64.ABC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ppc64.BO_BCTR + p.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(top) + + // srcReg and dstReg were incremented in the loop, so + // later instructions start with offset 0. + offset = int64(0) + } + + // No loop was generated for one iteration, so + // add 32 bytes to the remainder to move those bytes. + if ctr == 1 { + rem += bytesPerLoop + } + + if rem >= 16 { + // Generate 16 byte loads and stores. + // Use temp register for index (value 16) + // on the second one. + p := s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Index = ppc64.REGZERO + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Index = ppc64.REGZERO + + offset = 16 + rem -= 16 + + if rem >= 16 { + // Use REGTMP as index reg + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Index = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Index = ppc64.REGTMP + + offset = 32 + rem -= 16 + } + } + + // Generate all the remaining load and store pairs, starting with + // as many 8 byte moves as possible, then 4, 2, 1. + for rem > 0 { + op, size := ppc64.AMOVB, int64(1) + switch { + case rem >= 8: + op, size = ppc64.AMOVD, 8 + case rem >= 4: + op, size = ppc64.AMOVWZ, 4 + case rem >= 2: + op, size = ppc64.AMOVH, 2 + } + // Load + p := s.Prog(op) + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + + // Store + p = s.Prog(op) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + rem -= size + offset += size + } + + case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort: + bytesPerLoop := int64(64) + // This is used when moving more + // than 8 bytes on power9. Moves start with + // as many 8 byte moves as possible, then + // 4, 2, or 1 byte(s) as remaining. This will + // work and be efficient for power8 or later. + // If there are 64 or more bytes, then a + // loop is generated to move 32 bytes and + // update the src and dst addresses on each + // iteration. When < 64 bytes, the appropriate + // number of moves are generated based on the + // size. + // When moving >= 64 bytes a loop is used + // MOVD len/32,REG_TMP + // MOVD REG_TMP,CTR + // top: + // LXV 0(R21),VS32 + // LXV 16(R21),VS33 + // ADD $32,R21 + // STXV VS32,0(R20) + // STXV VS33,16(R20) + // ADD $32,R20 + // BC 16,0,top + // Bytes not moved by this loop are moved + // with a combination of the following instructions, + // starting with the largest sizes and generating as + // many as needed, using the appropriate offset value. + // MOVD n(R21),R31 + // MOVD R31,n(R20) + // MOVW n1(R21),R31 + // MOVW R31,n1(R20) + // MOVH n2(R21),R31 + // MOVH R31,n2(R20) + // MOVB n3(R21),R31 + // MOVB R31,n3(R20) + + // Each loop iteration moves 32 bytes + ctr := v.AuxInt / bytesPerLoop + + // Remainder after the loop + rem := v.AuxInt % bytesPerLoop + + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + + offset := int64(0) + + // top of the loop + var top *obj.Prog + + // Only generate looping code when loop counter is > 1 for >= 64 bytes + if ctr > 1 { + // Set up the CTR + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ctr + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_CTR + + p = s.Prog(obj.APCALIGN) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + + // Generate 16 byte loads and stores. + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + if top == nil { + top = p + } + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + // generate 16 byte stores + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + 16 + + // Generate 16 byte loads and stores. + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + 32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + 48 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + // generate 16 byte stores + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + 32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + 48 + + // increment the src reg for next iteration + p = s.Prog(ppc64.AADD) + p.Reg = srcReg + p.From.Type = obj.TYPE_CONST + p.From.Offset = bytesPerLoop + p.To.Type = obj.TYPE_REG + p.To.Reg = srcReg + + // increment the dst reg for next iteration + p = s.Prog(ppc64.AADD) + p.Reg = dstReg + p.From.Type = obj.TYPE_CONST + p.From.Offset = bytesPerLoop + p.To.Type = obj.TYPE_REG + p.To.Reg = dstReg + + // BC with BO_BCTR generates bdnz to branch on nonzero CTR + // to loop top. + p = s.Prog(ppc64.ABC) + p.From.Type = obj.TYPE_CONST + p.From.Offset = ppc64.BO_BCTR + p.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(top) + + // srcReg and dstReg were incremented in the loop, so + // later instructions start with offset 0. + offset = int64(0) + } + + // No loop was generated for one iteration, so + // add 32 bytes to the remainder to move those bytes. + if ctr == 1 { + rem += bytesPerLoop + } + if rem >= 32 { + p := s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = 16 + + offset = 32 + rem -= 32 + } + + if rem >= 16 { + // Generate 16 byte loads and stores. + p := s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + + offset += 16 + rem -= 16 + + if rem >= 16 { + p := s.Prog(ppc64.ALXV) + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXV) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + + offset += 16 + rem -= 16 + } + } + // Generate all the remaining load and store pairs, starting with + // as many 8 byte moves as possible, then 4, 2, 1. + for rem > 0 { + op, size := ppc64.AMOVB, int64(1) + switch { + case rem >= 8: + op, size = ppc64.AMOVD, 8 + case rem >= 4: + op, size = ppc64.AMOVWZ, 4 + case rem >= 2: + op, size = ppc64.AMOVH, 2 + } + // Load + p := s.Prog(op) + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + p.From.Type = obj.TYPE_MEM + p.From.Reg = srcReg + p.From.Offset = offset + + // Store + p = s.Prog(op) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REGTMP + p.To.Type = obj.TYPE_MEM + p.To.Reg = dstReg + p.To.Offset = offset + rem -= size + offset += size + } + + case ssa.OpPPC64CALLstatic: + s.Call(v) + + case ssa.OpPPC64CALLtail: + s.TailCall(v) + + case ssa.OpPPC64CALLclosure, ssa.OpPPC64CALLinter: + p := s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_LR + + if v.Args[0].Reg() != ppc64.REG_R12 { + v.Fatalf("Function address for %v should be in R12 %d but is in %d", v.LongString(), ppc64.REG_R12, p.From.Reg) + } + + pp := s.Call(v) + pp.To.Reg = ppc64.REG_LR + + // Insert a hint this is not a subroutine return. + pp.SetFrom3Const(1) + + if base.Ctxt.Flag_shared { + // When compiling Go into PIC, the function we just + // called via pointer might have been implemented in + // a separate module and so overwritten the TOC + // pointer in R2; reload it. + q := s.Prog(ppc64.AMOVD) + q.From.Type = obj.TYPE_MEM + q.From.Offset = 24 + q.From.Reg = ppc64.REGSP + q.To.Type = obj.TYPE_REG + q.To.Reg = ppc64.REG_R2 + } + + case ssa.OpPPC64LoweredWB: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = v.Aux.(*obj.LSym) + + case ssa.OpPPC64LoweredPanicBoundsA, ssa.OpPPC64LoweredPanicBoundsB, ssa.OpPPC64LoweredPanicBoundsC: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt] + s.UseArgs(16) // space used in callee args area by assembly stubs + + case ssa.OpPPC64LoweredNilCheck: + if buildcfg.GOOS == "aix" { + // CMP Rarg0, R0 + // BNE 2(PC) + // STW R0, 0(R0) + // NOP (so the BNE has somewhere to land) + + // CMP Rarg0, R0 + p := s.Prog(ppc64.ACMP) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_R0 + + // BNE 2(PC) + p2 := s.Prog(ppc64.ABNE) + p2.To.Type = obj.TYPE_BRANCH + + // STW R0, 0(R0) + // Write at 0 is forbidden and will trigger a SIGSEGV + p = s.Prog(ppc64.AMOVW) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_R0 + p.To.Type = obj.TYPE_MEM + p.To.Reg = ppc64.REG_R0 + + // NOP (so the BNE has somewhere to land) + nop := s.Prog(obj.ANOP) + p2.To.SetTarget(nop) + + } else { + // Issue a load which will fault if arg is nil. + p := s.Prog(ppc64.AMOVBZ) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + } + if logopt.Enabled() { + logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name) + } + if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers + base.WarnfAt(v.Pos, "generated nil check") + } + + // These should be resolved by rules and not make it here. + case ssa.OpPPC64Equal, ssa.OpPPC64NotEqual, ssa.OpPPC64LessThan, ssa.OpPPC64FLessThan, + ssa.OpPPC64LessEqual, ssa.OpPPC64GreaterThan, ssa.OpPPC64FGreaterThan, ssa.OpPPC64GreaterEqual, + ssa.OpPPC64FLessEqual, ssa.OpPPC64FGreaterEqual: + v.Fatalf("Pseudo-op should not make it to codegen: %s ###\n", v.LongString()) + case ssa.OpPPC64InvertFlags: + v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString()) + case ssa.OpPPC64FlagEQ, ssa.OpPPC64FlagLT, ssa.OpPPC64FlagGT: + v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString()) + case ssa.OpClobber, ssa.OpClobberReg: + // TODO: implement for clobberdead experiment. Nop is ok for now. + default: + v.Fatalf("genValue not implemented: %s", v.LongString()) + } +} + +var blockJump = [...]struct { + asm, invasm obj.As + asmeq, invasmun bool +}{ + ssa.BlockPPC64EQ: {ppc64.ABEQ, ppc64.ABNE, false, false}, + ssa.BlockPPC64NE: {ppc64.ABNE, ppc64.ABEQ, false, false}, + + ssa.BlockPPC64LT: {ppc64.ABLT, ppc64.ABGE, false, false}, + ssa.BlockPPC64GE: {ppc64.ABGE, ppc64.ABLT, false, false}, + ssa.BlockPPC64LE: {ppc64.ABLE, ppc64.ABGT, false, false}, + ssa.BlockPPC64GT: {ppc64.ABGT, ppc64.ABLE, false, false}, + + // TODO: need to work FP comparisons into block jumps + ssa.BlockPPC64FLT: {ppc64.ABLT, ppc64.ABGE, false, false}, + ssa.BlockPPC64FGE: {ppc64.ABGT, ppc64.ABLT, true, true}, // GE = GT or EQ; !GE = LT or UN + ssa.BlockPPC64FLE: {ppc64.ABLT, ppc64.ABGT, true, true}, // LE = LT or EQ; !LE = GT or UN + ssa.BlockPPC64FGT: {ppc64.ABGT, ppc64.ABLE, false, false}, +} + +func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) { + switch b.Kind { + case ssa.BlockDefer: + // defer returns in R3: + // 0 if we should continue executing + // 1 if we should jump to deferreturn call + p := s.Prog(ppc64.ACMP) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_R3 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_R0 + + p = s.Prog(ppc64.ABNE) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[1].Block()}) + if b.Succs[0].Block() != next { + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()}) + } + + case ssa.BlockPlain: + if b.Succs[0].Block() != next { + p := s.Prog(obj.AJMP) + p.To.Type = obj.TYPE_BRANCH + s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()}) + } + case ssa.BlockExit, ssa.BlockRetJmp: + case ssa.BlockRet: + s.Prog(obj.ARET) + + case ssa.BlockPPC64EQ, ssa.BlockPPC64NE, + ssa.BlockPPC64LT, ssa.BlockPPC64GE, + ssa.BlockPPC64LE, ssa.BlockPPC64GT, + ssa.BlockPPC64FLT, ssa.BlockPPC64FGE, + ssa.BlockPPC64FLE, ssa.BlockPPC64FGT: + jmp := blockJump[b.Kind] + switch next { + case b.Succs[0].Block(): + s.Br(jmp.invasm, b.Succs[1].Block()) + if jmp.invasmun { + // TODO: The second branch is probably predict-not-taken since it is for FP unordered + s.Br(ppc64.ABVS, b.Succs[1].Block()) + } + case b.Succs[1].Block(): + s.Br(jmp.asm, b.Succs[0].Block()) + if jmp.asmeq { + s.Br(ppc64.ABEQ, b.Succs[0].Block()) + } + default: + if b.Likely != ssa.BranchUnlikely { + s.Br(jmp.asm, b.Succs[0].Block()) + if jmp.asmeq { + s.Br(ppc64.ABEQ, b.Succs[0].Block()) + } + s.Br(obj.AJMP, b.Succs[1].Block()) + } else { + s.Br(jmp.invasm, b.Succs[1].Block()) + if jmp.invasmun { + // TODO: The second branch is probably predict-not-taken since it is for FP unordered + s.Br(ppc64.ABVS, b.Succs[1].Block()) + } + s.Br(obj.AJMP, b.Succs[0].Block()) + } + } + default: + b.Fatalf("branch not implemented: %s", b.LongString()) + } +} + +func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog { + p := s.Prog(loadByType(t)) + p.From.Type = obj.TYPE_MEM + p.From.Name = obj.NAME_AUTO + p.From.Sym = n.Linksym() + p.From.Offset = n.FrameOffset() + off + p.To.Type = obj.TYPE_REG + p.To.Reg = reg + return p +} + +func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog { + p = pp.Append(p, storeByType(t), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off) + p.To.Name = obj.NAME_PARAM + p.To.Sym = n.Linksym() + p.Pos = p.Pos.WithNotStmt() + return p +} |