diff options
Diffstat (limited to 'src/cmd/compile/internal/amd64/ggen.go')
-rw-r--r-- | src/cmd/compile/internal/amd64/ggen.go | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go new file mode 100644 index 0000000..b8dce81 --- /dev/null +++ b/src/cmd/compile/internal/amd64/ggen.go @@ -0,0 +1,154 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package amd64 + +import ( + "cmd/compile/internal/base" + "cmd/compile/internal/ir" + "cmd/compile/internal/objw" + "cmd/compile/internal/types" + "cmd/internal/obj" + "cmd/internal/obj/x86" + "internal/buildcfg" +) + +// no floating point in note handlers on Plan 9 +var isPlan9 = buildcfg.GOOS == "plan9" + +// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, +// See runtime/mkduff.go. +const ( + dzBlocks = 16 // number of MOV/ADD blocks + dzBlockLen = 4 // number of clears per block + dzBlockSize = 23 // size of instructions in a single block + dzMovSize = 5 // size of single MOV instruction w/ offset + dzLeaqSize = 4 // size of single LEAQ instruction + dzClearStep = 16 // number of bytes cleared by each MOV instruction + + dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block + dzSize = dzBlocks * dzBlockSize +) + +// dzOff returns the offset for a jump into DUFFZERO. +// b is the number of bytes to zero. +func dzOff(b int64) int64 { + off := int64(dzSize) + off -= b / dzClearLen * dzBlockSize + tailLen := b % dzClearLen + if tailLen >= dzClearStep { + off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep) + } + return off +} + +// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO. +// b is the number of bytes to zero. +func dzDI(b int64) int64 { + tailLen := b % dzClearLen + if tailLen < dzClearStep { + return 0 + } + tailSteps := tailLen / dzClearStep + return -dzClearStep * (dzBlockLen - tailSteps) +} + +func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog { + const ( + r13 = 1 << iota // if R13 is already zeroed. + ) + + if cnt == 0 { + return p + } + + if cnt%int64(types.RegSize) != 0 { + // should only happen with nacl + if cnt%int64(types.PtrSize) != 0 { + base.Fatalf("zerorange count not a multiple of widthptr %d", cnt) + } + if *state&r13 == 0 { + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_R13, 0) + *state |= r13 + } + p = pp.Append(p, x86.AMOVL, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_MEM, x86.REG_SP, off) + off += int64(types.PtrSize) + cnt -= int64(types.PtrSize) + } + + if cnt == 8 { + if *state&r13 == 0 { + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_R13, 0) + *state |= r13 + } + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_MEM, x86.REG_SP, off) + } else if !isPlan9 && cnt <= int64(8*types.RegSize) { + for i := int64(0); i < cnt/16; i++ { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16) + } + + if cnt%16 != 0 { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16)) + } + } else if !isPlan9 && (cnt <= int64(128*types.RegSize)) { + // Save DI to r12. With the amd64 Go register abi, DI can contain + // an incoming parameter, whereas R12 is always scratch. + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) + // Emit duffzero call + p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) + p.To.Sym = ir.Syms.Duffzero + if cnt%16 != 0 { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) + } + // Restore DI from r12 + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) + + } else { + // When the register ABI is in effect, at this point in the + // prolog we may have live values in all of RAX,RDI,RCX. Save + // them off to registers before the REPSTOSQ below, then + // restore. Note that R12 and R13 are always available as + // scratch regs; here we also use R15 (this is safe to do + // since there won't be any globals accessed in the prolog). + // See rewriteToUseGot() in obj6.go for more on r15 use. + + // Save rax/rdi/rcx + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0) + + // Set up the REPSTOSQ and kick it off. + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0) + p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) + p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) + + // Restore rax/rdi/rcx + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0) + + // Record the fact that r13 is no longer zero. + *state &= ^uint32(r13) + } + + return p +} + +func ginsnop(pp *objw.Progs) *obj.Prog { + // This is a hardware nop (1-byte 0x90) instruction, + // even though we describe it as an explicit XCHGL here. + // Particularly, this does not zero the high 32 bits + // like typical *L opcodes. + // (gas assembles "xchg %eax,%eax" to 0x87 0xc0, which + // does zero the high 32 bits.) + p := pp.Prog(x86.AXCHGL) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_AX + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_AX + return p +} |