diff options
Diffstat (limited to 'src/cmd/compile/internal/ssa/schedule.go')
-rw-r--r-- | src/cmd/compile/internal/ssa/schedule.go | 632 |
1 files changed, 632 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go new file mode 100644 index 0000000..4e762f7 --- /dev/null +++ b/src/cmd/compile/internal/ssa/schedule.go @@ -0,0 +1,632 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "cmd/compile/internal/types" + "container/heap" + "sort" +) + +const ( + ScorePhi = iota // towards top of block + ScoreArg + ScoreNilCheck + ScoreReadTuple + ScoreVarDef + ScoreCarryChainTail + ScoreMemory + ScoreReadFlags + ScoreDefault + ScoreFlags + ScoreControl // towards bottom of block +) + +type ValHeap struct { + a []*Value + score []int8 +} + +func (h ValHeap) Len() int { return len(h.a) } +func (h ValHeap) Swap(i, j int) { a := h.a; a[i], a[j] = a[j], a[i] } + +func (h *ValHeap) Push(x interface{}) { + // Push and Pop use pointer receivers because they modify the slice's length, + // not just its contents. + v := x.(*Value) + h.a = append(h.a, v) +} +func (h *ValHeap) Pop() interface{} { + old := h.a + n := len(old) + x := old[n-1] + h.a = old[0 : n-1] + return x +} +func (h ValHeap) Less(i, j int) bool { + x := h.a[i] + y := h.a[j] + sx := h.score[x.ID] + sy := h.score[y.ID] + if c := sx - sy; c != 0 { + return c > 0 // higher score comes later. + } + if x.Pos != y.Pos { // Favor in-order line stepping + return x.Pos.After(y.Pos) + } + if x.Op != OpPhi { + if c := len(x.Args) - len(y.Args); c != 0 { + return c < 0 // smaller args comes later + } + } + if c := x.Uses - y.Uses; c != 0 { + return c < 0 // smaller uses come later + } + // These comparisons are fairly arbitrary. + // The goal here is stability in the face + // of unrelated changes elsewhere in the compiler. + if c := x.AuxInt - y.AuxInt; c != 0 { + return c > 0 + } + if cmp := x.Type.Compare(y.Type); cmp != types.CMPeq { + return cmp == types.CMPgt + } + return x.ID > y.ID +} + +func (op Op) isLoweredGetClosurePtr() bool { + switch op { + case OpAMD64LoweredGetClosurePtr, OpPPC64LoweredGetClosurePtr, OpARMLoweredGetClosurePtr, OpARM64LoweredGetClosurePtr, + Op386LoweredGetClosurePtr, OpMIPS64LoweredGetClosurePtr, OpLOONG64LoweredGetClosurePtr, OpS390XLoweredGetClosurePtr, OpMIPSLoweredGetClosurePtr, + OpRISCV64LoweredGetClosurePtr, OpWasmLoweredGetClosurePtr: + return true + } + return false +} + +// Schedule the Values in each Block. After this phase returns, the +// order of b.Values matters and is the order in which those values +// will appear in the assembly output. For now it generates a +// reasonable valid schedule using a priority queue. TODO(khr): +// schedule smarter. +func schedule(f *Func) { + // For each value, the number of times it is used in the block + // by values that have not been scheduled yet. + uses := f.Cache.allocInt32Slice(f.NumValues()) + defer f.Cache.freeInt32Slice(uses) + + // reusable priority queue + priq := new(ValHeap) + + // "priority" for a value + score := f.Cache.allocInt8Slice(f.NumValues()) + defer f.Cache.freeInt8Slice(score) + + // scheduling order. We queue values in this list in reverse order. + // A constant bound allows this to be stack-allocated. 64 is + // enough to cover almost every schedule call. + order := make([]*Value, 0, 64) + + // maps mem values to the next live memory value + nextMem := f.Cache.allocValueSlice(f.NumValues()) + defer f.Cache.freeValueSlice(nextMem) + // additional pretend arguments for each Value. Used to enforce load/store ordering. + additionalArgs := make([][]*Value, f.NumValues()) + + for _, b := range f.Blocks { + // Compute score. Larger numbers are scheduled closer to the end of the block. + for _, v := range b.Values { + switch { + case v.Op.isLoweredGetClosurePtr(): + // We also score GetLoweredClosurePtr as early as possible to ensure that the + // context register is not stomped. GetLoweredClosurePtr should only appear + // in the entry block where there are no phi functions, so there is no + // conflict or ambiguity here. + if b != f.Entry { + f.Fatalf("LoweredGetClosurePtr appeared outside of entry block, b=%s", b.String()) + } + score[v.ID] = ScorePhi + case v.Op == OpAMD64LoweredNilCheck || v.Op == OpPPC64LoweredNilCheck || + v.Op == OpARMLoweredNilCheck || v.Op == OpARM64LoweredNilCheck || + v.Op == Op386LoweredNilCheck || v.Op == OpMIPS64LoweredNilCheck || + v.Op == OpS390XLoweredNilCheck || v.Op == OpMIPSLoweredNilCheck || + v.Op == OpRISCV64LoweredNilCheck || v.Op == OpWasmLoweredNilCheck || + v.Op == OpLOONG64LoweredNilCheck: + // Nil checks must come before loads from the same address. + score[v.ID] = ScoreNilCheck + case v.Op == OpPhi: + // We want all the phis first. + score[v.ID] = ScorePhi + case v.Op == OpVarDef: + // We want all the vardefs next. + score[v.ID] = ScoreVarDef + case v.Op == OpArgIntReg || v.Op == OpArgFloatReg: + // In-register args must be scheduled as early as possible to ensure that the + // context register is not stomped. They should only appear in the entry block. + if b != f.Entry { + f.Fatalf("%s appeared outside of entry block, b=%s", v.Op, b.String()) + } + score[v.ID] = ScorePhi + case v.Op == OpArg: + // We want all the args as early as possible, for better debugging. + score[v.ID] = ScoreArg + case v.Type.IsMemory(): + // Schedule stores as early as possible. This tends to + // reduce register pressure. It also helps make sure + // VARDEF ops are scheduled before the corresponding LEA. + score[v.ID] = ScoreMemory + case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN: + if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) { + // When the Select pseudo op is being used for a carry or flag from + // a tuple then score it as ScoreFlags so it happens later. This + // prevents the bit from being clobbered before it is used. + score[v.ID] = ScoreFlags + } else { + score[v.ID] = ScoreReadTuple + } + case v.isCarry(): + if w := v.getCarryInput(); w != nil && w.Block == b { + // The producing op is not the final user of the carry bit. Its + // current score is one of unscored, Flags, or CarryChainTail. + // These occur if the producer has not been scored, another user + // of the producers carry flag was scored (there are >1 users of + // the carry out flag), or it was visited earlier and already + // scored CarryChainTail (and prove w is not a tail). + score[w.ID] = ScoreFlags + } + // Verify v has not been scored. If v has not been visited, v may be + // the final (tail) operation in a carry chain. If v is not, v will be + // rescored above when v's carry-using op is scored. When scoring is done, + // only tail operations will retain the CarryChainTail score. + if score[v.ID] != ScoreFlags { + // Score the tail of carry chain operations to a lower (earlier in the + // block) priority. This creates a priority inversion which allows only + // one chain to be scheduled, if possible. + score[v.ID] = ScoreCarryChainTail + } + case v.isFlagOp(): + // Schedule flag register generation as late as possible. + // This makes sure that we only have one live flags + // value at a time. + score[v.ID] = ScoreFlags + default: + score[v.ID] = ScoreDefault + // If we're reading flags, schedule earlier to keep flag lifetime short. + for _, a := range v.Args { + if a.isFlagOp() { + score[v.ID] = ScoreReadFlags + } + } + } + } + } + + for _, b := range f.Blocks { + // Find store chain for block. + // Store chains for different blocks overwrite each other, so + // the calculated store chain is good only for this block. + for _, v := range b.Values { + if v.Op != OpPhi && v.Type.IsMemory() { + for _, w := range v.Args { + if w.Type.IsMemory() { + nextMem[w.ID] = v + } + } + } + } + + // Compute uses. + for _, v := range b.Values { + if v.Op == OpPhi { + // If a value is used by a phi, it does not induce + // a scheduling edge because that use is from the + // previous iteration. + continue + } + for _, w := range v.Args { + if w.Block == b { + uses[w.ID]++ + } + // Any load must come before the following store. + if !v.Type.IsMemory() && w.Type.IsMemory() { + // v is a load. + s := nextMem[w.ID] + if s == nil || s.Block != b { + continue + } + additionalArgs[s.ID] = append(additionalArgs[s.ID], v) + uses[v.ID]++ + } + } + } + + for _, c := range b.ControlValues() { + // Force the control values to be scheduled at the end, + // unless they are phi values (which must be first). + // OpArg also goes first -- if it is stack it register allocates + // to a LoadReg, if it is register it is from the beginning anyway. + if score[c.ID] == ScorePhi || score[c.ID] == ScoreArg { + continue + } + score[c.ID] = ScoreControl + + // Schedule values dependent on the control values at the end. + // This reduces the number of register spills. We don't find + // all values that depend on the controls, just values with a + // direct dependency. This is cheaper and in testing there + // was no difference in the number of spills. + for _, v := range b.Values { + if v.Op != OpPhi { + for _, a := range v.Args { + if a == c { + score[v.ID] = ScoreControl + } + } + } + } + } + + // To put things into a priority queue + // The values that should come last are least. + priq.score = score + priq.a = priq.a[:0] + + // Initialize priority queue with schedulable values. + for _, v := range b.Values { + if uses[v.ID] == 0 { + heap.Push(priq, v) + } + } + + // Schedule highest priority value, update use counts, repeat. + order = order[:0] + tuples := make(map[ID][]*Value) + for priq.Len() > 0 { + // Find highest priority schedulable value. + // Note that schedule is assembled backwards. + + v := heap.Pop(priq).(*Value) + + if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() { + // Add some debugging noise if the chain of carrying ops will not + // likely be scheduled without potential carry flag clobbers. + if !isCarryChainReady(v, uses) { + f.Warnl(v.Pos, "carry chain ending with %v not ready", v) + } + } + + // Add it to the schedule. + // Do not emit tuple-reading ops until we're ready to emit the tuple-generating op. + //TODO: maybe remove ReadTuple score above, if it does not help on performance + switch { + case v.Op == OpSelect0: + if tuples[v.Args[0].ID] == nil { + tuples[v.Args[0].ID] = make([]*Value, 2) + } + tuples[v.Args[0].ID][0] = v + case v.Op == OpSelect1: + if tuples[v.Args[0].ID] == nil { + tuples[v.Args[0].ID] = make([]*Value, 2) + } + tuples[v.Args[0].ID][1] = v + case v.Op == OpSelectN: + if tuples[v.Args[0].ID] == nil { + tuples[v.Args[0].ID] = make([]*Value, v.Args[0].Type.NumFields()) + } + tuples[v.Args[0].ID][v.AuxInt] = v + case v.Type.IsResults() && tuples[v.ID] != nil: + tup := tuples[v.ID] + for i := len(tup) - 1; i >= 0; i-- { + if tup[i] != nil { + order = append(order, tup[i]) + } + } + delete(tuples, v.ID) + order = append(order, v) + case v.Type.IsTuple() && tuples[v.ID] != nil: + if tuples[v.ID][1] != nil { + order = append(order, tuples[v.ID][1]) + } + if tuples[v.ID][0] != nil { + order = append(order, tuples[v.ID][0]) + } + delete(tuples, v.ID) + fallthrough + default: + order = append(order, v) + } + + // Update use counts of arguments. + for _, w := range v.Args { + if w.Block != b { + continue + } + uses[w.ID]-- + if uses[w.ID] == 0 { + // All uses scheduled, w is now schedulable. + heap.Push(priq, w) + } + } + for _, w := range additionalArgs[v.ID] { + uses[w.ID]-- + if uses[w.ID] == 0 { + // All uses scheduled, w is now schedulable. + heap.Push(priq, w) + } + } + } + if len(order) != len(b.Values) { + f.Fatalf("schedule does not include all values in block %s", b) + } + for i := 0; i < len(b.Values); i++ { + b.Values[i] = order[len(b.Values)-1-i] + } + } + + f.scheduled = true +} + +// storeOrder orders values with respect to stores. That is, +// if v transitively depends on store s, v is ordered after s, +// otherwise v is ordered before s. +// Specifically, values are ordered like +// +// store1 +// NilCheck that depends on store1 +// other values that depends on store1 +// store2 +// NilCheck that depends on store2 +// other values that depends on store2 +// ... +// +// The order of non-store and non-NilCheck values are undefined +// (not necessarily dependency order). This should be cheaper +// than a full scheduling as done above. +// Note that simple dependency order won't work: there is no +// dependency between NilChecks and values like IsNonNil. +// Auxiliary data structures are passed in as arguments, so +// that they can be allocated in the caller and be reused. +// This function takes care of reset them. +func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value { + if len(values) == 0 { + return values + } + + f := values[0].Block.Func + + // find all stores + + // Members of values that are store values. + // A constant bound allows this to be stack-allocated. 64 is + // enough to cover almost every storeOrder call. + stores := make([]*Value, 0, 64) + hasNilCheck := false + sset.clear() // sset is the set of stores that are used in other values + for _, v := range values { + if v.Type.IsMemory() { + stores = append(stores, v) + if v.Op == OpInitMem || v.Op == OpPhi { + continue + } + sset.add(v.MemoryArg().ID) // record that v's memory arg is used + } + if v.Op == OpNilCheck { + hasNilCheck = true + } + } + if len(stores) == 0 || !hasNilCheck && f.pass.name == "nilcheckelim" { + // there is no store, the order does not matter + return values + } + + // find last store, which is the one that is not used by other stores + var last *Value + for _, v := range stores { + if !sset.contains(v.ID) { + if last != nil { + f.Fatalf("two stores live simultaneously: %v and %v", v, last) + } + last = v + } + } + + // We assign a store number to each value. Store number is the + // index of the latest store that this value transitively depends. + // The i-th store in the current block gets store number 3*i. A nil + // check that depends on the i-th store gets store number 3*i+1. + // Other values that depends on the i-th store gets store number 3*i+2. + // Special case: 0 -- unassigned, 1 or 2 -- the latest store it depends + // is in the previous block (or no store at all, e.g. value is Const). + // First we assign the number to all stores by walking back the store chain, + // then assign the number to other values in DFS order. + count := make([]int32, 3*(len(stores)+1)) + sset.clear() // reuse sparse set to ensure that a value is pushed to stack only once + for n, w := len(stores), last; n > 0; n-- { + storeNumber[w.ID] = int32(3 * n) + count[3*n]++ + sset.add(w.ID) + if w.Op == OpInitMem || w.Op == OpPhi { + if n != 1 { + f.Fatalf("store order is wrong: there are stores before %v", w) + } + break + } + w = w.MemoryArg() + } + var stack []*Value + for _, v := range values { + if sset.contains(v.ID) { + // in sset means v is a store, or already pushed to stack, or already assigned a store number + continue + } + stack = append(stack, v) + sset.add(v.ID) + + for len(stack) > 0 { + w := stack[len(stack)-1] + if storeNumber[w.ID] != 0 { + stack = stack[:len(stack)-1] + continue + } + if w.Op == OpPhi { + // Phi value doesn't depend on store in the current block. + // Do this early to avoid dependency cycle. + storeNumber[w.ID] = 2 + count[2]++ + stack = stack[:len(stack)-1] + continue + } + + max := int32(0) // latest store dependency + argsdone := true + for _, a := range w.Args { + if a.Block != w.Block { + continue + } + if !sset.contains(a.ID) { + stack = append(stack, a) + sset.add(a.ID) + argsdone = false + break + } + if storeNumber[a.ID]/3 > max { + max = storeNumber[a.ID] / 3 + } + } + if !argsdone { + continue + } + + n := 3*max + 2 + if w.Op == OpNilCheck { + n = 3*max + 1 + } + storeNumber[w.ID] = n + count[n]++ + stack = stack[:len(stack)-1] + } + } + + // convert count to prefix sum of counts: count'[i] = sum_{j<=i} count[i] + for i := range count { + if i == 0 { + continue + } + count[i] += count[i-1] + } + if count[len(count)-1] != int32(len(values)) { + f.Fatalf("storeOrder: value is missing, total count = %d, values = %v", count[len(count)-1], values) + } + + // place values in count-indexed bins, which are in the desired store order + order := make([]*Value, len(values)) + for _, v := range values { + s := storeNumber[v.ID] + order[count[s-1]] = v + count[s-1]++ + } + + // Order nil checks in source order. We want the first in source order to trigger. + // If two are on the same line, we don't really care which happens first. + // See issue 18169. + if hasNilCheck { + start := -1 + for i, v := range order { + if v.Op == OpNilCheck { + if start == -1 { + start = i + } + } else { + if start != -1 { + sort.Sort(bySourcePos(order[start:i])) + start = -1 + } + } + } + if start != -1 { + sort.Sort(bySourcePos(order[start:])) + } + } + + return order +} + +// isFlagOp reports if v is an OP with the flag type. +func (v *Value) isFlagOp() bool { + return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags() +} + +// isCarryChainReady reports whether all dependent carry ops can be scheduled after this. +func isCarryChainReady(v *Value, uses []int32) bool { + // A chain can be scheduled in it's entirety if + // the use count of each dependent op is 1. If none, + // schedule the first. + j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1. + for k := v; k != nil; k = k.getCarryInput() { + j += int(uses[k.ID]) - 1 + } + return j == 0 +} + +// isCarryInput reports whether v accepts a carry value as input. +func (v *Value) isCarryInput() bool { + return v.getCarryInput() != nil +} + +// isCarryOutput reports whether v generates a carry as output. +func (v *Value) isCarryOutput() bool { + // special cases for PPC64 which put their carry values in XER instead of flags + switch v.Block.Func.Config.arch { + case "ppc64", "ppc64le": + switch v.Op { + case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst: + return true + } + return false + } + return v.isFlagOp() && v.Op != OpSelect1 +} + +// isCarryCreator reports whether op is an operation which produces a carry bit value, +// but does not consume it. +func (v *Value) isCarryCreator() bool { + return v.isCarryOutput() && !v.isCarryInput() +} + +// isCarry reports whether op consumes or creates a carry a bit value. +func (v *Value) isCarry() bool { + return v.isCarryOutput() || v.isCarryInput() +} + +// getCarryInput returns the producing *Value of the carry bit of this op, or nil if none. +func (v *Value) getCarryInput() *Value { + // special cases for PPC64 which put their carry values in XER instead of flags + switch v.Block.Func.Config.arch { + case "ppc64", "ppc64le": + switch v.Op { + case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero: + // PPC64 carry dependencies are conveyed through their final argument. + // Likewise, there is always an OpSelect1 between them. + return v.Args[len(v.Args)-1].Args[0] + } + return nil + } + for _, a := range v.Args { + if !a.isFlagOp() { + continue + } + if a.Op == OpSelect1 { + a = a.Args[0] + } + return a + } + return nil +} + +type bySourcePos []*Value + +func (s bySourcePos) Len() int { return len(s) } +func (s bySourcePos) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s bySourcePos) Less(i, j int) bool { return s[i].Pos.Before(s[j].Pos) } |