LLVM API Documentation

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
R600MachineScheduler.cpp
Go to the documentation of this file.
1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief R600 Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #define DEBUG_TYPE "misched"
16 
17 #include "R600MachineScheduler.h"
20 #include "llvm/Pass.h"
21 #include "llvm/PassManager.h"
23 
24 using namespace llvm;
25 
27 
28  DAG = dag;
29  TII = static_cast<const R600InstrInfo*>(DAG->TII);
30  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
31  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
32  MRI = &DAG->MRI;
33  CurInstKind = IDOther;
34  CurEmitted = 0;
35  OccupedSlotsMask = 31;
36  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
37  InstKindLimit[IDOther] = 32;
38 
40  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
41  AluInstCount = 0;
42  FetchInstCount = 0;
43 }
44 
45 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
46  std::vector<SUnit *> &QDst)
47 {
48  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
49  QSrc.clear();
50 }
51 
52 static
53 unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
54  assert (GPRCount && "GPRCount cannot be 0");
55  return 248 / GPRCount;
56 }
57 
59  SUnit *SU = 0;
60  NextInstKind = IDOther;
61 
62  IsTopNode = false;
63 
64  // check if we might want to switch current clause type
65  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
66  (Available[CurInstKind].empty());
67  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
68  (!Available[IDFetch].empty() || !Available[IDOther].empty());
69 
70  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
71  // We use the heuristic provided by AMD Accelerated Parallel Processing
72  // OpenCL Programming Guide :
73  // The approx. number of WF that allows TEX inst to hide ALU inst is :
74  // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
75  float ALUFetchRationEstimate =
76  (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
77  (FetchInstCount + Available[IDFetch].size());
78  unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
79  DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
80  // We assume the local GPR requirements to be "dominated" by the requirement
81  // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
82  // after TEX are indeed likely to consume or generate values from/for the
83  // TEX clause.
84  // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
85  // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
86  // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
87  // (TODO : use RegisterPressure)
88  // If we are going too use too many GPR, we flush Fetch instruction to lower
89  // register pressure on 128 bits regs.
90  unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
91  if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
92  AllowSwitchFromAlu = true;
93  }
94 
95  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
96  (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
97  // try to pick ALU
98  SU = pickAlu();
99  if (!SU && !PhysicalRegCopy.empty()) {
100  SU = PhysicalRegCopy.front();
101  PhysicalRegCopy.erase(PhysicalRegCopy.begin());
102  }
103  if (SU) {
104  if (CurEmitted >= InstKindLimit[IDAlu])
105  CurEmitted = 0;
106  NextInstKind = IDAlu;
107  }
108  }
109 
110  if (!SU) {
111  // try to pick FETCH
112  SU = pickOther(IDFetch);
113  if (SU)
114  NextInstKind = IDFetch;
115  }
116 
117  // try to pick other
118  if (!SU) {
119  SU = pickOther(IDOther);
120  if (SU)
121  NextInstKind = IDOther;
122  }
123 
124  DEBUG(
125  if (SU) {
126  dbgs() << " ** Pick node **\n";
127  SU->dump(DAG);
128  } else {
129  dbgs() << "NO NODE \n";
130  for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
131  const SUnit &S = DAG->SUnits[i];
132  if (!S.isScheduled)
133  S.dump(DAG);
134  }
135  }
136  );
137 
138  return SU;
139 }
140 
141 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
142  if (NextInstKind != CurInstKind) {
143  DEBUG(dbgs() << "Instruction Type Switch\n");
144  if (NextInstKind != IDAlu)
145  OccupedSlotsMask |= 31;
146  CurEmitted = 0;
147  CurInstKind = NextInstKind;
148  }
149 
150  if (CurInstKind == IDAlu) {
151  AluInstCount ++;
152  switch (getAluKind(SU)) {
153  case AluT_XYZW:
154  CurEmitted += 4;
155  break;
156  case AluDiscarded:
157  break;
158  default: {
159  ++CurEmitted;
161  E = SU->getInstr()->operands_end(); It != E; ++It) {
162  MachineOperand &MO = *It;
163  if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
164  ++CurEmitted;
165  }
166  }
167  }
168  } else {
169  ++CurEmitted;
170  }
171 
172 
173  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
174 
175  if (CurInstKind != IDFetch) {
176  MoveUnits(Pending[IDFetch], Available[IDFetch]);
177  } else
178  FetchInstCount++;
179 }
180 
181 static bool
183  if (MI->getOpcode() != AMDGPU::COPY)
184  return false;
185 
187 }
188 
190  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
191 }
192 
194  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
195  if (isPhysicalRegCopy(SU->getInstr())) {
196  PhysicalRegCopy.push_back(SU);
197  return;
198  }
199 
200  int IK = getInstKind(SU);
201 
202  // There is no export clause, we can schedule one as soon as its ready
203  if (IK == IDOther)
204  Available[IDOther].push_back(SU);
205  else
206  Pending[IK].push_back(SU);
207 
208 }
209 
210 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
211  const TargetRegisterClass *RC) const {
213  return RC->contains(Reg);
214  } else {
215  return MRI->getRegClass(Reg) == RC;
216  }
217 }
218 
219 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
220  MachineInstr *MI = SU->getInstr();
221 
222  if (TII->isTransOnly(MI))
223  return AluTrans;
224 
225  switch (MI->getOpcode()) {
226  case AMDGPU::PRED_X:
227  return AluPredX;
228  case AMDGPU::INTERP_PAIR_XY:
229  case AMDGPU::INTERP_PAIR_ZW:
230  case AMDGPU::INTERP_VEC_LOAD:
231  case AMDGPU::DOT_4:
232  return AluT_XYZW;
233  case AMDGPU::COPY:
234  if (MI->getOperand(1).isUndef()) {
235  // MI will become a KILL, don't considers it in scheduling
236  return AluDiscarded;
237  }
238  default:
239  break;
240  }
241 
242  // Does the instruction take a whole IG ?
243  // XXX: Is it possible to add a helper function in R600InstrInfo that can
244  // be used here and in R600PacketizerList::isSoloInstruction() ?
245  if(TII->isVector(*MI) ||
246  TII->isCubeOp(MI->getOpcode()) ||
247  TII->isReductionOp(MI->getOpcode()) ||
248  MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
249  return AluT_XYZW;
250  }
251 
252  if (TII->isLDSInstr(MI->getOpcode())) {
253  return AluT_X;
254  }
255 
256  // Is the result already assigned to a channel ?
257  unsigned DestSubReg = MI->getOperand(0).getSubReg();
258  switch (DestSubReg) {
259  case AMDGPU::sub0:
260  return AluT_X;
261  case AMDGPU::sub1:
262  return AluT_Y;
263  case AMDGPU::sub2:
264  return AluT_Z;
265  case AMDGPU::sub3:
266  return AluT_W;
267  default:
268  break;
269  }
270 
271  // Is the result already member of a X/Y/Z/W class ?
272  unsigned DestReg = MI->getOperand(0).getReg();
273  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
274  regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
275  return AluT_X;
276  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
277  return AluT_Y;
278  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
279  return AluT_Z;
280  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
281  return AluT_W;
282  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
283  return AluT_XYZW;
284 
285  // LDS src registers cannot be used in the Trans slot.
286  if (TII->readsLDSSrcReg(MI))
287  return AluT_XYZW;
288 
289  return AluAny;
290 
291 }
292 
293 int R600SchedStrategy::getInstKind(SUnit* SU) {
294  int Opcode = SU->getInstr()->getOpcode();
295 
296  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
297  return IDFetch;
298 
299  if (TII->isALUInstr(Opcode)) {
300  return IDAlu;
301  }
302 
303  switch (Opcode) {
304  case AMDGPU::PRED_X:
305  case AMDGPU::COPY:
306  case AMDGPU::CONST_COPY:
307  case AMDGPU::INTERP_PAIR_XY:
308  case AMDGPU::INTERP_PAIR_ZW:
309  case AMDGPU::INTERP_VEC_LOAD:
310  case AMDGPU::DOT_4:
311  return IDAlu;
312  default:
313  return IDOther;
314  }
315 }
316 
317 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
318  if (Q.empty())
319  return NULL;
320  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
321  It != E; ++It) {
322  SUnit *SU = *It;
323  InstructionsGroupCandidate.push_back(SU->getInstr());
324  if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
325  && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
326  ) {
327  InstructionsGroupCandidate.pop_back();
328  Q.erase((It + 1).base());
329  return SU;
330  } else {
331  InstructionsGroupCandidate.pop_back();
332  }
333  }
334  return NULL;
335 }
336 
337 void R600SchedStrategy::LoadAlu() {
338  std::vector<SUnit *> &QSrc = Pending[IDAlu];
339  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
340  AluKind AK = getAluKind(QSrc[i]);
341  AvailableAlus[AK].push_back(QSrc[i]);
342  }
343  QSrc.clear();
344 }
345 
346 void R600SchedStrategy::PrepareNextSlot() {
347  DEBUG(dbgs() << "New Slot\n");
348  assert (OccupedSlotsMask && "Slot wasn't filled");
349  OccupedSlotsMask = 0;
350 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
351 // OccupedSlotsMask |= 16;
352  InstructionsGroupCandidate.clear();
353  LoadAlu();
354 }
355 
356 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
357  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
358  if (DstIndex == -1) {
359  return;
360  }
361  unsigned DestReg = MI->getOperand(DstIndex).getReg();
362  // PressureRegister crashes if an operand is def and used in the same inst
363  // and we try to constraint its regclass
365  E = MI->operands_end(); It != E; ++It) {
366  MachineOperand &MO = *It;
367  if (MO.isReg() && !MO.isDef() &&
368  MO.getReg() == DestReg)
369  return;
370  }
371  // Constrains the regclass of DestReg to assign it to Slot
372  switch (Slot) {
373  case 0:
374  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
375  break;
376  case 1:
377  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
378  break;
379  case 2:
380  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
381  break;
382  case 3:
383  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
384  break;
385  }
386 }
387 
388 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
389  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
390  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
391  if (SlotedSU)
392  return SlotedSU;
393  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
394  if (UnslotedSU)
395  AssignSlot(UnslotedSU->getInstr(), Slot);
396  return UnslotedSU;
397 }
398 
399 unsigned R600SchedStrategy::AvailablesAluCount() const {
400  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
401  AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
402  AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
403  AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
404  AvailableAlus[AluPredX].size();
405 }
406 
407 SUnit* R600SchedStrategy::pickAlu() {
408  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
409  if (!OccupedSlotsMask) {
410  // Bottom up scheduling : predX must comes first
411  if (!AvailableAlus[AluPredX].empty()) {
412  OccupedSlotsMask |= 31;
413  return PopInst(AvailableAlus[AluPredX], false);
414  }
415  // Flush physical reg copies (RA will discard them)
416  if (!AvailableAlus[AluDiscarded].empty()) {
417  OccupedSlotsMask |= 31;
418  return PopInst(AvailableAlus[AluDiscarded], false);
419  }
420  // If there is a T_XYZW alu available, use it
421  if (!AvailableAlus[AluT_XYZW].empty()) {
422  OccupedSlotsMask |= 15;
423  return PopInst(AvailableAlus[AluT_XYZW], false);
424  }
425  }
426  bool TransSlotOccuped = OccupedSlotsMask & 16;
427  if (!TransSlotOccuped && VLIW5) {
428  if (!AvailableAlus[AluTrans].empty()) {
429  OccupedSlotsMask |= 16;
430  return PopInst(AvailableAlus[AluTrans], false);
431  }
432  SUnit *SU = AttemptFillSlot(3, true);
433  if (SU) {
434  OccupedSlotsMask |= 16;
435  return SU;
436  }
437  }
438  for (int Chan = 3; Chan > -1; --Chan) {
439  bool isOccupied = OccupedSlotsMask & (1 << Chan);
440  if (!isOccupied) {
441  SUnit *SU = AttemptFillSlot(Chan, false);
442  if (SU) {
443  OccupedSlotsMask |= (1 << Chan);
444  InstructionsGroupCandidate.push_back(SU->getInstr());
445  return SU;
446  }
447  }
448  }
449  PrepareNextSlot();
450  }
451  return NULL;
452 }
453 
454 SUnit* R600SchedStrategy::pickOther(int QID) {
455  SUnit *SU = 0;
456  std::vector<SUnit *> &AQ = Available[QID];
457 
458  if (AQ.empty()) {
459  MoveUnits(Pending[QID], AQ);
460  }
461  if (!AQ.empty()) {
462  SU = AQ.back();
463  AQ.resize(AQ.size() - 1);
464  }
465  return SU;
466 }
467 
bool readsLDSSrcReg(const MachineInstr *MI) const
mop_iterator operands_end()
Definition: MachineInstr.h:285
bool isLDSInstr(unsigned Opcode) const
virtual void schedNode(SUnit *SU, bool IsTopNode)
MachineInstr * getInstr() const
Definition: ScheduleDAG.h:386
bool isVector(const MachineInstr &MI) const
static bool isVirtualRegister(unsigned Reg)
R600 Machine Scheduler interface.
bool isVectorOnly(unsigned Opcode) const
MachineFunction & MF
Definition: ScheduleDAG.h:543
bool isScheduled
Definition: ScheduleDAG.h:291
bool isCubeOp(unsigned opcode) const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const TargetRegisterClass * getRegClass(unsigned Reg) const
bool isUndef() const
virtual void releaseTopNode(SUnit *SU)
bool usesVertexCache(unsigned Opcode) const
int getOpcode() const
Definition: MachineInstr.h:261
bool usesTextureCache(unsigned Opcode) const
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
virtual void releaseBottomNode(SUnit *SU)
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:267
static bool isPhysicalRegCopy(MachineInstr *MI)
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
const TargetMachine & TM
Definition: ScheduleDAG.h:540
unsigned getSubReg() const
const STC & getSubtarget() const
virtual SUnit * pickNode(bool &IsTopNode)
raw_ostream & dbgs()
dbgs - Return a circular-buffered debug stream.
Definition: Debug.cpp:101
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
const TargetRegisterInfo * TRI
Definition: ScheduleDAG.h:542
virtual void initialize(ScheduleDAGMI *dag)
Initialize the strategy after building the DAG for a new region.
std::reverse_iterator< const_iterator > reverse_iterator
Definition: Path.h:79
const TargetMachine & getTarget() const
const TargetInstrInfo * TII
Definition: ScheduleDAG.h:541
unsigned getMaxAlusPerClause() const
bool isReductionOp(unsigned opcode) const
unsigned getReg() const
getReg - Returns the register number.
short getTexVTXClauseSize() const
mop_iterator operands_begin()
Definition: MachineInstr.h:284
#define DEBUG(X)
Definition: Debug.h:97
bool fitsConstReadLimitations(const std::vector< MachineInstr * > &) const
MachineRegisterInfo & MRI
Definition: ScheduleDAG.h:544
std::vector< SUnit > SUnits
Definition: ScheduleDAG.h:545
bool isALUInstr(unsigned Opcode) const
void dump(const ScheduleDAG *G) const
bool isTransOnly(unsigned Opcode) const
SUnit - Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:249
bool contains(unsigned Reg) const