LLVM API Documentation

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BBVectorize.cpp
Go to the documentation of this file.
1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a basic-block vectorization pass. The algorithm was
11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12 // et al. It works by looking for chains of pairable operations and then
13 // pairing them.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #define BBV_NAME "bb-vectorize"
18 #define DEBUG_TYPE BBV_NAME
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/DenseSet.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallSet.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringExtras.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DataLayout.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/LLVMContext.h"
42 #include "llvm/IR/Metadata.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/Pass.h"
46 #include "llvm/Support/Debug.h"
50 #include <algorithm>
51 using namespace llvm;
52 
53 static cl::opt<bool>
54 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
55  cl::Hidden, cl::desc("Ignore target information"));
56 
57 static cl::opt<unsigned>
58 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
59  cl::desc("The required chain depth for vectorization"));
60 
61 static cl::opt<bool>
62 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
63  cl::Hidden, cl::desc("Use the chain depth requirement with"
64  " target information"));
65 
66 static cl::opt<unsigned>
67 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
68  cl::desc("The maximum search distance for instruction pairs"));
69 
70 static cl::opt<bool>
71 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
72  cl::desc("Replicating one element to a pair breaks the chain"));
73 
74 static cl::opt<unsigned>
75 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
76  cl::desc("The size of the native vector registers"));
77 
78 static cl::opt<unsigned>
79 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
80  cl::desc("The maximum number of pairing iterations"));
81 
82 static cl::opt<bool>
83 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
84  cl::desc("Don't try to form non-2^n-length vectors"));
85 
86 static cl::opt<unsigned>
87 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
88  cl::desc("The maximum number of pairable instructions per group"));
89 
90 static cl::opt<unsigned>
91 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
92  cl::desc("The maximum number of candidate instruction pairs per group"));
93 
94 static cl::opt<unsigned>
95 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
96  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
97  " a full cycle check"));
98 
99 static cl::opt<bool>
100 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
101  cl::desc("Don't try to vectorize boolean (i1) values"));
102 
103 static cl::opt<bool>
104 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
105  cl::desc("Don't try to vectorize integer values"));
106 
107 static cl::opt<bool>
108 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
109  cl::desc("Don't try to vectorize floating-point values"));
110 
111 // FIXME: This should default to false once pointer vector support works.
112 static cl::opt<bool>
113 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
114  cl::desc("Don't try to vectorize pointer values"));
115 
116 static cl::opt<bool>
117 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
118  cl::desc("Don't try to vectorize casting (conversion) operations"));
119 
120 static cl::opt<bool>
121 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
122  cl::desc("Don't try to vectorize floating-point math intrinsics"));
123 
124 static cl::opt<bool>
125 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
126  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
127 
128 static cl::opt<bool>
129 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
130  cl::desc("Don't try to vectorize select instructions"));
131 
132 static cl::opt<bool>
133 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
134  cl::desc("Don't try to vectorize comparison instructions"));
135 
136 static cl::opt<bool>
137 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
138  cl::desc("Don't try to vectorize getelementptr instructions"));
139 
140 static cl::opt<bool>
141 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
142  cl::desc("Don't try to vectorize loads and stores"));
143 
144 static cl::opt<bool>
145 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
146  cl::desc("Only generate aligned loads and stores"));
147 
148 static cl::opt<bool>
149 NoMemOpBoost("bb-vectorize-no-mem-op-boost",
150  cl::init(false), cl::Hidden,
151  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
152 
153 static cl::opt<bool>
154 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
155  cl::desc("Use a fast instruction dependency analysis"));
156 
157 #ifndef NDEBUG
158 static cl::opt<bool>
159 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
160  cl::init(false), cl::Hidden,
161  cl::desc("When debugging is enabled, output information on the"
162  " instruction-examination process"));
163 static cl::opt<bool>
164 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
165  cl::init(false), cl::Hidden,
166  cl::desc("When debugging is enabled, output information on the"
167  " candidate-selection process"));
168 static cl::opt<bool>
169 DebugPairSelection("bb-vectorize-debug-pair-selection",
170  cl::init(false), cl::Hidden,
171  cl::desc("When debugging is enabled, output information on the"
172  " pair-selection process"));
173 static cl::opt<bool>
174 DebugCycleCheck("bb-vectorize-debug-cycle-check",
175  cl::init(false), cl::Hidden,
176  cl::desc("When debugging is enabled, output information on the"
177  " cycle-checking process"));
178 
179 static cl::opt<bool>
180 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
181  cl::init(false), cl::Hidden,
182  cl::desc("When debugging is enabled, dump the basic block after"
183  " every pair is fused"));
184 #endif
185 
186 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
187 
188 namespace {
189  struct BBVectorize : public BasicBlockPass {
190  static char ID; // Pass identification, replacement for typeid
191 
192  const VectorizeConfig Config;
193 
194  BBVectorize(const VectorizeConfig &C = VectorizeConfig())
195  : BasicBlockPass(ID), Config(C) {
197  }
198 
199  BBVectorize(Pass *P, const VectorizeConfig &C)
200  : BasicBlockPass(ID), Config(C) {
201  AA = &P->getAnalysis<AliasAnalysis>();
202  DT = &P->getAnalysis<DominatorTree>();
203  SE = &P->getAnalysis<ScalarEvolution>();
206  }
207 
208  typedef std::pair<Value *, Value *> ValuePair;
209  typedef std::pair<ValuePair, int> ValuePairWithCost;
210  typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
211  typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
212  typedef std::pair<VPPair, unsigned> VPPairWithType;
213 
214  AliasAnalysis *AA;
215  DominatorTree *DT;
216  ScalarEvolution *SE;
217  DataLayout *TD;
218  const TargetTransformInfo *TTI;
219 
220  // FIXME: const correct?
221 
222  bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
223 
224  bool getCandidatePairs(BasicBlock &BB,
225  BasicBlock::iterator &Start,
226  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
227  DenseSet<ValuePair> &FixedOrderPairs,
228  DenseMap<ValuePair, int> &CandidatePairCostSavings,
229  std::vector<Value *> &PairableInsts, bool NonPow2Len);
230 
231  // FIXME: The current implementation does not account for pairs that
232  // are connected in multiple ways. For example:
233  // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
234  enum PairConnectionType {
235  PairConnectionDirect,
236  PairConnectionSwap,
237  PairConnectionSplat
238  };
239 
240  void computeConnectedPairs(
241  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
242  DenseSet<ValuePair> &CandidatePairsSet,
243  std::vector<Value *> &PairableInsts,
244  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
245  DenseMap<VPPair, unsigned> &PairConnectionTypes);
246 
247  void buildDepMap(BasicBlock &BB,
248  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
249  std::vector<Value *> &PairableInsts,
250  DenseSet<ValuePair> &PairableInstUsers);
251 
252  void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
253  DenseSet<ValuePair> &CandidatePairsSet,
254  DenseMap<ValuePair, int> &CandidatePairCostSavings,
255  std::vector<Value *> &PairableInsts,
256  DenseSet<ValuePair> &FixedOrderPairs,
257  DenseMap<VPPair, unsigned> &PairConnectionTypes,
258  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
259  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
260  DenseSet<ValuePair> &PairableInstUsers,
261  DenseMap<Value *, Value *>& ChosenPairs);
262 
263  void fuseChosenPairs(BasicBlock &BB,
264  std::vector<Value *> &PairableInsts,
265  DenseMap<Value *, Value *>& ChosenPairs,
266  DenseSet<ValuePair> &FixedOrderPairs,
267  DenseMap<VPPair, unsigned> &PairConnectionTypes,
268  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
269  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
270 
271 
272  bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
273 
274  bool areInstsCompatible(Instruction *I, Instruction *J,
275  bool IsSimpleLoadStore, bool NonPow2Len,
276  int &CostSavings, int &FixedOrder);
277 
278  bool trackUsesOfI(DenseSet<Value *> &Users,
279  AliasSetTracker &WriteSet, Instruction *I,
280  Instruction *J, bool UpdateUsers = true,
281  DenseSet<ValuePair> *LoadMoveSetPairs = 0);
282 
283  void computePairsConnectedTo(
284  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
285  DenseSet<ValuePair> &CandidatePairsSet,
286  std::vector<Value *> &PairableInsts,
287  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
288  DenseMap<VPPair, unsigned> &PairConnectionTypes,
289  ValuePair P);
290 
291  bool pairsConflict(ValuePair P, ValuePair Q,
292  DenseSet<ValuePair> &PairableInstUsers,
293  DenseMap<ValuePair, std::vector<ValuePair> >
294  *PairableInstUserMap = 0,
295  DenseSet<VPPair> *PairableInstUserPairSet = 0);
296 
297  bool pairWillFormCycle(ValuePair P,
298  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
299  DenseSet<ValuePair> &CurrentPairs);
300 
301  void pruneDAGFor(
302  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
303  std::vector<Value *> &PairableInsts,
304  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
305  DenseSet<ValuePair> &PairableInstUsers,
306  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
307  DenseSet<VPPair> &PairableInstUserPairSet,
308  DenseMap<Value *, Value *> &ChosenPairs,
310  DenseSet<ValuePair> &PrunedDAG, ValuePair J,
311  bool UseCycleCheck);
312 
313  void buildInitialDAGFor(
314  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
315  DenseSet<ValuePair> &CandidatePairsSet,
316  std::vector<Value *> &PairableInsts,
317  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
318  DenseSet<ValuePair> &PairableInstUsers,
319  DenseMap<Value *, Value *> &ChosenPairs,
320  DenseMap<ValuePair, size_t> &DAG, ValuePair J);
321 
322  void findBestDAGFor(
323  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
324  DenseSet<ValuePair> &CandidatePairsSet,
325  DenseMap<ValuePair, int> &CandidatePairCostSavings,
326  std::vector<Value *> &PairableInsts,
327  DenseSet<ValuePair> &FixedOrderPairs,
328  DenseMap<VPPair, unsigned> &PairConnectionTypes,
329  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
330  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
331  DenseSet<ValuePair> &PairableInstUsers,
332  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
333  DenseSet<VPPair> &PairableInstUserPairSet,
334  DenseMap<Value *, Value *> &ChosenPairs,
335  DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
336  int &BestEffSize, Value *II, std::vector<Value *>&JJ,
337  bool UseCycleCheck);
338 
339  Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
340  Instruction *J, unsigned o);
341 
342  void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
343  unsigned MaskOffset, unsigned NumInElem,
344  unsigned NumInElem1, unsigned IdxOffset,
345  std::vector<Constant*> &Mask);
346 
347  Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
348  Instruction *J);
349 
350  bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
351  unsigned o, Value *&LOp, unsigned numElemL,
352  Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
353  unsigned IdxOff = 0);
354 
355  Value *getReplacementInput(LLVMContext& Context, Instruction *I,
356  Instruction *J, unsigned o, bool IBeforeJ);
357 
358  void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
359  Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
360  bool IBeforeJ);
361 
362  void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
363  Instruction *J, Instruction *K,
364  Instruction *&InsertionPt, Instruction *&K1,
365  Instruction *&K2);
366 
367  void collectPairLoadMoveSet(BasicBlock &BB,
368  DenseMap<Value *, Value *> &ChosenPairs,
369  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
370  DenseSet<ValuePair> &LoadMoveSetPairs,
371  Instruction *I);
372 
373  void collectLoadMoveSet(BasicBlock &BB,
374  std::vector<Value *> &PairableInsts,
375  DenseMap<Value *, Value *> &ChosenPairs,
376  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
377  DenseSet<ValuePair> &LoadMoveSetPairs);
378 
379  bool canMoveUsesOfIAfterJ(BasicBlock &BB,
380  DenseSet<ValuePair> &LoadMoveSetPairs,
381  Instruction *I, Instruction *J);
382 
383  void moveUsesOfIAfterJ(BasicBlock &BB,
384  DenseSet<ValuePair> &LoadMoveSetPairs,
385  Instruction *&InsertionPt,
386  Instruction *I, Instruction *J);
387 
388  void combineMetadata(Instruction *K, const Instruction *J);
389 
390  bool vectorizeBB(BasicBlock &BB) {
391  if (!DT->isReachableFromEntry(&BB)) {
392  DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
393  " in " << BB.getParent()->getName() << "\n");
394  return false;
395  }
396 
397  DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
398 
399  bool changed = false;
400  // Iterate a sufficient number of times to merge types of size 1 bit,
401  // then 2 bits, then 4, etc. up to half of the target vector width of the
402  // target vector register.
403  unsigned n = 1;
404  for (unsigned v = 2;
405  (TTI || v <= Config.VectorBits) &&
406  (!Config.MaxIter || n <= Config.MaxIter);
407  v *= 2, ++n) {
408  DEBUG(dbgs() << "BBV: fusing loop #" << n <<
409  " for " << BB.getName() << " in " <<
410  BB.getParent()->getName() << "...\n");
411  if (vectorizePairs(BB))
412  changed = true;
413  else
414  break;
415  }
416 
417  if (changed && !Pow2LenOnly) {
418  ++n;
419  for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
420  DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
421  n << " for " << BB.getName() << " in " <<
422  BB.getParent()->getName() << "...\n");
423  if (!vectorizePairs(BB, true)) break;
424  }
425  }
426 
427  DEBUG(dbgs() << "BBV: done!\n");
428  return changed;
429  }
430 
431  virtual bool runOnBasicBlock(BasicBlock &BB) {
432  AA = &getAnalysis<AliasAnalysis>();
433  DT = &getAnalysis<DominatorTree>();
434  SE = &getAnalysis<ScalarEvolution>();
435  TD = getAnalysisIfAvailable<DataLayout>();
436  TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>();
437 
438  return vectorizeBB(BB);
439  }
440 
441  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
450  AU.setPreservesCFG();
451  }
452 
453  static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
454  assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
455  "Cannot form vector from incompatible scalar types");
456  Type *STy = ElemTy->getScalarType();
457 
458  unsigned numElem;
459  if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
460  numElem = VTy->getNumElements();
461  } else {
462  numElem = 1;
463  }
464 
465  if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
466  numElem += VTy->getNumElements();
467  } else {
468  numElem += 1;
469  }
470 
471  return VectorType::get(STy, numElem);
472  }
473 
474  static inline void getInstructionTypes(Instruction *I,
475  Type *&T1, Type *&T2) {
476  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
477  // For stores, it is the value type, not the pointer type that matters
478  // because the value is what will come from a vector register.
479 
480  Value *IVal = SI->getValueOperand();
481  T1 = IVal->getType();
482  } else {
483  T1 = I->getType();
484  }
485 
486  if (CastInst *CI = dyn_cast<CastInst>(I))
487  T2 = CI->getSrcTy();
488  else
489  T2 = T1;
490 
491  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
492  T2 = SI->getCondition()->getType();
493  } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
494  T2 = SI->getOperand(0)->getType();
495  } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
496  T2 = CI->getOperand(0)->getType();
497  }
498  }
499 
500  // Returns the weight associated with the provided value. A chain of
501  // candidate pairs has a length given by the sum of the weights of its
502  // members (one weight per pair; the weight of each member of the pair
503  // is assumed to be the same). This length is then compared to the
504  // chain-length threshold to determine if a given chain is significant
505  // enough to be vectorized. The length is also used in comparing
506  // candidate chains where longer chains are considered to be better.
507  // Note: when this function returns 0, the resulting instructions are
508  // not actually fused.
509  inline size_t getDepthFactor(Value *V) {
510  // InsertElement and ExtractElement have a depth factor of zero. This is
511  // for two reasons: First, they cannot be usefully fused. Second, because
512  // the pass generates a lot of these, they can confuse the simple metric
513  // used to compare the dags in the next iteration. Thus, giving them a
514  // weight of zero allows the pass to essentially ignore them in
515  // subsequent iterations when looking for vectorization opportunities
516  // while still tracking dependency chains that flow through those
517  // instructions.
518  if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
519  return 0;
520 
521  // Give a load or store half of the required depth so that load/store
522  // pairs will vectorize.
523  if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
524  return Config.ReqChainDepth/2;
525 
526  return 1;
527  }
528 
529  // Returns the cost of the provided instruction using TTI.
530  // This does not handle loads and stores.
531  unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
532  switch (Opcode) {
533  default: break;
534  case Instruction::GetElementPtr:
535  // We mark this instruction as zero-cost because scalar GEPs are usually
536  // lowered to the instruction addressing mode. At the moment we don't
537  // generate vector GEPs.
538  return 0;
539  case Instruction::Br:
540  return TTI->getCFInstrCost(Opcode);
541  case Instruction::PHI:
542  return 0;
543  case Instruction::Add:
544  case Instruction::FAdd:
545  case Instruction::Sub:
546  case Instruction::FSub:
547  case Instruction::Mul:
548  case Instruction::FMul:
549  case Instruction::UDiv:
550  case Instruction::SDiv:
551  case Instruction::FDiv:
552  case Instruction::URem:
553  case Instruction::SRem:
554  case Instruction::FRem:
555  case Instruction::Shl:
556  case Instruction::LShr:
557  case Instruction::AShr:
558  case Instruction::And:
559  case Instruction::Or:
560  case Instruction::Xor:
561  return TTI->getArithmeticInstrCost(Opcode, T1);
562  case Instruction::Select:
563  case Instruction::ICmp:
564  case Instruction::FCmp:
565  return TTI->getCmpSelInstrCost(Opcode, T1, T2);
566  case Instruction::ZExt:
567  case Instruction::SExt:
568  case Instruction::FPToUI:
569  case Instruction::FPToSI:
570  case Instruction::FPExt:
571  case Instruction::PtrToInt:
572  case Instruction::IntToPtr:
573  case Instruction::SIToFP:
574  case Instruction::UIToFP:
575  case Instruction::Trunc:
576  case Instruction::FPTrunc:
577  case Instruction::BitCast:
578  case Instruction::ShuffleVector:
579  return TTI->getCastInstrCost(Opcode, T1, T2);
580  }
581 
582  return 1;
583  }
584 
585  // This determines the relative offset of two loads or stores, returning
586  // true if the offset could be determined to be some constant value.
587  // For example, if OffsetInElmts == 1, then J accesses the memory directly
588  // after I; if OffsetInElmts == -1 then I accesses the memory
589  // directly after J.
590  bool getPairPtrInfo(Instruction *I, Instruction *J,
591  Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
592  unsigned &IAddressSpace, unsigned &JAddressSpace,
593  int64_t &OffsetInElmts, bool ComputeOffset = true) {
594  OffsetInElmts = 0;
595  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
596  LoadInst *LJ = cast<LoadInst>(J);
597  IPtr = LI->getPointerOperand();
598  JPtr = LJ->getPointerOperand();
599  IAlignment = LI->getAlignment();
600  JAlignment = LJ->getAlignment();
601  IAddressSpace = LI->getPointerAddressSpace();
602  JAddressSpace = LJ->getPointerAddressSpace();
603  } else {
604  StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
605  IPtr = SI->getPointerOperand();
606  JPtr = SJ->getPointerOperand();
607  IAlignment = SI->getAlignment();
608  JAlignment = SJ->getAlignment();
609  IAddressSpace = SI->getPointerAddressSpace();
610  JAddressSpace = SJ->getPointerAddressSpace();
611  }
612 
613  if (!ComputeOffset)
614  return true;
615 
616  const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
617  const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
618 
619  // If this is a trivial offset, then we'll get something like
620  // 1*sizeof(type). With target data, which we need anyway, this will get
621  // constant folded into a number.
622  const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
623  if (const SCEVConstant *ConstOffSCEV =
624  dyn_cast<SCEVConstant>(OffsetSCEV)) {
625  ConstantInt *IntOff = ConstOffSCEV->getValue();
626  int64_t Offset = IntOff->getSExtValue();
627 
628  Type *VTy = IPtr->getType()->getPointerElementType();
629  int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
630 
631  Type *VTy2 = JPtr->getType()->getPointerElementType();
632  if (VTy != VTy2 && Offset < 0) {
633  int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
634  OffsetInElmts = Offset/VTy2TSS;
635  return (abs64(Offset) % VTy2TSS) == 0;
636  }
637 
638  OffsetInElmts = Offset/VTyTSS;
639  return (abs64(Offset) % VTyTSS) == 0;
640  }
641 
642  return false;
643  }
644 
645  // Returns true if the provided CallInst represents an intrinsic that can
646  // be vectorized.
647  bool isVectorizableIntrinsic(CallInst* I) {
648  Function *F = I->getCalledFunction();
649  if (!F) return false;
650 
652  if (!IID) return false;
653 
654  switch(IID) {
655  default:
656  return false;
657  case Intrinsic::sqrt:
658  case Intrinsic::powi:
659  case Intrinsic::sin:
660  case Intrinsic::cos:
661  case Intrinsic::log:
662  case Intrinsic::log2:
663  case Intrinsic::log10:
664  case Intrinsic::exp:
665  case Intrinsic::exp2:
666  case Intrinsic::pow:
667  return Config.VectorizeMath;
668  case Intrinsic::fma:
669  case Intrinsic::fmuladd:
670  return Config.VectorizeFMA;
671  }
672  }
673 
674  bool isPureIEChain(InsertElementInst *IE) {
675  InsertElementInst *IENext = IE;
676  do {
677  if (!isa<UndefValue>(IENext->getOperand(0)) &&
678  !isa<InsertElementInst>(IENext->getOperand(0))) {
679  return false;
680  }
681  } while ((IENext =
682  dyn_cast<InsertElementInst>(IENext->getOperand(0))));
683 
684  return true;
685  }
686  };
687 
688  // This function implements one vectorization iteration on the provided
689  // basic block. It returns true if the block is changed.
690  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
691  bool ShouldContinue;
693 
694  std::vector<Value *> AllPairableInsts;
695  DenseMap<Value *, Value *> AllChosenPairs;
696  DenseSet<ValuePair> AllFixedOrderPairs;
697  DenseMap<VPPair, unsigned> AllPairConnectionTypes;
698  DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
699  AllConnectedPairDeps;
700 
701  do {
702  std::vector<Value *> PairableInsts;
704  DenseSet<ValuePair> FixedOrderPairs;
705  DenseMap<ValuePair, int> CandidatePairCostSavings;
706  ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
707  FixedOrderPairs,
708  CandidatePairCostSavings,
709  PairableInsts, NonPow2Len);
710  if (PairableInsts.empty()) continue;
711 
712  // Build the candidate pair set for faster lookups.
713  DenseSet<ValuePair> CandidatePairsSet;
714  for (DenseMap<Value *, std::vector<Value *> >::iterator I =
715  CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
716  for (std::vector<Value *>::iterator J = I->second.begin(),
717  JE = I->second.end(); J != JE; ++J)
718  CandidatePairsSet.insert(ValuePair(I->first, *J));
719 
720  // Now we have a map of all of the pairable instructions and we need to
721  // select the best possible pairing. A good pairing is one such that the
722  // users of the pair are also paired. This defines a (directed) forest
723  // over the pairs such that two pairs are connected iff the second pair
724  // uses the first.
725 
726  // Note that it only matters that both members of the second pair use some
727  // element of the first pair (to allow for splatting).
728 
730  ConnectedPairDeps;
731  DenseMap<VPPair, unsigned> PairConnectionTypes;
732  computeConnectedPairs(CandidatePairs, CandidatePairsSet,
733  PairableInsts, ConnectedPairs, PairConnectionTypes);
734  if (ConnectedPairs.empty()) continue;
735 
736  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
737  I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
738  I != IE; ++I)
739  for (std::vector<ValuePair>::iterator J = I->second.begin(),
740  JE = I->second.end(); J != JE; ++J)
741  ConnectedPairDeps[*J].push_back(I->first);
742 
743  // Build the pairable-instruction dependency map
744  DenseSet<ValuePair> PairableInstUsers;
745  buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
746 
747  // There is now a graph of the connected pairs. For each variable, pick
748  // the pairing with the largest dag meeting the depth requirement on at
749  // least one branch. Then select all pairings that are part of that dag
750  // and remove them from the list of available pairings and pairable
751  // variables.
752 
753  DenseMap<Value *, Value *> ChosenPairs;
754  choosePairs(CandidatePairs, CandidatePairsSet,
755  CandidatePairCostSavings,
756  PairableInsts, FixedOrderPairs, PairConnectionTypes,
757  ConnectedPairs, ConnectedPairDeps,
758  PairableInstUsers, ChosenPairs);
759 
760  if (ChosenPairs.empty()) continue;
761  AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
762  PairableInsts.end());
763  AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
764 
765  // Only for the chosen pairs, propagate information on fixed-order pairs,
766  // pair connections, and their types to the data structures used by the
767  // pair fusion procedures.
768  for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
769  IE = ChosenPairs.end(); I != IE; ++I) {
770  if (FixedOrderPairs.count(*I))
771  AllFixedOrderPairs.insert(*I);
772  else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
773  AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
774 
775  for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
776  J != IE; ++J) {
778  PairConnectionTypes.find(VPPair(*I, *J));
779  if (K != PairConnectionTypes.end()) {
780  AllPairConnectionTypes.insert(*K);
781  } else {
782  K = PairConnectionTypes.find(VPPair(*J, *I));
783  if (K != PairConnectionTypes.end())
784  AllPairConnectionTypes.insert(*K);
785  }
786  }
787  }
788 
789  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
790  I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
791  I != IE; ++I)
792  for (std::vector<ValuePair>::iterator J = I->second.begin(),
793  JE = I->second.end(); J != JE; ++J)
794  if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
795  AllConnectedPairs[I->first].push_back(*J);
796  AllConnectedPairDeps[*J].push_back(I->first);
797  }
798  } while (ShouldContinue);
799 
800  if (AllChosenPairs.empty()) return false;
801  NumFusedOps += AllChosenPairs.size();
802 
803  // A set of pairs has now been selected. It is now necessary to replace the
804  // paired instructions with vector instructions. For this procedure each
805  // operand must be replaced with a vector operand. This vector is formed
806  // by using build_vector on the old operands. The replaced values are then
807  // replaced with a vector_extract on the result. Subsequent optimization
808  // passes should coalesce the build/extract combinations.
809 
810  fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
811  AllPairConnectionTypes,
812  AllConnectedPairs, AllConnectedPairDeps);
813 
814  // It is important to cleanup here so that future iterations of this
815  // function have less work to do.
816  (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
817  return true;
818  }
819 
820  // This function returns true if the provided instruction is capable of being
821  // fused into a vector instruction. This determination is based only on the
822  // type and other attributes of the instruction.
823  bool BBVectorize::isInstVectorizable(Instruction *I,
824  bool &IsSimpleLoadStore) {
825  IsSimpleLoadStore = false;
826 
827  if (CallInst *C = dyn_cast<CallInst>(I)) {
828  if (!isVectorizableIntrinsic(C))
829  return false;
830  } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
831  // Vectorize simple loads if possbile:
832  IsSimpleLoadStore = L->isSimple();
833  if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
834  return false;
835  } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
836  // Vectorize simple stores if possbile:
837  IsSimpleLoadStore = S->isSimple();
838  if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
839  return false;
840  } else if (CastInst *C = dyn_cast<CastInst>(I)) {
841  // We can vectorize casts, but not casts of pointer types, etc.
842  if (!Config.VectorizeCasts)
843  return false;
844 
845  Type *SrcTy = C->getSrcTy();
846  if (!SrcTy->isSingleValueType())
847  return false;
848 
849  Type *DestTy = C->getDestTy();
850  if (!DestTy->isSingleValueType())
851  return false;
852  } else if (isa<SelectInst>(I)) {
853  if (!Config.VectorizeSelect)
854  return false;
855  } else if (isa<CmpInst>(I)) {
856  if (!Config.VectorizeCmp)
857  return false;
858  } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
859  if (!Config.VectorizeGEP)
860  return false;
861 
862  // Currently, vector GEPs exist only with one index.
863  if (G->getNumIndices() != 1)
864  return false;
865  } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
866  isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
867  return false;
868  }
869 
870  // We can't vectorize memory operations without target data
871  if (TD == 0 && IsSimpleLoadStore)
872  return false;
873 
874  Type *T1, *T2;
875  getInstructionTypes(I, T1, T2);
876 
877  // Not every type can be vectorized...
878  if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
880  return false;
881 
882  if (T1->getScalarSizeInBits() == 1) {
883  if (!Config.VectorizeBools)
884  return false;
885  } else {
886  if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
887  return false;
888  }
889 
890  if (T2->getScalarSizeInBits() == 1) {
891  if (!Config.VectorizeBools)
892  return false;
893  } else {
894  if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
895  return false;
896  }
897 
898  if (!Config.VectorizeFloats
899  && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
900  return false;
901 
902  // Don't vectorize target-specific types.
903  if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
904  return false;
905  if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
906  return false;
907 
908  if ((!Config.VectorizePointers || TD == 0) &&
909  (T1->getScalarType()->isPointerTy() ||
910  T2->getScalarType()->isPointerTy()))
911  return false;
912 
913  if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
914  T2->getPrimitiveSizeInBits() >= Config.VectorBits))
915  return false;
916 
917  return true;
918  }
919 
920  // This function returns true if the two provided instructions are compatible
921  // (meaning that they can be fused into a vector instruction). This assumes
922  // that I has already been determined to be vectorizable and that J is not
923  // in the use dag of I.
924  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
925  bool IsSimpleLoadStore, bool NonPow2Len,
926  int &CostSavings, int &FixedOrder) {
927  DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
928  " <-> " << *J << "\n");
929 
930  CostSavings = 0;
931  FixedOrder = 0;
932 
933  // Loads and stores can be merged if they have different alignments,
934  // but are otherwise the same.
936  (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
937  return false;
938 
939  Type *IT1, *IT2, *JT1, *JT2;
940  getInstructionTypes(I, IT1, IT2);
941  getInstructionTypes(J, JT1, JT2);
942  unsigned MaxTypeBits = std::max(
945  if (!TTI && MaxTypeBits > Config.VectorBits)
946  return false;
947 
948  // FIXME: handle addsub-type operations!
949 
950  if (IsSimpleLoadStore) {
951  Value *IPtr, *JPtr;
952  unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
953  int64_t OffsetInElmts = 0;
954  if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
955  IAddressSpace, JAddressSpace,
956  OffsetInElmts) && abs64(OffsetInElmts) == 1) {
957  FixedOrder = (int) OffsetInElmts;
958  unsigned BottomAlignment = IAlignment;
959  if (OffsetInElmts < 0) BottomAlignment = JAlignment;
960 
961  Type *aTypeI = isa<StoreInst>(I) ?
962  cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
963  Type *aTypeJ = isa<StoreInst>(J) ?
964  cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
965  Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
966 
967  if (Config.AlignedOnly) {
968  // An aligned load or store is possible only if the instruction
969  // with the lower offset has an alignment suitable for the
970  // vector type.
971 
972  unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
973  if (BottomAlignment < VecAlignment)
974  return false;
975  }
976 
977  if (TTI) {
978  unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
979  IAlignment, IAddressSpace);
980  unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
981  JAlignment, JAddressSpace);
982  unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
983  BottomAlignment,
984  IAddressSpace);
985 
986  ICost += TTI->getAddressComputationCost(aTypeI);
987  JCost += TTI->getAddressComputationCost(aTypeJ);
988  VCost += TTI->getAddressComputationCost(VType);
989 
990  if (VCost > ICost + JCost)
991  return false;
992 
993  // We don't want to fuse to a type that will be split, even
994  // if the two input types will also be split and there is no other
995  // associated cost.
996  unsigned VParts = TTI->getNumberOfParts(VType);
997  if (VParts > 1)
998  return false;
999  else if (!VParts && VCost == ICost + JCost)
1000  return false;
1001 
1002  CostSavings = ICost + JCost - VCost;
1003  }
1004  } else {
1005  return false;
1006  }
1007  } else if (TTI) {
1008  unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
1009  unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
1010  Type *VT1 = getVecTypeForPair(IT1, JT1),
1011  *VT2 = getVecTypeForPair(IT2, JT2);
1012 
1013  // Note that this procedure is incorrect for insert and extract element
1014  // instructions (because combining these often results in a shuffle),
1015  // but this cost is ignored (because insert and extract element
1016  // instructions are assigned a zero depth factor and are not really
1017  // fused in general).
1018  unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
1019 
1020  if (VCost > ICost + JCost)
1021  return false;
1022 
1023  // We don't want to fuse to a type that will be split, even
1024  // if the two input types will also be split and there is no other
1025  // associated cost.
1026  unsigned VParts1 = TTI->getNumberOfParts(VT1),
1027  VParts2 = TTI->getNumberOfParts(VT2);
1028  if (VParts1 > 1 || VParts2 > 1)
1029  return false;
1030  else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
1031  return false;
1032 
1033  CostSavings = ICost + JCost - VCost;
1034  }
1035 
1036  // The powi intrinsic is special because only the first argument is
1037  // vectorized, the second arguments must be equal.
1038  CallInst *CI = dyn_cast<CallInst>(I);
1039  Function *FI;
1040  if (CI && (FI = CI->getCalledFunction())) {
1042  if (IID == Intrinsic::powi) {
1043  Value *A1I = CI->getArgOperand(1),
1044  *A1J = cast<CallInst>(J)->getArgOperand(1);
1045  const SCEV *A1ISCEV = SE->getSCEV(A1I),
1046  *A1JSCEV = SE->getSCEV(A1J);
1047  return (A1ISCEV == A1JSCEV);
1048  }
1049 
1050  if (IID && TTI) {
1052  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
1053  Tys.push_back(CI->getArgOperand(i)->getType());
1054  unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
1055 
1056  Tys.clear();
1057  CallInst *CJ = cast<CallInst>(J);
1058  for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
1059  Tys.push_back(CJ->getArgOperand(i)->getType());
1060  unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
1061 
1062  Tys.clear();
1063  assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
1064  "Intrinsic argument counts differ");
1065  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1066  if (IID == Intrinsic::powi && i == 1)
1067  Tys.push_back(CI->getArgOperand(i)->getType());
1068  else
1069  Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
1070  CJ->getArgOperand(i)->getType()));
1071  }
1072 
1073  Type *RetTy = getVecTypeForPair(IT1, JT1);
1074  unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
1075 
1076  if (VCost > ICost + JCost)
1077  return false;
1078 
1079  // We don't want to fuse to a type that will be split, even
1080  // if the two input types will also be split and there is no other
1081  // associated cost.
1082  unsigned RetParts = TTI->getNumberOfParts(RetTy);
1083  if (RetParts > 1)
1084  return false;
1085  else if (!RetParts && VCost == ICost + JCost)
1086  return false;
1087 
1088  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1089  if (!Tys[i]->isVectorTy())
1090  continue;
1091 
1092  unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
1093  if (NumParts > 1)
1094  return false;
1095  else if (!NumParts && VCost == ICost + JCost)
1096  return false;
1097  }
1098 
1099  CostSavings = ICost + JCost - VCost;
1100  }
1101  }
1102 
1103  return true;
1104  }
1105 
1106  // Figure out whether or not J uses I and update the users and write-set
1107  // structures associated with I. Specifically, Users represents the set of
1108  // instructions that depend on I. WriteSet represents the set
1109  // of memory locations that are dependent on I. If UpdateUsers is true,
1110  // and J uses I, then Users is updated to contain J and WriteSet is updated
1111  // to contain any memory locations to which J writes. The function returns
1112  // true if J uses I. By default, alias analysis is used to determine
1113  // whether J reads from memory that overlaps with a location in WriteSet.
1114  // If LoadMoveSet is not null, then it is a previously-computed map
1115  // where the key is the memory-based user instruction and the value is
1116  // the instruction to be compared with I. So, if LoadMoveSet is provided,
1117  // then the alias analysis is not used. This is necessary because this
1118  // function is called during the process of moving instructions during
1119  // vectorization and the results of the alias analysis are not stable during
1120  // that process.
1121  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
1122  AliasSetTracker &WriteSet, Instruction *I,
1123  Instruction *J, bool UpdateUsers,
1124  DenseSet<ValuePair> *LoadMoveSetPairs) {
1125  bool UsesI = false;
1126 
1127  // This instruction may already be marked as a user due, for example, to
1128  // being a member of a selected pair.
1129  if (Users.count(J))
1130  UsesI = true;
1131 
1132  if (!UsesI)
1133  for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1134  JU != JE; ++JU) {
1135  Value *V = *JU;
1136  if (I == V || Users.count(V)) {
1137  UsesI = true;
1138  break;
1139  }
1140  }
1141  if (!UsesI && J->mayReadFromMemory()) {
1142  if (LoadMoveSetPairs) {
1143  UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
1144  } else {
1145  for (AliasSetTracker::iterator W = WriteSet.begin(),
1146  WE = WriteSet.end(); W != WE; ++W) {
1147  if (W->aliasesUnknownInst(J, *AA)) {
1148  UsesI = true;
1149  break;
1150  }
1151  }
1152  }
1153  }
1154 
1155  if (UsesI && UpdateUsers) {
1156  if (J->mayWriteToMemory()) WriteSet.add(J);
1157  Users.insert(J);
1158  }
1159 
1160  return UsesI;
1161  }
1162 
1163  // This function iterates over all instruction pairs in the provided
1164  // basic block and collects all candidate pairs for vectorization.
1165  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1166  BasicBlock::iterator &Start,
1167  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1168  DenseSet<ValuePair> &FixedOrderPairs,
1169  DenseMap<ValuePair, int> &CandidatePairCostSavings,
1170  std::vector<Value *> &PairableInsts, bool NonPow2Len) {
1171  size_t TotalPairs = 0;
1172  BasicBlock::iterator E = BB.end();
1173  if (Start == E) return false;
1174 
1175  bool ShouldContinue = false, IAfterStart = false;
1176  for (BasicBlock::iterator I = Start++; I != E; ++I) {
1177  if (I == Start) IAfterStart = true;
1178 
1179  bool IsSimpleLoadStore;
1180  if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
1181 
1182  // Look for an instruction with which to pair instruction *I...
1184  AliasSetTracker WriteSet(*AA);
1185  if (I->mayWriteToMemory()) WriteSet.add(I);
1186 
1187  bool JAfterStart = IAfterStart;
1189  for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
1190  if (J == Start) JAfterStart = true;
1191 
1192  // Determine if J uses I, if so, exit the loop.
1193  bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
1194  if (Config.FastDep) {
1195  // Note: For this heuristic to be effective, independent operations
1196  // must tend to be intermixed. This is likely to be true from some
1197  // kinds of grouped loop unrolling (but not the generic LLVM pass),
1198  // but otherwise may require some kind of reordering pass.
1199 
1200  // When using fast dependency analysis,
1201  // stop searching after first use:
1202  if (UsesI) break;
1203  } else {
1204  if (UsesI) continue;
1205  }
1206 
1207  // J does not use I, and comes before the first use of I, so it can be
1208  // merged with I if the instructions are compatible.
1209  int CostSavings, FixedOrder;
1210  if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
1211  CostSavings, FixedOrder)) continue;
1212 
1213  // J is a candidate for merging with I.
1214  if (!PairableInsts.size() ||
1215  PairableInsts[PairableInsts.size()-1] != I) {
1216  PairableInsts.push_back(I);
1217  }
1218 
1219  CandidatePairs[I].push_back(J);
1220  ++TotalPairs;
1221  if (TTI)
1222  CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
1223  CostSavings));
1224 
1225  if (FixedOrder == 1)
1226  FixedOrderPairs.insert(ValuePair(I, J));
1227  else if (FixedOrder == -1)
1228  FixedOrderPairs.insert(ValuePair(J, I));
1229 
1230  // The next call to this function must start after the last instruction
1231  // selected during this invocation.
1232  if (JAfterStart) {
1233  Start = llvm::next(J);
1234  IAfterStart = JAfterStart = false;
1235  }
1236 
1237  DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1238  << *I << " <-> " << *J << " (cost savings: " <<
1239  CostSavings << ")\n");
1240 
1241  // If we have already found too many pairs, break here and this function
1242  // will be called again starting after the last instruction selected
1243  // during this invocation.
1244  if (PairableInsts.size() >= Config.MaxInsts ||
1245  TotalPairs >= Config.MaxPairs) {
1246  ShouldContinue = true;
1247  break;
1248  }
1249  }
1250 
1251  if (ShouldContinue)
1252  break;
1253  }
1254 
1255  DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1256  << " instructions with candidate pairs\n");
1257 
1258  return ShouldContinue;
1259  }
1260 
1261  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
1262  // it looks for pairs such that both members have an input which is an
1263  // output of PI or PJ.
1264  void BBVectorize::computePairsConnectedTo(
1265  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1266  DenseSet<ValuePair> &CandidatePairsSet,
1267  std::vector<Value *> &PairableInsts,
1268  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1269  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1270  ValuePair P) {
1271  StoreInst *SI, *SJ;
1272 
1273  // For each possible pairing for this variable, look at the uses of
1274  // the first value...
1275  for (Value::use_iterator I = P.first->use_begin(),
1276  E = P.first->use_end(); I != E; ++I) {
1277  if (isa<LoadInst>(*I)) {
1278  // A pair cannot be connected to a load because the load only takes one
1279  // operand (the address) and it is a scalar even after vectorization.
1280  continue;
1281  } else if ((SI = dyn_cast<StoreInst>(*I)) &&
1282  P.first == SI->getPointerOperand()) {
1283  // Similarly, a pair cannot be connected to a store through its
1284  // pointer operand.
1285  continue;
1286  }
1287 
1288  // For each use of the first variable, look for uses of the second
1289  // variable...
1290  for (Value::use_iterator J = P.second->use_begin(),
1291  E2 = P.second->use_end(); J != E2; ++J) {
1292  if ((SJ = dyn_cast<StoreInst>(*J)) &&
1293  P.second == SJ->getPointerOperand())
1294  continue;
1295 
1296  // Look for <I, J>:
1297  if (CandidatePairsSet.count(ValuePair(*I, *J))) {
1298  VPPair VP(P, ValuePair(*I, *J));
1299  ConnectedPairs[VP.first].push_back(VP.second);
1300  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1301  }
1302 
1303  // Look for <J, I>:
1304  if (CandidatePairsSet.count(ValuePair(*J, *I))) {
1305  VPPair VP(P, ValuePair(*J, *I));
1306  ConnectedPairs[VP.first].push_back(VP.second);
1307  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1308  }
1309  }
1310 
1311  if (Config.SplatBreaksChain) continue;
1312  // Look for cases where just the first value in the pair is used by
1313  // both members of another pair (splatting).
1314  for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
1315  if ((SJ = dyn_cast<StoreInst>(*J)) &&
1316  P.first == SJ->getPointerOperand())
1317  continue;
1318 
1319  if (CandidatePairsSet.count(ValuePair(*I, *J))) {
1320  VPPair VP(P, ValuePair(*I, *J));
1321  ConnectedPairs[VP.first].push_back(VP.second);
1322  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1323  }
1324  }
1325  }
1326 
1327  if (Config.SplatBreaksChain) return;
1328  // Look for cases where just the second value in the pair is used by
1329  // both members of another pair (splatting).
1330  for (Value::use_iterator I = P.second->use_begin(),
1331  E = P.second->use_end(); I != E; ++I) {
1332  if (isa<LoadInst>(*I))
1333  continue;
1334  else if ((SI = dyn_cast<StoreInst>(*I)) &&
1335  P.second == SI->getPointerOperand())
1336  continue;
1337 
1338  for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
1339  if ((SJ = dyn_cast<StoreInst>(*J)) &&
1340  P.second == SJ->getPointerOperand())
1341  continue;
1342 
1343  if (CandidatePairsSet.count(ValuePair(*I, *J))) {
1344  VPPair VP(P, ValuePair(*I, *J));
1345  ConnectedPairs[VP.first].push_back(VP.second);
1346  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1347  }
1348  }
1349  }
1350  }
1351 
1352  // This function figures out which pairs are connected. Two pairs are
1353  // connected if some output of the first pair forms an input to both members
1354  // of the second pair.
1355  void BBVectorize::computeConnectedPairs(
1356  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1357  DenseSet<ValuePair> &CandidatePairsSet,
1358  std::vector<Value *> &PairableInsts,
1359  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1360  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
1361  for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1362  PE = PairableInsts.end(); PI != PE; ++PI) {
1364  CandidatePairs.find(*PI);
1365  if (PP == CandidatePairs.end())
1366  continue;
1367 
1368  for (std::vector<Value *>::iterator P = PP->second.begin(),
1369  E = PP->second.end(); P != E; ++P)
1370  computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
1371  PairableInsts, ConnectedPairs,
1372  PairConnectionTypes, ValuePair(*PI, *P));
1373  }
1374 
1375  DEBUG(size_t TotalPairs = 0;
1376  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
1377  ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
1378  TotalPairs += I->second.size();
1379  dbgs() << "BBV: found " << TotalPairs
1380  << " pair connections.\n");
1381  }
1382 
1383  // This function builds a set of use tuples such that <A, B> is in the set
1384  // if B is in the use dag of A. If B is in the use dag of A, then B
1385  // depends on the output of A.
1386  void BBVectorize::buildDepMap(
1387  BasicBlock &BB,
1388  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1389  std::vector<Value *> &PairableInsts,
1390  DenseSet<ValuePair> &PairableInstUsers) {
1391  DenseSet<Value *> IsInPair;
1392  for (DenseMap<Value *, std::vector<Value *> >::iterator C =
1393  CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
1394  IsInPair.insert(C->first);
1395  IsInPair.insert(C->second.begin(), C->second.end());
1396  }
1397 
1398  // Iterate through the basic block, recording all users of each
1399  // pairable instruction.
1400 
1401  BasicBlock::iterator E = BB.end(), EL =
1402  BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
1403  for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
1404  if (IsInPair.find(I) == IsInPair.end()) continue;
1405 
1407  AliasSetTracker WriteSet(*AA);
1408  if (I->mayWriteToMemory()) WriteSet.add(I);
1409 
1410  for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
1411  (void) trackUsesOfI(Users, WriteSet, I, J);
1412 
1413  if (J == EL)
1414  break;
1415  }
1416 
1417  for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
1418  U != E; ++U) {
1419  if (IsInPair.find(*U) == IsInPair.end()) continue;
1420  PairableInstUsers.insert(ValuePair(I, *U));
1421  }
1422 
1423  if (I == EL)
1424  break;
1425  }
1426  }
1427 
1428  // Returns true if an input to pair P is an output of pair Q and also an
1429  // input of pair Q is an output of pair P. If this is the case, then these
1430  // two pairs cannot be simultaneously fused.
1431  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1432  DenseSet<ValuePair> &PairableInstUsers,
1433  DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
1434  DenseSet<VPPair> *PairableInstUserPairSet) {
1435  // Two pairs are in conflict if they are mutual Users of eachother.
1436  bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
1437  PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
1438  PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
1439  PairableInstUsers.count(ValuePair(P.second, Q.second));
1440  bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
1441  PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
1442  PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
1443  PairableInstUsers.count(ValuePair(Q.second, P.second));
1444  if (PairableInstUserMap) {
1445  // FIXME: The expensive part of the cycle check is not so much the cycle
1446  // check itself but this edge insertion procedure. This needs some
1447  // profiling and probably a different data structure.
1448  if (PUsesQ) {
1449  if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
1450  (*PairableInstUserMap)[Q].push_back(P);
1451  }
1452  if (QUsesP) {
1453  if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
1454  (*PairableInstUserMap)[P].push_back(Q);
1455  }
1456  }
1457 
1458  return (QUsesP && PUsesQ);
1459  }
1460 
1461  // This function walks the use graph of current pairs to see if, starting
1462  // from P, the walk returns to P.
1463  bool BBVectorize::pairWillFormCycle(ValuePair P,
1464  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1465  DenseSet<ValuePair> &CurrentPairs) {
1466  DEBUG(if (DebugCycleCheck)
1467  dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1468  << *P.second << "\n");
1469  // A lookup table of visisted pairs is kept because the PairableInstUserMap
1470  // contains non-direct associations.
1471  DenseSet<ValuePair> Visited;
1473  // General depth-first post-order traversal:
1474  Q.push_back(P);
1475  do {
1476  ValuePair QTop = Q.pop_back_val();
1477  Visited.insert(QTop);
1478 
1479  DEBUG(if (DebugCycleCheck)
1480  dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1481  << *QTop.second << "\n");
1483  PairableInstUserMap.find(QTop);
1484  if (QQ == PairableInstUserMap.end())
1485  continue;
1486 
1487  for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
1488  CE = QQ->second.end(); C != CE; ++C) {
1489  if (*C == P) {
1490  DEBUG(dbgs()
1491  << "BBV: rejected to prevent non-trivial cycle formation: "
1492  << QTop.first << " <-> " << C->second << "\n");
1493  return true;
1494  }
1495 
1496  if (CurrentPairs.count(*C) && !Visited.count(*C))
1497  Q.push_back(*C);
1498  }
1499  } while (!Q.empty());
1500 
1501  return false;
1502  }
1503 
1504  // This function builds the initial dag of connected pairs with the
1505  // pair J at the root.
1506  void BBVectorize::buildInitialDAGFor(
1507  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1508  DenseSet<ValuePair> &CandidatePairsSet,
1509  std::vector<Value *> &PairableInsts,
1510  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1511  DenseSet<ValuePair> &PairableInstUsers,
1512  DenseMap<Value *, Value *> &ChosenPairs,
1513  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
1514  // Each of these pairs is viewed as the root node of a DAG. The DAG
1515  // is then walked (depth-first). As this happens, we keep track of
1516  // the pairs that compose the DAG and the maximum depth of the DAG.
1518  // General depth-first post-order traversal:
1519  Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1520  do {
1521  ValuePairWithDepth QTop = Q.back();
1522 
1523  // Push each child onto the queue:
1524  bool MoreChildren = false;
1525  size_t MaxChildDepth = QTop.second;
1527  ConnectedPairs.find(QTop.first);
1528  if (QQ != ConnectedPairs.end())
1529  for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
1530  ke = QQ->second.end(); k != ke; ++k) {
1531  // Make sure that this child pair is still a candidate:
1532  if (CandidatePairsSet.count(*k)) {
1534  if (C == DAG.end()) {
1535  size_t d = getDepthFactor(k->first);
1536  Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
1537  MoreChildren = true;
1538  } else {
1539  MaxChildDepth = std::max(MaxChildDepth, C->second);
1540  }
1541  }
1542  }
1543 
1544  if (!MoreChildren) {
1545  // Record the current pair as part of the DAG:
1546  DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1547  Q.pop_back();
1548  }
1549  } while (!Q.empty());
1550  }
1551 
1552  // Given some initial dag, prune it by removing conflicting pairs (pairs
1553  // that cannot be simultaneously chosen for vectorization).
1554  void BBVectorize::pruneDAGFor(
1555  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1556  std::vector<Value *> &PairableInsts,
1557  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1558  DenseSet<ValuePair> &PairableInstUsers,
1559  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1560  DenseSet<VPPair> &PairableInstUserPairSet,
1561  DenseMap<Value *, Value *> &ChosenPairs,
1563  DenseSet<ValuePair> &PrunedDAG, ValuePair J,
1564  bool UseCycleCheck) {
1566  // General depth-first post-order traversal:
1567  Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1568  do {
1569  ValuePairWithDepth QTop = Q.pop_back_val();
1570  PrunedDAG.insert(QTop.first);
1571 
1572  // Visit each child, pruning as necessary...
1575  ConnectedPairs.find(QTop.first);
1576  if (QQ == ConnectedPairs.end())
1577  continue;
1578 
1579  for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
1580  KE = QQ->second.end(); K != KE; ++K) {
1582  if (C == DAG.end()) continue;
1583 
1584  // This child is in the DAG, now we need to make sure it is the
1585  // best of any conflicting children. There could be multiple
1586  // conflicting children, so first, determine if we're keeping
1587  // this child, then delete conflicting children as necessary.
1588 
1589  // It is also necessary to guard against pairing-induced
1590  // dependencies. Consider instructions a .. x .. y .. b
1591  // such that (a,b) are to be fused and (x,y) are to be fused
1592  // but a is an input to x and b is an output from y. This
1593  // means that y cannot be moved after b but x must be moved
1594  // after b for (a,b) to be fused. In other words, after
1595  // fusing (a,b) we have y .. a/b .. x where y is an input
1596  // to a/b and x is an output to a/b: x and y can no longer
1597  // be legally fused. To prevent this condition, we must
1598  // make sure that a child pair added to the DAG is not
1599  // both an input and output of an already-selected pair.
1600 
1601  // Pairing-induced dependencies can also form from more complicated
1602  // cycles. The pair vs. pair conflicts are easy to check, and so
1603  // that is done explicitly for "fast rejection", and because for
1604  // child vs. child conflicts, we may prefer to keep the current
1605  // pair in preference to the already-selected child.
1606  DenseSet<ValuePair> CurrentPairs;
1607 
1608  bool CanAdd = true;
1610  = BestChildren.begin(), E2 = BestChildren.end();
1611  C2 != E2; ++C2) {
1612  if (C2->first.first == C->first.first ||
1613  C2->first.first == C->first.second ||
1614  C2->first.second == C->first.first ||
1615  C2->first.second == C->first.second ||
1616  pairsConflict(C2->first, C->first, PairableInstUsers,
1617  UseCycleCheck ? &PairableInstUserMap : 0,
1618  UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1619  if (C2->second >= C->second) {
1620  CanAdd = false;
1621  break;
1622  }
1623 
1624  CurrentPairs.insert(C2->first);
1625  }
1626  }
1627  if (!CanAdd) continue;
1628 
1629  // Even worse, this child could conflict with another node already
1630  // selected for the DAG. If that is the case, ignore this child.
1631  for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
1632  E2 = PrunedDAG.end(); T != E2; ++T) {
1633  if (T->first == C->first.first ||
1634  T->first == C->first.second ||
1635  T->second == C->first.first ||
1636  T->second == C->first.second ||
1637  pairsConflict(*T, C->first, PairableInstUsers,
1638  UseCycleCheck ? &PairableInstUserMap : 0,
1639  UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1640  CanAdd = false;
1641  break;
1642  }
1643 
1644  CurrentPairs.insert(*T);
1645  }
1646  if (!CanAdd) continue;
1647 
1648  // And check the queue too...
1650  E2 = Q.end(); C2 != E2; ++C2) {
1651  if (C2->first.first == C->first.first ||
1652  C2->first.first == C->first.second ||
1653  C2->first.second == C->first.first ||
1654  C2->first.second == C->first.second ||
1655  pairsConflict(C2->first, C->first, PairableInstUsers,
1656  UseCycleCheck ? &PairableInstUserMap : 0,
1657  UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1658  CanAdd = false;
1659  break;
1660  }
1661 
1662  CurrentPairs.insert(C2->first);
1663  }
1664  if (!CanAdd) continue;
1665 
1666  // Last but not least, check for a conflict with any of the
1667  // already-chosen pairs.
1669  ChosenPairs.begin(), E2 = ChosenPairs.end();
1670  C2 != E2; ++C2) {
1671  if (pairsConflict(*C2, C->first, PairableInstUsers,
1672  UseCycleCheck ? &PairableInstUserMap : 0,
1673  UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1674  CanAdd = false;
1675  break;
1676  }
1677 
1678  CurrentPairs.insert(*C2);
1679  }
1680  if (!CanAdd) continue;
1681 
1682  // To check for non-trivial cycles formed by the addition of the
1683  // current pair we've formed a list of all relevant pairs, now use a
1684  // graph walk to check for a cycle. We start from the current pair and
1685  // walk the use dag to see if we again reach the current pair. If we
1686  // do, then the current pair is rejected.
1687 
1688  // FIXME: It may be more efficient to use a topological-ordering
1689  // algorithm to improve the cycle check. This should be investigated.
1690  if (UseCycleCheck &&
1691  pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1692  continue;
1693 
1694  // This child can be added, but we may have chosen it in preference
1695  // to an already-selected child. Check for this here, and if a
1696  // conflict is found, then remove the previously-selected child
1697  // before adding this one in its place.
1699  = BestChildren.begin(); C2 != BestChildren.end();) {
1700  if (C2->first.first == C->first.first ||
1701  C2->first.first == C->first.second ||
1702  C2->first.second == C->first.first ||
1703  C2->first.second == C->first.second ||
1704  pairsConflict(C2->first, C->first, PairableInstUsers))
1705  C2 = BestChildren.erase(C2);
1706  else
1707  ++C2;
1708  }
1709 
1710  BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1711  }
1712 
1714  = BestChildren.begin(), E2 = BestChildren.end();
1715  C != E2; ++C) {
1716  size_t DepthF = getDepthFactor(C->first.first);
1717  Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1718  }
1719  } while (!Q.empty());
1720  }
1721 
1722  // This function finds the best dag of mututally-compatible connected
1723  // pairs, given the choice of root pairs as an iterator range.
1724  void BBVectorize::findBestDAGFor(
1725  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1726  DenseSet<ValuePair> &CandidatePairsSet,
1727  DenseMap<ValuePair, int> &CandidatePairCostSavings,
1728  std::vector<Value *> &PairableInsts,
1729  DenseSet<ValuePair> &FixedOrderPairs,
1730  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1731  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1732  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
1733  DenseSet<ValuePair> &PairableInstUsers,
1734  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1735  DenseSet<VPPair> &PairableInstUserPairSet,
1736  DenseMap<Value *, Value *> &ChosenPairs,
1737  DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
1738  int &BestEffSize, Value *II, std::vector<Value *>&JJ,
1739  bool UseCycleCheck) {
1740  for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
1741  J != JE; ++J) {
1742  ValuePair IJ(II, *J);
1743  if (!CandidatePairsSet.count(IJ))
1744  continue;
1745 
1746  // Before going any further, make sure that this pair does not
1747  // conflict with any already-selected pairs (see comment below
1748  // near the DAG pruning for more details).
1749  DenseSet<ValuePair> ChosenPairSet;
1750  bool DoesConflict = false;
1751  for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1752  E = ChosenPairs.end(); C != E; ++C) {
1753  if (pairsConflict(*C, IJ, PairableInstUsers,
1754  UseCycleCheck ? &PairableInstUserMap : 0,
1755  UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1756  DoesConflict = true;
1757  break;
1758  }
1759 
1760  ChosenPairSet.insert(*C);
1761  }
1762  if (DoesConflict) continue;
1763 
1764  if (UseCycleCheck &&
1765  pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
1766  continue;
1767 
1769  buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
1770  PairableInsts, ConnectedPairs,
1771  PairableInstUsers, ChosenPairs, DAG, IJ);
1772 
1773  // Because we'll keep the child with the largest depth, the largest
1774  // depth is still the same in the unpruned DAG.
1775  size_t MaxDepth = DAG.lookup(IJ);
1776 
1777  DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
1778  << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
1779  MaxDepth << " and size " << DAG.size() << "\n");
1780 
1781  // At this point the DAG has been constructed, but, may contain
1782  // contradictory children (meaning that different children of
1783  // some dag node may be attempting to fuse the same instruction).
1784  // So now we walk the dag again, in the case of a conflict,
1785  // keep only the child with the largest depth. To break a tie,
1786  // favor the first child.
1787 
1788  DenseSet<ValuePair> PrunedDAG;
1789  pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
1790  PairableInstUsers, PairableInstUserMap,
1791  PairableInstUserPairSet,
1792  ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
1793 
1794  int EffSize = 0;
1795  if (TTI) {
1796  DenseSet<Value *> PrunedDAGInstrs;
1797  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1798  E = PrunedDAG.end(); S != E; ++S) {
1799  PrunedDAGInstrs.insert(S->first);
1800  PrunedDAGInstrs.insert(S->second);
1801  }
1802 
1803  // The set of pairs that have already contributed to the total cost.
1804  DenseSet<ValuePair> IncomingPairs;
1805 
1806  // If the cost model were perfect, this might not be necessary; but we
1807  // need to make sure that we don't get stuck vectorizing our own
1808  // shuffle chains.
1809  bool HasNontrivialInsts = false;
1810 
1811  // The node weights represent the cost savings associated with
1812  // fusing the pair of instructions.
1813  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1814  E = PrunedDAG.end(); S != E; ++S) {
1815  if (!isa<ShuffleVectorInst>(S->first) &&
1816  !isa<InsertElementInst>(S->first) &&
1817  !isa<ExtractElementInst>(S->first))
1818  HasNontrivialInsts = true;
1819 
1820  bool FlipOrder = false;
1821 
1822  if (getDepthFactor(S->first)) {
1823  int ESContrib = CandidatePairCostSavings.find(*S)->second;
1824  DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1825  << *S->first << " <-> " << *S->second << "} = " <<
1826  ESContrib << "\n");
1827  EffSize += ESContrib;
1828  }
1829 
1830  // The edge weights contribute in a negative sense: they represent
1831  // the cost of shuffles.
1833  ConnectedPairDeps.find(*S);
1834  if (SS != ConnectedPairDeps.end()) {
1835  unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1836  for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1837  TE = SS->second.end(); T != TE; ++T) {
1838  VPPair Q(*S, *T);
1839  if (!PrunedDAG.count(Q.second))
1840  continue;
1842  PairConnectionTypes.find(VPPair(Q.second, Q.first));
1843  assert(R != PairConnectionTypes.end() &&
1844  "Cannot find pair connection type");
1845  if (R->second == PairConnectionDirect)
1846  ++NumDepsDirect;
1847  else if (R->second == PairConnectionSwap)
1848  ++NumDepsSwap;
1849  }
1850 
1851  // If there are more swaps than direct connections, then
1852  // the pair order will be flipped during fusion. So the real
1853  // number of swaps is the minimum number.
1854  FlipOrder = !FixedOrderPairs.count(*S) &&
1855  ((NumDepsSwap > NumDepsDirect) ||
1856  FixedOrderPairs.count(ValuePair(S->second, S->first)));
1857 
1858  for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1859  TE = SS->second.end(); T != TE; ++T) {
1860  VPPair Q(*S, *T);
1861  if (!PrunedDAG.count(Q.second))
1862  continue;
1864  PairConnectionTypes.find(VPPair(Q.second, Q.first));
1865  assert(R != PairConnectionTypes.end() &&
1866  "Cannot find pair connection type");
1867  Type *Ty1 = Q.second.first->getType(),
1868  *Ty2 = Q.second.second->getType();
1869  Type *VTy = getVecTypeForPair(Ty1, Ty2);
1870  if ((R->second == PairConnectionDirect && FlipOrder) ||
1871  (R->second == PairConnectionSwap && !FlipOrder) ||
1872  R->second == PairConnectionSplat) {
1873  int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1874  VTy, VTy);
1875 
1876  if (VTy->getVectorNumElements() == 2) {
1877  if (R->second == PairConnectionSplat)
1878  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1880  else
1881  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1883  }
1884 
1885  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1886  *Q.second.first << " <-> " << *Q.second.second <<
1887  "} -> {" <<
1888  *S->first << " <-> " << *S->second << "} = " <<
1889  ESContrib << "\n");
1890  EffSize -= ESContrib;
1891  }
1892  }
1893  }
1894 
1895  // Compute the cost of outgoing edges. We assume that edges outgoing
1896  // to shuffles, inserts or extracts can be merged, and so contribute
1897  // no additional cost.
1898  if (!S->first->getType()->isVoidTy()) {
1899  Type *Ty1 = S->first->getType(),
1900  *Ty2 = S->second->getType();
1901  Type *VTy = getVecTypeForPair(Ty1, Ty2);
1902 
1903  bool NeedsExtraction = false;
1904  for (Value::use_iterator I = S->first->use_begin(),
1905  IE = S->first->use_end(); I != IE; ++I) {
1906  if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
1907  // Shuffle can be folded if it has no other input
1908  if (isa<UndefValue>(SI->getOperand(1)))
1909  continue;
1910  }
1911  if (isa<ExtractElementInst>(*I))
1912  continue;
1913  if (PrunedDAGInstrs.count(*I))
1914  continue;
1915  NeedsExtraction = true;
1916  break;
1917  }
1918 
1919  if (NeedsExtraction) {
1920  int ESContrib;
1921  if (Ty1->isVectorTy()) {
1922  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1923  Ty1, VTy);
1924  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1926  } else
1927  ESContrib = (int) TTI->getVectorInstrCost(
1928  Instruction::ExtractElement, VTy, 0);
1929 
1930  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1931  *S->first << "} = " << ESContrib << "\n");
1932  EffSize -= ESContrib;
1933  }
1934 
1935  NeedsExtraction = false;
1936  for (Value::use_iterator I = S->second->use_begin(),
1937  IE = S->second->use_end(); I != IE; ++I) {
1938  if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
1939  // Shuffle can be folded if it has no other input
1940  if (isa<UndefValue>(SI->getOperand(1)))
1941  continue;
1942  }
1943  if (isa<ExtractElementInst>(*I))
1944  continue;
1945  if (PrunedDAGInstrs.count(*I))
1946  continue;
1947  NeedsExtraction = true;
1948  break;
1949  }
1950 
1951  if (NeedsExtraction) {
1952  int ESContrib;
1953  if (Ty2->isVectorTy()) {
1954  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1955  Ty2, VTy);
1956  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1958  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
1959  } else
1960  ESContrib = (int) TTI->getVectorInstrCost(
1961  Instruction::ExtractElement, VTy, 1);
1962  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1963  *S->second << "} = " << ESContrib << "\n");
1964  EffSize -= ESContrib;
1965  }
1966  }
1967 
1968  // Compute the cost of incoming edges.
1969  if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
1970  Instruction *S1 = cast<Instruction>(S->first),
1971  *S2 = cast<Instruction>(S->second);
1972  for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
1973  Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
1974 
1975  // Combining constants into vector constants (or small vector
1976  // constants into larger ones are assumed free).
1977  if (isa<Constant>(O1) && isa<Constant>(O2))
1978  continue;
1979 
1980  if (FlipOrder)
1981  std::swap(O1, O2);
1982 
1983  ValuePair VP = ValuePair(O1, O2);
1984  ValuePair VPR = ValuePair(O2, O1);
1985 
1986  // Internal edges are not handled here.
1987  if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
1988  continue;
1989 
1990  Type *Ty1 = O1->getType(),
1991  *Ty2 = O2->getType();
1992  Type *VTy = getVecTypeForPair(Ty1, Ty2);
1993 
1994  // Combining vector operations of the same type is also assumed
1995  // folded with other operations.
1996  if (Ty1 == Ty2) {
1997  // If both are insert elements, then both can be widened.
1999  *IEO2 = dyn_cast<InsertElementInst>(O2);
2000  if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
2001  continue;
2002  // If both are extract elements, and both have the same input
2003  // type, then they can be replaced with a shuffle
2005  *EIO2 = dyn_cast<ExtractElementInst>(O2);
2006  if (EIO1 && EIO2 &&
2007  EIO1->getOperand(0)->getType() ==
2008  EIO2->getOperand(0)->getType())
2009  continue;
2010  // If both are a shuffle with equal operand types and only two
2011  // unqiue operands, then they can be replaced with a single
2012  // shuffle
2014  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
2015  if (SIO1 && SIO2 &&
2016  SIO1->getOperand(0)->getType() ==
2017  SIO2->getOperand(0)->getType()) {
2018  SmallSet<Value *, 4> SIOps;
2019  SIOps.insert(SIO1->getOperand(0));
2020  SIOps.insert(SIO1->getOperand(1));
2021  SIOps.insert(SIO2->getOperand(0));
2022  SIOps.insert(SIO2->getOperand(1));
2023  if (SIOps.size() <= 2)
2024  continue;
2025  }
2026  }
2027 
2028  int ESContrib;
2029  // This pair has already been formed.
2030  if (IncomingPairs.count(VP)) {
2031  continue;
2032  } else if (IncomingPairs.count(VPR)) {
2033  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2034  VTy, VTy);
2035 
2036  if (VTy->getVectorNumElements() == 2)
2037  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2039  } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
2040  ESContrib = (int) TTI->getVectorInstrCost(
2041  Instruction::InsertElement, VTy, 0);
2042  ESContrib += (int) TTI->getVectorInstrCost(
2043  Instruction::InsertElement, VTy, 1);
2044  } else if (!Ty1->isVectorTy()) {
2045  // O1 needs to be inserted into a vector of size O2, and then
2046  // both need to be shuffled together.
2047  ESContrib = (int) TTI->getVectorInstrCost(
2048  Instruction::InsertElement, Ty2, 0);
2049  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2050  VTy, Ty2);
2051  } else if (!Ty2->isVectorTy()) {
2052  // O2 needs to be inserted into a vector of size O1, and then
2053  // both need to be shuffled together.
2054  ESContrib = (int) TTI->getVectorInstrCost(
2055  Instruction::InsertElement, Ty1, 0);
2056  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2057  VTy, Ty1);
2058  } else {
2059  Type *TyBig = Ty1, *TySmall = Ty2;
2060  if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
2061  std::swap(TyBig, TySmall);
2062 
2063  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2064  VTy, TyBig);
2065  if (TyBig != TySmall)
2066  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2067  TyBig, TySmall);
2068  }
2069 
2070  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
2071  << *O1 << " <-> " << *O2 << "} = " <<
2072  ESContrib << "\n");
2073  EffSize -= ESContrib;
2074  IncomingPairs.insert(VP);
2075  }
2076  }
2077  }
2078 
2079  if (!HasNontrivialInsts) {
2080  DEBUG(if (DebugPairSelection) dbgs() <<
2081  "\tNo non-trivial instructions in DAG;"
2082  " override to zero effective size\n");
2083  EffSize = 0;
2084  }
2085  } else {
2086  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
2087  E = PrunedDAG.end(); S != E; ++S)
2088  EffSize += (int) getDepthFactor(S->first);
2089  }
2090 
2092  dbgs() << "BBV: found pruned DAG for pair {"
2093  << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
2094  MaxDepth << " and size " << PrunedDAG.size() <<
2095  " (effective size: " << EffSize << ")\n");
2096  if (((TTI && !UseChainDepthWithTI) ||
2097  MaxDepth >= Config.ReqChainDepth) &&
2098  EffSize > 0 && EffSize > BestEffSize) {
2099  BestMaxDepth = MaxDepth;
2100  BestEffSize = EffSize;
2101  BestDAG = PrunedDAG;
2102  }
2103  }
2104  }
2105 
2106  // Given the list of candidate pairs, this function selects those
2107  // that will be fused into vector instructions.
2108  void BBVectorize::choosePairs(
2109  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
2110  DenseSet<ValuePair> &CandidatePairsSet,
2111  DenseMap<ValuePair, int> &CandidatePairCostSavings,
2112  std::vector<Value *> &PairableInsts,
2113  DenseSet<ValuePair> &FixedOrderPairs,
2114  DenseMap<VPPair, unsigned> &PairConnectionTypes,
2115  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2116  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
2117  DenseSet<ValuePair> &PairableInstUsers,
2118  DenseMap<Value *, Value *>& ChosenPairs) {
2119  bool UseCycleCheck =
2120  CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
2121 
2122  DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
2123  for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
2124  E = CandidatePairsSet.end(); I != E; ++I) {
2125  std::vector<Value *> &JJ = CandidatePairs2[I->second];
2126  if (JJ.empty()) JJ.reserve(32);
2127  JJ.push_back(I->first);
2128  }
2129 
2130  DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
2131  DenseSet<VPPair> PairableInstUserPairSet;
2132  for (std::vector<Value *>::iterator I = PairableInsts.begin(),
2133  E = PairableInsts.end(); I != E; ++I) {
2134  // The number of possible pairings for this variable:
2135  size_t NumChoices = CandidatePairs.lookup(*I).size();
2136  if (!NumChoices) continue;
2137 
2138  std::vector<Value *> &JJ = CandidatePairs[*I];
2139 
2140  // The best pair to choose and its dag:
2141  size_t BestMaxDepth = 0;
2142  int BestEffSize = 0;
2143  DenseSet<ValuePair> BestDAG;
2144  findBestDAGFor(CandidatePairs, CandidatePairsSet,
2145  CandidatePairCostSavings,
2146  PairableInsts, FixedOrderPairs, PairConnectionTypes,
2147  ConnectedPairs, ConnectedPairDeps,
2148  PairableInstUsers, PairableInstUserMap,
2149  PairableInstUserPairSet, ChosenPairs,
2150  BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
2151  UseCycleCheck);
2152 
2153  if (BestDAG.empty())
2154  continue;
2155 
2156  // A dag has been chosen (or not) at this point. If no dag was
2157  // chosen, then this instruction, I, cannot be paired (and is no longer
2158  // considered).
2159 
2160  DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
2161  << *cast<Instruction>(*I) << "\n");
2162 
2163  for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
2164  SE2 = BestDAG.end(); S != SE2; ++S) {
2165  // Insert the members of this dag into the list of chosen pairs.
2166  ChosenPairs.insert(ValuePair(S->first, S->second));
2167  DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2168  *S->second << "\n");
2169 
2170  // Remove all candidate pairs that have values in the chosen dag.
2171  std::vector<Value *> &KK = CandidatePairs[S->first];
2172  for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
2173  K != KE; ++K) {
2174  if (*K == S->second)
2175  continue;
2176 
2177  CandidatePairsSet.erase(ValuePair(S->first, *K));
2178  }
2179 
2180  std::vector<Value *> &LL = CandidatePairs2[S->second];
2181  for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
2182  L != LE; ++L) {
2183  if (*L == S->first)
2184  continue;
2185 
2186  CandidatePairsSet.erase(ValuePair(*L, S->second));
2187  }
2188 
2189  std::vector<Value *> &MM = CandidatePairs[S->second];
2190  for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
2191  M != ME; ++M) {
2192  assert(*M != S->first && "Flipped pair in candidate list?");
2193  CandidatePairsSet.erase(ValuePair(S->second, *M));
2194  }
2195 
2196  std::vector<Value *> &NN = CandidatePairs2[S->first];
2197  for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
2198  N != NE; ++N) {
2199  assert(*N != S->second && "Flipped pair in candidate list?");
2200  CandidatePairsSet.erase(ValuePair(*N, S->first));
2201  }
2202  }
2203  }
2204 
2205  DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2206  }
2207 
2208  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2209  unsigned n = 0) {
2210  if (!I->hasName())
2211  return "";
2212 
2213  return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
2214  (n > 0 ? "." + utostr(n) : "")).str();
2215  }
2216 
2217  // Returns the value that is to be used as the pointer input to the vector
2218  // instruction that fuses I with J.
2219  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2220  Instruction *I, Instruction *J, unsigned o) {
2221  Value *IPtr, *JPtr;
2222  unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2223  int64_t OffsetInElmts;
2224 
2225  // Note: the analysis might fail here, that is why the pair order has
2226  // been precomputed (OffsetInElmts must be unused here).
2227  (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2228  IAddressSpace, JAddressSpace,
2229  OffsetInElmts, false);
2230 
2231  // The pointer value is taken to be the one with the lowest offset.
2232  Value *VPtr = IPtr;
2233 
2234  Type *ArgTypeI = IPtr->getType()->getPointerElementType();
2235  Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
2236  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2237  Type *VArgPtrType
2238  = PointerType::get(VArgType,
2239  IPtr->getType()->getPointerAddressSpace());
2240  return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2241  /* insert before */ I);
2242  }
2243 
2244  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2245  unsigned MaskOffset, unsigned NumInElem,
2246  unsigned NumInElem1, unsigned IdxOffset,
2247  std::vector<Constant*> &Mask) {
2248  unsigned NumElem1 = J->getType()->getVectorNumElements();
2249  for (unsigned v = 0; v < NumElem1; ++v) {
2250  int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
2251  if (m < 0) {
2252  Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2253  } else {
2254  unsigned mm = m + (int) IdxOffset;
2255  if (m >= (int) NumInElem1)
2256  mm += (int) NumInElem;
2257 
2258  Mask[v+MaskOffset] =
2259  ConstantInt::get(Type::getInt32Ty(Context), mm);
2260  }
2261  }
2262  }
2263 
2264  // Returns the value that is to be used as the vector-shuffle mask to the
2265  // vector instruction that fuses I with J.
2266  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2267  Instruction *I, Instruction *J) {
2268  // This is the shuffle mask. We need to append the second
2269  // mask to the first, and the numbers need to be adjusted.
2270 
2271  Type *ArgTypeI = I->getType();
2272  Type *ArgTypeJ = J->getType();
2273  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2274 
2275  unsigned NumElemI = ArgTypeI->getVectorNumElements();
2276 
2277  // Get the total number of elements in the fused vector type.
2278  // By definition, this must equal the number of elements in
2279  // the final mask.
2280  unsigned NumElem = VArgType->getVectorNumElements();
2281  std::vector<Constant*> Mask(NumElem);
2282 
2283  Type *OpTypeI = I->getOperand(0)->getType();
2284  unsigned NumInElemI = OpTypeI->getVectorNumElements();
2285  Type *OpTypeJ = J->getOperand(0)->getType();
2286  unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
2287 
2288  // The fused vector will be:
2289  // -----------------------------------------------------
2290  // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2291  // -----------------------------------------------------
2292  // from which we'll extract NumElem total elements (where the first NumElemI
2293  // of them come from the mask in I and the remainder come from the mask
2294  // in J.
2295 
2296  // For the mask from the first pair...
2297  fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI,
2298  0, Mask);
2299 
2300  // For the mask from the second pair...
2301  fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2302  NumInElemI, Mask);
2303 
2304  return ConstantVector::get(Mask);
2305  }
2306 
2307  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2308  Instruction *J, unsigned o, Value *&LOp,
2309  unsigned numElemL,
2310  Type *ArgTypeL, Type *ArgTypeH,
2311  bool IBeforeJ, unsigned IdxOff) {
2312  bool ExpandedIEChain = false;
2313  if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
2314  // If we have a pure insertelement chain, then this can be rewritten
2315  // into a chain that directly builds the larger type.
2316  if (isPureIEChain(LIE)) {
2317  SmallVector<Value *, 8> VectElemts(numElemL,
2318  UndefValue::get(ArgTypeL->getScalarType()));
2319  InsertElementInst *LIENext = LIE;
2320  do {
2321  unsigned Idx =
2322  cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
2323  VectElemts[Idx] = LIENext->getOperand(1);
2324  } while ((LIENext =
2325  dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
2326 
2327  LIENext = 0;
2328  Value *LIEPrev = UndefValue::get(ArgTypeH);
2329  for (unsigned i = 0; i < numElemL; ++i) {
2330  if (isa<UndefValue>(VectElemts[i])) continue;
2331  LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2333  i + IdxOff),
2334  getReplacementName(IBeforeJ ? I : J,
2335  true, o, i+1));
2336  LIENext->insertBefore(IBeforeJ ? J : I);
2337  LIEPrev = LIENext;
2338  }
2339 
2340  LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
2341  ExpandedIEChain = true;
2342  }
2343  }
2344 
2345  return ExpandedIEChain;
2346  }
2347 
2348  static unsigned getNumScalarElements(Type *Ty) {
2349  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
2350  return VecTy->getNumElements();
2351  return 1;
2352  }
2353 
2354  // Returns the value to be used as the specified operand of the vector
2355  // instruction that fuses I with J.
2356  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2357  Instruction *J, unsigned o, bool IBeforeJ) {
2358  Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2359  Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2360 
2361  // Compute the fused vector type for this operand
2362  Type *ArgTypeI = I->getOperand(o)->getType();
2363  Type *ArgTypeJ = J->getOperand(o)->getType();
2364  VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2365 
2366  Instruction *L = I, *H = J;
2367  Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2368 
2369  unsigned numElemL = getNumScalarElements(ArgTypeL);
2370  unsigned numElemH = getNumScalarElements(ArgTypeH);
2371 
2372  Value *LOp = L->getOperand(o);
2373  Value *HOp = H->getOperand(o);
2374  unsigned numElem = VArgType->getNumElements();
2375 
2376  // First, we check if we can reuse the "original" vector outputs (if these
2377  // exist). We might need a shuffle.
2382 
2383  // FIXME: If we're fusing shuffle instructions, then we can't apply this
2384  // optimization. The input vectors to the shuffle might be a different
2385  // length from the shuffle outputs. Unfortunately, the replacement
2386  // shuffle mask has already been formed, and the mask entries are sensitive
2387  // to the sizes of the inputs.
2388  bool IsSizeChangeShuffle =
2389  isa<ShuffleVectorInst>(L) &&
2390  (LOp->getType() != L->getType() || HOp->getType() != H->getType());
2391 
2392  if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
2393  // We can have at most two unique vector inputs.
2394  bool CanUseInputs = true;
2395  Value *I1, *I2 = 0;
2396  if (LEE) {
2397  I1 = LEE->getOperand(0);
2398  } else {
2399  I1 = LSV->getOperand(0);
2400  I2 = LSV->getOperand(1);
2401  if (I2 == I1 || isa<UndefValue>(I2))
2402  I2 = 0;
2403  }
2404 
2405  if (HEE) {
2406  Value *I3 = HEE->getOperand(0);
2407  if (!I2 && I3 != I1)
2408  I2 = I3;
2409  else if (I3 != I1 && I3 != I2)
2410  CanUseInputs = false;
2411  } else {
2412  Value *I3 = HSV->getOperand(0);
2413  if (!I2 && I3 != I1)
2414  I2 = I3;
2415  else if (I3 != I1 && I3 != I2)
2416  CanUseInputs = false;
2417 
2418  if (CanUseInputs) {
2419  Value *I4 = HSV->getOperand(1);
2420  if (!isa<UndefValue>(I4)) {
2421  if (!I2 && I4 != I1)
2422  I2 = I4;
2423  else if (I4 != I1 && I4 != I2)
2424  CanUseInputs = false;
2425  }
2426  }
2427  }
2428 
2429  if (CanUseInputs) {
2430  unsigned LOpElem =
2431  cast<Instruction>(LOp)->getOperand(0)->getType()
2432  ->getVectorNumElements();
2433 
2434  unsigned HOpElem =
2435  cast<Instruction>(HOp)->getOperand(0)->getType()
2436  ->getVectorNumElements();
2437 
2438  // We have one or two input vectors. We need to map each index of the
2439  // operands to the index of the original vector.
2440  SmallVector<std::pair<int, int>, 8> II(numElem);
2441  for (unsigned i = 0; i < numElemL; ++i) {
2442  int Idx, INum;
2443  if (LEE) {
2444  Idx =
2445  cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
2446  INum = LEE->getOperand(0) == I1 ? 0 : 1;
2447  } else {
2448  Idx = LSV->getMaskValue(i);
2449  if (Idx < (int) LOpElem) {
2450  INum = LSV->getOperand(0) == I1 ? 0 : 1;
2451  } else {
2452  Idx -= LOpElem;
2453  INum = LSV->getOperand(1) == I1 ? 0 : 1;
2454  }
2455  }
2456 
2457  II[i] = std::pair<int, int>(Idx, INum);
2458  }
2459  for (unsigned i = 0; i < numElemH; ++i) {
2460  int Idx, INum;
2461  if (HEE) {
2462  Idx =
2463  cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
2464  INum = HEE->getOperand(0) == I1 ? 0 : 1;
2465  } else {
2466  Idx = HSV->getMaskValue(i);
2467  if (Idx < (int) HOpElem) {
2468  INum = HSV->getOperand(0) == I1 ? 0 : 1;
2469  } else {
2470  Idx -= HOpElem;
2471  INum = HSV->getOperand(1) == I1 ? 0 : 1;
2472  }
2473  }
2474 
2475  II[i + numElemL] = std::pair<int, int>(Idx, INum);
2476  }
2477 
2478  // We now have an array which tells us from which index of which
2479  // input vector each element of the operand comes.
2480  VectorType *I1T = cast<VectorType>(I1->getType());
2481  unsigned I1Elem = I1T->getNumElements();
2482 
2483  if (!I2) {
2484  // In this case there is only one underlying vector input. Check for
2485  // the trivial case where we can use the input directly.
2486  if (I1Elem == numElem) {
2487  bool ElemInOrder = true;
2488  for (unsigned i = 0; i < numElem; ++i) {
2489  if (II[i].first != (int) i && II[i].first != -1) {
2490  ElemInOrder = false;
2491  break;
2492  }
2493  }
2494 
2495  if (ElemInOrder)
2496  return I1;
2497  }
2498 
2499  // A shuffle is needed.
2500  std::vector<Constant *> Mask(numElem);
2501  for (unsigned i = 0; i < numElem; ++i) {
2502  int Idx = II[i].first;
2503  if (Idx == -1)
2504  Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2505  else
2506  Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2507  }
2508 
2509  Instruction *S =
2510  new ShuffleVectorInst(I1, UndefValue::get(I1T),
2511  ConstantVector::get(Mask),
2512  getReplacementName(IBeforeJ ? I : J,
2513  true, o));
2514  S->insertBefore(IBeforeJ ? J : I);
2515  return S;
2516  }
2517 
2518  VectorType *I2T = cast<VectorType>(I2->getType());
2519  unsigned I2Elem = I2T->getNumElements();
2520 
2521  // This input comes from two distinct vectors. The first step is to
2522  // make sure that both vectors are the same length. If not, the
2523  // smaller one will need to grow before they can be shuffled together.
2524  if (I1Elem < I2Elem) {
2525  std::vector<Constant *> Mask(I2Elem);
2526  unsigned v = 0;
2527  for (; v < I1Elem; ++v)
2528  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2529  for (; v < I2Elem; ++v)
2530  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2531 
2532  Instruction *NewI1 =
2533  new ShuffleVectorInst(I1, UndefValue::get(I1T),
2534  ConstantVector::get(Mask),
2535  getReplacementName(IBeforeJ ? I : J,
2536  true, o, 1));
2537  NewI1->insertBefore(IBeforeJ ? J : I);
2538  I1 = NewI1;
2539  I1T = I2T;
2540  I1Elem = I2Elem;
2541  } else if (I1Elem > I2Elem) {
2542  std::vector<Constant *> Mask(I1Elem);
2543  unsigned v = 0;
2544  for (; v < I2Elem; ++v)
2545  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2546  for (; v < I1Elem; ++v)
2547  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2548 
2549  Instruction *NewI2 =
2550  new ShuffleVectorInst(I2, UndefValue::get(I2T),
2551  ConstantVector::get(Mask),
2552  getReplacementName(IBeforeJ ? I : J,
2553  true, o, 1));
2554  NewI2->insertBefore(IBeforeJ ? J : I);
2555  I2 = NewI2;
2556  I2T = I1T;
2557  I2Elem = I1Elem;
2558  }
2559 
2560  // Now that both I1 and I2 are the same length we can shuffle them
2561  // together (and use the result).
2562  std::vector<Constant *> Mask(numElem);
2563  for (unsigned v = 0; v < numElem; ++v) {
2564  if (II[v].first == -1) {
2565  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2566  } else {
2567  int Idx = II[v].first + II[v].second * I1Elem;
2568  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2569  }
2570  }
2571 
2572  Instruction *NewOp =
2573  new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2574  getReplacementName(IBeforeJ ? I : J, true, o));
2575  NewOp->insertBefore(IBeforeJ ? J : I);
2576  return NewOp;
2577  }
2578  }
2579 
2580  Type *ArgType = ArgTypeL;
2581  if (numElemL < numElemH) {
2582  if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
2583  ArgTypeL, VArgType, IBeforeJ, 1)) {
2584  // This is another short-circuit case: we're combining a scalar into
2585  // a vector that is formed by an IE chain. We've just expanded the IE
2586  // chain, now insert the scalar and we're done.
2587 
2588  Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2589  getReplacementName(IBeforeJ ? I : J, true, o));
2590  S->insertBefore(IBeforeJ ? J : I);
2591  return S;
2592  } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
2593  ArgTypeH, IBeforeJ)) {
2594  // The two vector inputs to the shuffle must be the same length,
2595  // so extend the smaller vector to be the same length as the larger one.
2596  Instruction *NLOp;
2597  if (numElemL > 1) {
2598 
2599  std::vector<Constant *> Mask(numElemH);
2600  unsigned v = 0;
2601  for (; v < numElemL; ++v)
2602  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2603  for (; v < numElemH; ++v)
2604  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2605 
2606  NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2607  ConstantVector::get(Mask),
2608  getReplacementName(IBeforeJ ? I : J,
2609  true, o, 1));
2610  } else {
2611  NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2612  getReplacementName(IBeforeJ ? I : J,
2613  true, o, 1));
2614  }
2615 
2616  NLOp->insertBefore(IBeforeJ ? J : I);
2617  LOp = NLOp;
2618  }
2619 
2620  ArgType = ArgTypeH;
2621  } else if (numElemL > numElemH) {
2622  if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
2623  ArgTypeH, VArgType, IBeforeJ)) {
2624  Instruction *S =
2625  InsertElementInst::Create(LOp, HOp,
2627  numElemL),
2628  getReplacementName(IBeforeJ ? I : J,
2629  true, o));
2630  S->insertBefore(IBeforeJ ? J : I);
2631  return S;
2632  } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
2633  ArgTypeL, IBeforeJ)) {
2634  Instruction *NHOp;
2635  if (numElemH > 1) {
2636  std::vector<Constant *> Mask(numElemL);
2637  unsigned v = 0;
2638  for (; v < numElemH; ++v)
2639  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2640  for (; v < numElemL; ++v)
2641  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2642 
2643  NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2644  ConstantVector::get(Mask),
2645  getReplacementName(IBeforeJ ? I : J,
2646  true, o, 1));
2647  } else {
2648  NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2649  getReplacementName(IBeforeJ ? I : J,
2650  true, o, 1));
2651  }
2652 
2653  NHOp->insertBefore(IBeforeJ ? J : I);
2654  HOp = NHOp;
2655  }
2656  }
2657 
2658  if (ArgType->isVectorTy()) {
2659  unsigned numElem = VArgType->getVectorNumElements();
2660  std::vector<Constant*> Mask(numElem);
2661  for (unsigned v = 0; v < numElem; ++v) {
2662  unsigned Idx = v;
2663  // If the low vector was expanded, we need to skip the extra
2664  // undefined entries.
2665  if (v >= numElemL && numElemH > numElemL)
2666  Idx += (numElemH - numElemL);
2667  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2668  }
2669 
2670  Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2671  ConstantVector::get(Mask),
2672  getReplacementName(IBeforeJ ? I : J, true, o));
2673  BV->insertBefore(IBeforeJ ? J : I);
2674  return BV;
2675  }
2676 
2678  UndefValue::get(VArgType), LOp, CV0,
2679  getReplacementName(IBeforeJ ? I : J,
2680  true, o, 1));
2681  BV1->insertBefore(IBeforeJ ? J : I);
2682  Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2683  getReplacementName(IBeforeJ ? I : J,
2684  true, o, 2));
2685  BV2->insertBefore(IBeforeJ ? J : I);
2686  return BV2;
2687  }
2688 
2689  // This function creates an array of values that will be used as the inputs
2690  // to the vector instruction that fuses I with J.
2691  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2692  Instruction *I, Instruction *J,
2693  SmallVectorImpl<Value *> &ReplacedOperands,
2694  bool IBeforeJ) {
2695  unsigned NumOperands = I->getNumOperands();
2696 
2697  for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
2698  // Iterate backward so that we look at the store pointer
2699  // first and know whether or not we need to flip the inputs.
2700 
2701  if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
2702  // This is the pointer for a load/store instruction.
2703  ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2704  continue;
2705  } else if (isa<CallInst>(I)) {
2706  Function *F = cast<CallInst>(I)->getCalledFunction();
2708  if (o == NumOperands-1) {
2709  BasicBlock &BB = *I->getParent();
2710 
2711  Module *M = BB.getParent()->getParent();
2712  Type *ArgTypeI = I->getType();
2713  Type *ArgTypeJ = J->getType();
2714  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2715 
2716  ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
2717  continue;
2718  } else if (IID == Intrinsic::powi && o == 1) {
2719  // The second argument of powi is a single integer and we've already
2720  // checked that both arguments are equal. As a result, we just keep
2721  // I's second argument.
2722  ReplacedOperands[o] = I->getOperand(o);
2723  continue;
2724  }
2725  } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
2726  ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2727  continue;
2728  }
2729 
2730  ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2731  }
2732  }
2733 
2734  // This function creates two values that represent the outputs of the
2735  // original I and J instructions. These are generally vector shuffles
2736  // or extracts. In many cases, these will end up being unused and, thus,
2737  // eliminated by later passes.
2738  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2739  Instruction *J, Instruction *K,
2740  Instruction *&InsertionPt,
2741  Instruction *&K1, Instruction *&K2) {
2742  if (isa<StoreInst>(I)) {
2743  AA->replaceWithNewValue(I, K);
2744  AA->replaceWithNewValue(J, K);
2745  } else {
2746  Type *IType = I->getType();
2747  Type *JType = J->getType();
2748 
2749  VectorType *VType = getVecTypeForPair(IType, JType);
2750  unsigned numElem = VType->getNumElements();
2751 
2752  unsigned numElemI = getNumScalarElements(IType);
2753  unsigned numElemJ = getNumScalarElements(JType);
2754 
2755  if (IType->isVectorTy()) {
2756  std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
2757  for (unsigned v = 0; v < numElemI; ++v) {
2758  Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2759  Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
2760  }
2761 
2762  K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2763  ConstantVector::get( Mask1),
2764  getReplacementName(K, false, 1));
2765  } else {
2766  Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2767  K1 = ExtractElementInst::Create(K, CV0,
2768  getReplacementName(K, false, 1));
2769  }
2770 
2771  if (JType->isVectorTy()) {
2772  std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
2773  for (unsigned v = 0; v < numElemJ; ++v) {
2774  Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2775  Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
2776  }
2777 
2778  K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2779  ConstantVector::get( Mask2),
2780  getReplacementName(K, false, 2));
2781  } else {
2782  Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
2783  K2 = ExtractElementInst::Create(K, CV1,
2784  getReplacementName(K, false, 2));
2785  }
2786 
2787  K1->insertAfter(K);
2788  K2->insertAfter(K1);
2789  InsertionPt = K2;
2790  }
2791  }
2792 
2793  // Move all uses of the function I (including pairing-induced uses) after J.
2794  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2795  DenseSet<ValuePair> &LoadMoveSetPairs,
2796  Instruction *I, Instruction *J) {
2797  // Skip to the first instruction past I.
2799 
2801  AliasSetTracker WriteSet(*AA);
2802  if (I->mayWriteToMemory()) WriteSet.add(I);
2803 
2804  for (; cast<Instruction>(L) != J; ++L)
2805  (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
2806 
2807  assert(cast<Instruction>(L) == J &&
2808  "Tracking has not proceeded far enough to check for dependencies");
2809  // If J is now in the use set of I, then trackUsesOfI will return true
2810  // and we have a dependency cycle (and the fusing operation must abort).
2811  return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
2812  }
2813 
2814  // Move all uses of the function I (including pairing-induced uses) after J.
2815  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2816  DenseSet<ValuePair> &LoadMoveSetPairs,
2817  Instruction *&InsertionPt,
2818  Instruction *I, Instruction *J) {
2819  // Skip to the first instruction past I.
2821 
2823  AliasSetTracker WriteSet(*AA);
2824  if (I->mayWriteToMemory()) WriteSet.add(I);
2825 
2826  for (; cast<Instruction>(L) != J;) {
2827  if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
2828  // Move this instruction
2829  Instruction *InstToMove = L; ++L;
2830 
2831  DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2832  " to after " << *InsertionPt << "\n");
2833  InstToMove->removeFromParent();
2834  InstToMove->insertAfter(InsertionPt);
2835  InsertionPt = InstToMove;
2836  } else {
2837  ++L;
2838  }
2839  }
2840  }
2841 
2842  // Collect all load instruction that are in the move set of a given first
2843  // pair member. These loads depend on the first instruction, I, and so need
2844  // to be moved after J (the second instruction) when the pair is fused.
2845  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2846  DenseMap<Value *, Value *> &ChosenPairs,
2847  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2848  DenseSet<ValuePair> &LoadMoveSetPairs,
2849  Instruction *I) {
2850  // Skip to the first instruction past I.
2852 
2854  AliasSetTracker WriteSet(*AA);
2855  if (I->mayWriteToMemory()) WriteSet.add(I);
2856 
2857  // Note: We cannot end the loop when we reach J because J could be moved
2858  // farther down the use chain by another instruction pairing. Also, J
2859  // could be before I if this is an inverted input.
2860  for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
2861  if (trackUsesOfI(Users, WriteSet, I, L)) {
2862  if (L->mayReadFromMemory()) {
2863  LoadMoveSet[L].push_back(I);
2864  LoadMoveSetPairs.insert(ValuePair(L, I));
2865  }
2866  }
2867  }
2868  }
2869 
2870  // In cases where both load/stores and the computation of their pointers
2871  // are chosen for vectorization, we can end up in a situation where the
2872  // aliasing analysis starts returning different query results as the
2873  // process of fusing instruction pairs continues. Because the algorithm
2874  // relies on finding the same use dags here as were found earlier, we'll
2875  // need to precompute the necessary aliasing information here and then
2876  // manually update it during the fusion process.
2877  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2878  std::vector<Value *> &PairableInsts,
2879  DenseMap<Value *, Value *> &ChosenPairs,
2880  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2881  DenseSet<ValuePair> &LoadMoveSetPairs) {
2882  for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
2883  PIE = PairableInsts.end(); PI != PIE; ++PI) {
2884  DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
2885  if (P == ChosenPairs.end()) continue;
2886 
2887  Instruction *I = cast<Instruction>(P->first);
2888  collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
2889  LoadMoveSetPairs, I);
2890  }
2891  }
2892 
2893  // When the first instruction in each pair is cloned, it will inherit its
2894  // parent's metadata. This metadata must be combined with that of the other
2895  // instruction in a safe way.
2896  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
2898  K->getAllMetadataOtherThanDebugLoc(Metadata);
2899  for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
2900  unsigned Kind = Metadata[i].first;
2901  MDNode *JMD = J->getMetadata(Kind);
2902  MDNode *KMD = Metadata[i].second;
2903 
2904  switch (Kind) {
2905  default:
2906  K->setMetadata(Kind, 0); // Remove unknown metadata
2907  break;
2908  case LLVMContext::MD_tbaa:
2909  K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
2910  break;
2912  K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
2913  break;
2914  }
2915  }
2916  }
2917 
2918  // This function fuses the chosen instruction pairs into vector instructions,
2919  // taking care preserve any needed scalar outputs and, then, it reorders the
2920  // remaining instructions as needed (users of the first member of the pair
2921  // need to be moved to after the location of the second member of the pair
2922  // because the vector instruction is inserted in the location of the pair's
2923  // second member).
2924  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
2925  std::vector<Value *> &PairableInsts,
2926  DenseMap<Value *, Value *> &ChosenPairs,
2927  DenseSet<ValuePair> &FixedOrderPairs,
2928  DenseMap<VPPair, unsigned> &PairConnectionTypes,
2929  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2930  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
2931  LLVMContext& Context = BB.getContext();
2932 
2933  // During the vectorization process, the order of the pairs to be fused
2934  // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
2935  // list. After a pair is fused, the flipped pair is removed from the list.
2936  DenseSet<ValuePair> FlippedPairs;
2937  for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
2938  E = ChosenPairs.end(); P != E; ++P)
2939  FlippedPairs.insert(ValuePair(P->second, P->first));
2940  for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
2941  E = FlippedPairs.end(); P != E; ++P)
2942  ChosenPairs.insert(*P);
2943 
2945  DenseSet<ValuePair> LoadMoveSetPairs;
2946  collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
2947  LoadMoveSet, LoadMoveSetPairs);
2948 
2949  DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
2950 
2951  for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
2952  DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
2953  if (P == ChosenPairs.end()) {
2954  ++PI;
2955  continue;
2956  }
2957 
2958  if (getDepthFactor(P->first) == 0) {
2959  // These instructions are not really fused, but are tracked as though
2960  // they are. Any case in which it would be interesting to fuse them
2961  // will be taken care of by InstCombine.
2962  --NumFusedOps;
2963  ++PI;
2964  continue;
2965  }
2966 
2967  Instruction *I = cast<Instruction>(P->first),
2968  *J = cast<Instruction>(P->second);
2969 
2970  DEBUG(dbgs() << "BBV: fusing: " << *I <<
2971  " <-> " << *J << "\n");
2972 
2973  // Remove the pair and flipped pair from the list.
2974  DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
2975  assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
2976  ChosenPairs.erase(FP);
2977  ChosenPairs.erase(P);
2978 
2979  if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
2980  DEBUG(dbgs() << "BBV: fusion of: " << *I <<
2981  " <-> " << *J <<
2982  " aborted because of non-trivial dependency cycle\n");
2983  --NumFusedOps;
2984  ++PI;
2985  continue;
2986  }
2987 
2988  // If the pair must have the other order, then flip it.
2989  bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
2990  if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
2991  // This pair does not have a fixed order, and so we might want to
2992  // flip it if that will yield fewer shuffles. We count the number
2993  // of dependencies connected via swaps, and those directly connected,
2994  // and flip the order if the number of swaps is greater.
2995  bool OrigOrder = true;
2997  ConnectedPairDeps.find(ValuePair(I, J));
2998  if (IJ == ConnectedPairDeps.end()) {
2999  IJ = ConnectedPairDeps.find(ValuePair(J, I));
3000  OrigOrder = false;
3001  }
3002 
3003  if (IJ != ConnectedPairDeps.end()) {
3004  unsigned NumDepsDirect = 0, NumDepsSwap = 0;
3005  for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
3006  TE = IJ->second.end(); T != TE; ++T) {
3007  VPPair Q(IJ->first, *T);
3009  PairConnectionTypes.find(VPPair(Q.second, Q.first));
3010  assert(R != PairConnectionTypes.end() &&
3011  "Cannot find pair connection type");
3012  if (R->second == PairConnectionDirect)
3013  ++NumDepsDirect;
3014  else if (R->second == PairConnectionSwap)
3015  ++NumDepsSwap;
3016  }
3017 
3018  if (!OrigOrder)
3019  std::swap(NumDepsDirect, NumDepsSwap);
3020 
3021  if (NumDepsSwap > NumDepsDirect) {
3022  FlipPairOrder = true;
3023  DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
3024  " <-> " << *J << "\n");
3025  }
3026  }
3027  }
3028 
3029  Instruction *L = I, *H = J;
3030  if (FlipPairOrder)
3031  std::swap(H, L);
3032 
3033  // If the pair being fused uses the opposite order from that in the pair
3034  // connection map, then we need to flip the types.
3036  ConnectedPairs.find(ValuePair(H, L));
3037  if (HL != ConnectedPairs.end())
3038  for (std::vector<ValuePair>::iterator T = HL->second.begin(),
3039  TE = HL->second.end(); T != TE; ++T) {
3040  VPPair Q(HL->first, *T);
3041  DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
3042  assert(R != PairConnectionTypes.end() &&
3043  "Cannot find pair connection type");
3044  if (R->second == PairConnectionDirect)
3045  R->second = PairConnectionSwap;
3046  else if (R->second == PairConnectionSwap)
3047  R->second = PairConnectionDirect;
3048  }
3049 
3050  bool LBeforeH = !FlipPairOrder;
3051  unsigned NumOperands = I->getNumOperands();
3052  SmallVector<Value *, 3> ReplacedOperands(NumOperands);
3053  getReplacementInputsForPair(Context, L, H, ReplacedOperands,
3054  LBeforeH);
3055 
3056  // Make a copy of the original operation, change its type to the vector
3057  // type and replace its operands with the vector operands.
3058  Instruction *K = L->clone();
3059  if (L->hasName())
3060  K->takeName(L);
3061  else if (H->hasName())
3062  K->takeName(H);
3063 
3064  if (!isa<StoreInst>(K))
3065  K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
3066 
3067  combineMetadata(K, H);
3069 
3070  for (unsigned o = 0; o < NumOperands; ++o)
3071  K->setOperand(o, ReplacedOperands[o]);
3072 
3073  K->insertAfter(J);
3074 
3075  // Instruction insertion point:
3076  Instruction *InsertionPt = K;
3077  Instruction *K1 = 0, *K2 = 0;
3078  replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
3079 
3080  // The use dag of the first original instruction must be moved to after
3081  // the location of the second instruction. The entire use dag of the
3082  // first instruction is disjoint from the input dag of the second
3083  // (by definition), and so commutes with it.
3084 
3085  moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
3086 
3087  if (!isa<StoreInst>(I)) {
3088  L->replaceAllUsesWith(K1);
3089  H->replaceAllUsesWith(K2);
3090  AA->replaceWithNewValue(L, K1);
3091  AA->replaceWithNewValue(H, K2);
3092  }
3093 
3094  // Instructions that may read from memory may be in the load move set.
3095  // Once an instruction is fused, we no longer need its move set, and so
3096  // the values of the map never need to be updated. However, when a load
3097  // is fused, we need to merge the entries from both instructions in the
3098  // pair in case those instructions were in the move set of some other
3099  // yet-to-be-fused pair. The loads in question are the keys of the map.
3100  if (I->mayReadFromMemory()) {
3101  std::vector<ValuePair> NewSetMembers;
3103  LoadMoveSet.find(I);
3104  if (II != LoadMoveSet.end())
3105  for (std::vector<Value *>::iterator N = II->second.begin(),
3106  NE = II->second.end(); N != NE; ++N)
3107  NewSetMembers.push_back(ValuePair(K, *N));
3109  LoadMoveSet.find(J);
3110  if (JJ != LoadMoveSet.end())
3111  for (std::vector<Value *>::iterator N = JJ->second.begin(),
3112  NE = JJ->second.end(); N != NE; ++N)
3113  NewSetMembers.push_back(ValuePair(K, *N));
3114  for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
3115  AE = NewSetMembers.end(); A != AE; ++A) {
3116  LoadMoveSet[A->first].push_back(A->second);
3117  LoadMoveSetPairs.insert(*A);
3118  }
3119  }
3120 
3121  // Before removing I, set the iterator to the next instruction.
3123  if (cast<Instruction>(PI) == J)
3124  ++PI;
3125 
3126  SE->forgetValue(I);
3127  SE->forgetValue(J);
3128  I->eraseFromParent();
3129  J->eraseFromParent();
3130 
3131  DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
3132  BB << "\n");
3133  }
3134 
3135  DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
3136  }
3137 }
3138 
3139 char BBVectorize::ID = 0;
3140 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
3141 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3146 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3147 
3149  return new BBVectorize(C);
3150 }
3151 
3152 bool
3154  BBVectorize BBVectorizer(P, C);
3155  return BBVectorizer.vectorizeBB(BB);
3156 }
3157 
3158 //===----------------------------------------------------------------------===//
3167  VectorizeFMA = !::NoFMA;
3169  VectorizeCmp = !::NoCmp;
3170  VectorizeGEP = !::NoGEP;
3177  MaxInsts = ::MaxInsts;
3178  MaxPairs = ::MaxPairs;
3179  MaxIter = ::MaxIter;
3182  FastDep = ::FastDep;
3183 }
bool VectorizeFMA
Vectorize the fused-multiply-add intrinsic.
use_iterator use_end()
Definition: Value.h:152
static cl::opt< unsigned > MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, cl::desc("The maximum number of candidate instruction pairs per group"))
Abstract base class of comparison instructions.
Definition: InstrTypes.h:633
STATISTIC(NumFusedOps,"Number of operations fused by bb-vectorize")
AnalysisUsage & addPreserved()
static PassRegistry * getPassRegistry()
bool hasName() const
Definition: Value.h:117
unsigned getScalarSizeInBits()
Definition: Type.cpp:135
The main container class for the LLVM Intermediate Representation.
Definition: Module.h:112
static cl::opt< bool > DebugPairSelection("bb-vectorize-debug-pair-selection", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" pair-selection process"))
enable_if_c<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:266
unsigned getNumOperands() const
Definition: User.h:108
virtual void getAnalysisUsage(AnalysisUsage &) const
Definition: Pass.cpp:75
void initializeBBVectorizePass(PassRegistry &)
static cl::opt< bool > NoMemOpBoost("bb-vectorize-no-mem-op-boost", cl::init(false), cl::Hidden, cl::desc("Don't boost the chain-depth contribution of loads and stores"))
static PointerType * get(Type *ElementType, unsigned AddressSpace)
Definition: Type.cpp:730
static cl::opt< bool > IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), cl::Hidden, cl::desc("Ignore target information"))
bool VectorizeMath
Vectorize floating-point math intrinsics.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:116
MDNode - a tuple of other values.
Definition: Metadata.h:69
F(f)
iv Induction Variable Users
Definition: IVUsers.cpp:39
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: Type.cpp:218
op_iterator op_begin()
Definition: User.h:116
LoopInfoBase< BlockT, LoopT > * LI
Definition: LoopInfoImpl.h:411
static cl::opt< unsigned > VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, cl::desc("The size of the native vector registers"))
Type * getPointerElementType() const
Definition: Type.h:373
StringRef getName() const
Definition: Value.cpp:167
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Definition: Instruction.h:162
bool isSingleValueType() const
Definition: Type.h:259
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:167
static cl::opt< bool > AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, cl::desc("Only generate aligned loads and stores"))
unsigned MaxCandPairsForCycleCheck
The maximum number of candidate pairs with which to use a full cycle check.
bool erase(const ValueT &V)
Definition: DenseSet.h:49
Base class of casting instructions.
Definition: InstrTypes.h:387
const APInt & getValue() const
Return the constant's value.
Definition: Constants.h:105
const_iterator end() const
AnalysisType * getAnalysisIfAvailable() const
T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val()
Definition: SmallVector.h:430
Definition: Use.h:60
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:172
unsigned getNumArgOperands() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:923
int64_t abs64(int64_t x)
Definition: MathExtras.h:579
Check for equivalence ignoring load/store alignment.
Definition: Instruction.h:349
static ConstantInt * ExtractElement(Constant *V, Constant *Idx)
bool empty() const
Definition: DenseSet.h:33
ID
LLVM Calling Convention Representation.
Definition: CallingConv.h:26
Instruction * clone() const
#define false
Definition: ConvertUTF.c:64
static cl::opt< unsigned > MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, cl::desc("The maximum number of pairable instructions per group"))
static const char bb_vectorize_name[]
#define G(x, y, z)
Definition: MD5.cpp:52
bool Pow2LenOnly
Don't try to form odd-length vectors.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:351
const_iterator begin() const
static cl::opt< bool > NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point math intrinsics"))
bool mayReadFromMemory() const
static cl::opt< bool > UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), cl::Hidden, cl::desc("Use the chain depth requirement with"" target information"))
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:56
#define T
static cl::opt< bool > NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"))
static bool isValidElementType(Type *ElemTy)
Definition: Type.cpp:721
This class represents a no-op cast from one type to another.
static std::string utostr(uint64_t X, bool isNeg=false)
Definition: StringExtras.h:88
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Definition: Function.cpp:683
bool insert(const T &V)
Definition: SmallSet.h:59
void replaceAllUsesWith(Value *V)
Definition: Value.cpp:303
unsigned getNumElements() const
Return the number of elements in the Vector type.
Definition: DerivedTypes.h:408
Reverse the order of the vector.
void takeName(Value *V)
Definition: Value.cpp:239
bool VectorizePointers
Vectorize pointer values.
bool isPPC_FP128Ty() const
isPPC_FP128Ty - Return true if this is powerpc long double.
Definition: Type.h:158
bool VectorizeCmp
Vectorize comparison instructions.
unsigned MaxIter
The maximum number of pairing iterations.
bool VectorizeMemOps
Vectorize loads and stores.
bool SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD=0, const TargetLibraryInfo *TLI=0)
Definition: Local.cpp:398
static cl::opt< bool > FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, cl::desc("Use a fast instruction dependency analysis"))
ExtractSubvector Index indicates start offset.
static cl::opt< bool > NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize comparison instructions"))
bool isX86_MMXTy() const
isX86_MMXTy - Return true if this is X86 MMX.
Definition: Type.h:182
bool isIntOrIntVectorTy() const
Definition: Type.h:204
#define P(N)
void intersectOptionalDataWith(const Value *V)
Definition: Value.h:258
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:314
unsigned getAlignment() const
Definition: Instructions.h:301
iterator find(const ValueT &V)
Definition: DenseSet.h:113
bool AlignedOnly
Only generate aligned loads and stores.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=0)
static cl::opt< bool > NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values"))
void insertBefore(Instruction *InsertPos)
Definition: Instruction.cpp:78
LLVM Basic Block Representation.
Definition: BasicBlock.h:72
static cl::opt< unsigned > SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs"))
bool isVectorTy() const
Definition: Type.h:229
unsigned getIntrinsicID() const LLVM_READONLY
Definition: Function.cpp:371
bool SplatBreaksChain
Replicating one element to a pair breaks the chain.
#define H(x, y, z)
Definition: MD5.cpp:53
static cl::opt< bool > DebugInstructionExamination("bb-vectorize-debug-instruction-examination", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" instruction-examination process"))
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1845
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1850
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=0)
op_iterator op_end()
Definition: User.h:118
ItTy next(ItTy it, Dist n)
Definition: STLExtras.h:154
Value * getOperand(unsigned i) const
Definition: User.h:88
Value * getPointerOperand()
Definition: Instructions.h:223
unsigned SearchLimit
The maximum search distance for instruction pairs.
static cl::opt< bool > NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize casting (conversion) operations"))
static cl::opt< unsigned > MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"" a full cycle check"))
bool count(const KeyT &Val) const
count - Return true if the specified key is in the map.
Definition: DenseMap.h:103
#define INITIALIZE_AG_DEPENDENCY(depName)
Definition: PassSupport.h:169
static MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
Methods for metadata merging.
bool isPointerTy() const
Definition: Type.h:220
static UndefValue * get(Type *T)
Definition: Constants.cpp:1334
iterator erase(iterator I)
Definition: SmallVector.h:478
static cl::opt< unsigned > MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, cl::desc("The maximum number of pairing iterations"))
bool isFPOrFPVectorTy() const
Definition: Type.h:186
void setMetadata(unsigned KindID, MDNode *Node)
Definition: Metadata.cpp:589
bool mayWriteToMemory() const
unsigned MaxPairs
The maximum number of candidate instruction pairs per group.
const unsigned MaxDepth
iterator begin()
Definition: DenseSet.h:107
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:153
AnalysisType & getAnalysis() const
bool count(const ValueT &V) const
Definition: DenseSet.h:45
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:117
Class for constant integers.
Definition: Constants.h:51
bool VectorizeFloats
Vectorize floating-point values.
unsigned getVectorNumElements() const
Definition: Type.cpp:214
iterator end()
Definition: BasicBlock.h:195
Type * getType() const
Definition: Value.h:111
static cl::opt< bool > SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, cl::desc("Replicating one element to a pair breaks the chain"))
MDNode * getMetadata(unsigned KindID) const
Definition: Instruction.h:140
bool vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C=VectorizeConfig())
Vectorize the BasicBlock.
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:164
static cl::opt< bool > NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize loads and stores"))
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
Definition: Constants.cpp:492
Function * getCalledFunction() const
void setPreservesCFG()
Definition: Pass.cpp:249
static cl::opt< bool > NoPointers("bb-vectorize-no-pointers", cl::init(true), cl::Hidden, cl::desc("Don't try to vectorize pointer values"))
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:228
bool FastDep
Use a fast instruction dependency analysis.
Vectorize configuration.
void setOperand(unsigned i, Value *Val)
Definition: User.h:92
raw_ostream & dbgs()
dbgs - Return a circular-buffered debug stream.
Definition: Debug.cpp:101
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:591
Value * getArgOperand(unsigned i) const
static cl::opt< bool > NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize getelementptr instructions"))
static cl::opt< bool > NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize boolean (i1) values"))
bool VectorizeCasts
Vectorize casting (conversion) operations.
#define BBV_NAME
Definition: BBVectorize.cpp:17
BasicBlockPass * createBBVectorizePass(const VectorizeConfig &C=VectorizeConfig())
bool VectorizeBools
Vectorize boolean values.
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1840
static cl::opt< unsigned > ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization"))
unsigned ReqChainDepth
The required chain depth for vectorization.
use_iterator use_begin()
Definition: Value.h:150
bool isX86_FP80Ty() const
isX86_FP80Ty - Return true if this is x86 long double.
Definition: Type.h:152
unsigned MaxInsts
The maximum number of pairable instructions per group.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
unsigned size() const
Definition: SmallSet.h:43
static cl::opt< bool > NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize integer values"))
unsigned getAlignment() const
Definition: Instructions.h:181
bool isBinaryOp() const
Definition: Instruction.h:87
void insertAfter(Instruction *InsertPos)
Definition: Instruction.cpp:84
bool NoMemOpBoost
Don't boost the chain-depth contribution of loads and stores.
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
bool VectorizeGEP
Vectorize getelementptr instructions.
static Function * getCalledFunction(const Value *V, bool LookThroughBitCast)
const Type * getScalarType() const
Definition: Type.cpp:51
static cl::opt< bool > PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, dump the basic block after"" every pair is fused"))
static cl::opt< bool > DebugCycleCheck("bb-vectorize-debug-cycle-check", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" cycle-checking process"))
static cl::opt< bool > DebugCandidateSelection("bb-vectorize-debug-candidate-selection", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" candidate-selection process"))
unsigned getPrimitiveSizeInBits() const
Definition: Type.cpp:117
void mutateType(Type *Ty)
Definition: Value.h:342
VectorizeConfig()
Initialize the VectorizeConfig from command line options.
unsigned size() const
Definition: DenseSet.h:34
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:33
bool add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo)
Module * getParent()
Definition: GlobalValue.h:286
LLVM Value Representation.
Definition: Value.h:66
unsigned getOpcode() const
getOpcode() returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:83
static VectorType * get(Type *ElementType, unsigned NumElements)
Definition: Type.cpp:706
bool VectorizeInts
Vectorize integer values.
Broadcast element 0 to all other elements.
static cl::opt< bool > Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, cl::desc("Don't try to form non-2^n-length vectors"))
static cl::opt< HelpPrinterWrapper, true, parser< bool > > HOp("help", cl::desc("Display available options (-help-hidden for more)"), cl::location(WrappedNormalPrinter), cl::ValueDisallowed)
#define DEBUG(X)
Definition: Debug.h:97
bool isSameOperationAs(const Instruction *I, unsigned flags=0) const
Determine if one instruction is the same operation as another.
iterator end()
Definition: DenseSet.h:108
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:170
bool VectorizeSelect
Vectorize select instructions.
static MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
Definition: Metadata.cpp:408
int64_t getSExtValue() const
Return the sign extended value.
Definition: Constants.h:124
Value * getPointerOperand()
Definition: Instructions.h:346
const BasicBlock * getParent() const
Definition: Instruction.h:52
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:99
INITIALIZE_PASS(GlobalMerge,"global-merge","Global Merge", false, false) bool GlobalMerge const DataLayout * TD
static cl::opt< bool > NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize select instructions"))
#define T1
unsigned VectorBits
The size of the native vector registers.