LLVM API Documentation

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
X86ISelLowering.cpp
Go to the documentation of this file.
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #define DEBUG_TYPE "x86-isel"
16 #include "X86ISelLowering.h"
17 #include "Utils/X86ShuffleDecode.h"
18 #include "X86.h"
19 #include "X86CallingConv.h"
20 #include "X86InstrBuilder.h"
21 #include "X86TargetMachine.h"
22 #include "X86TargetObjectFile.h"
23 #include "llvm/ADT/SmallSet.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringExtras.h"
34 #include "llvm/IR/CallingConv.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/GlobalAlias.h"
39 #include "llvm/IR/GlobalVariable.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/LLVMContext.h"
43 #include "llvm/MC/MCAsmInfo.h"
44 #include "llvm/MC/MCContext.h"
45 #include "llvm/MC/MCExpr.h"
46 #include "llvm/MC/MCSymbol.h"
47 #include "llvm/Support/CallSite.h"
48 #include "llvm/Support/Debug.h"
52 #include <bitset>
53 #include <cctype>
54 using namespace llvm;
55 
56 STATISTIC(NumTailCalls, "Number of tail calls");
57 
58 // Forward declarations.
59 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
60  SDValue V2);
61 
62 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
63  SelectionDAG &DAG, SDLoc dl,
64  unsigned vectorWidth) {
65  assert((vectorWidth == 128 || vectorWidth == 256) &&
66  "Unsupported vector width");
67  EVT VT = Vec.getValueType();
68  EVT ElVT = VT.getVectorElementType();
69  unsigned Factor = VT.getSizeInBits()/vectorWidth;
70  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
71  VT.getVectorNumElements()/Factor);
72 
73  // Extract from UNDEF is UNDEF.
74  if (Vec.getOpcode() == ISD::UNDEF)
75  return DAG.getUNDEF(ResultVT);
76 
77  // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
78  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
79 
80  // This is the index of the first element of the vectorWidth-bit chunk
81  // we want.
82  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
83  * ElemsPerChunk);
84 
85  // If the input is a buildvector just emit a smaller one.
86  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
87  return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
88  Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
89 
90  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
91  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
92  VecIdx);
93 
94  return Result;
95 
96 }
97 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
98 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
99 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
100 /// instructions or a simple subregister reference. Idx is an index in the
101 /// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes
102 /// lowering EXTRACT_VECTOR_ELT operations easier.
103 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
104  SelectionDAG &DAG, SDLoc dl) {
105  assert((Vec.getValueType().is256BitVector() ||
106  Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
107  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
108 }
109 
110 /// Generate a DAG to grab 256-bits from a 512-bit vector.
111 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
112  SelectionDAG &DAG, SDLoc dl) {
113  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
114  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
115 }
116 
118  unsigned IdxVal, SelectionDAG &DAG,
119  SDLoc dl, unsigned vectorWidth) {
120  assert((vectorWidth == 128 || vectorWidth == 256) &&
121  "Unsupported vector width");
122  // Inserting UNDEF is Result
123  if (Vec.getOpcode() == ISD::UNDEF)
124  return Result;
125  EVT VT = Vec.getValueType();
126  EVT ElVT = VT.getVectorElementType();
127  EVT ResultVT = Result.getValueType();
128 
129  // Insert the relevant vectorWidth bits.
130  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
131 
132  // This is the index of the first element of the vectorWidth-bit chunk
133  // we want.
134  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
135  * ElemsPerChunk);
136 
137  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
138  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
139  VecIdx);
140 }
141 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
142 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
143 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
144 /// simple superregister reference. Idx is an index in the 128 bits
145 /// we want. It need not be aligned to a 128-bit bounday. That makes
146 /// lowering INSERT_VECTOR_ELT operations easier.
148  unsigned IdxVal, SelectionDAG &DAG,
149  SDLoc dl) {
150  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
151  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
152 }
153 
155  unsigned IdxVal, SelectionDAG &DAG,
156  SDLoc dl) {
157  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
158  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
159 }
160 
161 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
162 /// instructions. This is used because creating CONCAT_VECTOR nodes of
163 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
164 /// large BUILD_VECTORS.
166  unsigned NumElems, SelectionDAG &DAG,
167  SDLoc dl) {
168  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
169  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
170 }
171 
173  unsigned NumElems, SelectionDAG &DAG,
174  SDLoc dl) {
175  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
176  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
177 }
178 
180  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
181  bool is64Bit = Subtarget->is64Bit();
182 
183  if (Subtarget->isTargetEnvMacho()) {
184  if (is64Bit)
185  return new X86_64MachoTargetObjectFile();
186  return new TargetLoweringObjectFileMachO();
187  }
188 
189  if (Subtarget->isTargetLinux())
190  return new X86LinuxTargetObjectFile();
191  if (Subtarget->isTargetELF())
192  return new TargetLoweringObjectFileELF();
193  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
194  return new TargetLoweringObjectFileCOFF();
195  llvm_unreachable("unknown subtarget type");
196 }
197 
199  : TargetLowering(TM, createTLOF(TM)) {
200  Subtarget = &TM.getSubtarget<X86Subtarget>();
201  X86ScalarSSEf64 = Subtarget->hasSSE2();
202  X86ScalarSSEf32 = Subtarget->hasSSE1();
203  TD = getDataLayout();
204 
206 }
207 
209  const TargetMachine &TM = getTargetMachine();
210  static bool FirstTimeThrough = true;
211 
212  // If none of the target options have changed, then we don't need to reset the
213  // operation actions.
214  if (!FirstTimeThrough && TO == TM.Options) return;
215 
216  if (!FirstTimeThrough) {
217  // Reinitialize the actions.
218  initActions();
219  FirstTimeThrough = false;
220  }
221 
222  TO = TM.Options;
223 
224  // Set up the TargetLowering object.
225  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
226 
227  // X86 is weird, it always uses i8 for shift amounts and setcc results.
229  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
231 
232  // For 64-bit since we have so many registers use the ILP scheduler, for
233  // 32-bit code use the register pressure specific scheduling.
234  // For Atom, always use ILP scheduling.
235  if (Subtarget->isAtom())
237  else if (Subtarget->is64Bit())
239  else
241  const X86RegisterInfo *RegInfo =
242  static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
244 
245  // Bypass expensive divides on Atom when compiling with O2
246  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
247  addBypassSlowDiv(32, 8);
248  if (Subtarget->is64Bit())
249  addBypassSlowDiv(64, 16);
250  }
251 
252  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
253  // Setup Windows compiler runtime calls.
254  setLibcallName(RTLIB::SDIV_I64, "_alldiv");
255  setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
256  setLibcallName(RTLIB::SREM_I64, "_allrem");
257  setLibcallName(RTLIB::UREM_I64, "_aullrem");
258  setLibcallName(RTLIB::MUL_I64, "_allmul");
264 
265  // The _ftol2 runtime function has an unusual calling conv, which
266  // is modeled by a special pseudo-instruction.
271  }
272 
273  if (Subtarget->isTargetDarwin()) {
274  // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
275  setUseUnderscoreSetJmp(false);
277  } else if (Subtarget->isTargetMingw()) {
278  // MS runtime is weird: it exports _setjmp, but longjmp!
281  } else {
284  }
285 
286  // Set up the register classes.
287  addRegisterClass(MVT::i8, &X86::GR8RegClass);
288  addRegisterClass(MVT::i16, &X86::GR16RegClass);
289  addRegisterClass(MVT::i32, &X86::GR32RegClass);
290  if (Subtarget->is64Bit())
291  addRegisterClass(MVT::i64, &X86::GR64RegClass);
292 
294 
295  // We don't accept any truncstore of integer registers.
302 
303  // SETOEQ and SETUNE require checking two conditions.
310 
311  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
312  // operation.
316 
317  if (Subtarget->is64Bit()) {
320  } else if (!TM.Options.UseSoftFloat) {
321  // We have an algorithm for SSE2->double, and we turn this into a
322  // 64-bit FILD followed by conditional FADD for other targets.
324  // We have an algorithm for SSE2, and we turn this into a 64-bit
325  // FILD for other targets.
327  }
328 
329  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
330  // this operation.
333 
334  if (!TM.Options.UseSoftFloat) {
335  // SSE has no i16 to fp conversion, only i32
336  if (X86ScalarSSEf32) {
338  // f32 and f64 cases are Legal, f80 case is not
340  } else {
343  }
344  } else {
347  }
348 
349  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
350  // are Legal, f80 is custom lowered.
353 
354  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
355  // this operation.
358 
359  if (X86ScalarSSEf32) {
361  // f32 and f64 cases are Legal, f80 case is not
363  } else {
366  }
367 
368  // Handle FP_TO_UINT by promoting the destination to a larger signed
369  // conversion.
373 
374  if (Subtarget->is64Bit()) {
377  } else if (!TM.Options.UseSoftFloat) {
378  // Since AVX is a superset of SSE3, only check for SSE here.
379  if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
380  // Expand FP_TO_UINT into a select.
381  // FIXME: We would like to use a Custom expander here eventually to do
382  // the optimal thing for SSE vs. the default expansion in the legalizer.
384  else
385  // With SSE3 we can use fisttpll to convert to a signed i64; without
386  // SSE, we're stuck with a fistpll.
388  }
389 
390  if (isTargetFTOL()) {
391  // Use the _ftol2 runtime function, which has a pseudo-instruction
392  // to handle its weird calling convention.
394  }
395 
396  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
397  if (!X86ScalarSSEf64) {
400  if (Subtarget->is64Bit()) {
402  // Without SSE, i64->f64 goes through memory.
404  }
405  }
406 
407  // Scalar integer divide and remainder are lowered to use operations that
408  // produce two results, to match the available instructions. This exposes
409  // the two-result form to trivial CSE, which is able to combine x/y and x%y
410  // into a single instruction.
411  //
412  // Scalar integer multiply-high is also lowered to use two-result
413  // operations, to match the available instructions. However, plain multiply
414  // (low) operations are left as Legal, as there are single-result
415  // instructions for this in x86. Using the two-result multiply instructions
416  // when both high and low results are needed must be arranged by dagcombine.
417  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
418  MVT VT = IntVTs[i];
425 
426  // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
431  }
432 
443  if (Subtarget->is64Bit())
453 
454  // Promote the i8 variants and force them on up to i32 which has a shorter
455  // encoding.
460  if (Subtarget->hasBMI()) {
463  if (Subtarget->is64Bit())
465  } else {
468  if (Subtarget->is64Bit())
470  }
471 
472  if (Subtarget->hasLZCNT()) {
473  // When promoting the i8 variants, force them to i32 for a shorter
474  // encoding.
481  if (Subtarget->is64Bit())
483  } else {
490  if (Subtarget->is64Bit()) {
493  }
494  }
495 
496  if (Subtarget->hasPOPCNT()) {
498  } else {
502  if (Subtarget->is64Bit())
504  }
505 
508 
509  // These should be promoted to a larger select which is supported.
511  // X86 wants to expand cmov itself.
524  if (Subtarget->is64Bit()) {
527  }
529  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
530  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
531  // support continuation, user-level threading, and etc.. As a result, no
532  // other SjLj exception interfaces are implemented and please don't build
533  // your own exception handling based on them.
534  // LLVM/Clang supports zero-cost DWARF exception handling.
537 
538  // Darwin ABI issue.
543  if (Subtarget->is64Bit())
547  if (Subtarget->is64Bit()) {
553  }
554  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
558  if (Subtarget->is64Bit()) {
562  }
563 
564  if (Subtarget->hasSSE1())
566 
568 
569  // Expand certain atomics
570  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
571  MVT VT = IntVTs[i];
575  }
576 
577  if (!Subtarget->is64Bit()) {
590  }
591 
592  if (Subtarget->hasCmpxchg16b()) {
594  }
595 
596  // FIXME - use subtarget debug flags
597  if (!Subtarget->isTargetDarwin() &&
598  !Subtarget->isTargetELF() &&
599  !Subtarget->isTargetCygMing()) {
601  }
602 
603  if (Subtarget->is64Bit()) {
604  setExceptionPointerRegister(X86::RAX);
606  } else {
609  }
612 
615 
618 
619  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
622  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
623  // TargetInfo::X86_64ABIBuiltinVaList
626  } else {
627  // TargetInfo::CharPtrBuiltinVaList
630  }
631 
634 
635  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
638  else if (TM.Options.EnableSegmentedStacks)
641  else
644 
645  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
646  // f32 and f64 use SSE.
647  // Set up the FP register classes.
648  addRegisterClass(MVT::f32, &X86::FR32RegClass);
649  addRegisterClass(MVT::f64, &X86::FR64RegClass);
650 
651  // Use ANDPD to simulate FABS.
654 
655  // Use XORP to simulate FNEG.
658 
659  // Use ANDPD and ORPD to simulate FCOPYSIGN.
662 
663  // Lower this to FGETSIGNx86 plus an AND.
666 
667  // We don't support sin/cos/fmod
674 
675  // Expand FP immediates into loads from the stack, except for the special
676  // cases we handle.
677  addLegalFPImmediate(APFloat(+0.0)); // xorpd
678  addLegalFPImmediate(APFloat(+0.0f)); // xorps
679  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
680  // Use SSE for f32, x87 for f64.
681  // Set up the FP register classes.
682  addRegisterClass(MVT::f32, &X86::FR32RegClass);
683  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
684 
685  // Use ANDPS to simulate FABS.
687 
688  // Use XORP to simulate FNEG.
690 
692 
693  // Use ANDPS and ORPS to simulate FCOPYSIGN.
696 
697  // We don't support sin/cos/fmod
701 
702  // Special cases we handle for FP constants.
703  addLegalFPImmediate(APFloat(+0.0f)); // xorps
704  addLegalFPImmediate(APFloat(+0.0)); // FLD0
705  addLegalFPImmediate(APFloat(+1.0)); // FLD1
706  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
707  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
708 
709  if (!TM.Options.UnsafeFPMath) {
713  }
714  } else if (!TM.Options.UseSoftFloat) {
715  // f32 and f64 in x87.
716  // Set up the FP register classes.
717  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
718  addRegisterClass(MVT::f32, &X86::RFP32RegClass);
719 
724 
725  if (!TM.Options.UnsafeFPMath) {
732  }
733  addLegalFPImmediate(APFloat(+0.0)); // FLD0
734  addLegalFPImmediate(APFloat(+1.0)); // FLD1
735  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
736  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
737  addLegalFPImmediate(APFloat(+0.0f)); // FLD0
738  addLegalFPImmediate(APFloat(+1.0f)); // FLD1
739  addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
740  addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
741  }
742 
743  // We don't support FMA.
746 
747  // Long double always uses X87.
748  if (!TM.Options.UseSoftFloat) {
749  addRegisterClass(MVT::f80, &X86::RFP80RegClass);
752  {
754  addLegalFPImmediate(TmpFlt); // FLD0
755  TmpFlt.changeSign();
756  addLegalFPImmediate(TmpFlt); // FLD0/FCHS
757 
758  bool ignored;
759  APFloat TmpFlt2(+1.0);
761  &ignored);
762  addLegalFPImmediate(TmpFlt2); // FLD1
763  TmpFlt2.changeSign();
764  addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
765  }
766 
767  if (!TM.Options.UnsafeFPMath) {
771  }
772 
779  }
780 
781  // Always use a library call for pow.
785 
791 
792  // First set operation action for all vector types to either promote
793  // (for widening) or expand (for scalarization). Then we will selectively
794  // turn on ones that can be effectively codegen'd.
795  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
796  i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
797  MVT VT = (MVT::SimpleValueType)i;
863  for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
864  InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
866  (MVT::SimpleValueType)InnerVT, Expand);
870  }
871 
872  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
873  // with -msoft-float, disable use of MMX as well.
874  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
875  addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
876  // No operations on x86mmx supported, everything uses intrinsics.
877  }
878 
879  // MMX-sized vectors (other than x86mmx) are expected to be expanded
880  // into smaller operations.
910 
911  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
912  addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
913 
926  }
927 
928  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
929  addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
930 
931  // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
932  // registers cannot be used even for integer operations.
933  addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
934  addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
935  addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
936  addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
937 
956 
961 
967 
968  // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
969  for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
970  MVT VT = (MVT::SimpleValueType)i;
971  // Do not attempt to custom lower non-power-of-2 vectors
973  continue;
974  // Do not attempt to custom lower non-128-bit vectors
975  if (!VT.is128BitVector())
976  continue;
980  }
981 
988 
989  if (Subtarget->is64Bit()) {
992  }
993 
994  // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
995  for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
996  MVT VT = (MVT::SimpleValueType)i;
997 
998  // Do not attempt to promote non-128-bit vectors
999  if (!VT.is128BitVector())
1000  continue;
1001 
1012  }
1013 
1015 
1016  // Custom lower v2i64 and v2f64 selects.
1021 
1024 
1027  // As there is no 64-bit GPR available, we need build a special custom
1028  // sequence to convert from v2i32 to v2f32.
1029  if (!Subtarget->is64Bit())
1031 
1034 
1036  }
1037 
1038  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1049 
1060 
1061  // FIXME: Do we need to handle scalar-to-vector here?
1063 
1069 
1070  // i8 and i16 vectors are custom , because the source register and source
1071  // source memory operand types are not the same width. f32 vectors are
1072  // custom since the immediate controlling the insert encodes additional
1073  // information.
1078 
1083 
1084  // FIXME: these should be Legal but thats only for the case where
1085  // the index is constant. For now custom expand to deal with that.
1086  if (Subtarget->is64Bit()) {
1089  }
1090  }
1091 
1092  if (Subtarget->hasSSE2()) {
1095 
1098 
1101 
1102  // In the customized shift lowering, the legal cases in AVX2 will be
1103  // recognized.
1106 
1109 
1111 
1114  }
1115 
1116  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1117  addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
1118  addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1119  addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
1120  addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
1121  addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
1122  addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
1123 
1127 
1140 
1153 
1155 
1160 
1163 
1165 
1168 
1171 
1174 
1176 
1181 
1185 
1190 
1203 
1204  if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1211  }
1212 
1213  if (Subtarget->hasInt256()) {
1218 
1223 
1227  // Don't lower v32i8 because there is no 128-bit byte mul
1228 
1230 
1232  } else {
1237 
1242 
1246  // Don't lower v32i8 because there is no 128-bit byte mul
1247  }
1248 
1249  // In the customized shift lowering, the legal cases in AVX2 will be
1250  // recognized.
1253 
1256 
1258 
1259  // Custom lower several nodes for 256-bit types.
1260  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1261  i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1262  MVT VT = (MVT::SimpleValueType)i;
1263 
1264  // Extract subvector is special because the value type
1265  // (result) is 128-bit but the source is 256-bit wide.
1266  if (VT.is128BitVector())
1268 
1269  // Do not attempt to custom lower other non-256-bit vectors
1270  if (!VT.is256BitVector())
1271  continue;
1272 
1280  }
1281 
1282  // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1283  for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1284  MVT VT = (MVT::SimpleValueType)i;
1285 
1286  // Do not attempt to promote non-256-bit vectors
1287  if (!VT.is256BitVector())
1288  continue;
1289 
1300  }
1301  }
1302 
1303  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1304  addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1305  addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1306  addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1307  addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1308 
1309  addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1310  addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1311 
1318 
1325 
1335 
1340  if (Subtarget->is64Bit()) {
1345  }
1354 
1367 
1373 
1376 
1378 
1384 
1387 
1390 
1392 
1395 
1398 
1401 
1408 
1409  // Custom lower several nodes.
1410  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1411  i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1412  MVT VT = (MVT::SimpleValueType)i;
1413 
1414  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1415  // Extract subvector is special because the value type
1416  // (result) is 256/128-bit but the source is 512-bit wide.
1417  if (VT.is128BitVector() || VT.is256BitVector())
1419 
1420  if (VT.getVectorElementType() == MVT::i1)
1422 
1423  // Do not attempt to custom lower other non-512-bit vectors
1424  if (!VT.is512BitVector())
1425  continue;
1426 
1427  if ( EltSize >= 32) {
1435  }
1436  }
1437  for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1438  MVT VT = (MVT::SimpleValueType)i;
1439 
1440  // Do not attempt to promote non-256-bit vectors
1441  if (!VT.is512BitVector())
1442  continue;
1443 
1446  }
1447  }// has AVX-512
1448 
1449  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1450  // of this type with custom code.
1451  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1452  VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1454  Custom);
1455  }
1456 
1457  // We want to custom lower some of our intrinsics.
1461 
1462  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1463  // handle type legalization for these operations here.
1464  //
1465  // FIXME: We really should do custom legalization for addition and
1466  // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1467  // than generic legalization for 64-bit multiplication-with-overflow, though.
1468  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1469  // Add/Sub/Mul with overflow operations are custom lowered.
1470  MVT VT = IntVTs[i];
1477  }
1478 
1479  // There are no 8-bit 3-address imul/mul instructions
1482 
1483  if (!Subtarget->is64Bit()) {
1484  // These libcalls are not available in 32-bit.
1488  }
1489 
1490  // Combine sin / cos into one node or libcall if possible.
1491  if (Subtarget->hasSinCos()) {
1492  setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1493  setLibcallName(RTLIB::SINCOS_F64, "sincos");
1494  if (Subtarget->isTargetDarwin()) {
1495  // For MacOSX, we don't want to the normal expansion of a libcall to
1496  // sincos. We want to issue a libcall to __sincos_stret to avoid memory
1497  // traffic.
1500  }
1501  }
1502 
1503  // We have target-specific dag combine patterns for the following nodes:
1527  if (Subtarget->is64Bit())
1530 
1532 
1533  // On Darwin, -Os means optimize for size without hurting performance,
1534  // do not reduce the limit.
1535  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1536  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1537  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1538  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1539  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1540  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1541  setPrefLoopAlignment(4); // 2^4 bytes.
1542 
1543  // Predictable cmov don't hurt on atom because it's in-order.
1544  PredictableSelectIsExpensive = !Subtarget->isAtom();
1545 
1546  setPrefFunctionAlignment(4); // 2^4 bytes.
1547 }
1548 
1550  if (!VT.isVector())
1551  return MVT::i8;
1552 
1553  const TargetMachine &TM = getTargetMachine();
1554  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512())
1555  switch(VT.getVectorNumElements()) {
1556  case 8: return MVT::v8i1;
1557  case 16: return MVT::v16i1;
1558  }
1559 
1561 }
1562 
1563 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1564 /// the desired ByVal argument alignment.
1565 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1566  if (MaxAlign == 16)
1567  return;
1568  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1569  if (VTy->getBitWidth() == 128)
1570  MaxAlign = 16;
1571  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1572  unsigned EltAlign = 0;
1573  getMaxByValAlign(ATy->getElementType(), EltAlign);
1574  if (EltAlign > MaxAlign)
1575  MaxAlign = EltAlign;
1576  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1577  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1578  unsigned EltAlign = 0;
1579  getMaxByValAlign(STy->getElementType(i), EltAlign);
1580  if (EltAlign > MaxAlign)
1581  MaxAlign = EltAlign;
1582  if (MaxAlign == 16)
1583  break;
1584  }
1585  }
1586 }
1587 
1588 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1589 /// function arguments in the caller parameter area. For X86, aggregates
1590 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1591 /// are at 4-byte boundaries.
1593  if (Subtarget->is64Bit()) {
1594  // Max of 8 and alignment of type.
1595  unsigned TyAlign = TD->getABITypeAlignment(Ty);
1596  if (TyAlign > 8)
1597  return TyAlign;
1598  return 8;
1599  }
1600 
1601  unsigned Align = 4;
1602  if (Subtarget->hasSSE1())
1603  getMaxByValAlign(Ty, Align);
1604  return Align;
1605 }
1606 
1607 /// getOptimalMemOpType - Returns the target specific optimal type for load
1608 /// and store operations as a result of memset, memcpy, and memmove
1609 /// lowering. If DstAlign is zero that means it's safe to destination
1610 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1611 /// means there isn't a need to check it against alignment requirement,
1612 /// probably because the source does not need to be loaded. If 'IsMemset' is
1613 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1614 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1615 /// source is constant so it does not need to be loaded.
1616 /// It returns EVT::Other if the type should be determined using generic
1617 /// target-independent logic.
1618 EVT
1620  unsigned DstAlign, unsigned SrcAlign,
1621  bool IsMemset, bool ZeroMemset,
1622  bool MemcpyStrSrc,
1623  MachineFunction &MF) const {
1624  const Function *F = MF.getFunction();
1625  if ((!IsMemset || ZeroMemset) &&
1626  !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1628  if (Size >= 16 &&
1629  (Subtarget->isUnalignedMemAccessFast() ||
1630  ((DstAlign == 0 || DstAlign >= 16) &&
1631  (SrcAlign == 0 || SrcAlign >= 16)))) {
1632  if (Size >= 32) {
1633  if (Subtarget->hasInt256())
1634  return MVT::v8i32;
1635  if (Subtarget->hasFp256())
1636  return MVT::v8f32;
1637  }
1638  if (Subtarget->hasSSE2())
1639  return MVT::v4i32;
1640  if (Subtarget->hasSSE1())
1641  return MVT::v4f32;
1642  } else if (!MemcpyStrSrc && Size >= 8 &&
1643  !Subtarget->is64Bit() &&
1644  Subtarget->hasSSE2()) {
1645  // Do not use f64 to lower memcpy if source is string constant. It's
1646  // better to use i32 to avoid the loads.
1647  return MVT::f64;
1648  }
1649  }
1650  if (Subtarget->is64Bit() && Size >= 8)
1651  return MVT::i64;
1652  return MVT::i32;
1653 }
1654 
1656  if (VT == MVT::f32)
1657  return X86ScalarSSEf32;
1658  else if (VT == MVT::f64)
1659  return X86ScalarSSEf64;
1660  return true;
1661 }
1662 
1663 bool
1665  if (Fast)
1666  *Fast = Subtarget->isUnalignedMemAccessFast();
1667  return true;
1668 }
1669 
1670 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
1671 /// current function. The returned value is a member of the
1672 /// MachineJumpTableInfo::JTEntryKind enum.
1674  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1675  // symbol.
1676  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1677  Subtarget->isPICStyleGOT())
1679 
1680  // Otherwise, use the normal jump table encoding heuristics.
1682 }
1683 
1684 const MCExpr *
1686  const MachineBasicBlock *MBB,
1687  unsigned uid,MCContext &Ctx) const{
1688  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1689  Subtarget->isPICStyleGOT());
1690  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1691  // entries.
1692  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1694 }
1695 
1696 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1697 /// jumptable.
1699  SelectionDAG &DAG) const {
1700  if (!Subtarget->is64Bit())
1701  // This doesn't have SDLoc associated with it, but is not really the
1702  // same as a Register.
1703  return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1704  return Table;
1705 }
1706 
1707 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1708 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1709 /// MCExpr.
1712  MCContext &Ctx) const {
1713  // X86-64 uses RIP relative addressing based on the jump table label.
1714  if (Subtarget->isPICStyleRIPRel())
1715  return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1716 
1717  // Otherwise, the reference is relative to the PIC base.
1718  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1719 }
1720 
1721 // FIXME: Why this routine is here? Move to RegInfo!
1722 std::pair<const TargetRegisterClass*, uint8_t>
1724  const TargetRegisterClass *RRC = 0;
1725  uint8_t Cost = 1;
1726  switch (VT.SimpleTy) {
1727  default:
1729  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1730  RRC = Subtarget->is64Bit() ?
1731  (const TargetRegisterClass*)&X86::GR64RegClass :
1732  (const TargetRegisterClass*)&X86::GR32RegClass;
1733  break;
1734  case MVT::x86mmx:
1735  RRC = &X86::VR64RegClass;
1736  break;
1737  case MVT::f32: case MVT::f64:
1738  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1739  case MVT::v4f32: case MVT::v2f64:
1740  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1741  case MVT::v4f64:
1742  RRC = &X86::VR128RegClass;
1743  break;
1744  }
1745  return std::make_pair(RRC, Cost);
1746 }
1747 
1749  unsigned &Offset) const {
1750  if (!Subtarget->isTargetLinux())
1751  return false;
1752 
1753  if (Subtarget->is64Bit()) {
1754  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1755  Offset = 0x28;
1757  AddressSpace = 256;
1758  else
1759  AddressSpace = 257;
1760  } else {
1761  // %gs:0x14 on i386
1762  Offset = 0x14;
1763  AddressSpace = 256;
1764  }
1765  return true;
1766 }
1767 
1769  unsigned DestAS) const {
1770  assert(SrcAS != DestAS && "Expected different address spaces!");
1771 
1772  return SrcAS < 256 && DestAS < 256;
1773 }
1774 
1775 //===----------------------------------------------------------------------===//
1776 // Return Value Calling Convention Implementation
1777 //===----------------------------------------------------------------------===//
1778 
1779 #include "X86GenCallingConv.inc"
1780 
1781 bool
1782 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1783  MachineFunction &MF, bool isVarArg,
1784  const SmallVectorImpl<ISD::OutputArg> &Outs,
1785  LLVMContext &Context) const {
1787  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1788  RVLocs, Context);
1789  return CCInfo.CheckReturn(Outs, RetCC_X86);
1790 }
1791 
1792 const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
1793  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
1794  return ScratchRegs;
1795 }
1796 
1797 SDValue
1798 X86TargetLowering::LowerReturn(SDValue Chain,
1799  CallingConv::ID CallConv, bool isVarArg,
1800  const SmallVectorImpl<ISD::OutputArg> &Outs,
1801  const SmallVectorImpl<SDValue> &OutVals,
1802  SDLoc dl, SelectionDAG &DAG) const {
1803  MachineFunction &MF = DAG.getMachineFunction();
1805 
1807  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1808  RVLocs, *DAG.getContext());
1809  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1810 
1811  SDValue Flag;
1812  SmallVector<SDValue, 6> RetOps;
1813  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1814  // Operand #1 = Bytes To Pop
1815  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1816  MVT::i16));
1817 
1818  // Copy the result values into the output registers.
1819  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1820  CCValAssign &VA = RVLocs[i];
1821  assert(VA.isRegLoc() && "Can only return in registers!");
1822  SDValue ValToCopy = OutVals[i];
1823  EVT ValVT = ValToCopy.getValueType();
1824 
1825  // Promote values to the appropriate types
1826  if (VA.getLocInfo() == CCValAssign::SExt)
1827  ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1828  else if (VA.getLocInfo() == CCValAssign::ZExt)
1829  ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1830  else if (VA.getLocInfo() == CCValAssign::AExt)
1831  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1832  else if (VA.getLocInfo() == CCValAssign::BCvt)
1833  ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1834 
1835  // If this is x86-64, and we disabled SSE, we can't return FP values,
1836  // or SSE or MMX vectors.
1837  if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1838  VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1839  (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1840  report_fatal_error("SSE register return with SSE disabled");
1841  }
1842  // Likewise we can't return F64 values with SSE1 only. gcc does so, but
1843  // llvm-gcc has never done it right and no one has noticed, so this
1844  // should be OK for now.
1845  if (ValVT == MVT::f64 &&
1846  (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1847  report_fatal_error("SSE2 register return with SSE2 disabled");
1848 
1849  // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1850  // the RET instruction and handled by the FP Stackifier.
1851  if (VA.getLocReg() == X86::ST0 ||
1852  VA.getLocReg() == X86::ST1) {
1853  // If this is a copy from an xmm register to ST(0), use an FPExtend to
1854  // change the value to the FP stack register class.
1855  if (isScalarFPTypeInSSEReg(VA.getValVT()))
1856  ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1857  RetOps.push_back(ValToCopy);
1858  // Don't emit a copytoreg.
1859  continue;
1860  }
1861 
1862  // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1863  // which is returned in RAX / RDX.
1864  if (Subtarget->is64Bit()) {
1865  if (ValVT == MVT::x86mmx) {
1866  if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1867  ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1868  ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1869  ValToCopy);
1870  // If we don't have SSE2 available, convert to v4f32 so the generated
1871  // register is legal.
1872  if (!Subtarget->hasSSE2())
1873  ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1874  }
1875  }
1876  }
1877 
1878  Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1879  Flag = Chain.getValue(1);
1880  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1881  }
1882 
1883  // The x86-64 ABIs require that for returning structs by value we copy
1884  // the sret argument into %rax/%eax (depending on ABI) for the return.
1885  // Win32 requires us to put the sret argument to %eax as well.
1886  // We saved the argument into a virtual register in the entry block,
1887  // so now we copy the value out and into %rax/%eax.
1889  (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
1890  MachineFunction &MF = DAG.getMachineFunction();
1892  unsigned Reg = FuncInfo->getSRetReturnReg();
1893  assert(Reg &&
1894  "SRetReturnReg should have been set in LowerFormalArguments().");
1895  SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1896 
1897  unsigned RetValReg
1898  = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
1899  X86::RAX : X86::EAX;
1900  Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
1901  Flag = Chain.getValue(1);
1902 
1903  // RAX/EAX now acts like a return value.
1904  RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
1905  }
1906 
1907  RetOps[0] = Chain; // Update chain.
1908 
1909  // Add the flag if we have it.
1910  if (Flag.getNode())
1911  RetOps.push_back(Flag);
1912 
1913  return DAG.getNode(X86ISD::RET_FLAG, dl,
1914  MVT::Other, &RetOps[0], RetOps.size());
1915 }
1916 
1917 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1918  if (N->getNumValues() != 1)
1919  return false;
1920  if (!N->hasNUsesOfValue(1, 0))
1921  return false;
1922 
1923  SDValue TCChain = Chain;
1924  SDNode *Copy = *N->use_begin();
1925  if (Copy->getOpcode() == ISD::CopyToReg) {
1926  // If the copy has a glue operand, we conservatively assume it isn't safe to
1927  // perform a tail call.
1928  if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1929  return false;
1930  TCChain = Copy->getOperand(0);
1931  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1932  return false;
1933 
1934  bool HasRet = false;
1935  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1936  UI != UE; ++UI) {
1937  if (UI->getOpcode() != X86ISD::RET_FLAG)
1938  return false;
1939  HasRet = true;
1940  }
1941 
1942  if (!HasRet)
1943  return false;
1944 
1945  Chain = TCChain;
1946  return true;
1947 }
1948 
1949 MVT
1950 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
1951  ISD::NodeType ExtendKind) const {
1952  MVT ReturnMVT;
1953  // TODO: Is this also valid on 32-bit?
1954  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1955  ReturnMVT = MVT::i8;
1956  else
1957  ReturnMVT = MVT::i32;
1958 
1959  MVT MinVT = getRegisterType(ReturnMVT);
1960  return VT.bitsLT(MinVT) ? MinVT : VT;
1961 }
1962 
1963 /// LowerCallResult - Lower the result values of a call into the
1964 /// appropriate copies out of appropriate physical registers.
1965 ///
1966 SDValue
1967 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1968  CallingConv::ID CallConv, bool isVarArg,
1970  SDLoc dl, SelectionDAG &DAG,
1971  SmallVectorImpl<SDValue> &InVals) const {
1972 
1973  // Assign locations to each value returned by this call.
1975  bool Is64Bit = Subtarget->is64Bit();
1976  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1977  getTargetMachine(), RVLocs, *DAG.getContext());
1978  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1979 
1980  // Copy all of the result registers out of their specified physreg.
1981  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1982  CCValAssign &VA = RVLocs[i];
1983  EVT CopyVT = VA.getValVT();
1984 
1985  // If this is x86-64, and we disabled SSE, we can't return FP values
1986  if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1987  ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1988  report_fatal_error("SSE register return with SSE disabled");
1989  }
1990 
1991  SDValue Val;
1992 
1993  // If this is a call to a function that returns an fp value on the floating
1994  // point stack, we must guarantee the value is popped from the stack, so
1995  // a CopyFromReg is not good enough - the copy instruction may be eliminated
1996  // if the return value is not used. We use the FpPOP_RETVAL instruction
1997  // instead.
1998  if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1999  // If we prefer to use the value in xmm registers, copy it out as f80 and
2000  // use a truncate to move it from fp stack reg to xmm reg.
2001  if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
2002  SDValue Ops[] = { Chain, InFlag };
2003  Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
2004  MVT::Other, MVT::Glue, Ops), 1);
2005  Val = Chain.getValue(0);
2006 
2007  // Round the f80 to the right size, which also moves it to the appropriate
2008  // xmm register.
2009  if (CopyVT != VA.getValVT())
2010  Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2011  // This truncation won't change the value.
2012  DAG.getIntPtrConstant(1));
2013  } else {
2014  Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2015  CopyVT, InFlag).getValue(1);
2016  Val = Chain.getValue(0);
2017  }
2018  InFlag = Chain.getValue(2);
2019  InVals.push_back(Val);
2020  }
2021 
2022  return Chain;
2023 }
2024 
2025 //===----------------------------------------------------------------------===//
2026 // C & StdCall & Fast Calling Convention implementation
2027 //===----------------------------------------------------------------------===//
2028 // StdCall calling convention seems to be standard for many Windows' API
2029 // routines and around. It differs from C calling convention just a little:
2030 // callee should clean up the stack, not caller. Symbols should be also
2031 // decorated in some fancy way :) It doesn't support any vector arguments.
2032 // For info on fast calling convention see Fast Calling Convention (tail call)
2033 // implementation LowerX86_32FastCCCallTo.
2034 
2035 /// CallIsStructReturn - Determines whether a call uses struct return
2036 /// semantics.
2041 };
2042 static StructReturnType
2044  if (Outs.empty())
2045  return NotStructReturn;
2046 
2047  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2048  if (!Flags.isSRet())
2049  return NotStructReturn;
2050  if (Flags.isInReg())
2051  return RegStructReturn;
2052  return StackStructReturn;
2053 }
2054 
2055 /// ArgsAreStructReturn - Determines whether a function uses struct
2056 /// return semantics.
2057 static StructReturnType
2059  if (Ins.empty())
2060  return NotStructReturn;
2061 
2062  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2063  if (!Flags.isSRet())
2064  return NotStructReturn;
2065  if (Flags.isInReg())
2066  return RegStructReturn;
2067  return StackStructReturn;
2068 }
2069 
2070 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
2071 /// by "Src" to address "Dst" with size and alignment information specified by
2072 /// the specific parameter attribute. The copy will be passed as a byval
2073 /// function parameter.
2074 static SDValue
2077  SDLoc dl) {
2078  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2079 
2080  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2081  /*isVolatile*/false, /*AlwaysInline=*/true,
2083 }
2084 
2085 /// IsTailCallConvention - Return true if the calling convention is one that
2086 /// supports tail call optimization.
2088  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2089  CC == CallingConv::HiPE);
2090 }
2091 
2092 /// \brief Return true if the calling convention is a C calling convention.
2094  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2095  CC == CallingConv::X86_64_SysV);
2096 }
2097 
2098 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2100  return false;
2101 
2102  CallSite CS(CI);
2103  CallingConv::ID CalleeCC = CS.getCallingConv();
2104  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2105  return false;
2106 
2107  return true;
2108 }
2109 
2110 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
2111 /// a tailcall target by changing its ABI.
2113  bool GuaranteedTailCallOpt) {
2114  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2115 }
2116 
2117 SDValue
2118 X86TargetLowering::LowerMemArgument(SDValue Chain,
2119  CallingConv::ID CallConv,
2120  const SmallVectorImpl<ISD::InputArg> &Ins,
2121  SDLoc dl, SelectionDAG &DAG,
2122  const CCValAssign &VA,
2123  MachineFrameInfo *MFI,
2124  unsigned i) const {
2125  // Create the nodes corresponding to a load from this parameter slot.
2126  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2127  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
2128  getTargetMachine().Options.GuaranteedTailCallOpt);
2129  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2130  EVT ValVT;
2131 
2132  // If value is passed by pointer we have address passed instead of the value
2133  // itself.
2134  if (VA.getLocInfo() == CCValAssign::Indirect)
2135  ValVT = VA.getLocVT();
2136  else
2137  ValVT = VA.getValVT();
2138 
2139  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2140  // changed with more analysis.
2141  // In case of tail call optimization mark all arguments mutable. Since they
2142  // could be overwritten by lowering of arguments in case of a tail call.
2143  if (Flags.isByVal()) {
2144  unsigned Bytes = Flags.getByValSize();
2145  if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2146  int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2147  return DAG.getFrameIndex(FI, getPointerTy());
2148  } else {
2149  int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2150  VA.getLocMemOffset(), isImmutable);
2151  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2152  return DAG.getLoad(ValVT, dl, Chain, FIN,
2154  false, false, false, 0);
2155  }
2156 }
2157 
2158 SDValue
2159 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2160  CallingConv::ID CallConv,
2161  bool isVarArg,
2162  const SmallVectorImpl<ISD::InputArg> &Ins,
2163  SDLoc dl,
2164  SelectionDAG &DAG,
2165  SmallVectorImpl<SDValue> &InVals)
2166  const {
2167  MachineFunction &MF = DAG.getMachineFunction();
2169 
2170  const Function* Fn = MF.getFunction();
2171  if (Fn->hasExternalLinkage() &&
2172  Subtarget->isTargetCygMing() &&
2173  Fn->getName() == "main")
2174  FuncInfo->setForceFramePointer(true);
2175 
2176  MachineFrameInfo *MFI = MF.getFrameInfo();
2177  bool Is64Bit = Subtarget->is64Bit();
2178  bool IsWindows = Subtarget->isTargetWindows();
2179  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2180 
2181  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2182  "Var args not supported with calling convention fastcc, ghc or hipe");
2183 
2184  // Assign locations to all of the incoming arguments.
2186  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2187  ArgLocs, *DAG.getContext());
2188 
2189  // Allocate shadow area for Win64
2190  if (IsWin64)
2191  CCInfo.AllocateStack(32, 8);
2192 
2193  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2194 
2195  unsigned LastVal = ~0U;
2196  SDValue ArgValue;
2197  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2198  CCValAssign &VA = ArgLocs[i];
2199  // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2200  // places.
2201  assert(VA.getValNo() != LastVal &&
2202  "Don't support value assigned to multiple locs yet");
2203  (void)LastVal;
2204  LastVal = VA.getValNo();
2205 
2206  if (VA.isRegLoc()) {
2207  EVT RegVT = VA.getLocVT();
2208  const TargetRegisterClass *RC;
2209  if (RegVT == MVT::i32)
2210  RC = &X86::GR32RegClass;
2211  else if (Is64Bit && RegVT == MVT::i64)
2212  RC = &X86::GR64RegClass;
2213  else if (RegVT == MVT::f32)
2214  RC = &X86::FR32RegClass;
2215  else if (RegVT == MVT::f64)
2216  RC = &X86::FR64RegClass;
2217  else if (RegVT.is512BitVector())
2218  RC = &X86::VR512RegClass;
2219  else if (RegVT.is256BitVector())
2220  RC = &X86::VR256RegClass;
2221  else if (RegVT.is128BitVector())
2222  RC = &X86::VR128RegClass;
2223  else if (RegVT == MVT::x86mmx)
2224  RC = &X86::VR64RegClass;
2225  else if (RegVT == MVT::v8i1)
2226  RC = &X86::VK8RegClass;
2227  else if (RegVT == MVT::v16i1)
2228  RC = &X86::VK16RegClass;
2229  else
2230  llvm_unreachable("Unknown argument type!");
2231 
2232  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2233  ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2234 
2235  // If this is an 8 or 16-bit value, it is really passed promoted to 32
2236  // bits. Insert an assert[sz]ext to capture this, then truncate to the
2237  // right size.
2238  if (VA.getLocInfo() == CCValAssign::SExt)
2239  ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2240  DAG.getValueType(VA.getValVT()));
2241  else if (VA.getLocInfo() == CCValAssign::ZExt)
2242  ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2243  DAG.getValueType(VA.getValVT()));
2244  else if (VA.getLocInfo() == CCValAssign::BCvt)
2245  ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2246 
2247  if (VA.isExtInLoc()) {
2248  // Handle MMX values passed in XMM regs.
2249  if (RegVT.isVector())
2250  ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2251  else
2252  ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2253  }
2254  } else {
2255  assert(VA.isMemLoc());
2256  ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2257  }
2258 
2259  // If value is passed via pointer - do a load.
2260  if (VA.getLocInfo() == CCValAssign::Indirect)
2261  ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2262  MachinePointerInfo(), false, false, false, 0);
2263 
2264  InVals.push_back(ArgValue);
2265  }
2266 
2267  // The x86-64 ABIs require that for returning structs by value we copy
2268  // the sret argument into %rax/%eax (depending on ABI) for the return.
2269  // Win32 requires us to put the sret argument to %eax as well.
2270  // Save the argument into a virtual register so that we can access it
2271  // from the return points.
2272  if (MF.getFunction()->hasStructRetAttr() &&
2273  (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
2275  unsigned Reg = FuncInfo->getSRetReturnReg();
2276  if (!Reg) {
2277  MVT PtrTy = getPointerTy();
2278  Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2279  FuncInfo->setSRetReturnReg(Reg);
2280  }
2281  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
2282  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2283  }
2284 
2285  unsigned StackSize = CCInfo.getNextStackOffset();
2286  // Align stack specially for tail calls.
2287  if (FuncIsMadeTailCallSafe(CallConv,
2289  StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2290 
2291  // If the function takes variable number of arguments, make a frame index for
2292  // the start of the first vararg value... for expansion of llvm.va_start.
2293  if (isVarArg) {
2294  if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2295  CallConv != CallingConv::X86_ThisCall)) {
2296  FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
2297  }
2298  if (Is64Bit) {
2299  unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
2300 
2301  // FIXME: We should really autogenerate these arrays
2302  static const uint16_t GPR64ArgRegsWin64[] = {
2303  X86::RCX, X86::RDX, X86::R8, X86::R9
2304  };
2305  static const uint16_t GPR64ArgRegs64Bit[] = {
2306  X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2307  };
2308  static const uint16_t XMMArgRegs64Bit[] = {
2309  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2310  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2311  };
2312  const uint16_t *GPR64ArgRegs;
2313  unsigned NumXMMRegs = 0;
2314 
2315  if (IsWin64) {
2316  // The XMM registers which might contain var arg parameters are shadowed
2317  // in their paired GPR. So we only need to save the GPR to their home
2318  // slots.
2319  TotalNumIntRegs = 4;
2320  GPR64ArgRegs = GPR64ArgRegsWin64;
2321  } else {
2322  TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
2323  GPR64ArgRegs = GPR64ArgRegs64Bit;
2324 
2325  NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
2326  TotalNumXMMRegs);
2327  }
2328  unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
2329  TotalNumIntRegs);
2330 
2331  bool NoImplicitFloatOps = Fn->getAttributes().
2332  hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2333  assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2334  "SSE register cannot be used when SSE is disabled!");
2335  assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
2336  NoImplicitFloatOps) &&
2337  "SSE register cannot be used when SSE is disabled!");
2338  if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2339  !Subtarget->hasSSE1())
2340  // Kernel mode asks for SSE to be disabled, so don't push them
2341  // on the stack.
2342  TotalNumXMMRegs = 0;
2343 
2344  if (IsWin64) {
2346  // Get to the caller-allocated home save location. Add 8 to account
2347  // for the return address.
2348  int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2349  FuncInfo->setRegSaveFrameIndex(
2350  MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2351  // Fixup to set vararg frame on shadow area (4 x i64).
2352  if (NumIntRegs < 4)
2353  FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2354  } else {
2355  // For X86-64, if there are vararg parameters that are passed via
2356  // registers, then we must store them to their spots on the stack so
2357  // they may be loaded by deferencing the result of va_next.
2358  FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2359  FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2360  FuncInfo->setRegSaveFrameIndex(
2361  MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2362  false));
2363  }
2364 
2365  // Store the integer parameter registers.
2366  SmallVector<SDValue, 8> MemOps;
2367  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2368  getPointerTy());
2369  unsigned Offset = FuncInfo->getVarArgsGPOffset();
2370  for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2371  SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2372  DAG.getIntPtrConstant(Offset));
2373  unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2374  &X86::GR64RegClass);
2375  SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2376  SDValue Store =
2377  DAG.getStore(Val.getValue(1), dl, Val, FIN,
2379  FuncInfo->getRegSaveFrameIndex(), Offset),
2380  false, false, 0);
2381  MemOps.push_back(Store);
2382  Offset += 8;
2383  }
2384 
2385  if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2386  // Now store the XMM (fp + vector) parameter registers.
2387  SmallVector<SDValue, 11> SaveXMMOps;
2388  SaveXMMOps.push_back(Chain);
2389 
2390  unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2391  SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2392  SaveXMMOps.push_back(ALVal);
2393 
2394  SaveXMMOps.push_back(DAG.getIntPtrConstant(
2395  FuncInfo->getRegSaveFrameIndex()));
2396  SaveXMMOps.push_back(DAG.getIntPtrConstant(
2397  FuncInfo->getVarArgsFPOffset()));
2398 
2399  for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2400  unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2401  &X86::VR128RegClass);
2402  SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2403  SaveXMMOps.push_back(Val);
2404  }
2406  MVT::Other,
2407  &SaveXMMOps[0], SaveXMMOps.size()));
2408  }
2409 
2410  if (!MemOps.empty())
2411  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2412  &MemOps[0], MemOps.size());
2413  }
2414  }
2415 
2416  // Some CCs need callee pop.
2417  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2419  FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2420  } else {
2421  FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2422  // If this is an sret function, the return should pop the hidden pointer.
2423  if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2425  FuncInfo->setBytesToPopOnReturn(4);
2426  }
2427 
2428  if (!Is64Bit) {
2429  // RegSaveFrameIndex is X86-64 only.
2430  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2431  if (CallConv == CallingConv::X86_FastCall ||
2432  CallConv == CallingConv::X86_ThisCall)
2433  // fastcc functions can't have varargs.
2434  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2435  }
2436 
2437  FuncInfo->setArgumentStackSize(StackSize);
2438 
2439  return Chain;
2440 }
2441 
2442 SDValue
2443 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2444  SDValue StackPtr, SDValue Arg,
2445  SDLoc dl, SelectionDAG &DAG,
2446  const CCValAssign &VA,
2447  ISD::ArgFlagsTy Flags) const {
2448  unsigned LocMemOffset = VA.getLocMemOffset();
2449  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2450  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2451  if (Flags.isByVal())
2452  return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2453 
2454  return DAG.getStore(Chain, dl, Arg, PtrOff,
2455  MachinePointerInfo::getStack(LocMemOffset),
2456  false, false, 0);
2457 }
2458 
2459 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2460 /// optimization is performed and it is required.
2461 SDValue
2462 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2463  SDValue &OutRetAddr, SDValue Chain,
2464  bool IsTailCall, bool Is64Bit,
2465  int FPDiff, SDLoc dl) const {
2466  // Adjust the Return address stack slot.
2467  EVT VT = getPointerTy();
2468  OutRetAddr = getReturnAddressFrameIndex(DAG);
2469 
2470  // Load the "old" Return address.
2471  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2472  false, false, false, 0);
2473  return SDValue(OutRetAddr.getNode(), 1);
2474 }
2475 
2476 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2477 /// optimization is performed and it is required (FPDiff!=0).
2478 static SDValue
2480  SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
2481  unsigned SlotSize, int FPDiff, SDLoc dl) {
2482  // Store the return address to the appropriate stack slot.
2483  if (!FPDiff) return Chain;
2484  // Calculate the new stack slot for the return address.
2485  int NewReturnAddrFI =
2486  MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2487  false);
2488  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2489  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2490  MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2491  false, false, 0);
2492  return Chain;
2493 }
2494 
2495 SDValue
2496 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2497  SmallVectorImpl<SDValue> &InVals) const {
2498  SelectionDAG &DAG = CLI.DAG;
2499  SDLoc &dl = CLI.DL;
2501  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2503  SDValue Chain = CLI.Chain;
2504  SDValue Callee = CLI.Callee;
2505  CallingConv::ID CallConv = CLI.CallConv;
2506  bool &isTailCall = CLI.IsTailCall;
2507  bool isVarArg = CLI.IsVarArg;
2508 
2509  MachineFunction &MF = DAG.getMachineFunction();
2510  bool Is64Bit = Subtarget->is64Bit();
2511  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2512  bool IsWindows = Subtarget->isTargetWindows();
2514  bool IsSibcall = false;
2515 
2517  isTailCall = false;
2518 
2519  if (isTailCall) {
2520  // Check if it's really possible to do a tail call.
2521  isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2522  isVarArg, SR != NotStructReturn,
2523  MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2524  Outs, OutVals, Ins, DAG);
2525 
2526  // Sibcalls are automatically detected tailcalls which do not require
2527  // ABI changes.
2528  if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2529  IsSibcall = true;
2530 
2531  if (isTailCall)
2532  ++NumTailCalls;
2533  }
2534 
2535  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2536  "Var args not supported with calling convention fastcc, ghc or hipe");
2537 
2538  // Analyze operands of the call, assigning locations to each operand.
2540  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2541  ArgLocs, *DAG.getContext());
2542 
2543  // Allocate shadow area for Win64
2544  if (IsWin64)
2545  CCInfo.AllocateStack(32, 8);
2546 
2547  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2548 
2549  // Get a count of how many bytes are to be pushed on the stack.
2550  unsigned NumBytes = CCInfo.getNextStackOffset();
2551  if (IsSibcall)
2552  // This is a sibcall. The memory operands are available in caller's
2553  // own caller's stack.
2554  NumBytes = 0;
2555  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2556  IsTailCallConvention(CallConv))
2557  NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2558 
2559  int FPDiff = 0;
2560  if (isTailCall && !IsSibcall) {
2561  // Lower arguments at fp - stackoffset + fpdiff.
2563  unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2564 
2565  FPDiff = NumBytesCallerPushed - NumBytes;
2566 
2567  // Set the delta of movement of the returnaddr stackslot.
2568  // But only set if delta is greater than previous delta.
2569  if (FPDiff < X86Info->getTCReturnAddrDelta())
2570  X86Info->setTCReturnAddrDelta(FPDiff);
2571  }
2572 
2573  if (!IsSibcall)
2574  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
2575  dl);
2576 
2577  SDValue RetAddrFrIdx;
2578  // Load return address for tail calls.
2579  if (isTailCall && FPDiff)
2580  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2581  Is64Bit, FPDiff, dl);
2582 
2584  SmallVector<SDValue, 8> MemOpChains;
2585  SDValue StackPtr;
2586 
2587  // Walk the register/memloc assignments, inserting copies/loads. In the case
2588  // of tail call optimization arguments are handle later.
2589  const X86RegisterInfo *RegInfo =
2590  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
2591  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2592  CCValAssign &VA = ArgLocs[i];
2593  EVT RegVT = VA.getLocVT();
2594  SDValue Arg = OutVals[i];
2595  ISD::ArgFlagsTy Flags = Outs[i].Flags;
2596  bool isByVal = Flags.isByVal();
2597 
2598  // Promote the value if needed.
2599  switch (VA.getLocInfo()) {
2600  default: llvm_unreachable("Unknown loc info!");
2601  case CCValAssign::Full: break;
2602  case CCValAssign::SExt:
2603  Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2604  break;
2605  case CCValAssign::ZExt:
2606  Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2607  break;
2608  case CCValAssign::AExt:
2609  if (RegVT.is128BitVector()) {
2610  // Special case: passing MMX values in XMM registers.
2611  Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2612  Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2613  Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2614  } else
2615  Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2616  break;
2617  case CCValAssign::BCvt:
2618  Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2619  break;
2620  case CCValAssign::Indirect: {
2621  // Store the argument.
2622  SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2623  int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2624  Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2626  false, false, 0);
2627  Arg = SpillSlot;
2628  break;
2629  }
2630  }
2631 
2632  if (VA.isRegLoc()) {
2633  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2634  if (isVarArg && IsWin64) {
2635  // Win64 ABI requires argument XMM reg to be copied to the corresponding
2636  // shadow reg if callee is a varargs function.
2637  unsigned ShadowReg = 0;
2638  switch (VA.getLocReg()) {
2639  case X86::XMM0: ShadowReg = X86::RCX; break;
2640  case X86::XMM1: ShadowReg = X86::RDX; break;
2641  case X86::XMM2: ShadowReg = X86::R8; break;
2642  case X86::XMM3: ShadowReg = X86::R9; break;
2643  }
2644  if (ShadowReg)
2645  RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2646  }
2647  } else if (!IsSibcall && (!isTailCall || isByVal)) {
2648  assert(VA.isMemLoc());
2649  if (StackPtr.getNode() == 0)
2650  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2651  getPointerTy());
2652  MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2653  dl, DAG, VA, Flags));
2654  }
2655  }
2656 
2657  if (!MemOpChains.empty())
2658  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2659  &MemOpChains[0], MemOpChains.size());
2660 
2661  if (Subtarget->isPICStyleGOT()) {
2662  // ELF / PIC requires GOT in the EBX register before function calls via PLT
2663  // GOT pointer.
2664  if (!isTailCall) {
2665  RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2667  } else {
2668  // If we are tail calling and generating PIC/GOT style code load the
2669  // address of the callee into ECX. The value in ecx is used as target of
2670  // the tail jump. This is done to circumvent the ebx/callee-saved problem
2671  // for tail calls on PIC/GOT architectures. Normally we would just put the
2672  // address of GOT into ebx and then call target@PLT. But for tail calls
2673  // ebx would be restored (since ebx is callee saved) before jumping to the
2674  // target@PLT.
2675 
2676  // Note: The actual moving to ECX is done further down.
2678  if (G && !G->getGlobal()->hasHiddenVisibility() &&
2680  Callee = LowerGlobalAddress(Callee, DAG);
2681  else if (isa<ExternalSymbolSDNode>(Callee))
2682  Callee = LowerExternalSymbol(Callee, DAG);
2683  }
2684  }
2685 
2686  if (Is64Bit && isVarArg && !IsWin64) {
2687  // From AMD64 ABI document:
2688  // For calls that may call functions that use varargs or stdargs
2689  // (prototype-less calls or calls to functions containing ellipsis (...) in
2690  // the declaration) %al is used as hidden argument to specify the number
2691  // of SSE registers used. The contents of %al do not need to match exactly
2692  // the number of registers, but must be an ubound on the number of SSE
2693  // registers used and is in the range 0 - 8 inclusive.
2694 
2695  // Count the number of XMM registers allocated.
2696  static const uint16_t XMMArgRegs[] = {
2697  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2698  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2699  };
2700  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2701  assert((Subtarget->hasSSE1() || !NumXMMRegs)
2702  && "SSE registers cannot be used when SSE is disabled");
2703 
2704  RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2705  DAG.getConstant(NumXMMRegs, MVT::i8)));
2706  }
2707 
2708  // For tail calls lower the arguments to the 'real' stack slot.
2709  if (isTailCall) {
2710  // Force all the incoming stack arguments to be loaded from the stack
2711  // before any new outgoing arguments are stored to the stack, because the
2712  // outgoing stack slots may alias the incoming argument stack slots, and
2713  // the alias isn't otherwise explicit. This is slightly more conservative
2714  // than necessary, because it means that each store effectively depends
2715  // on every argument instead of just those arguments it would clobber.
2716  SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2717 
2718  SmallVector<SDValue, 8> MemOpChains2;
2719  SDValue FIN;
2720  int FI = 0;
2721  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2722  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2723  CCValAssign &VA = ArgLocs[i];
2724  if (VA.isRegLoc())
2725  continue;
2726  assert(VA.isMemLoc());
2727  SDValue Arg = OutVals[i];
2728  ISD::ArgFlagsTy Flags = Outs[i].Flags;
2729  // Create frame index.
2730  int32_t Offset = VA.getLocMemOffset()+FPDiff;
2731  uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2732  FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2733  FIN = DAG.getFrameIndex(FI, getPointerTy());
2734 
2735  if (Flags.isByVal()) {
2736  // Copy relative to framepointer.
2738  if (StackPtr.getNode() == 0)
2739  StackPtr = DAG.getCopyFromReg(Chain, dl,
2740  RegInfo->getStackRegister(),
2741  getPointerTy());
2742  Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2743 
2744  MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2745  ArgChain,
2746  Flags, DAG, dl));
2747  } else {
2748  // Store relative to framepointer.
2749  MemOpChains2.push_back(
2750  DAG.getStore(ArgChain, dl, Arg, FIN,
2752  false, false, 0));
2753  }
2754  }
2755  }
2756 
2757  if (!MemOpChains2.empty())
2758  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2759  &MemOpChains2[0], MemOpChains2.size());
2760 
2761  // Store the return address to the appropriate stack slot.
2762  Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2763  getPointerTy(), RegInfo->getSlotSize(),
2764  FPDiff, dl);
2765  }
2766 
2767  // Build a sequence of copy-to-reg nodes chained together with token chain
2768  // and flag operands which copy the outgoing args into registers.
2769  SDValue InFlag;
2770  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2771  Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2772  RegsToPass[i].second, InFlag);
2773  InFlag = Chain.getValue(1);
2774  }
2775 
2776  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2777  assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2778  // In the 64-bit large code model, we have to make all calls
2779  // through a register, since the call instruction's 32-bit
2780  // pc-relative offset may not be large enough to hold the whole
2781  // address.
2782  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2783  // If the callee is a GlobalAddress node (quite common, every direct call
2784  // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2785  // it.
2786 
2787  // We should use extra load for direct calls to dllimported functions in
2788  // non-JIT mode.
2789  const GlobalValue *GV = G->getGlobal();
2790  if (!GV->hasDLLImportLinkage()) {
2791  unsigned char OpFlags = 0;
2792  bool ExtraLoad = false;
2793  unsigned WrapperKind = ISD::DELETED_NODE;
2794 
2795  // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2796  // external symbols most go through the PLT in PIC mode. If the symbol
2797  // has hidden or protected visibility, or if it is static or local, then
2798  // we don't need to use the PLT - we can directly call it.
2799  if (Subtarget->isTargetELF() &&
2801  GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2802  OpFlags = X86II::MO_PLT;
2803  } else if (Subtarget->isPICStyleStubAny() &&
2804  (GV->isDeclaration() || GV->isWeakForLinker()) &&
2805  (!Subtarget->getTargetTriple().isMacOSX() ||
2806  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2807  // PC-relative references to external symbols should go through $stub,
2808  // unless we're building with the leopard linker or later, which
2809  // automatically synthesizes these stubs.
2810  OpFlags = X86II::MO_DARWIN_STUB;
2811  } else if (Subtarget->isPICStyleRIPRel() &&
2812  isa<Function>(GV) &&
2813  cast<Function>(GV)->getAttributes().
2814  hasAttribute(AttributeSet::FunctionIndex,
2816  // If the function is marked as non-lazy, generate an indirect call
2817  // which loads from the GOT directly. This avoids runtime overhead
2818  // at the cost of eager binding (and one extra byte of encoding).
2819  OpFlags = X86II::MO_GOTPCREL;
2820  WrapperKind = X86ISD::WrapperRIP;
2821  ExtraLoad = true;
2822  }
2823 
2824  Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2825  G->getOffset(), OpFlags);
2826 
2827  // Add a wrapper if needed.
2828  if (WrapperKind != ISD::DELETED_NODE)
2829  Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2830  // Add extra indirection if needed.
2831  if (ExtraLoad)
2832  Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2834  false, false, false, 0);
2835  }
2836  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2837  unsigned char OpFlags = 0;
2838 
2839  // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2840  // external symbols should go through the PLT.
2841  if (Subtarget->isTargetELF() &&
2843  OpFlags = X86II::MO_PLT;
2844  } else if (Subtarget->isPICStyleStubAny() &&
2845  (!Subtarget->getTargetTriple().isMacOSX() ||
2846  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2847  // PC-relative references to external symbols should go through $stub,
2848  // unless we're building with the leopard linker or later, which
2849  // automatically synthesizes these stubs.
2850  OpFlags = X86II::MO_DARWIN_STUB;
2851  }
2852 
2853  Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2854  OpFlags);
2855  }
2856 
2857  // Returns a chain & a flag for retval copy to use.
2858  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2860 
2861  if (!IsSibcall && isTailCall) {
2862  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2863  DAG.getIntPtrConstant(0, true), InFlag, dl);
2864  InFlag = Chain.getValue(1);
2865  }
2866 
2867  Ops.push_back(Chain);
2868  Ops.push_back(Callee);
2869 
2870  if (isTailCall)
2871  Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2872 
2873  // Add argument registers to the end of the list so that they are known live
2874  // into the call.
2875  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2876  Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2877  RegsToPass[i].second.getValueType()));
2878 
2879  // Add a register mask operand representing the call-preserved registers.
2881  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2882  assert(Mask && "Missing call preserved mask for calling convention");
2883  Ops.push_back(DAG.getRegisterMask(Mask));
2884 
2885  if (InFlag.getNode())
2886  Ops.push_back(InFlag);
2887 
2888  if (isTailCall) {
2889  // We used to do:
2890  //// If this is the first return lowered for this function, add the regs
2891  //// to the liveout set for the function.
2892  // This isn't right, although it's probably harmless on x86; liveouts
2893  // should be computed from returns not tail calls. Consider a void
2894  // function making a tail call to a function returning int.
2895  return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
2896  }
2897 
2898  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2899  InFlag = Chain.getValue(1);
2900 
2901  // Create the CALLSEQ_END node.
2902  unsigned NumBytesForCalleeToPush;
2903  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2904  getTargetMachine().Options.GuaranteedTailCallOpt))
2905  NumBytesForCalleeToPush = NumBytes; // Callee pops everything
2906  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2907  SR == StackStructReturn)
2908  // If this is a call to a struct-return function, the callee
2909  // pops the hidden struct pointer, so we have to push it back.
2910  // This is common for Darwin/X86, Linux & Mingw32 targets.
2911  // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2912  NumBytesForCalleeToPush = 4;
2913  else
2914  NumBytesForCalleeToPush = 0; // Callee pops nothing.
2915 
2916  // Returns a flag for retval copy to use.
2917  if (!IsSibcall) {
2918  Chain = DAG.getCALLSEQ_END(Chain,
2919  DAG.getIntPtrConstant(NumBytes, true),
2920  DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2921  true),
2922  InFlag, dl);
2923  InFlag = Chain.getValue(1);
2924  }
2925 
2926  // Handle result values, copying them out of physregs into vregs that we
2927  // return.
2928  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2929  Ins, dl, DAG, InVals);
2930 }
2931 
2932 //===----------------------------------------------------------------------===//
2933 // Fast Calling Convention (tail call) implementation
2934 //===----------------------------------------------------------------------===//
2935 
2936 // Like std call, callee cleans arguments, convention except that ECX is
2937 // reserved for storing the tail called function address. Only 2 registers are
2938 // free for argument passing (inreg). Tail call optimization is performed
2939 // provided:
2940 // * tailcallopt is enabled
2941 // * caller/callee are fastcc
2942 // On X86_64 architecture with GOT-style position independent code only local
2943 // (within module) calls are supported at the moment.
2944 // To keep the stack aligned according to platform abi the function
2945 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
2946 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2947 // If a tail called function callee has more arguments than the caller the
2948 // caller needs to make sure that there is room to move the RETADDR to. This is
2949 // achieved by reserving an area the size of the argument delta right after the
2950 // original REtADDR, but before the saved framepointer or the spilled registers
2951 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2952 // stack layout:
2953 // arg1
2954 // arg2
2955 // RETADDR
2956 // [ new RETADDR
2957 // move area ]
2958 // (possible EBP)
2959 // ESI
2960 // EDI
2961 // local1 ..
2962 
2963 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2964 /// for a 16 byte align requirement.
2965 unsigned
2966 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2967  SelectionDAG& DAG) const {
2968  MachineFunction &MF = DAG.getMachineFunction();
2969  const TargetMachine &TM = MF.getTarget();
2970  const X86RegisterInfo *RegInfo =
2971  static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
2972  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2973  unsigned StackAlignment = TFI.getStackAlignment();
2974  uint64_t AlignMask = StackAlignment - 1;
2975  int64_t Offset = StackSize;
2976  unsigned SlotSize = RegInfo->getSlotSize();
2977  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2978  // Number smaller than 12 so just add the difference.
2979  Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2980  } else {
2981  // Mask out lower bits, add stackalignment once plus the 12 bytes.
2982  Offset = ((~AlignMask) & Offset) + StackAlignment +
2983  (StackAlignment-SlotSize);
2984  }
2985  return Offset;
2986 }
2987 
2988 /// MatchingStackOffset - Return true if the given stack call argument is
2989 /// already available in the same position (relatively) of the caller's
2990 /// incoming argument stack.
2991 static
2992 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2994  const X86InstrInfo *TII) {
2995  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2996  int FI = INT_MAX;
2997  if (Arg.getOpcode() == ISD::CopyFromReg) {
2998  unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3000  return false;
3001  MachineInstr *Def = MRI->getVRegDef(VR);
3002  if (!Def)
3003  return false;
3004  if (!Flags.isByVal()) {
3005  if (!TII->isLoadFromStackSlot(Def, FI))
3006  return false;
3007  } else {
3008  unsigned Opcode = Def->getOpcode();
3009  if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
3010  Def->getOperand(1).isFI()) {
3011  FI = Def->getOperand(1).getIndex();
3012  Bytes = Flags.getByValSize();
3013  } else
3014  return false;
3015  }
3016  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3017  if (Flags.isByVal())
3018  // ByVal argument is passed in as a pointer but it's now being
3019  // dereferenced. e.g.
3020  // define @foo(%struct.X* %A) {
3021  // tail call @bar(%struct.X* byval %A)
3022  // }
3023  return false;
3024  SDValue Ptr = Ld->getBasePtr();
3025  FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3026  if (!FINode)
3027  return false;
3028  FI = FINode->getIndex();
3029  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3030  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3031  FI = FINode->getIndex();
3032  Bytes = Flags.getByValSize();
3033  } else
3034  return false;
3035 
3036  assert(FI != INT_MAX);
3037  if (!MFI->isFixedObjectIndex(FI))
3038  return false;
3039  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3040 }
3041 
3042 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3043 /// for tail call optimization. Targets which want to do tail call
3044 /// optimization should implement this function.
3045 bool
3046 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3047  CallingConv::ID CalleeCC,
3048  bool isVarArg,
3049  bool isCalleeStructRet,
3050  bool isCallerStructRet,
3051  Type *RetTy,
3052  const SmallVectorImpl<ISD::OutputArg> &Outs,
3053  const SmallVectorImpl<SDValue> &OutVals,
3054  const SmallVectorImpl<ISD::InputArg> &Ins,
3055  SelectionDAG &DAG) const {
3056  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3057  return false;
3058 
3059  // If -tailcallopt is specified, make fastcc functions tail-callable.
3060  const MachineFunction &MF = DAG.getMachineFunction();
3061  const Function *CallerF = MF.getFunction();
3062 
3063  // If the function return type is x86_fp80 and the callee return type is not,
3064  // then the FP_EXTEND of the call result is not a nop. It's not safe to
3065  // perform a tailcall optimization here.
3066  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3067  return false;
3068 
3069  CallingConv::ID CallerCC = CallerF->getCallingConv();
3070  bool CCMatch = CallerCC == CalleeCC;
3071  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3072  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3073 
3074  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
3075  if (IsTailCallConvention(CalleeCC) && CCMatch)
3076  return true;
3077  return false;
3078  }
3079 
3080  // Look for obvious safe cases to perform tail call optimization that do not
3081  // require ABI changes. This is what gcc calls sibcall.
3082 
3083  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3084  // emit a special epilogue.
3085  const X86RegisterInfo *RegInfo =
3086  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
3087  if (RegInfo->needsStackRealignment(MF))
3088  return false;
3089 
3090  // Also avoid sibcall optimization if either caller or callee uses struct
3091  // return semantics.
3092  if (isCalleeStructRet || isCallerStructRet)
3093  return false;
3094 
3095  // An stdcall caller is expected to clean up its arguments; the callee
3096  // isn't going to do that.
3097  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
3098  return false;
3099 
3100  // Do not sibcall optimize vararg calls unless all arguments are passed via
3101  // registers.
3102  if (isVarArg && !Outs.empty()) {
3103 
3104  // Optimizing for varargs on Win64 is unlikely to be safe without
3105  // additional testing.
3106  if (IsCalleeWin64 || IsCallerWin64)
3107  return false;
3108 
3110  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
3111  getTargetMachine(), ArgLocs, *DAG.getContext());
3112 
3113  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3114  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3115  if (!ArgLocs[i].isRegLoc())
3116  return false;
3117  }
3118 
3119  // If the call result is in ST0 / ST1, it needs to be popped off the x87
3120  // stack. Therefore, if it's not used by the call it is not safe to optimize
3121  // this into a sibcall.
3122  bool Unused = false;
3123  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3124  if (!Ins[i].Used) {
3125  Unused = true;
3126  break;
3127  }
3128  }
3129  if (Unused) {
3131  CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
3132  getTargetMachine(), RVLocs, *DAG.getContext());
3133  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3134  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3135  CCValAssign &VA = RVLocs[i];
3136  if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
3137  return false;
3138  }
3139  }
3140 
3141  // If the calling conventions do not match, then we'd better make sure the
3142  // results are returned in the same way as what the caller expects.
3143  if (!CCMatch) {
3145  CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
3146  getTargetMachine(), RVLocs1, *DAG.getContext());
3147  CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3148 
3150  CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
3151  getTargetMachine(), RVLocs2, *DAG.getContext());
3152  CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3153 
3154  if (RVLocs1.size() != RVLocs2.size())
3155  return false;
3156  for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3157  if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3158  return false;
3159  if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3160  return false;
3161  if (RVLocs1[i].isRegLoc()) {
3162  if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3163  return false;
3164  } else {
3165  if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3166  return false;
3167  }
3168  }
3169  }
3170 
3171  // If the callee takes no arguments then go on to check the results of the
3172  // call.
3173  if (!Outs.empty()) {
3174  // Check if stack adjustment is needed. For now, do not do this if any
3175  // argument is passed on the stack.
3177  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
3178  getTargetMachine(), ArgLocs, *DAG.getContext());
3179 
3180  // Allocate shadow area for Win64
3181  if (IsCalleeWin64)
3182  CCInfo.AllocateStack(32, 8);
3183 
3184  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3185  if (CCInfo.getNextStackOffset()) {
3186  MachineFunction &MF = DAG.getMachineFunction();
3188  return false;
3189 
3190  // Check if the arguments are already laid out in the right way as
3191  // the caller's fixed stack objects.
3192  MachineFrameInfo *MFI = MF.getFrameInfo();
3193  const MachineRegisterInfo *MRI = &MF.getRegInfo();
3194  const X86InstrInfo *TII =
3195  ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
3196  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3197  CCValAssign &VA = ArgLocs[i];
3198  SDValue Arg = OutVals[i];
3199  ISD::ArgFlagsTy Flags = Outs[i].Flags;
3200  if (VA.getLocInfo() == CCValAssign::Indirect)
3201  return false;
3202  if (!VA.isRegLoc()) {
3203  if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3204  MFI, MRI, TII))
3205  return false;
3206  }
3207  }
3208  }
3209 
3210  // If the tailcall address may be in a register, then make sure it's
3211  // possible to register allocate for it. In 32-bit, the call address can
3212  // only target EAX, EDX, or ECX since the tail call must be scheduled after
3213  // callee-saved registers are restored. These happen to be the same
3214  // registers used to pass 'inreg' arguments so watch out for those.
3215  if (!Subtarget->is64Bit() &&
3216  ((!isa<GlobalAddressSDNode>(Callee) &&
3217  !isa<ExternalSymbolSDNode>(Callee)) ||
3219  unsigned NumInRegs = 0;
3220  // In PIC we need an extra register to formulate the address computation
3221  // for the callee.
3222  unsigned MaxInRegs =
3224 
3225  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3226  CCValAssign &VA = ArgLocs[i];
3227  if (!VA.isRegLoc())
3228  continue;
3229  unsigned Reg = VA.getLocReg();
3230  switch (Reg) {
3231  default: break;
3232  case X86::EAX: case X86::EDX: case X86::ECX:
3233  if (++NumInRegs == MaxInRegs)
3234  return false;
3235  break;
3236  }
3237  }
3238  }
3239  }
3240 
3241  return true;
3242 }
3243 
3244 FastISel *
3246  const TargetLibraryInfo *libInfo) const {
3247  return X86::createFastISel(funcInfo, libInfo);
3248 }
3249 
3250 //===----------------------------------------------------------------------===//
3251 // Other Lowering Hooks
3252 //===----------------------------------------------------------------------===//
3253 
3254 static bool MayFoldLoad(SDValue Op) {
3255  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3256 }
3257 
3258 static bool MayFoldIntoStore(SDValue Op) {
3259  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3260 }
3261 
3262 static bool isTargetShuffle(unsigned Opcode) {
3263  switch(Opcode) {
3264  default: return false;
3265  case X86ISD::PSHUFD:
3266  case X86ISD::PSHUFHW:
3267  case X86ISD::PSHUFLW:
3268  case X86ISD::SHUFP:
3269  case X86ISD::PALIGNR:
3270  case X86ISD::MOVLHPS:
3271  case X86ISD::MOVLHPD:
3272  case X86ISD::MOVHLPS:
3273  case X86ISD::MOVLPS:
3274  case X86ISD::MOVLPD:
3275  case X86ISD::MOVSHDUP:
3276  case X86ISD::MOVSLDUP:
3277  case X86ISD::MOVDDUP:
3278  case X86ISD::MOVSS:
3279  case X86ISD::MOVSD:
3280  case X86ISD::UNPCKL:
3281  case X86ISD::UNPCKH:
3282  case X86ISD::VPERMILP:
3283  case X86ISD::VPERM2X128:
3284  case X86ISD::VPERMI:
3285  return true;
3286  }
3287 }
3288 
3289 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3290  SDValue V1, SelectionDAG &DAG) {
3291  switch(Opc) {
3292  default: llvm_unreachable("Unknown x86 shuffle node");
3293  case X86ISD::MOVSHDUP:
3294  case X86ISD::MOVSLDUP:
3295  case X86ISD::MOVDDUP:
3296  return DAG.getNode(Opc, dl, VT, V1);
3297  }
3298 }
3299 
3300 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3301  SDValue V1, unsigned TargetMask,
3302  SelectionDAG &DAG) {
3303  switch(Opc) {
3304  default: llvm_unreachable("Unknown x86 shuffle node");
3305  case X86ISD::PSHUFD:
3306  case X86ISD::PSHUFHW:
3307  case X86ISD::PSHUFLW:
3308  case X86ISD::VPERMILP:
3309  case X86ISD::VPERMI:
3310  return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3311  }
3312 }
3313 
3314 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3315  SDValue V1, SDValue V2, unsigned TargetMask,
3316  SelectionDAG &DAG) {
3317  switch(Opc) {
3318  default: llvm_unreachable("Unknown x86 shuffle node");
3319  case X86ISD::PALIGNR:
3320  case X86ISD::SHUFP:
3321  case X86ISD::VPERM2X128:
3322  return DAG.getNode(Opc, dl, VT, V1, V2,
3323  DAG.getConstant(TargetMask, MVT::i8));
3324  }
3325 }
3326 
3327 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3328  SDValue V1, SDValue V2, SelectionDAG &DAG) {
3329  switch(Opc) {
3330  default: llvm_unreachable("Unknown x86 shuffle node");
3331  case X86ISD::MOVLHPS:
3332  case X86ISD::MOVLHPD:
3333  case X86ISD::MOVHLPS:
3334  case X86ISD::MOVLPS:
3335  case X86ISD::MOVLPD:
3336  case X86ISD::MOVSS:
3337  case X86ISD::MOVSD:
3338  case X86ISD::UNPCKL:
3339  case X86ISD::UNPCKH:
3340  return DAG.getNode(Opc, dl, VT, V1, V2);
3341  }
3342 }
3343 
3345  MachineFunction &MF = DAG.getMachineFunction();
3346  const X86RegisterInfo *RegInfo =
3347  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
3349  int ReturnAddrIndex = FuncInfo->getRAIndex();
3350 
3351  if (ReturnAddrIndex == 0) {
3352  // Set up a frame object for the return address.
3353  unsigned SlotSize = RegInfo->getSlotSize();
3354  ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3355  -(int64_t)SlotSize,
3356  false);
3357  FuncInfo->setRAIndex(ReturnAddrIndex);
3358  }
3359 
3360  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3361 }
3362 
3364  bool hasSymbolicDisplacement) {
3365  // Offset should fit into 32 bit immediate field.
3366  if (!isInt<32>(Offset))
3367  return false;
3368 
3369  // If we don't have a symbolic displacement - we don't have any extra
3370  // restrictions.
3371  if (!hasSymbolicDisplacement)
3372  return true;
3373 
3374  // FIXME: Some tweaks might be needed for medium code model.
3375  if (M != CodeModel::Small && M != CodeModel::Kernel)
3376  return false;
3377 
3378  // For small code model we assume that latest object is 16MB before end of 31
3379  // bits boundary. We may also accept pretty large negative constants knowing
3380  // that all objects are in the positive half of address space.
3381  if (M == CodeModel::Small && Offset < 16*1024*1024)
3382  return true;
3383 
3384  // For kernel code model we know that all object resist in the negative half
3385  // of 32bits address space. We may not accept negative offsets, since they may
3386  // be just off and we may accept pretty large positive ones.
3387  if (M == CodeModel::Kernel && Offset > 0)
3388  return true;
3389 
3390  return false;
3391 }
3392 
3393 /// isCalleePop - Determines whether the callee is required to pop its
3394 /// own arguments. Callee pop is necessary to support tail calls.
3396  bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3397  if (IsVarArg)
3398  return false;
3399 
3400  switch (CallingConv) {
3401  default:
3402  return false;
3404  return !is64Bit;
3406  return !is64Bit;
3408  return !is64Bit;
3409  case CallingConv::Fast:
3410  return TailCallOpt;
3411  case CallingConv::GHC:
3412  return TailCallOpt;
3413  case CallingConv::HiPE:
3414  return TailCallOpt;
3415  }
3416 }
3417 
3418 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3419 /// specific condition code, returning the condition code and the LHS/RHS of the
3420 /// comparison to make.
3421 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3422  SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3423  if (!isFP) {
3424  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3425  if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3426  // X > -1 -> X == 0, jump !sign.
3427  RHS = DAG.getConstant(0, RHS.getValueType());
3428  return X86::COND_NS;
3429  }
3430  if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3431  // X < 0 -> X == 0, jump on sign.
3432  return X86::COND_S;
3433  }
3434  if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3435  // X < 1 -> X <= 0
3436  RHS = DAG.getConstant(0, RHS.getValueType());
3437  return X86::COND_LE;
3438  }
3439  }
3440 
3441  switch (SetCCOpcode) {
3442  default: llvm_unreachable("Invalid integer condition!");
3443  case ISD::SETEQ: return X86::COND_E;
3444  case ISD::SETGT: return X86::COND_G;
3445  case ISD::SETGE: return X86::COND_GE;
3446  case ISD::SETLT: return X86::COND_L;
3447  case ISD::SETLE: return X86::COND_LE;
3448  case ISD::SETNE: return X86::COND_NE;
3449  case ISD::SETULT: return X86::COND_B;
3450  case ISD::SETUGT: return X86::COND_A;
3451  case ISD::SETULE: return X86::COND_BE;
3452  case ISD::SETUGE: return X86::COND_AE;
3453  }
3454  }
3455 
3456  // First determine if it is required or is profitable to flip the operands.
3457 
3458  // If LHS is a foldable load, but RHS is not, flip the condition.
3459  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3460  !ISD::isNON_EXTLoad(RHS.getNode())) {
3461  SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3462  std::swap(LHS, RHS);
3463  }
3464 
3465  switch (SetCCOpcode) {
3466  default: break;
3467  case ISD::SETOLT:
3468  case ISD::SETOLE:
3469  case ISD::SETUGT:
3470  case ISD::SETUGE:
3471  std::swap(LHS, RHS);
3472  break;
3473  }
3474 
3475  // On a floating point condition, the flags are set as follows:
3476  // ZF PF CF op
3477  // 0 | 0 | 0 | X > Y
3478  // 0 | 0 | 1 | X < Y
3479  // 1 | 0 | 0 | X == Y
3480  // 1 | 1 | 1 | unordered
3481  switch (SetCCOpcode) {
3482  default: llvm_unreachable("Condcode should be pre-legalized away");
3483  case ISD::SETUEQ:
3484  case ISD::SETEQ: return X86::COND_E;
3485  case ISD::SETOLT: // flipped
3486  case ISD::SETOGT:
3487  case ISD::SETGT: return X86::COND_A;
3488  case ISD::SETOLE: // flipped
3489  case ISD::SETOGE:
3490  case ISD::SETGE: return X86::COND_AE;
3491  case ISD::SETUGT: // flipped
3492  case ISD::SETULT:
3493  case ISD::SETLT: return X86::COND_B;
3494  case ISD::SETUGE: // flipped
3495  case ISD::SETULE:
3496  case ISD::SETLE: return X86::COND_BE;
3497  case ISD::SETONE:
3498  case ISD::SETNE: return X86::COND_NE;
3499  case ISD::SETUO: return X86::COND_P;
3500  case ISD::SETO: return X86::COND_NP;
3501  case ISD::SETOEQ:
3502  case ISD::SETUNE: return X86::COND_INVALID;
3503  }
3504 }
3505 
3506 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3507 /// code. Current x86 isa includes the following FP cmov instructions:
3508 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3509 static bool hasFPCMov(unsigned X86CC) {
3510  switch (X86CC) {
3511  default:
3512  return false;
3513  case X86::COND_B:
3514  case X86::COND_BE:
3515  case X86::COND_E:
3516  case X86::COND_P:
3517  case X86::COND_A:
3518  case X86::COND_AE:
3519  case X86::COND_NE:
3520  case X86::COND_NP:
3521  return true;
3522  }
3523 }
3524 
3525 /// isFPImmLegal - Returns true if the target can instruction select the
3526 /// specified FP immediate natively. If false, the legalizer will
3527 /// materialize the FP immediate as a load from a constant pool.
3528 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3529  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3530  if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3531  return true;
3532  }
3533  return false;
3534 }
3535 
3536 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3537 /// the specified range (L, H].
3538 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3539  return (Val < 0) || (Val >= Low && Val < Hi);
3540 }
3541 
3542 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3543 /// specified value.
3544 static bool isUndefOrEqual(int Val, int CmpVal) {
3545  return (Val < 0 || Val == CmpVal);
3546 }
3547 
3548 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3549 /// from position Pos and ending in Pos+Size, falls within the specified
3550 /// sequential range (L, L+Pos]. or is undef.
3552  unsigned Pos, unsigned Size, int Low) {
3553  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3554  if (!isUndefOrEqual(Mask[i], Low))
3555  return false;
3556  return true;
3557 }
3558 
3559 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3560 /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
3561 /// the second operand.
3562 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
3563  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3564  return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3565  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3566  return (Mask[0] < 2 && Mask[1] < 2);
3567  return false;
3568 }
3569 
3570 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3571 /// is suitable for input to PSHUFHW.
3572 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3573  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3574  return false;
3575 
3576  // Lower quadword copied in order or undef.
3577  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3578  return false;
3579 
3580  // Upper quadword shuffled.
3581  for (unsigned i = 4; i != 8; ++i)
3582  if (!isUndefOrInRange(Mask[i], 4, 8))
3583  return false;
3584 
3585  if (VT == MVT::v16i16) {
3586  // Lower quadword copied in order or undef.
3587  if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3588  return false;
3589 
3590  // Upper quadword shuffled.
3591  for (unsigned i = 12; i != 16; ++i)
3592  if (!isUndefOrInRange(Mask[i], 12, 16))
3593  return false;
3594  }
3595 
3596  return true;
3597 }
3598 
3599 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3600 /// is suitable for input to PSHUFLW.
3601 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3602  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3603  return false;
3604 
3605  // Upper quadword copied in order.
3606  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3607  return false;
3608 
3609  // Lower quadword shuffled.
3610  for (unsigned i = 0; i != 4; ++i)
3611  if (!isUndefOrInRange(Mask[i], 0, 4))
3612  return false;
3613 
3614  if (VT == MVT::v16i16) {
3615  // Upper quadword copied in order.
3616  if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3617  return false;
3618 
3619  // Lower quadword shuffled.
3620  for (unsigned i = 8; i != 12; ++i)
3621  if (!isUndefOrInRange(Mask[i], 8, 12))
3622  return false;
3623  }
3624 
3625  return true;
3626 }
3627 
3628 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3629 /// is suitable for input to PALIGNR.
3630 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
3631  const X86Subtarget *Subtarget) {
3632  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
3633  (VT.is256BitVector() && !Subtarget->hasInt256()))
3634  return false;
3635 
3636  unsigned NumElts = VT.getVectorNumElements();
3637  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
3638  unsigned NumLaneElts = NumElts/NumLanes;
3639 
3640  // Do not handle 64-bit element shuffles with palignr.
3641  if (NumLaneElts == 2)
3642  return false;
3643 
3644  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3645  unsigned i;
3646  for (i = 0; i != NumLaneElts; ++i) {
3647  if (Mask[i+l] >= 0)
3648  break;
3649  }
3650 
3651  // Lane is all undef, go to next lane
3652  if (i == NumLaneElts)
3653  continue;
3654 
3655  int Start = Mask[i+l];
3656 
3657  // Make sure its in this lane in one of the sources
3658  if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3659  !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3660  return false;
3661 
3662  // If not lane 0, then we must match lane 0
3663  if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3664  return false;
3665 
3666  // Correct second source to be contiguous with first source
3667  if (Start >= (int)NumElts)
3668  Start -= NumElts - NumLaneElts;
3669 
3670  // Make sure we're shifting in the right direction.
3671  if (Start <= (int)(i+l))
3672  return false;
3673 
3674  Start -= i;
3675 
3676  // Check the rest of the elements to see if they are consecutive.
3677  for (++i; i != NumLaneElts; ++i) {
3678  int Idx = Mask[i+l];
3679 
3680  // Make sure its in this lane
3681  if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3682  !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3683  return false;
3684 
3685  // If not lane 0, then we must match lane 0
3686  if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3687  return false;
3688 
3689  if (Idx >= (int)NumElts)
3690  Idx -= NumElts - NumLaneElts;
3691 
3692  if (!isUndefOrEqual(Idx, Start+i))
3693  return false;
3694 
3695  }
3696  }
3697 
3698  return true;
3699 }
3700 
3701 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3702 /// the two vector operands have swapped position.
3704  unsigned NumElems) {
3705  for (unsigned i = 0; i != NumElems; ++i) {
3706  int idx = Mask[i];
3707  if (idx < 0)
3708  continue;
3709  else if (idx < (int)NumElems)
3710  Mask[i] = idx + NumElems;
3711  else
3712  Mask[i] = idx - NumElems;
3713  }
3714 }
3715 
3716 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3717 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
3718 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3719 /// reverse of what x86 shuffles want.
3720 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
3721 
3722  unsigned NumElems = VT.getVectorNumElements();
3723  unsigned NumLanes = VT.getSizeInBits()/128;
3724  unsigned NumLaneElems = NumElems/NumLanes;
3725 
3726  if (NumLaneElems != 2 && NumLaneElems != 4)
3727  return false;
3728 
3729  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
3730  bool symetricMaskRequired =
3731  (VT.getSizeInBits() >= 256) && (EltSize == 32);
3732 
3733  // VSHUFPSY divides the resulting vector into 4 chunks.
3734  // The sources are also splitted into 4 chunks, and each destination
3735  // chunk must come from a different source chunk.
3736  //
3737  // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
3738  // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
3739  //
3740  // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
3741  // Y3..Y0, Y3..Y0, X3..X0, X3..X0
3742  //
3743  // VSHUFPDY divides the resulting vector into 4 chunks.
3744  // The sources are also splitted into 4 chunks, and each destination
3745  // chunk must come from a different source chunk.
3746  //
3747  // SRC1 => X3 X2 X1 X0
3748  // SRC2 => Y3 Y2 Y1 Y0
3749  //
3750  // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
3751  //
3752  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
3753  unsigned HalfLaneElems = NumLaneElems/2;
3754  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3755  for (unsigned i = 0; i != NumLaneElems; ++i) {
3756  int Idx = Mask[i+l];
3757  unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3758  if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3759  return false;
3760  // For VSHUFPSY, the mask of the second half must be the same as the
3761  // first but with the appropriate offsets. This works in the same way as
3762  // VPERMILPS works with masks.
3763  if (!symetricMaskRequired || Idx < 0)
3764  continue;
3765  if (MaskVal[i] < 0) {
3766  MaskVal[i] = Idx - l;
3767  continue;
3768  }
3769  if ((signed)(Idx - l) != MaskVal[i])
3770  return false;
3771  }
3772  }
3773 
3774  return true;
3775 }
3776 
3777 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3778 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3779 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
3780  if (!VT.is128BitVector())
3781  return false;
3782 
3783  unsigned NumElems = VT.getVectorNumElements();
3784 
3785  if (NumElems != 4)
3786  return false;
3787 
3788  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3789  return isUndefOrEqual(Mask[0], 6) &&
3790  isUndefOrEqual(Mask[1], 7) &&
3791  isUndefOrEqual(Mask[2], 2) &&
3792  isUndefOrEqual(Mask[3], 3);
3793 }
3794 
3795 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3796 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3797 /// <2, 3, 2, 3>
3799  if (!VT.is128BitVector())
3800  return false;
3801 
3802  unsigned NumElems = VT.getVectorNumElements();
3803 
3804  if (NumElems != 4)
3805  return false;
3806 
3807  return isUndefOrEqual(Mask[0], 2) &&
3808  isUndefOrEqual(Mask[1], 3) &&
3809  isUndefOrEqual(Mask[2], 2) &&
3810  isUndefOrEqual(Mask[3], 3);
3811 }
3812 
3813 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3814 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3815 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
3816  if (!VT.is128BitVector())
3817  return false;
3818 
3819  unsigned NumElems = VT.getVectorNumElements();
3820 
3821  if (NumElems != 2 && NumElems != 4)
3822  return false;
3823 
3824  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3825  if (!isUndefOrEqual(Mask[i], i + NumElems))
3826  return false;
3827 
3828  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3829  if (!isUndefOrEqual(Mask[i], i))
3830  return false;
3831 
3832  return true;
3833 }
3834 
3835 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3836 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3837 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
3838  if (!VT.is128BitVector())
3839  return false;
3840 
3841  unsigned NumElems = VT.getVectorNumElements();
3842 
3843  if (NumElems != 2 && NumElems != 4)
3844  return false;
3845 
3846  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3847  if (!isUndefOrEqual(Mask[i], i))
3848  return false;
3849 
3850  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3851  if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3852  return false;
3853 
3854  return true;
3855 }
3856 
3857 //
3858 // Some special combinations that can be optimized.
3859 //
3860 static
3862  SelectionDAG &DAG) {
3863  MVT VT = SVOp->getSimpleValueType(0);
3864  SDLoc dl(SVOp);
3865 
3866  if (VT != MVT::v8i32 && VT != MVT::v8f32)
3867  return SDValue();
3868 
3869  ArrayRef<int> Mask = SVOp->getMask();
3870 
3871  // These are the special masks that may be optimized.
3872  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
3873  static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
3874  bool MatchEvenMask = true;
3875  bool MatchOddMask = true;
3876  for (int i=0; i<8; ++i) {
3877  if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
3878  MatchEvenMask = false;
3879  if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
3880  MatchOddMask = false;
3881  }
3882 
3883  if (!MatchEvenMask && !MatchOddMask)
3884  return SDValue();
3885 
3886  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
3887 
3888  SDValue Op0 = SVOp->getOperand(0);
3889  SDValue Op1 = SVOp->getOperand(1);
3890 
3891  if (MatchEvenMask) {
3892  // Shift the second operand right to 32 bits.
3893  static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
3894  Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
3895  } else {
3896  // Shift the first operand left to 32 bits.
3897  static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
3898  Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
3899  }
3900  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
3901  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
3902 }
3903 
3904 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3905 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
3906 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
3907  bool HasInt256, bool V2IsSplat = false) {
3908 
3909  assert(VT.getSizeInBits() >= 128 &&
3910  "Unsupported vector type for unpckl");
3911 
3912  // AVX defines UNPCK* to operate independently on 128-bit lanes.
3913  unsigned NumLanes;
3914  unsigned NumOf256BitLanes;
3915  unsigned NumElts = VT.getVectorNumElements();
3916  if (VT.is256BitVector()) {
3917  if (NumElts != 4 && NumElts != 8 &&
3918  (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3919  return false;
3920  NumLanes = 2;
3921  NumOf256BitLanes = 1;
3922  } else if (VT.is512BitVector()) {
3923  assert(VT.getScalarType().getSizeInBits() >= 32 &&
3924  "Unsupported vector type for unpckh");
3925  NumLanes = 2;
3926  NumOf256BitLanes = 2;
3927  } else {
3928  NumLanes = 1;
3929  NumOf256BitLanes = 1;
3930  }
3931 
3932  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
3933  unsigned NumLaneElts = NumEltsInStride/NumLanes;
3934 
3935  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
3936  for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
3937  for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
3938  int BitI = Mask[l256*NumEltsInStride+l+i];
3939  int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
3940  if (!isUndefOrEqual(BitI, j+l256*NumElts))
3941  return false;
3942  if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
3943  return false;
3944  if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
3945  return false;
3946  }
3947  }
3948  }
3949  return true;
3950 }
3951 
3952 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3953 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
3954 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
3955  bool HasInt256, bool V2IsSplat = false) {
3956  assert(VT.getSizeInBits() >= 128 &&
3957  "Unsupported vector type for unpckh");
3958 
3959  // AVX defines UNPCK* to operate independently on 128-bit lanes.
3960  unsigned NumLanes;
3961  unsigned NumOf256BitLanes;
3962  unsigned NumElts = VT.getVectorNumElements();
3963  if (VT.is256BitVector()) {
3964  if (NumElts != 4 && NumElts != 8 &&
3965  (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3966  return false;
3967  NumLanes = 2;
3968  NumOf256BitLanes = 1;
3969  } else if (VT.is512BitVector()) {
3970  assert(VT.getScalarType().getSizeInBits() >= 32 &&
3971  "Unsupported vector type for unpckh");
3972  NumLanes = 2;
3973  NumOf256BitLanes = 2;
3974  } else {
3975  NumLanes = 1;
3976  NumOf256BitLanes = 1;
3977  }
3978 
3979  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
3980  unsigned NumLaneElts = NumEltsInStride/NumLanes;
3981 
3982  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
3983  for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
3984  for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
3985  int BitI = Mask[l256*NumEltsInStride+l+i];
3986  int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
3987  if (!isUndefOrEqual(BitI, j+l256*NumElts))
3988  return false;
3989  if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
3990  return false;
3991  if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
3992  return false;
3993  }
3994  }
3995  }
3996  return true;
3997 }
3998 
3999 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4000 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4001 /// <0, 0, 1, 1>
4002 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4003  unsigned NumElts = VT.getVectorNumElements();
4004  bool Is256BitVec = VT.is256BitVector();
4005 
4006  if (VT.is512BitVector())
4007  return false;
4008  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4009  "Unsupported vector type for unpckh");
4010 
4011  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4012  (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4013  return false;
4014 
4015  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4016  // FIXME: Need a better way to get rid of this, there's no latency difference
4017  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4018  // the former later. We should also remove the "_undef" special mask.
4019  if (NumElts == 4 && Is256BitVec)
4020  return false;
4021 
4022  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4023  // independently on 128-bit lanes.
4024  unsigned NumLanes = VT.getSizeInBits()/128;
4025  unsigned NumLaneElts = NumElts/NumLanes;
4026 
4027  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4028  for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4029  int BitI = Mask[l+i];
4030  int BitI1 = Mask[l+i+1];
4031 
4032  if (!isUndefOrEqual(BitI, j))
4033  return false;
4034  if (!isUndefOrEqual(BitI1, j))
4035  return false;
4036  }
4037  }
4038 
4039  return true;
4040 }
4041 
4042 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4043 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4044 /// <2, 2, 3, 3>
4045 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4046  unsigned NumElts = VT.getVectorNumElements();
4047 
4048  if (VT.is512BitVector())
4049  return false;
4050 
4051  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4052  "Unsupported vector type for unpckh");
4053 
4054  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4055  (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4056  return false;
4057 
4058  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4059  // independently on 128-bit lanes.
4060  unsigned NumLanes = VT.getSizeInBits()/128;
4061  unsigned NumLaneElts = NumElts/NumLanes;
4062 
4063  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4064  for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4065  int BitI = Mask[l+i];
4066  int BitI1 = Mask[l+i+1];
4067  if (!isUndefOrEqual(BitI, j))
4068  return false;
4069  if (!isUndefOrEqual(BitI1, j))
4070  return false;
4071  }
4072  }
4073  return true;
4074 }
4075 
4076 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4077 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4078 /// MOVSD, and MOVD, i.e. setting the lowest element.
4079 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4080  if (VT.getVectorElementType().getSizeInBits() < 32)
4081  return false;
4082  if (!VT.is128BitVector())
4083  return false;
4084 
4085  unsigned NumElts = VT.getVectorNumElements();
4086 
4087  if (!isUndefOrEqual(Mask[0], NumElts))
4088  return false;
4089 
4090  for (unsigned i = 1; i != NumElts; ++i)
4091  if (!isUndefOrEqual(Mask[i], i))
4092  return false;
4093 
4094  return true;
4095 }
4096 
4097 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4098 /// as permutations between 128-bit chunks or halves. As an example: this
4099 /// shuffle bellow:
4100 /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4101 /// The first half comes from the second half of V1 and the second half from the
4102 /// the second half of V2.
4103 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4104  if (!HasFp256 || !VT.is256BitVector())
4105  return false;
4106 
4107  // The shuffle result is divided into half A and half B. In total the two
4108  // sources have 4 halves, namely: C, D, E, F. The final values of A and
4109  // B must come from C, D, E or F.
4110  unsigned HalfSize = VT.getVectorNumElements()/2;
4111  bool MatchA = false, MatchB = false;
4112 
4113  // Check if A comes from one of C, D, E, F.
4114  for (unsigned Half = 0; Half != 4; ++Half) {
4115  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4116  MatchA = true;
4117  break;
4118  }
4119  }
4120 
4121  // Check if B comes from one of C, D, E, F.
4122  for (unsigned Half = 0; Half != 4; ++Half) {
4123  if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4124  MatchB = true;
4125  break;
4126  }
4127  }
4128 
4129  return MatchA && MatchB;
4130 }
4131 
4132 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4133 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4135  MVT VT = SVOp->getSimpleValueType(0);
4136 
4137  unsigned HalfSize = VT.getVectorNumElements()/2;
4138 
4139  unsigned FstHalf = 0, SndHalf = 0;
4140  for (unsigned i = 0; i < HalfSize; ++i) {
4141  if (SVOp->getMaskElt(i) > 0) {
4142  FstHalf = SVOp->getMaskElt(i)/HalfSize;
4143  break;
4144  }
4145  }
4146  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4147  if (SVOp->getMaskElt(i) > 0) {
4148  SndHalf = SVOp->getMaskElt(i)/HalfSize;
4149  break;
4150  }
4151  }
4152 
4153  return (FstHalf | (SndHalf << 4));
4154 }
4155 
4156 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4157 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4158  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4159  if (EltSize < 32)
4160  return false;
4161 
4162  unsigned NumElts = VT.getVectorNumElements();
4163  Imm8 = 0;
4164  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4165  for (unsigned i = 0; i != NumElts; ++i) {
4166  if (Mask[i] < 0)
4167  continue;
4168  Imm8 |= Mask[i] << (i*2);
4169  }
4170  return true;
4171  }
4172 
4173  unsigned LaneSize = 4;
4174  SmallVector<int, 4> MaskVal(LaneSize, -1);
4175 
4176  for (unsigned l = 0; l != NumElts; l += LaneSize) {
4177  for (unsigned i = 0; i != LaneSize; ++i) {
4178  if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4179  return false;
4180  if (Mask[i+l] < 0)
4181  continue;
4182  if (MaskVal[i] < 0) {
4183  MaskVal[i] = Mask[i+l] - l;
4184  Imm8 |= MaskVal[i] << (i*2);
4185  continue;
4186  }
4187  if (Mask[i+l] != (signed)(MaskVal[i]+l))
4188  return false;
4189  }
4190  }
4191  return true;
4192 }
4193 
4194 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4195 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4196 /// Note that VPERMIL mask matching is different depending whether theunderlying
4197 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4198 /// to the same elements of the low, but to the higher half of the source.
4199 /// In VPERMILPD the two lanes could be shuffled independently of each other
4200 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4201 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4202  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4203  if (VT.getSizeInBits() < 256 || EltSize < 32)
4204  return false;
4205  bool symetricMaskRequired = (EltSize == 32);
4206  unsigned NumElts = VT.getVectorNumElements();
4207 
4208  unsigned NumLanes = VT.getSizeInBits()/128;
4209  unsigned LaneSize = NumElts/NumLanes;
4210  // 2 or 4 elements in one lane
4211 
4212  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4213  for (unsigned l = 0; l != NumElts; l += LaneSize) {
4214  for (unsigned i = 0; i != LaneSize; ++i) {
4215  if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4216  return false;
4217  if (symetricMaskRequired) {
4218  if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4219  ExpectedMaskVal[i] = Mask[i+l] - l;
4220  continue;
4221  }
4222  if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4223  return false;
4224  }
4225  }
4226  }
4227  return true;
4228 }
4229 
4230 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4231 /// of what x86 movss want. X86 movs requires the lowest element to be lowest
4232 /// element of vector 2 and the other elements to come from vector 1 in order.
4233 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4234  bool V2IsSplat = false, bool V2IsUndef = false) {
4235  if (!VT.is128BitVector())
4236  return false;
4237 
4238  unsigned NumOps = VT.getVectorNumElements();
4239  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4240  return false;
4241 
4242  if (!isUndefOrEqual(Mask[0], 0))
4243  return false;
4244 
4245  for (unsigned i = 1; i != NumOps; ++i)
4246  if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4247  (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4248  (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4249  return false;
4250 
4251  return true;
4252 }
4253 
4254 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4255 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4256 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4257 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4258  const X86Subtarget *Subtarget) {
4259  if (!Subtarget->hasSSE3())
4260  return false;
4261 
4262  unsigned NumElems = VT.getVectorNumElements();
4263 
4264  if ((VT.is128BitVector() && NumElems != 4) ||
4265  (VT.is256BitVector() && NumElems != 8) ||
4266  (VT.is512BitVector() && NumElems != 16))
4267  return false;
4268 
4269  // "i+1" is the value the indexed mask element must have
4270  for (unsigned i = 0; i != NumElems; i += 2)
4271  if (!isUndefOrEqual(Mask[i], i+1) ||
4272  !isUndefOrEqual(Mask[i+1], i+1))
4273  return false;
4274 
4275  return true;
4276 }
4277 
4278 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4279 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4280 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4281 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4282  const X86Subtarget *Subtarget) {
4283  if (!Subtarget->hasSSE3())
4284  return false;
4285 
4286  unsigned NumElems = VT.getVectorNumElements();
4287 
4288  if ((VT.is128BitVector() && NumElems != 4) ||
4289  (VT.is256BitVector() && NumElems != 8) ||
4290  (VT.is512BitVector() && NumElems != 16))
4291  return false;
4292 
4293  // "i" is the value the indexed mask element must have
4294  for (unsigned i = 0; i != NumElems; i += 2)
4295  if (!isUndefOrEqual(Mask[i], i) ||
4296  !isUndefOrEqual(Mask[i+1], i))
4297  return false;
4298 
4299  return true;
4300 }
4301 
4302 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4303 /// specifies a shuffle of elements that is suitable for input to 256-bit
4304 /// version of MOVDDUP.
4305 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4306  if (!HasFp256 || !VT.is256BitVector())
4307  return false;
4308 
4309  unsigned NumElts = VT.getVectorNumElements();
4310  if (NumElts != 4)
4311  return false;
4312 
4313  for (unsigned i = 0; i != NumElts/2; ++i)
4314  if (!isUndefOrEqual(Mask[i], 0))
4315  return false;
4316  for (unsigned i = NumElts/2; i != NumElts; ++i)
4317  if (!isUndefOrEqual(Mask[i], NumElts/2))
4318  return false;
4319  return true;
4320 }
4321 
4322 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4323 /// specifies a shuffle of elements that is suitable for input to 128-bit
4324 /// version of MOVDDUP.
4325 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4326  if (!VT.is128BitVector())
4327  return false;
4328 
4329  unsigned e = VT.getVectorNumElements() / 2;
4330  for (unsigned i = 0; i != e; ++i)
4331  if (!isUndefOrEqual(Mask[i], i))
4332  return false;
4333  for (unsigned i = 0; i != e; ++i)
4334  if (!isUndefOrEqual(Mask[e+i], i))
4335  return false;
4336  return true;
4337 }
4338 
4339 /// isVEXTRACTIndex - Return true if the specified
4340 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4341 /// suitable for instruction that extract 128 or 256 bit vectors
4342 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4343  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4344  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4345  return false;
4346 
4347  // The index should be aligned on a vecWidth-bit boundary.
4348  uint64_t Index =
4349  cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4350 
4351  MVT VT = N->getSimpleValueType(0);
4352  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4353  bool Result = (Index * ElSize) % vecWidth == 0;
4354 
4355  return Result;
4356 }
4357 
4358 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4359 /// operand specifies a subvector insert that is suitable for input to
4360 /// insertion of 128 or 256-bit subvectors
4361 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4362  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4363  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4364  return false;
4365  // The index should be aligned on a vecWidth-bit boundary.
4366  uint64_t Index =
4367  cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4368 
4369  MVT VT = N->getSimpleValueType(0);
4370  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4371  bool Result = (Index * ElSize) % vecWidth == 0;
4372 
4373  return Result;
4374 }
4375 
4377  return isVINSERTIndex(N, 128);
4378 }
4379 
4381  return isVINSERTIndex(N, 256);
4382 }
4383 
4385  return isVEXTRACTIndex(N, 128);
4386 }
4387 
4389  return isVEXTRACTIndex(N, 256);
4390 }
4391 
4392 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4393 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4394 /// Handles 128-bit and 256-bit.
4396  MVT VT = N->getSimpleValueType(0);
4397 
4398  assert((VT.getSizeInBits() >= 128) &&
4399  "Unsupported vector type for PSHUF/SHUFP");
4400 
4401  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4402  // independently on 128-bit lanes.
4403  unsigned NumElts = VT.getVectorNumElements();
4404  unsigned NumLanes = VT.getSizeInBits()/128;
4405  unsigned NumLaneElts = NumElts/NumLanes;
4406 
4407  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4408  "Only supports 2, 4 or 8 elements per lane");
4409 
4410  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4411  unsigned Mask = 0;
4412  for (unsigned i = 0; i != NumElts; ++i) {
4413  int Elt = N->getMaskElt(i);
4414  if (Elt < 0) continue;
4415  Elt &= NumLaneElts - 1;
4416  unsigned ShAmt = (i << Shift) % 8;
4417  Mask |= Elt << ShAmt;
4418  }
4419 
4420  return Mask;
4421 }
4422 
4423 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4424 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4426  MVT VT = N->getSimpleValueType(0);
4427 
4428  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4429  "Unsupported vector type for PSHUFHW");
4430 
4431  unsigned NumElts = VT.getVectorNumElements();
4432 
4433  unsigned Mask = 0;
4434  for (unsigned l = 0; l != NumElts; l += 8) {
4435  // 8 nodes per lane, but we only care about the last 4.
4436  for (unsigned i = 0; i < 4; ++i) {
4437  int Elt = N->getMaskElt(l+i+4);
4438  if (Elt < 0) continue;
4439  Elt &= 0x3; // only 2-bits.
4440  Mask |= Elt << (i * 2);
4441  }
4442  }
4443 
4444  return Mask;
4445 }
4446 
4447 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4448 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4450  MVT VT = N->getSimpleValueType(0);
4451 
4452  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4453  "Unsupported vector type for PSHUFHW");
4454 
4455  unsigned NumElts = VT.getVectorNumElements();
4456 
4457  unsigned Mask = 0;
4458  for (unsigned l = 0; l != NumElts; l += 8) {
4459  // 8 nodes per lane, but we only care about the first 4.
4460  for (unsigned i = 0; i < 4; ++i) {
4461  int Elt = N->getMaskElt(l+i);
4462  if (Elt < 0) continue;
4463  Elt &= 0x3; // only 2-bits
4464  Mask |= Elt << (i * 2);
4465  }
4466  }
4467 
4468  return Mask;
4469 }
4470 
4471 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
4472 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
4474  MVT VT = SVOp->getSimpleValueType(0);
4475  unsigned EltSize = VT.is512BitVector() ? 1 :
4476  VT.getVectorElementType().getSizeInBits() >> 3;
4477 
4478  unsigned NumElts = VT.getVectorNumElements();
4479  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4480  unsigned NumLaneElts = NumElts/NumLanes;
4481 
4482  int Val = 0;
4483  unsigned i;
4484  for (i = 0; i != NumElts; ++i) {
4485  Val = SVOp->getMaskElt(i);
4486  if (Val >= 0)
4487  break;
4488  }
4489  if (Val >= (int)NumElts)
4490  Val -= NumElts - NumLaneElts;
4491 
4492  assert(Val - i > 0 && "PALIGNR imm should be positive");
4493  return (Val - i) * EltSize;
4494 }
4495 
4496 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4497  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4498  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4499  llvm_unreachable("Illegal extract subvector for VEXTRACT");
4500 
4501  uint64_t Index =
4502  cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4503 
4504  MVT VecVT = N->getOperand(0).getSimpleValueType();
4505  MVT ElVT = VecVT.getVectorElementType();
4506 
4507  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4508  return Index / NumElemsPerChunk;
4509 }
4510 
4511 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4512  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4513  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4514  llvm_unreachable("Illegal insert subvector for VINSERT");
4515 
4516  uint64_t Index =
4517  cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4518 
4519  MVT VecVT = N->getSimpleValueType(0);
4520  MVT ElVT = VecVT.getVectorElementType();
4521 
4522  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4523  return Index / NumElemsPerChunk;
4524 }
4525 
4526 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4527 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4528 /// and VINSERTI128 instructions.
4530  return getExtractVEXTRACTImmediate(N, 128);
4531 }
4532 
4533 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4534 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4535 /// and VINSERTI64x4 instructions.
4537  return getExtractVEXTRACTImmediate(N, 256);
4538 }
4539 
4540 /// getInsertVINSERT128Immediate - Return the appropriate immediate
4541 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4542 /// and VINSERTI128 instructions.
4544  return getInsertVINSERTImmediate(N, 128);
4545 }
4546 
4547 /// getInsertVINSERT256Immediate - Return the appropriate immediate
4548 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
4549 /// and VINSERTI64x4 instructions.
4551  return getInsertVINSERTImmediate(N, 256);
4552 }
4553 
4554 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
4555 /// constant +0.0.
4557  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
4558  return CN->isNullValue();
4559  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
4560  return CFP->getValueAPF().isPosZero();
4561  return false;
4562 }
4563 
4564 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4565 /// their permute mask.
4567  SelectionDAG &DAG) {
4568  MVT VT = SVOp->getSimpleValueType(0);
4569  unsigned NumElems = VT.getVectorNumElements();
4570  SmallVector<int, 8> MaskVec;
4571 
4572  for (unsigned i = 0; i != NumElems; ++i) {
4573  int Idx = SVOp->getMaskElt(i);
4574  if (Idx >= 0) {
4575  if (Idx < (int)NumElems)
4576  Idx += NumElems;
4577  else
4578  Idx -= NumElems;
4579  }
4580  MaskVec.push_back(Idx);
4581  }
4582  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
4583  SVOp->getOperand(0), &MaskVec[0]);
4584 }
4585 
4586 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4587 /// match movhlps. The lower half elements should come from upper half of
4588 /// V1 (and in order), and the upper half elements should come from the upper
4589 /// half of V2 (and in order).
4590 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
4591  if (!VT.is128BitVector())
4592  return false;
4593  if (VT.getVectorNumElements() != 4)
4594  return false;
4595  for (unsigned i = 0, e = 2; i != e; ++i)
4596  if (!isUndefOrEqual(Mask[i], i+2))
4597  return false;
4598  for (unsigned i = 2; i != 4; ++i)
4599  if (!isUndefOrEqual(Mask[i], i+4))
4600  return false;
4601  return true;
4602 }
4603 
4604 /// isScalarLoadToVector - Returns true if the node is a scalar load that
4605 /// is promoted to a vector. It also returns the LoadSDNode by reference if
4606 /// required.
4607 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4608  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4609  return false;
4610  N = N->getOperand(0).getNode();
4611  if (!ISD::isNON_EXTLoad(N))
4612  return false;
4613  if (LD)
4614  *LD = cast<LoadSDNode>(N);
4615  return true;
4616 }
4617 
4618 // Test whether the given value is a vector value which will be legalized
4619 // into a load.
4621  if (N->getOpcode() != ISD::BUILD_VECTOR)
4622  return false;
4623 
4624  // Check for any non-constant elements.
4625  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4626  switch (N->getOperand(i).getNode()->getOpcode()) {
4627  case ISD::UNDEF:
4628  case ISD::ConstantFP:
4629  case ISD::Constant:
4630  break;
4631  default:
4632  return false;
4633  }
4634 
4635  // Vectors of all-zeros and all-ones are materialized with special
4636  // instructions rather than being loaded.
4637  return !ISD::isBuildVectorAllZeros(N) &&
4639 }
4640 
4641 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4642 /// match movlp{s|d}. The lower half elements should come from lower half of
4643 /// V1 (and in order), and the upper half elements should come from the upper
4644 /// half of V2 (and in order). And since V1 will become the source of the
4645 /// MOVLP, it must be either a vector load or a scalar load to vector.
4647  ArrayRef<int> Mask, MVT VT) {
4648  if (!VT.is128BitVector())
4649  return false;
4650 
4651  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4652  return false;
4653  // Is V2 is a vector load, don't do this transformation. We will try to use
4654  // load folding shufps op.
4656  return false;
4657 
4658  unsigned NumElems = VT.getVectorNumElements();
4659 
4660  if (NumElems != 2 && NumElems != 4)
4661  return false;
4662  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4663  if (!isUndefOrEqual(Mask[i], i))
4664  return false;
4665  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4666  if (!isUndefOrEqual(Mask[i], i+NumElems))
4667  return false;
4668  return true;
4669 }
4670 
4671 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4672 /// all the same.
4673 static bool isSplatVector(SDNode *N) {
4674  if (N->getOpcode() != ISD::BUILD_VECTOR)
4675  return false;
4676 
4677  SDValue SplatValue = N->getOperand(0);
4678  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4679  if (N->getOperand(i) != SplatValue)
4680  return false;
4681  return true;
4682 }
4683 
4684 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4685 /// to an zero vector.
4686 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4688  SDValue V1 = N->getOperand(0);
4689  SDValue V2 = N->getOperand(1);
4690  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4691  for (unsigned i = 0; i != NumElems; ++i) {
4692  int Idx = N->getMaskElt(i);
4693  if (Idx >= (int)NumElems) {
4694  unsigned Opc = V2.getOpcode();
4695  if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4696  continue;
4697  if (Opc != ISD::BUILD_VECTOR ||
4698  !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4699  return false;
4700  } else if (Idx >= 0) {
4701  unsigned Opc = V1.getOpcode();
4702  if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4703  continue;
4704  if (Opc != ISD::BUILD_VECTOR ||
4705  !X86::isZeroNode(V1.getOperand(Idx)))
4706  return false;
4707  }
4708  }
4709  return true;
4710 }
4711 
4712 /// getZeroVector - Returns a vector of specified type with all zero elements.
4713 ///
4714 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4715  SelectionDAG &DAG, SDLoc dl) {
4716  assert(VT.isVector() && "Expected a vector type");
4717 
4718  // Always build SSE zero vectors as <4 x i32> bitcasted
4719  // to their dest type. This ensures they get CSE'd.
4720  SDValue Vec;
4721  if (VT.is128BitVector()) { // SSE
4722  if (Subtarget->hasSSE2()) { // SSE2
4723  SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4724  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4725  } else { // SSE1
4726  SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4727  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4728  }
4729  } else if (VT.is256BitVector()) { // AVX
4730  if (Subtarget->hasInt256()) { // AVX2
4731  SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4732  SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4733  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
4734  array_lengthof(Ops));
4735  } else {
4736  // 256-bit logic and arithmetic instructions in AVX are all
4737  // floating-point, no support for integer ops. Emit fp zeroed vectors.
4738  SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4739  SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4740  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
4741  array_lengthof(Ops));
4742  }
4743  } else if (VT.is512BitVector()) { // AVX-512
4744  SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4745  SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
4746  Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4747  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
4748  } else
4749  llvm_unreachable("Unexpected vector type");
4750 
4751  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4752 }
4753 
4754 /// getOnesVector - Returns a vector of specified type with all bits set.
4755 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4756 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4757 /// Then bitcast to their original type, ensuring they get CSE'd.
4758 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
4759  SDLoc dl) {
4760  assert(VT.isVector() && "Expected a vector type");
4761 
4762  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4763  SDValue Vec;
4764  if (VT.is256BitVector()) {
4765  if (HasInt256) { // AVX2
4766  SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4767  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
4768  array_lengthof(Ops));
4769  } else { // AVX
4770  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4771  Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4772  }
4773  } else if (VT.is128BitVector()) {
4774  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4775  } else
4776  llvm_unreachable("Unexpected vector type");
4777 
4778  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4779 }
4780 
4781 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4782 /// that point to V2 points to its first element.
4783 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4784  for (unsigned i = 0; i != NumElems; ++i) {
4785  if (Mask[i] > (int)NumElems) {
4786  Mask[i] = NumElems;
4787  }
4788  }
4789 }
4790 
4791 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4792 /// operation of specified width.
4793 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
4794  SDValue V2) {
4795  unsigned NumElems = VT.getVectorNumElements();
4796  SmallVector<int, 8> Mask;
4797  Mask.push_back(NumElems);
4798  for (unsigned i = 1; i != NumElems; ++i)
4799  Mask.push_back(i);
4800  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4801 }
4802 
4803 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4805  SDValue V2) {
4806  unsigned NumElems = VT.getVectorNumElements();
4807  SmallVector<int, 8> Mask;
4808  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4809  Mask.push_back(i);
4810  Mask.push_back(i + NumElems);
4811  }
4812  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4813 }
4814 
4815 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4817  SDValue V2) {
4818  unsigned NumElems = VT.getVectorNumElements();
4819  SmallVector<int, 8> Mask;
4820  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4821  Mask.push_back(i + Half);
4822  Mask.push_back(i + NumElems + Half);
4823  }
4824  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4825 }
4826 
4827 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4828 // a generic shuffle instruction because the target has no such instructions.
4829 // Generate shuffles which repeat i16 and i8 several times until they can be
4830 // represented by v4f32 and then be manipulated by target suported shuffles.
4831 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4832  MVT VT = V.getSimpleValueType();
4833  int NumElems = VT.getVectorNumElements();
4834  SDLoc dl(V);
4835 
4836  while (NumElems > 4) {
4837  if (EltNo < NumElems/2) {
4838  V = getUnpackl(DAG, dl, VT, V, V);
4839  } else {
4840  V = getUnpackh(DAG, dl, VT, V, V);
4841  EltNo -= NumElems/2;
4842  }
4843  NumElems >>= 1;
4844  }
4845  return V;
4846 }
4847 
4848 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
4849 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4850  MVT VT = V.getSimpleValueType();
4851  SDLoc dl(V);
4852 
4853  if (VT.is128BitVector()) {
4854  V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4855  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4856  V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4857  &SplatMask[0]);
4858  } else if (VT.is256BitVector()) {
4859  // To use VPERMILPS to splat scalars, the second half of indicies must
4860  // refer to the higher part, which is a duplication of the lower one,
4861  // because VPERMILPS can only handle in-lane permutations.
4862  int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4863  EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4864 
4865  V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4866  V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4867  &SplatMask[0]);
4868  } else
4869  llvm_unreachable("Vector size not supported");
4870 
4871  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4872 }
4873 
4874 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
4876  MVT SrcVT = SV->getSimpleValueType(0);
4877  SDValue V1 = SV->getOperand(0);
4878  SDLoc dl(SV);
4879 
4880  int EltNo = SV->getSplatIndex();
4881  int NumElems = SrcVT.getVectorNumElements();
4882  bool Is256BitVec = SrcVT.is256BitVector();
4883 
4884  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
4885  "Unknown how to promote splat for type");
4886 
4887  // Extract the 128-bit part containing the splat element and update
4888  // the splat element index when it refers to the higher register.
4889  if (Is256BitVec) {
4890  V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4891  if (EltNo >= NumElems/2)
4892  EltNo -= NumElems/2;
4893  }
4894 
4895  // All i16 and i8 vector types can't be used directly by a generic shuffle
4896  // instruction because the target has no such instruction. Generate shuffles
4897  // which repeat i16 and i8 several times until they fit in i32, and then can
4898  // be manipulated by target suported shuffles.
4899  MVT EltVT = SrcVT.getVectorElementType();
4900  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4901  V1 = PromoteSplati8i16(V1, DAG, EltNo);
4902 
4903  // Recreate the 256-bit vector and place the same 128-bit vector
4904  // into the low and high part. This is necessary because we want
4905  // to use VPERM* to shuffle the vectors
4906  if (Is256BitVec) {
4907  V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4908  }
4909 
4910  return getLegalSplat(DAG, V1, EltNo);
4911 }
4912 
4913 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4914 /// vector of zero or undef vector. This produces a shuffle where the low
4915 /// element of V2 is swizzled into the zero/undef vector, landing at element
4916 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4918  bool IsZero,
4919  const X86Subtarget *Subtarget,
4920  SelectionDAG &DAG) {
4921  MVT VT = V2.getSimpleValueType();
4922  SDValue V1 = IsZero
4923  ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4924  unsigned NumElems = VT.getVectorNumElements();
4925  SmallVector<int, 16> MaskVec;
4926  for (unsigned i = 0; i != NumElems; ++i)
4927  // If this is the insertion idx, put the low elt of V2 here.
4928  MaskVec.push_back(i == Idx ? NumElems : i);
4929  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
4930 }
4931 
4932 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4933 /// target specific opcode. Returns true if the Mask could be calculated.
4934 /// Sets IsUnary to true if only uses one source.
4935 static bool getTargetShuffleMask(SDNode *N, MVT VT,
4936  SmallVectorImpl<int> &Mask, bool &IsUnary) {
4937  unsigned NumElems = VT.getVectorNumElements();
4938  SDValue ImmN;
4939 
4940  IsUnary = false;
4941  switch(N->getOpcode()) {
4942  case X86ISD::SHUFP:
4943  ImmN = N->getOperand(N->getNumOperands()-1);
4944  DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4945  break;
4946  case X86ISD::UNPCKH:
4947  DecodeUNPCKHMask(VT, Mask);
4948  break;
4949  case X86ISD::UNPCKL:
4950  DecodeUNPCKLMask(VT, Mask);
4951  break;
4952  case X86ISD::MOVHLPS:
4953  DecodeMOVHLPSMask(NumElems, Mask);
4954  break;
4955  case X86ISD::MOVLHPS:
4956  DecodeMOVLHPSMask(NumElems, Mask);
4957  break;
4958  case X86ISD::PALIGNR:
4959  ImmN = N->getOperand(N->getNumOperands()-1);
4960  DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4961  break;
4962  case X86ISD::PSHUFD:
4963  case X86ISD::VPERMILP:
4964  ImmN = N->getOperand(N->getNumOperands()-1);
4965  DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4966  IsUnary = true;
4967  break;
4968  case X86ISD::PSHUFHW:
4969  ImmN = N->getOperand(N->getNumOperands()-1);
4970  DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4971  IsUnary = true;
4972  break;
4973  case X86ISD::PSHUFLW:
4974  ImmN = N->getOperand(N->getNumOperands()-1);
4975  DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4976  IsUnary = true;
4977  break;
4978  case X86ISD::VPERMI:
4979  ImmN = N->getOperand(N->getNumOperands()-1);
4980  DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4981  IsUnary = true;
4982  break;
4983  case X86ISD::MOVSS:
4984  case X86ISD::MOVSD: {
4985  // The index 0 always comes from the first element of the second source,
4986  // this is why MOVSS and MOVSD are used in the first place. The other
4987  // elements come from the other positions of the first source vector
4988  Mask.push_back(NumElems);
4989  for (unsigned i = 1; i != NumElems; ++i) {
4990  Mask.push_back(i);
4991  }
4992  break;
4993  }
4994  case X86ISD::VPERM2X128:
4995  ImmN = N->getOperand(N->getNumOperands()-1);
4996  DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4997  if (Mask.empty()) return false;
4998  break;
4999  case X86ISD::MOVDDUP:
5000  case X86ISD::MOVLHPD:
5001  case X86ISD::MOVLPD:
5002  case X86ISD::MOVLPS:
5003  case X86ISD::MOVSHDUP:
5004  case X86ISD::MOVSLDUP:
5005  // Not yet implemented
5006  return false;
5007  default: llvm_unreachable("unknown target shuffle node");
5008  }
5009 
5010  return true;
5011 }
5012 
5013 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5014 /// element of the result of the vector shuffle.
5015 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5016  unsigned Depth) {
5017  if (Depth == 6)
5018  return SDValue(); // Limit search depth.
5019 
5020  SDValue V = SDValue(N, 0);
5021  EVT VT = V.getValueType();
5022  unsigned Opcode = V.getOpcode();
5023 
5024  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5025  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5026  int Elt = SV->getMaskElt(Index);
5027 
5028  if (Elt < 0)
5029  return DAG.getUNDEF(VT.getVectorElementType());
5030 
5031  unsigned NumElems = VT.getVectorNumElements();
5032  SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5033  : SV->getOperand(1);
5034  return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5035  }
5036 
5037  // Recurse into target specific vector shuffles to find scalars.
5038  if (isTargetShuffle(Opcode)) {
5039  MVT ShufVT = V.getSimpleValueType();
5040  unsigned NumElems = ShufVT.getVectorNumElements();
5041  SmallVector<int, 16> ShuffleMask;
5042  bool IsUnary;
5043 
5044  if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5045  return SDValue();
5046 
5047  int Elt = ShuffleMask[Index];
5048  if (Elt < 0)
5049  return DAG.getUNDEF(ShufVT.getVectorElementType());
5050 
5051  SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5052  : N->getOperand(1);
5053  return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5054  Depth+1);
5055  }
5056 
5057  // Actual nodes that may contain scalar elements
5058  if (Opcode == ISD::BITCAST) {
5059  V = V.getOperand(0);
5060  EVT SrcVT = V.getValueType();
5061  unsigned NumElems = VT.getVectorNumElements();
5062 
5063  if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5064  return SDValue();
5065  }
5066 
5067  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5068  return (Index == 0) ? V.getOperand(0)
5069  : DAG.getUNDEF(VT.getVectorElementType());
5070 
5071  if (V.getOpcode() == ISD::BUILD_VECTOR)
5072  return V.getOperand(Index);
5073 
5074  return SDValue();
5075 }
5076 
5077 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5078 /// shuffle operation which come from a consecutively from a zero. The
5079 /// search can start in two different directions, from left or right.
5080 /// We count undefs as zeros until PreferredNum is reached.
5082  unsigned NumElems, bool ZerosFromLeft,
5083  SelectionDAG &DAG,
5084  unsigned PreferredNum = -1U) {
5085  unsigned NumZeros = 0;
5086  for (unsigned i = 0; i != NumElems; ++i) {
5087  unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5088  SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5089  if (!Elt.getNode())
5090  break;
5091 
5092  if (X86::isZeroNode(Elt))
5093  ++NumZeros;
5094  else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5095  NumZeros = std::min(NumZeros + 1, PreferredNum);
5096  else
5097  break;
5098  }
5099 
5100  return NumZeros;
5101 }
5102 
5103 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5104 /// correspond consecutively to elements from one of the vector operands,
5105 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5106 static
5108  unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5109  unsigned NumElems, unsigned &OpNum) {
5110  bool SeenV1 = false;
5111  bool SeenV2 = false;
5112 
5113  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5114  int Idx = SVOp->getMaskElt(i);
5115  // Ignore undef indicies
5116  if (Idx < 0)
5117  continue;
5118 
5119  if (Idx < (int)NumElems)
5120  SeenV1 = true;
5121  else
5122  SeenV2 = true;
5123 
5124  // Only accept consecutive elements from the same vector
5125  if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5126  return false;
5127  }
5128 
5129  OpNum = SeenV1 ? 0 : 1;
5130  return true;
5131 }
5132 
5133 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5134 /// logical left shift of a vector.
5136  bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5137  unsigned NumElems =
5139  unsigned NumZeros = getNumOfConsecutiveZeros(
5140  SVOp, NumElems, false /* check zeros from right */, DAG,
5141  SVOp->getMaskElt(0));
5142  unsigned OpSrc;
5143 
5144  if (!NumZeros)
5145  return false;
5146 
5147  // Considering the elements in the mask that are not consecutive zeros,
5148  // check if they consecutively come from only one of the source vectors.
5149  //
5150  // V1 = {X, A, B, C} 0
5151  // \ \ \ /
5152  // vector_shuffle V1, V2 <1, 2, 3, X>
5153  //
5154  if (!isShuffleMaskConsecutive(SVOp,
5155  0, // Mask Start Index
5156  NumElems-NumZeros, // Mask End Index(exclusive)
5157  NumZeros, // Where to start looking in the src vector
5158  NumElems, // Number of elements in vector
5159  OpSrc)) // Which source operand ?
5160  return false;
5161 
5162  isLeft = false;
5163  ShAmt = NumZeros;
5164  ShVal = SVOp->getOperand(OpSrc);
5165  return true;
5166 }
5167 
5168 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5169 /// logical left shift of a vector.
5171  bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5172  unsigned NumElems =
5174  unsigned NumZeros = getNumOfConsecutiveZeros(
5175  SVOp, NumElems, true /* check zeros from left */, DAG,
5176  NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5177  unsigned OpSrc;
5178 
5179  if (!NumZeros)
5180  return false;
5181 
5182  // Considering the elements in the mask that are not consecutive zeros,
5183  // check if they consecutively come from only one of the source vectors.
5184  //
5185  // 0 { A, B, X, X } = V2
5186  // / \ / /
5187  // vector_shuffle V1, V2 <X, X, 4, 5>
5188  //
5189  if (!isShuffleMaskConsecutive(SVOp,
5190  NumZeros, // Mask Start Index
5191  NumElems, // Mask End Index(exclusive)
5192  0, // Where to start looking in the src vector
5193  NumElems, // Number of elements in vector
5194  OpSrc)) // Which source operand ?
5195  return false;
5196 
5197  isLeft = true;
5198  ShAmt = NumZeros;
5199  ShVal = SVOp->getOperand(OpSrc);
5200  return true;
5201 }
5202 
5203 /// isVectorShift - Returns true if the shuffle can be implemented as a
5204 /// logical left or right shift of a vector.
5206  bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5207  // Although the logic below support any bitwidth size, there are no
5208  // shift instructions which handle more than 128-bit vectors.
5209  if (!SVOp->getSimpleValueType(0).is128BitVector())
5210  return false;
5211 
5212  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5213  isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5214  return true;
5215 
5216  return false;
5217 }
5218 
5219 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5220 ///
5221 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5222  unsigned NumNonZero, unsigned NumZero,
5223  SelectionDAG &DAG,
5224  const X86Subtarget* Subtarget,
5225  const TargetLowering &TLI) {
5226  if (NumNonZero > 8)
5227  return SDValue();
5228 
5229  SDLoc dl(Op);
5230  SDValue V(0, 0);
5231  bool First = true;
5232  for (unsigned i = 0; i < 16; ++i) {
5233  bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5234  if (ThisIsNonZero && First) {
5235  if (NumZero)
5236  V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5237  else
5238  V = DAG.getUNDEF(MVT::v8i16);
5239  First = false;
5240  }
5241 
5242  if ((i & 1) != 0) {
5243  SDValue ThisElt(0, 0), LastElt(0, 0);
5244  bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5245  if (LastIsNonZero) {
5246  LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5247  MVT::i16, Op.getOperand(i-1));
5248  }
5249  if (ThisIsNonZero) {
5250  ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5251  ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5252  ThisElt, DAG.getConstant(8, MVT::i8));
5253  if (LastIsNonZero)
5254  ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5255  } else
5256  ThisElt = LastElt;
5257 
5258  if (ThisElt.getNode())
5259  V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5260  DAG.getIntPtrConstant(i/2));
5261  }
5262  }
5263 
5264  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5265 }
5266 
5267 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5268 ///
5269 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5270  unsigned NumNonZero, unsigned NumZero,
5271  SelectionDAG &DAG,
5272  const X86Subtarget* Subtarget,
5273  const TargetLowering &TLI) {
5274  if (NumNonZero > 4)
5275  return SDValue();
5276 
5277  SDLoc dl(Op);
5278  SDValue V(0, 0);
5279  bool First = true;
5280  for (unsigned i = 0; i < 8; ++i) {
5281  bool isNonZero = (NonZeros & (1 << i)) != 0;
5282  if (isNonZero) {
5283  if (First) {
5284  if (NumZero)
5285  V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5286  else
5287  V = DAG.getUNDEF(MVT::v8i16);
5288  First = false;
5289  }
5290  V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5291  MVT::v8i16, V, Op.getOperand(i),
5292  DAG.getIntPtrConstant(i));
5293  }
5294  }
5295 
5296  return V;
5297 }
5298 
5299 /// getVShift - Return a vector logical shift node.
5300 ///
5301 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5302  unsigned NumBits, SelectionDAG &DAG,
5303  const TargetLowering &TLI, SDLoc dl) {
5304  assert(VT.is128BitVector() && "Unknown type for VShift");
5305  EVT ShVT = MVT::v2i64;
5306  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5307  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5308  return DAG.getNode(ISD::BITCAST, dl, VT,
5309  DAG.getNode(Opc, dl, ShVT, SrcOp,
5310  DAG.getConstant(NumBits,
5311  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
5312 }
5313 
5314 static SDValue
5316 
5317  // Check if the scalar load can be widened into a vector load. And if
5318  // the address is "base + cst" see if the cst can be "absorbed" into
5319  // the shuffle mask.
5320  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5321  SDValue Ptr = LD->getBasePtr();
5322  if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5323  return SDValue();
5324  EVT PVT = LD->getValueType(0);
5325  if (PVT != MVT::i32 && PVT != MVT::f32)
5326  return SDValue();
5327 
5328  int FI = -1;
5329  int64_t Offset = 0;
5330  if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5331  FI = FINode->getIndex();
5332  Offset = 0;
5333  } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5334  isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5335  FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5336  Offset = Ptr.getConstantOperandVal(1);
5337  Ptr = Ptr.getOperand(0);
5338  } else {
5339  return SDValue();
5340  }
5341 
5342  // FIXME: 256-bit vector instructions don't require a strict alignment,
5343  // improve this code to support it better.
5344  unsigned RequiredAlign = VT.getSizeInBits()/8;
5345  SDValue Chain = LD->getChain();
5346  // Make sure the stack object alignment is at least 16 or 32.
5348  if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5349  if (MFI->isFixedObjectIndex(FI)) {
5350  // Can't change the alignment. FIXME: It's possible to compute
5351  // the exact stack offset and reference FI + adjust offset instead.
5352  // If someone *really* cares about this. That's the way to implement it.
5353  return SDValue();
5354  } else {
5355  MFI->setObjectAlignment(FI, RequiredAlign);
5356  }
5357  }
5358 
5359  // (Offset % 16 or 32) must be multiple of 4. Then address is then
5360  // Ptr + (Offset & ~15).
5361  if (Offset < 0)
5362  return SDValue();
5363  if ((Offset % RequiredAlign) & 3)
5364  return SDValue();
5365  int64_t StartOffset = Offset & ~(RequiredAlign-1);
5366  if (StartOffset)
5367  Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5368  Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5369 
5370  int EltNo = (Offset - StartOffset) >> 2;
5371  unsigned NumElems = VT.getVectorNumElements();
5372 
5373  EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5374  SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5375  LD->getPointerInfo().getWithOffset(StartOffset),
5376  false, false, false, 0);
5377 
5378  SmallVector<int, 8> Mask;
5379  for (unsigned i = 0; i != NumElems; ++i)
5380  Mask.push_back(EltNo);
5381 
5382  return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
5383  }
5384 
5385  return SDValue();
5386 }
5387 
5388 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
5389 /// vector of type 'VT', see if the elements can be replaced by a single large
5390 /// load which has the same value as a build_vector whose operands are 'elts'.
5391 ///
5392 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
5393 ///
5394 /// FIXME: we'd also like to handle the case where the last elements are zero
5395 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
5396 /// There's even a handy isZeroNode for that purpose.
5398  SDLoc &DL, SelectionDAG &DAG) {
5399  EVT EltVT = VT.getVectorElementType();
5400  unsigned NumElems = Elts.size();
5401 
5402  LoadSDNode *LDBase = NULL;
5403  unsigned LastLoadedElt = -1U;
5404 
5405  // For each element in the initializer, see if we've found a load or an undef.
5406  // If we don't find an initial load element, or later load elements are
5407  // non-consecutive, bail out.
5408  for (unsigned i = 0; i < NumElems; ++i) {
5409  SDValue Elt = Elts[i];
5410 
5411  if (!Elt.getNode() ||
5412  (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
5413  return SDValue();
5414  if (!LDBase) {
5415  if (Elt.getNode()->getOpcode() == ISD::UNDEF)
5416  return SDValue();
5417  LDBase = cast<LoadSDNode>(Elt.getNode());
5418  LastLoadedElt = i;
5419  continue;
5420  }
5421  if (Elt.getOpcode() == ISD::UNDEF)
5422  continue;
5423 
5424  LoadSDNode *LD = cast<LoadSDNode>(Elt);
5425  if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
5426  return SDValue();
5427  LastLoadedElt = i;
5428  }
5429 
5430  // If we have found an entire vector of loads and undefs, then return a large
5431  // load of the entire vector width starting at the base pointer. If we found
5432  // consecutive loads for the low half, generate a vzext_load node.
5433  if (LastLoadedElt == NumElems - 1) {
5434  SDValue NewLd = SDValue();
5435  if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
5436  NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5437  LDBase->getPointerInfo(),
5438  LDBase->isVolatile(), LDBase->isNonTemporal(),
5439  LDBase->isInvariant(), 0);
5440  NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5441  LDBase->getPointerInfo(),
5442  LDBase->isVolatile(), LDBase->isNonTemporal(),
5443  LDBase->isInvariant(), LDBase->getAlignment());
5444 
5445  if (LDBase->hasAnyUseOfValue(1)) {
5446  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5447  SDValue(LDBase, 1),
5448  SDValue(NewLd.getNode(), 1));
5449  DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5450  DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5451  SDValue(NewLd.getNode(), 1));
5452  }
5453 
5454  return NewLd;
5455  }
5456  if (NumElems == 4 && LastLoadedElt == 1 &&
5459  SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5460  SDValue ResNode =
5461  DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
5462  array_lengthof(Ops), MVT::i64,
5463  LDBase->getPointerInfo(),
5464  LDBase->getAlignment(),
5465  false/*isVolatile*/, true/*ReadMem*/,
5466  false/*WriteMem*/);
5467 
5468  // Make sure the newly-created LOAD is in the same position as LDBase in
5469  // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
5470  // update uses of LDBase's output chain to use the TokenFactor.
5471  if (LDBase->hasAnyUseOfValue(1)) {
5472  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5473  SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
5474  DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5475  DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5476  SDValue(ResNode.getNode(), 1));
5477  }
5478 
5479  return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
5480  }
5481  return SDValue();
5482 }
5483 
5484 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
5485 /// to generate a splat value for the following cases:
5486 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5487 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5488 /// a scalar load, or a constant.
5489 /// The VBROADCAST node is returned when a pattern is found,
5490 /// or SDValue() otherwise.
5491 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
5492  SelectionDAG &DAG) {
5493  if (!Subtarget->hasFp256())
5494  return SDValue();
5495 
5496  MVT VT = Op.getSimpleValueType();
5497  SDLoc dl(Op);
5498 
5499  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5500  "Unsupported vector type for broadcast.");
5501 
5502  SDValue Ld;
5503  bool ConstSplatVal;
5504 
5505  switch (Op.getOpcode()) {
5506  default:
5507  // Unknown pattern found.
5508  return SDValue();
5509 
5510  case ISD::BUILD_VECTOR: {
5511  // The BUILD_VECTOR node must be a splat.
5512  if (!isSplatVector(Op.getNode()))
5513  return SDValue();
5514 
5515  Ld = Op.getOperand(0);
5516  ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5517  Ld.getOpcode() == ISD::ConstantFP);
5518 
5519  // The suspected load node has several users. Make sure that all
5520  // of its users are from the BUILD_VECTOR node.
5521  // Constants may have multiple users.
5522  if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
5523  return SDValue();
5524  break;
5525  }
5526 
5527  case ISD::VECTOR_SHUFFLE: {
5528  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5529 
5530  // Shuffles must have a splat mask where the first element is
5531  // broadcasted.
5532  if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5533  return SDValue();
5534 
5535  SDValue Sc = Op.getOperand(0);
5536  if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5537  Sc.getOpcode() != ISD::BUILD_VECTOR) {
5538 
5539  if (!Subtarget->hasInt256())
5540  return SDValue();
5541 
5542  // Use the register form of the broadcast instruction available on AVX2.
5543  if (VT.getSizeInBits() >= 256)
5544  Sc = Extract128BitVector(Sc, 0, DAG, dl);
5545  return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5546  }
5547 
5548  Ld = Sc.getOperand(0);
5549  ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5550  Ld.getOpcode() == ISD::ConstantFP);
5551 
5552  // The scalar_to_vector node and the suspected
5553  // load node must have exactly one user.
5554  // Constants may have multiple users.
5555 
5556  // AVX-512 has register version of the broadcast
5557  bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
5558  Ld.getValueType().getSizeInBits() >= 32;
5559  if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5560  !hasRegVer))
5561  return SDValue();
5562  break;
5563  }
5564  }
5565 
5566  bool IsGE256 = (VT.getSizeInBits() >= 256);
5567 
5568  // Handle the broadcasting a single constant scalar from the constant pool
5569  // into a vector. On Sandybridge it is still better to load a constant vector
5570  // from the constant pool and not to broadcast it from a scalar.
5571  if (ConstSplatVal && Subtarget->hasInt256()) {
5572  EVT CVT = Ld.getValueType();
5573  assert(!CVT.isVector() && "Must not broadcast a vector type");
5574  unsigned ScalarSize = CVT.getSizeInBits();
5575 
5576  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
5577  const Constant *C = 0;
5578  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5579  C = CI->getConstantIntValue();
5580  else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5581  C = CF->getConstantFPValue();
5582 
5583  assert(C && "Invalid constant type");
5584 
5585  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5586  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
5587  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5588  Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5590  false, false, false, Alignment);
5591 
5592  return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5593  }
5594  }
5595 
5596  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5597  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5598 
5599  // Handle AVX2 in-register broadcasts.
5600  if (!IsLoad && Subtarget->hasInt256() &&
5601  (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5602  return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5603 
5604  // The scalar source must be a normal load.
5605  if (!IsLoad)
5606  return SDValue();
5607 
5608  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
5609  return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5610 
5611  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5612  // double since there is no vbroadcastsd xmm
5613  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
5614  if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5615  return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5616  }
5617 
5618  // Unsupported broadcast.
5619  return SDValue();
5620 }
5621 
5623  MVT VT = Op.getSimpleValueType();
5624 
5625  // Skip if insert_vec_elt is not supported.
5626  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5628  return SDValue();
5629 
5630  SDLoc DL(Op);
5631  unsigned NumElems = Op.getNumOperands();
5632 
5633  SDValue VecIn1;
5634  SDValue VecIn2;
5635  SmallVector<unsigned, 4> InsertIndices;
5636  SmallVector<int, 8> Mask(NumElems, -1);
5637 
5638  for (unsigned i = 0; i != NumElems; ++i) {
5639  unsigned Opc = Op.getOperand(i).getOpcode();
5640 
5641  if (Opc == ISD::UNDEF)
5642  continue;
5643 
5644  if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5645  // Quit if more than 1 elements need inserting.
5646  if (InsertIndices.size() > 1)
5647  return SDValue();
5648 
5649  InsertIndices.push_back(i);
5650  continue;
5651  }
5652 
5653  SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5654  SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5655 
5656  // Quit if extracted from vector of different type.
5657  if (ExtractedFromVec.getValueType() != VT)
5658  return SDValue();
5659 
5660  // Quit if non-constant index.
5661  if (!isa<ConstantSDNode>(ExtIdx))
5662  return SDValue();
5663 
5664  if (VecIn1.getNode() == 0)
5665  VecIn1 = ExtractedFromVec;
5666  else if (VecIn1 != ExtractedFromVec) {
5667  if (VecIn2.getNode() == 0)
5668  VecIn2 = ExtractedFromVec;
5669  else if (VecIn2 != ExtractedFromVec)
5670  // Quit if more than 2 vectors to shuffle
5671  return SDValue();
5672  }
5673 
5674  unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5675 
5676  if (ExtractedFromVec == VecIn1)
5677  Mask[i] = Idx;
5678  else if (ExtractedFromVec == VecIn2)
5679  Mask[i] = Idx + NumElems;
5680  }
5681 
5682  if (VecIn1.getNode() == 0)
5683  return SDValue();
5684 
5685  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5686  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
5687  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5688  unsigned Idx = InsertIndices[i];
5689  NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
5690  DAG.getIntPtrConstant(Idx));
5691  }
5692 
5693  return NV;
5694 }
5695 
5696 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
5697 SDValue
5698 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
5699 
5700  MVT VT = Op.getSimpleValueType();
5701  assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
5702  "Unexpected type in LowerBUILD_VECTORvXi1!");
5703 
5704  SDLoc dl(Op);
5705  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5706  SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
5707  SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5708  Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5709  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
5710  Ops, VT.getVectorNumElements());
5711  }
5712 
5713  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5714  SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
5715  SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5716  Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5717  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
5718  Ops, VT.getVectorNumElements());
5719  }
5720 
5721  bool AllContants = true;
5722  uint64_t Immediate = 0;
5723  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
5724  SDValue In = Op.getOperand(idx);
5725  if (In.getOpcode() == ISD::UNDEF)
5726  continue;
5727  if (!isa<ConstantSDNode>(In)) {
5728  AllContants = false;
5729  break;
5730  }
5731  if (cast<ConstantSDNode>(In)->getZExtValue())
5732  Immediate |= (1ULL << idx);
5733  }
5734 
5735  if (AllContants) {
5736  SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
5737  DAG.getConstant(Immediate, MVT::i16));
5738  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
5739  DAG.getIntPtrConstant(0));
5740  }
5741 
5742  // Splat vector (with undefs)
5743  SDValue In = Op.getOperand(0);
5744  for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) {
5745  if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF)
5746  llvm_unreachable("Unsupported predicate operation");
5747  }
5748 
5749  SDValue EFLAGS, X86CC;
5750  if (In.getOpcode() == ISD::SETCC) {
5751  SDValue Op0 = In.getOperand(0);
5752  SDValue Op1 = In.getOperand(1);
5753  ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
5754  bool isFP = Op1.getValueType().isFloatingPoint();
5755  unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5756 
5757  assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
5758 
5759  X86CC = DAG.getConstant(X86CCVal, MVT::i8);
5760  EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
5761  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
5762  } else if (In.getOpcode() == X86ISD::SETCC) {
5763  X86CC = In.getOperand(0);
5764  EFLAGS = In.getOperand(1);
5765  } else {
5766  // The algorithm:
5767  // Bit1 = In & 0x1
5768  // if (Bit1 != 0)
5769  // ZF = 0
5770  // else
5771  // ZF = 1
5772  // if (ZF == 0)
5773  // res = allOnes ### CMOVNE -1, %res
5774  // else
5775  // res = allZero
5776  MVT InVT = In.getSimpleValueType();
5777  SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
5778  EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
5779  X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5780  }
5781 
5782  if (VT == MVT::v16i1) {
5783  SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
5784  SDValue Cst0 = DAG.getConstant(0, MVT::i16);
5785  SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
5786  Cst0, Cst1, X86CC, EFLAGS);
5787  return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
5788  }
5789 
5790  if (VT == MVT::v8i1) {
5791  SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
5792  SDValue Cst0 = DAG.getConstant(0, MVT::i32);
5793  SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
5794  Cst0, Cst1, X86CC, EFLAGS);
5795  CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
5796  return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
5797  }
5798  llvm_unreachable("Unsupported predicate operation");
5799 }
5800 
5801 SDValue
5802 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5803  SDLoc dl(Op);
5804 
5805  MVT VT = Op.getSimpleValueType();
5806  MVT ExtVT = VT.getVectorElementType();
5807  unsigned NumElems = Op.getNumOperands();
5808 
5809  // Generate vectors for predicate vectors.
5810  if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
5811  return LowerBUILD_VECTORvXi1(Op, DAG);
5812 
5813  // Vectors containing all zeros can be matched by pxor and xorps later
5814  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5815  // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5816  // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5817  if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
5818  return Op;
5819 
5820  return getZeroVector(VT, Subtarget, DAG, dl);
5821  }
5822 
5823  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5824  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5825  // vpcmpeqd on 256-bit vectors.
5826  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
5827  if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
5828  return Op;
5829 
5830  if (!VT.is512BitVector())
5831  return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
5832  }
5833 
5834  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
5835  if (Broadcast.getNode())
5836  return Broadcast;
5837 
5838  unsigned EVTBits = ExtVT.getSizeInBits();
5839 
5840  unsigned NumZero = 0;
5841  unsigned NumNonZero = 0;
5842  unsigned NonZeros = 0;
5843  bool IsAllConstants = true;
5844  SmallSet<SDValue, 8> Values;
5845  for (unsigned i = 0; i < NumElems; ++i) {
5846  SDValue Elt = Op.getOperand(i);
5847  if (Elt.getOpcode() == ISD::UNDEF)
5848  continue;
5849  Values.insert(Elt);
5850  if (Elt.getOpcode() != ISD::Constant &&
5851  Elt.getOpcode() != ISD::ConstantFP)
5852  IsAllConstants = false;
5853  if (X86::isZeroNode(Elt))
5854  NumZero++;
5855  else {
5856  NonZeros |= (1 << i);
5857  NumNonZero++;
5858  }
5859  }
5860 
5861  // All undef vector. Return an UNDEF. All zero vectors were handled above.
5862  if (NumNonZero == 0)
5863  return DAG.getUNDEF(VT);
5864 
5865  // Special case for single non-zero, non-undef, element.
5866  if (NumNonZero == 1) {
5867  unsigned Idx = countTrailingZeros(NonZeros);
5868  SDValue Item = Op.getOperand(Idx);
5869 
5870  // If this is an insertion of an i64 value on x86-32, and if the top bits of
5871  // the value are obviously zero, truncate the value to i32 and do the
5872  // insertion that way. Only do this if the value is non-constant or if the
5873  // value is a constant being inserted into element 0. It is cheaper to do
5874  // a constant pool load than it is to do a movd + shuffle.
5875  if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5876  (!IsAllConstants || Idx == 0)) {
5877  if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5878  // Handle SSE only.
5879  assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5880  EVT VecVT = MVT::v4i32;
5881  unsigned VecElts = 4;
5882 
5883  // Truncate the value (which may itself be a constant) to i32, and
5884  // convert it to a vector with movd (S2V+shuffle to zero extend).
5885  Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5886  Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5887  Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5888 
5889  // Now we have our 32-bit value zero extended in the low element of
5890  // a vector. If Idx != 0, swizzle it into place.
5891  if (Idx != 0) {
5892  SmallVector<int, 4> Mask;
5893  Mask.push_back(Idx);
5894  for (unsigned i = 1; i != VecElts; ++i)
5895  Mask.push_back(i);
5896  Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5897  &Mask[0]);
5898  }
5899  return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5900  }
5901  }
5902 
5903  // If we have a constant or non-constant insertion into the low element of
5904  // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5905  // the rest of the elements. This will be matched as movd/movq/movss/movsd
5906  // depending on what the source datatype is.
5907  if (Idx == 0) {
5908  if (NumZero == 0)
5909  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5910 
5911  if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5912  (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5913  if (VT.is256BitVector() || VT.is512BitVector()) {
5914  SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5915  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5916  Item, DAG.getIntPtrConstant(0));
5917  }
5918  assert(VT.is128BitVector() && "Expected an SSE value type!");
5919  Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5920  // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5921  return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5922  }
5923 
5924  if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5925  Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5926  Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5927  if (VT.is256BitVector()) {
5928  SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5929  Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5930  } else {
5931  assert(VT.is128BitVector() && "Expected an SSE value type!");
5932  Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5933  }
5934  return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5935  }
5936  }
5937 
5938  // Is it a vector logical left shift?
5939  if (NumElems == 2 && Idx == 1 &&
5940  X86::isZeroNode(Op.getOperand(0)) &&
5941  !X86::isZeroNode(Op.getOperand(1))) {
5942  unsigned NumBits = VT.getSizeInBits();
5943  return getVShift(true, VT,
5945  VT, Op.getOperand(1)),
5946  NumBits/2, DAG, *this, dl);
5947  }
5948 
5949  if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5950  return SDValue();
5951 
5952  // Otherwise, if this is a vector with i32 or f32 elements, and the element
5953  // is a non-constant being inserted into an element other than the low one,
5954  // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
5955  // movd/movss) to move this into the low element, then shuffle it into
5956  // place.
5957  if (EVTBits == 32) {
5958  Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5959 
5960  // Turn it into a shuffle of zero and zero-extended scalar to vector.
5961  Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5962  SmallVector<int, 8> MaskVec;
5963  for (unsigned i = 0; i != NumElems; ++i)
5964  MaskVec.push_back(i == Idx ? 0 : 1);
5965  return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5966  }
5967  }
5968 
5969  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5970  if (Values.size() == 1) {
5971  if (EVTBits == 32) {
5972  // Instead of a shuffle like this:
5973  // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5974  // Check if it's possible to issue this instead.
5975  // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5976  unsigned Idx = countTrailingZeros(NonZeros);
5977  SDValue Item = Op.getOperand(Idx);
5978  if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5979  return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5980  }
5981  return SDValue();
5982  }
5983 
5984  // A vector full of immediates; various special cases are already
5985  // handled, so this is best done with a single constant-pool load.
5986  if (IsAllConstants)
5987  return SDValue();
5988 
5989  // For AVX-length vectors, build the individual 128-bit pieces and use
5990  // shuffles to put them in place.
5991  if (VT.is256BitVector()) {
5993  for (unsigned i = 0; i != NumElems; ++i)
5994  V.push_back(Op.getOperand(i));
5995 
5996  EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5997 
5998  // Build both the lower and upper subvector.
5999  SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
6000  SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
6001  NumElems/2);
6002 
6003  // Recreate the wider vector with the lower and upper part.
6004  return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6005  }
6006 
6007  // Let legalizer expand 2-wide build_vectors.
6008  if (EVTBits == 64) {
6009  if (NumNonZero == 1) {
6010  // One half is zero or undef.
6011  unsigned Idx = countTrailingZeros(NonZeros);
6012  SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6013  Op.getOperand(Idx));
6014  return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6015  }
6016  return SDValue();
6017  }
6018 
6019  // If element VT is < 32 bits, convert it to inserts into a zero vector.
6020  if (EVTBits == 8 && NumElems == 16) {
6021  SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
6022  Subtarget, *this);
6023  if (V.getNode()) return V;
6024  }
6025 
6026  if (EVTBits == 16 && NumElems == 8) {
6027  SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
6028  Subtarget, *this);
6029  if (V.getNode()) return V;
6030  }
6031 
6032  // If element VT is == 32 bits, turn it into a number of shuffles.
6033  SmallVector<SDValue, 8> V(NumElems);
6034  if (NumElems == 4 && NumZero > 0) {
6035  for (unsigned i = 0; i < 4; ++i) {
6036  bool isZero = !(NonZeros & (1 << i));
6037  if (isZero)
6038  V[i] = getZeroVector(VT, Subtarget, DAG, dl);
6039  else
6040  V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6041  }
6042 
6043  for (unsigned i = 0; i < 2; ++i) {
6044  switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6045  default: break;
6046  case 0:
6047  V[i] = V[i*2]; // Must be a zero vector.
6048  break;
6049  case 1:
6050  V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
6051  break;
6052  case 2:
6053  V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
6054  break;
6055  case 3:
6056  V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
6057  break;
6058  }
6059  }
6060 
6061  bool Reverse1 = (NonZeros & 0x3) == 2;
6062  bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6063  int MaskVec[] = {
6064  Reverse1 ? 1 : 0,
6065  Reverse1 ? 0 : 1,
6066  static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6067  static_cast<int>(Reverse2 ? NumElems : NumElems+1)
6068  };
6069  return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
6070  }
6071 
6072  if (Values.size() > 1 && VT.is128BitVector()) {
6073  // Check for a build vector of consecutive loads.
6074  for (unsigned i = 0; i < NumElems; ++i)
6075  V[i] = Op.getOperand(i);
6076 
6077  // Check for elements which are consecutive loads.
6078  SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
6079  if (LD.getNode())
6080  return LD;
6081 
6082  // Check for a build vector from mostly shuffle plus few inserting.
6083  SDValue Sh = buildFromShuffleMostly(Op, DAG);
6084  if (Sh.getNode())
6085  return Sh;
6086 
6087  // For SSE 4.1, use insertps to put the high elements into the low element.
6088  if (getSubtarget()->hasSSE41()) {
6089  SDValue Result;
6090  if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
6091  Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6092  else
6093  Result = DAG.getUNDEF(VT);
6094 
6095  for (unsigned i = 1; i < NumElems; ++i) {
6096  if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
6097  Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6098  Op.getOperand(i), DAG.getIntPtrConstant(i));
6099  }
6100  return Result;
6101  }
6102 
6103  // Otherwise, expand into a number of unpckl*, start by extending each of
6104  // our (non-undef) elements to the full vector width with the element in the
6105  // bottom slot of the vector (which generates no code for SSE).
6106  for (unsigned i = 0; i < NumElems; ++i) {
6107  if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
6108  V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6109  else
6110  V[i] = DAG.getUNDEF(VT);
6111  }
6112 
6113  // Next, we iteratively mix elements, e.g. for v4f32:
6114  // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6115  // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6116  // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
6117  unsigned EltStride = NumElems >> 1;
6118  while (EltStride != 0) {
6119  for (unsigned i = 0; i < EltStride; ++i) {
6120  // If V[i+EltStride] is undef and this is the first round of mixing,
6121  // then it is safe to just drop this shuffle: V[i] is already in the
6122  // right place, the one element (since it's the first round) being
6123  // inserted as undef can be dropped. This isn't safe for successive
6124  // rounds because they will permute elements within both vectors.
6125  if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
6126  EltStride == NumElems/2)
6127  continue;
6128 
6129  V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
6130  }
6131  EltStride >>= 1;
6132  }
6133  return V[0];
6134  }
6135  return SDValue();
6136 }
6137 
6138 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
6139 // to create 256-bit vectors from two other 128-bit ones.
6141  SDLoc dl(Op);
6142  MVT ResVT = Op.getSimpleValueType();
6143 
6144  assert((ResVT.is256BitVector() ||
6145  ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6146 
6147  SDValue V1 = Op.getOperand(0);
6148  SDValue V2 = Op.getOperand(1);
6149  unsigned NumElems = ResVT.getVectorNumElements();
6150  if(ResVT.is256BitVector())
6151  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6152 
6153  return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6154 }
6155 
6157  assert(Op.getNumOperands() == 2);
6158 
6159  // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
6160  // from two other 128-bit ones.
6161  return LowerAVXCONCAT_VECTORS(Op, DAG);
6162 }
6163 
6164 // Try to lower a shuffle node into a simple blend instruction.
6165 static SDValue
6167  const X86Subtarget *Subtarget, SelectionDAG &DAG) {
6168  SDValue V1 = SVOp->getOperand(0);
6169  SDValue V2 = SVOp->getOperand(1);
6170  SDLoc dl(SVOp);
6171  MVT VT = SVOp->getSimpleValueType(0);
6172  MVT EltVT = VT.getVectorElementType();
6173  unsigned NumElems = VT.getVectorNumElements();
6174 
6175  // There is no blend with immediate in AVX-512.
6176  if (VT.is512BitVector())
6177  return SDValue();
6178 
6179  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
6180  return SDValue();
6181  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
6182  return SDValue();
6183 
6184  // Check the mask for BLEND and build the value.
6185  unsigned MaskValue = 0;
6186  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
6187  unsigned NumLanes = (NumElems-1)/8 + 1;
6188  unsigned NumElemsInLane = NumElems / NumLanes;
6189 
6190  // Blend for v16i16 should be symetric for the both lanes.
6191  for (unsigned i = 0; i < NumElemsInLane; ++i) {
6192 
6193  int SndLaneEltIdx = (NumLanes == 2) ?
6194  SVOp->getMaskElt(i + NumElemsInLane) : -1;
6195  int EltIdx = SVOp->getMaskElt(i);
6196 
6197  if ((EltIdx < 0 || EltIdx == (int)i) &&
6198  (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
6199  continue;
6200 
6201  if (((unsigned)EltIdx == (i + NumElems)) &&
6202  (SndLaneEltIdx < 0 ||
6203  (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
6204  MaskValue |= (1<<i);
6205  else
6206  return SDValue();
6207  }
6208 
6209  // Convert i32 vectors to floating point if it is not AVX2.
6210  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
6211  MVT BlendVT = VT;
6212  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
6214  NumElems);
6215  V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
6216  V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
6217  }
6218 
6219  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
6220  DAG.getConstant(MaskValue, MVT::i32));
6221  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
6222 }
6223 
6224 // v8i16 shuffles - Prefer shuffles in the following order:
6225 // 1. [all] pshuflw, pshufhw, optional move
6226 // 2. [ssse3] 1 x pshufb
6227 // 3. [ssse3] 2 x pshufb + 1 x por
6228 // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
6229 static SDValue
6231  SelectionDAG &DAG) {
6232  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6233  SDValue V1 = SVOp->getOperand(0);
6234  SDValue V2 = SVOp->getOperand(1);
6235  SDLoc dl(SVOp);
6236  SmallVector<int, 8> MaskVals;
6237 
6238  // Determine if more than 1 of the words in each of the low and high quadwords
6239  // of the result come from the same quadword of one of the two inputs. Undef
6240  // mask values count as coming from any quadword, for better codegen.
6241  unsigned LoQuad[] = { 0, 0, 0, 0 };
6242  unsigned HiQuad[] = { 0, 0, 0, 0 };
6243  std::bitset<4> InputQuads;
6244  for (unsigned i = 0; i < 8; ++i) {
6245  unsigned *Quad = i < 4 ? LoQuad : HiQuad;
6246  int EltIdx = SVOp->getMaskElt(i);
6247  MaskVals.push_back(EltIdx);
6248  if (EltIdx < 0) {
6249  ++Quad[0];
6250  ++Quad[1];
6251  ++Quad[2];
6252  ++Quad[3];
6253  continue;
6254  }
6255  ++Quad[EltIdx / 4];
6256  InputQuads.set(EltIdx / 4);
6257  }
6258 
6259  int BestLoQuad = -1;
6260  unsigned MaxQuad = 1;
6261  for (unsigned i = 0; i < 4; ++i) {
6262  if (LoQuad[i] > MaxQuad) {
6263  BestLoQuad = i;
6264  MaxQuad = LoQuad[i];
6265  }
6266  }
6267 
6268  int BestHiQuad = -1;
6269  MaxQuad = 1;
6270  for (unsigned i = 0; i < 4; ++i) {
6271  if (HiQuad[i] > MaxQuad) {
6272  BestHiQuad = i;
6273  MaxQuad = HiQuad[i];
6274  }
6275  }
6276 
6277  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
6278  // of the two input vectors, shuffle them into one input vector so only a
6279  // single pshufb instruction is necessary. If There are more than 2 input
6280  // quads, disable the next transformation since it does not help SSSE3.
6281  bool V1Used = InputQuads[0] || InputQuads[1];
6282  bool V2Used = InputQuads[2] || InputQuads[3];
6283  if (Subtarget->hasSSSE3()) {
6284  if (InputQuads.count() == 2 && V1Used && V2Used) {
6285  BestLoQuad = InputQuads[0] ? 0 : 1;
6286  BestHiQuad = InputQuads[2] ? 2 : 3;
6287  }
6288  if (InputQuads.count() > 2) {
6289  BestLoQuad = -1;
6290  BestHiQuad = -1;
6291  }
6292  }
6293 
6294  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
6295  // the shuffle mask. If a quad is scored as -1, that means that it contains
6296  // words from all 4 input quadwords.
6297  SDValue NewV;
6298  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
6299  int MaskV[] = {
6300  BestLoQuad < 0 ? 0 : BestLoQuad,
6301  BestHiQuad < 0 ? 1 : BestHiQuad
6302  };
6303  NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
6304  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
6305  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
6306  NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
6307 
6308  // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
6309  // source words for the shuffle, to aid later transformations.
6310  bool AllWordsInNewV = true;
6311  bool InOrder[2] = { true, true };
6312  for (unsigned i = 0; i != 8; ++i) {
6313  int idx = MaskVals[i];
6314  if (idx != (int)i)
6315  InOrder[i/4] = false;
6316  if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
6317  continue;
6318  AllWordsInNewV = false;
6319  break;
6320  }
6321 
6322  bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
6323  if (AllWordsInNewV) {
6324  for (int i = 0; i != 8; ++i) {
6325  int idx = MaskVals[i];
6326  if (idx < 0)
6327  continue;
6328  idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
6329  if ((idx != i) && idx < 4)
6330  pshufhw = false;
6331  if ((idx != i) && idx > 3)
6332  pshuflw = false;
6333  }
6334  V1 = NewV;
6335  V2Used = false;
6336  BestLoQuad = 0;
6337  BestHiQuad = 1;
6338  }
6339 
6340  // If we've eliminated the use of V2, and the new mask is a pshuflw or
6341  // pshufhw, that's as cheap as it gets. Return the new shuffle.
6342  if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
6343  unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
6344  unsigned TargetMask = 0;
6345  NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
6346  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
6347  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6348  TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
6350  V1 = NewV.getOperand(0);
6351  return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
6352  }
6353  }
6354 
6355  // Promote splats to a larger type which usually leads to more efficient code.
6356  // FIXME: Is this true if pshufb is available?
6357  if (SVOp->isSplat())
6358  return PromoteSplat(SVOp, DAG);
6359 
6360  // If we have SSSE3, and all words of the result are from 1 input vector,
6361  // case 2 is generated, otherwise case 3 is generated. If no SSSE3
6362  // is present, fall back to case 4.
6363  if (Subtarget->hasSSSE3()) {
6364  SmallVector<SDValue,16> pshufbMask;
6365 
6366  // If we have elements from both input vectors, set the high bit of the
6367  // shuffle mask element to zero out elements that come from V2 in the V1
6368  // mask, and elements that come from V1 in the V2 mask, so that the two
6369  // results can be OR'd together.
6370  bool TwoInputs = V1Used && V2Used;
6371  for (unsigned i = 0; i != 8; ++i) {
6372  int EltIdx = MaskVals[i] * 2;
6373  int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
6374  int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
6375  pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
6376  pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
6377  }
6378  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
6379  V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
6380  DAG.getNode(ISD::BUILD_VECTOR, dl,
6381  MVT::v16i8, &pshufbMask[0], 16));
6382  if (!TwoInputs)
6383  return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6384 
6385  // Calculate the shuffle mask for the second input, shuffle it, and
6386  // OR it with the first shuffled input.
6387  pshufbMask.clear();
6388  for (unsigned i = 0; i != 8; ++i) {
6389  int EltIdx = MaskVals[i] * 2;
6390  int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6391  int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
6392  pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
6393  pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
6394  }
6395  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
6396  V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6397  DAG.getNode(ISD::BUILD_VECTOR, dl,
6398  MVT::v16i8, &pshufbMask[0], 16));
6399  V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6400  return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6401  }
6402 
6403  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
6404  // and update MaskVals with new element order.
6405  std::bitset<8> InOrder;
6406  if (BestLoQuad >= 0) {
6407  int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
6408  for (int i = 0; i != 4; ++i) {
6409  int idx = MaskVals[i];
6410  if (idx < 0) {
6411  InOrder.set(i);
6412  } else if ((idx / 4) == BestLoQuad) {
6413  MaskV[i] = idx & 3;
6414  InOrder.set(i);
6415  }
6416  }
6417  NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
6418  &MaskV[0]);
6419 
6420  if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
6421  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6423  NewV.getOperand(0),
6424  getShufflePSHUFLWImmediate(SVOp), DAG);
6425  }
6426  }
6427 
6428  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
6429  // and update MaskVals with the new element order.
6430  if (BestHiQuad >= 0) {
6431  int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
6432  for (unsigned i = 4; i != 8; ++i) {
6433  int idx = MaskVals[i];
6434  if (idx < 0) {
6435  InOrder.set(i);
6436  } else if ((idx / 4) == BestHiQuad) {
6437  MaskV[i] = (idx & 3) + 4;
6438  InOrder.set(i);
6439  }
6440  }
6441  NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
6442  &MaskV[0]);
6443 
6444  if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
6445  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6447  NewV.getOperand(0),
6448  getShufflePSHUFHWImmediate(SVOp), DAG);
6449  }
6450  }
6451 
6452  // In case BestHi & BestLo were both -1, which means each quadword has a word
6453  // from each of the four input quadwords, calculate the InOrder bitvector now
6454  // before falling through to the insert/extract cleanup.
6455  if (BestLoQuad == -1 && BestHiQuad == -1) {
6456  NewV = V1;
6457  for (int i = 0; i != 8; ++i)
6458  if (MaskVals[i] < 0 || MaskVals[i] == i)
6459  InOrder.set(i);
6460  }
6461 
6462  // The other elements are put in the right place using pextrw and pinsrw.
6463  for (unsigned i = 0; i != 8; ++i) {
6464  if (InOrder[i])
6465  continue;
6466  int EltIdx = MaskVals[i];
6467  if (EltIdx < 0)
6468  continue;
6469  SDValue ExtOp = (EltIdx < 8) ?
6471  DAG.getIntPtrConstant(EltIdx)) :
6473  DAG.getIntPtrConstant(EltIdx - 8));
6474  NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
6475  DAG.getIntPtrConstant(i));
6476  }
6477  return NewV;
6478 }
6479 
6480 // v16i8 shuffles - Prefer shuffles in the following order:
6481 // 1. [ssse3] 1 x pshufb
6482 // 2. [ssse3] 2 x pshufb + 1 x por
6483 // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
6485  const X86Subtarget* Subtarget,
6486  SelectionDAG &DAG) {
6487  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6488  SDValue V1 = SVOp->getOperand(0);
6489  SDValue V2 = SVOp->getOperand(1);
6490  SDLoc dl(SVOp);
6491  ArrayRef<int> MaskVals = SVOp->getMask();
6492 
6493  // Promote splats to a larger type which usually leads to more efficient code.
6494  // FIXME: Is this true if pshufb is available?
6495  if (SVOp->isSplat())
6496  return PromoteSplat(SVOp, DAG);
6497 
6498  // If we have SSSE3, case 1 is generated when all result bytes come from
6499  // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
6500  // present, fall back to case 3.
6501 
6502  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
6503  if (Subtarget->hasSSSE3()) {
6504  SmallVector<SDValue,16> pshufbMask;
6505 
6506  // If all result elements are from one input vector, then only translate
6507  // undef mask values to 0x80 (zero out result) in the pshufb mask.
6508  //
6509  // Otherwise, we have elements from both input vectors, and must zero out
6510  // elements that come from V2 in the first mask, and V1 in the second mask
6511  // so that we can OR them together.
6512  for (unsigned i = 0; i != 16; ++i) {
6513  int EltIdx = MaskVals[i];
6514  if (EltIdx < 0 || EltIdx >= 16)
6515  EltIdx = 0x80;
6516  pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6517  }
6518  V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
6519  DAG.getNode(ISD::BUILD_VECTOR, dl,
6520  MVT::v16i8, &pshufbMask[0], 16));
6521 
6522  // As PSHUFB will zero elements with negative indices, it's safe to ignore
6523  // the 2nd operand if it's undefined or zero.
6524  if (V2.getOpcode() == ISD::UNDEF ||
6526  return V1;
6527 
6528  // Calculate the shuffle mask for the second input, shuffle it, and
6529  // OR it with the first shuffled input.
6530  pshufbMask.clear();
6531  for (unsigned i = 0; i != 16; ++i) {
6532  int EltIdx = MaskVals[i];
6533  EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6534  pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6535  }
6536  V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6537  DAG.getNode(ISD::BUILD_VECTOR, dl,
6538  MVT::v16i8, &pshufbMask[0], 16));
6539  return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6540  }
6541 
6542  // No SSSE3 - Calculate in place words and then fix all out of place words
6543  // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
6544  // the 16 different words that comprise the two doublequadword input vectors.
6545  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6546  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
6547  SDValue NewV = V1;
6548  for (int i = 0; i != 8; ++i) {
6549  int Elt0 = MaskVals[i*2];
6550  int Elt1 = MaskVals[i*2+1];
6551 
6552  // This word of the result is all undef, skip it.
6553  if (Elt0 < 0 && Elt1 < 0)
6554  continue;
6555 
6556  // This word of the result is already in the correct place, skip it.
6557  if ((Elt0 == i*2) && (Elt1 == i*2+1))
6558  continue;
6559 
6560  SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
6561  SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
6562  SDValue InsElt;
6563 
6564  // If Elt0 and Elt1 are defined, are consecutive, and can be load
6565  // using a single extract together, load it and store it.
6566  if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
6567  InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6568  DAG.getIntPtrConstant(Elt1 / 2));
6569  NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6570  DAG.getIntPtrConstant(i));
6571  continue;
6572  }
6573 
6574  // If Elt1 is defined, extract it from the appropriate source. If the
6575  // source byte is not also odd, shift the extracted word left 8 bits
6576  // otherwise clear the bottom 8 bits if we need to do an or.
6577  if (Elt1 >= 0) {
6578  InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6579  DAG.getIntPtrConstant(Elt1 / 2));
6580  if ((Elt1 & 1) == 0)
6581  InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
6582  DAG.getConstant(8,
6583  TLI.getShiftAmountTy(InsElt.getValueType())));
6584  else if (Elt0 >= 0)
6585  InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
6586  DAG.getConstant(0xFF00, MVT::i16));
6587  }
6588  // If Elt0 is defined, extract it from the appropriate source. If the
6589  // source byte is not also even, shift the extracted word right 8 bits. If
6590  // Elt1 was also defined, OR the extracted values together before
6591  // inserting them in the result.
6592  if (Elt0 >= 0) {
6593  SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
6594  Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
6595  if ((Elt0 & 1) != 0)
6596  InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
6597  DAG.getConstant(8,
6598  TLI.getShiftAmountTy(InsElt0.getValueType())));
6599  else if (Elt1 >= 0)
6600  InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
6601  DAG.getConstant(0x00FF, MVT::i16));
6602  InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
6603  : InsElt0;
6604  }
6605  NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6606  DAG.getIntPtrConstant(i));
6607  }
6608  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
6609 }
6610 
6611 // v32i8 shuffles - Translate to VPSHUFB if possible.
6612 static
6614  const X86Subtarget *Subtarget,
6615  SelectionDAG &DAG) {
6616  MVT VT = SVOp->getSimpleValueType(0);
6617  SDValue V1 = SVOp->getOperand(0);
6618  SDValue V2 = SVOp->getOperand(1);
6619  SDLoc dl(SVOp);
6620  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
6621 
6622  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6623  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
6624  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
6625 
6626  // VPSHUFB may be generated if
6627  // (1) one of input vector is undefined or zeroinitializer.
6628  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
6629  // And (2) the mask indexes don't cross the 128-bit lane.
6630  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
6631  (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
6632  return SDValue();
6633 
6634  if (V1IsAllZero && !V2IsAllZero) {
6635  CommuteVectorShuffleMask(MaskVals, 32);
6636  V1 = V2;
6637  }
6638  SmallVector<SDValue, 32> pshufbMask;
6639  for (unsigned i = 0; i != 32; i++) {
6640  int EltIdx = MaskVals[i];
6641  if (EltIdx < 0 || EltIdx >= 32)
6642  EltIdx = 0x80;
6643  else {
6644  if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
6645  // Cross lane is not allowed.
6646  return SDValue();
6647  EltIdx &= 0xf;
6648  }
6649  pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6650  }
6651  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
6652  DAG.getNode(ISD::BUILD_VECTOR, dl,
6653  MVT::v32i8, &pshufbMask[0], 32));
6654 }
6655 
6656 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
6657 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
6658 /// done when every pair / quad of shuffle mask elements point to elements in
6659 /// the right sequence. e.g.
6660 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
6661 static
6663  SelectionDAG &DAG) {
6664  MVT VT = SVOp->getSimpleValueType(0);
6665  SDLoc dl(SVOp);
6666  unsigned NumElems = VT.getVectorNumElements();
6667  MVT NewVT;
6668  unsigned Scale;
6669  switch (VT.SimpleTy) {
6670  default: llvm_unreachable("Unexpected!");
6671  case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
6672  case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
6673  case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
6674  case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
6675  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
6676  case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
6677  }
6678 
6679  SmallVector<int, 8> MaskVec;
6680  for (unsigned i = 0; i != NumElems; i += Scale) {
6681  int StartIdx = -1;
6682  for (unsigned j = 0; j != Scale; ++j) {
6683  int EltIdx = SVOp->getMaskElt(i+j);
6684  if (EltIdx < 0)
6685  continue;
6686  if (StartIdx < 0)
6687  StartIdx = (EltIdx / Scale);
6688  if (EltIdx != (int)(StartIdx*Scale + j))
6689  return SDValue();
6690  }
6691  MaskVec.push_back(StartIdx);
6692  }
6693 
6694  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
6695  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
6696  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
6697 }
6698 
6699 /// getVZextMovL - Return a zero-extending vector move low node.
6700 ///
6701 static SDValue getVZextMovL(MVT VT, MVT OpVT,
6702  SDValue SrcOp, SelectionDAG &DAG,
6703  const X86Subtarget *Subtarget, SDLoc dl) {
6704  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
6705  LoadSDNode *LD = NULL;
6706  if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
6707  LD = dyn_cast<LoadSDNode>(SrcOp);
6708  if (!LD) {
6709  // movssrr and movsdrr do not clear top bits. Try to use movd, movq
6710  // instead.
6711  MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
6712  if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
6713  SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6714  SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
6715  SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
6716  // PR2108
6717  OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
6718  return DAG.getNode(ISD::BITCAST, dl, VT,
6719  DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6721  OpVT,
6722  SrcOp.getOperand(0)
6723  .getOperand(0))));
6724  }
6725  }
6726  }
6727 
6728  return DAG.getNode(ISD::BITCAST, dl, VT,
6729  DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6730  DAG.getNode(ISD::BITCAST, dl,
6731  OpVT, SrcOp)));
6732 }
6733 
6734 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
6735 /// which could not be matched by any known target speficic shuffle
6736 static SDValue
6738 
6739  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
6740  if (NewOp.getNode())
6741  return NewOp;
6742 
6743  MVT VT = SVOp->getSimpleValueType(0);
6744 
6745  unsigned NumElems = VT.getVectorNumElements();
6746  unsigned NumLaneElems = NumElems / 2;
6747 
6748  SDLoc dl(SVOp);
6749  MVT EltVT = VT.getVectorElementType();
6750  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
6751  SDValue Output[2];
6752 
6753  SmallVector<int, 16> Mask;
6754  for (unsigned l = 0; l < 2; ++l) {
6755  // Build a shuffle mask for the output, discovering on the fly which
6756  // input vectors to use as shuffle operands (recorded in InputUsed).
6757  // If building a suitable shuffle vector proves too hard, then bail
6758  // out with UseBuildVector set.
6759  bool UseBuildVector = false;
6760  int InputUsed[2] = { -1, -1 }; // Not yet discovered.
6761  unsigned LaneStart = l * NumLaneElems;
6762  for (unsigned i = 0; i != NumLaneElems; ++i) {
6763  // The mask element. This indexes into the input.
6764  int Idx = SVOp->getMaskElt(i+LaneStart);
6765  if (Idx < 0) {
6766  // the mask element does not index into any input vector.
6767  Mask.push_back(-1);
6768  continue;
6769  }
6770 
6771  // The input vector this mask element indexes into.
6772  int Input = Idx / NumLaneElems;
6773 
6774  // Turn the index into an offset from the start of the input vector.
6775  Idx -= Input * NumLaneElems;
6776 
6777  // Find or create a shuffle vector operand to hold this input.
6778  unsigned OpNo;
6779  for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6780  if (InputUsed[OpNo] == Input)
6781  // This input vector is already an operand.
6782  break;
6783  if (InputUsed[OpNo] < 0) {
6784  // Create a new operand for this input vector.
6785  InputUsed[OpNo] = Input;
6786  break;
6787  }
6788  }
6789 
6790  if (OpNo >= array_lengthof(InputUsed)) {
6791  // More than two input vectors used! Give up on trying to create a
6792  // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
6793  UseBuildVector = true;
6794  break;
6795  }
6796 
6797  // Add the mask index for the new shuffle vector.
6798  Mask.push_back(Idx + OpNo * NumLaneElems);
6799  }
6800 
6801  if (UseBuildVector) {
6803  for (unsigned i = 0; i != NumLaneElems; ++i) {
6804  // The mask element. This indexes into the input.
6805  int Idx = SVOp->getMaskElt(i+LaneStart);
6806  if (Idx < 0) {
6807  SVOps.push_back(DAG.getUNDEF(EltVT));
6808  continue;
6809  }
6810 
6811  // The input vector this mask element indexes into.
6812  int Input = Idx / NumElems;
6813 
6814  // Turn the index into an offset from the start of the input vector.
6815  Idx -= Input * NumElems;
6816 
6817  // Extract the vector element by hand.
6818  SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6819  SVOp->getOperand(Input),
6820  DAG.getIntPtrConstant(Idx)));
6821  }
6822 
6823  // Construct the output using a BUILD_VECTOR.
6824  Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
6825  SVOps.size());
6826  } else if (InputUsed[0] < 0) {
6827  // No input vectors were used! The result is undefined.
6828  Output[l] = DAG.getUNDEF(NVT);
6829  } else {
6830  SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6831  (InputUsed[0] % 2) * NumLaneElems,
6832  DAG, dl);
6833  // If only one input was used, use an undefined vector for the other.
6834  SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6835  Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6836  (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6837  // At least one input vector was used. Create a new shuffle vector.
6838  Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6839  }
6840 
6841  Mask.clear();
6842  }
6843 
6844  // Concatenate the result back
6845  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
6846 }
6847 
6848 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6849 /// 4 elements, and match them with several different shuffle types.
6850 static SDValue
6852  SDValue V1 = SVOp->getOperand(0);
6853  SDValue V2 = SVOp->getOperand(1);
6854  SDLoc dl(SVOp);
6855  MVT VT = SVOp->getSimpleValueType(0);
6856 
6857  assert(VT.is128BitVector() && "Unsupported vector size");
6858 
6859  std::pair<int, int> Locs[4];
6860  int Mask1[] = { -1, -1, -1, -1 };
6861  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6862 
6863  unsigned NumHi = 0;
6864  unsigned NumLo = 0;
6865  for (unsigned i = 0; i != 4; ++i) {
6866  int Idx = PermMask[i];
6867  if (Idx < 0) {
6868  Locs[i] = std::make_pair(-1, -1);
6869  } else {
6870  assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6871  if (Idx < 4) {
6872  Locs[i] = std::make_pair(0, NumLo);
6873  Mask1[NumLo] = Idx;
6874  NumLo++;
6875  } else {
6876  Locs[i] = std::make_pair(1, NumHi);
6877  if (2+NumHi < 4)
6878  Mask1[2+NumHi] = Idx;
6879  NumHi++;
6880  }
6881  }
6882  }
6883 
6884  if (NumLo <= 2 && NumHi <= 2) {
6885  // If no more than two elements come from either vector. This can be
6886  // implemented with two shuffles. First shuffle gather the elements.
6887  // The second shuffle, which takes the first shuffle as both of its
6888  // vector operands, put the elements into the right order.
6889  V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6890 
6891  int Mask2[] = { -1, -1, -1, -1 };
6892 
6893  for (unsigned i = 0; i != 4; ++i)
6894  if (Locs[i].first != -1) {
6895  unsigned Idx = (i < 2) ? 0 : 4;
6896  Idx += Locs[i].first * 2 + Locs[i].second;
6897  Mask2[i] = Idx;
6898  }
6899 
6900  return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6901  }
6902 
6903  if (NumLo == 3 || NumHi == 3) {
6904  // Otherwise, we must have three elements from one vector, call it X, and
6905  // one element from the other, call it Y. First, use a shufps to build an
6906  // intermediate vector with the one element from Y and the element from X
6907  // that will be in the same half in the final destination (the indexes don't
6908  // matter). Then, use a shufps to build the final vector, taking the half
6909  // containing the element from Y from the intermediate, and the other half
6910  // from X.
6911  if (NumHi == 3) {
6912  // Normalize it so the 3 elements come from V1.
6913  CommuteVectorShuffleMask(PermMask, 4);
6914  std::swap(V1, V2);
6915  }
6916 
6917  // Find the element from V2.
6918  unsigned HiIndex;
6919  for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6920  int Val = PermMask[HiIndex];
6921  if (Val < 0)
6922  continue;
6923  if (Val >= 4)
6924  break;
6925  }
6926 
6927  Mask1[0] = PermMask[HiIndex];
6928  Mask1[1] = -1;
6929  Mask1[2] = PermMask[HiIndex^1];
6930  Mask1[3] = -1;
6931  V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6932 
6933  if (HiIndex >= 2) {
6934  Mask1[0] = PermMask[0];
6935  Mask1[1] = PermMask[1];
6936  Mask1[2] = HiIndex & 1 ? 6 : 4;
6937  Mask1[3] = HiIndex & 1 ? 4 : 6;
6938  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6939  }
6940 
6941  Mask1[0] = HiIndex & 1 ? 2 : 0;
6942  Mask1[1] = HiIndex & 1 ? 0 : 2;
6943  Mask1[2] = PermMask[2];
6944  Mask1[3] = PermMask[3];
6945  if (Mask1[2] >= 0)
6946  Mask1[2] += 4;
6947  if (Mask1[3] >= 0)
6948  Mask1[3] += 4;
6949  return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6950  }
6951 
6952  // Break it into (shuffle shuffle_hi, shuffle_lo).
6953  int LoMask[] = { -1, -1, -1, -1 };
6954  int HiMask[] = { -1, -1, -1, -1 };
6955 
6956  int *MaskPtr = LoMask;
6957  unsigned MaskIdx = 0;
6958  unsigned LoIdx = 0;
6959  unsigned HiIdx = 2;
6960  for (unsigned i = 0; i != 4; ++i) {
6961  if (i == 2) {
6962  MaskPtr = HiMask;
6963  MaskIdx = 1;
6964  LoIdx = 0;
6965  HiIdx = 2;
6966  }
6967  int Idx = PermMask[i];
6968  if (Idx < 0) {
6969  Locs[i] = std::make_pair(-1, -1);
6970  } else if (Idx < 4) {
6971  Locs[i] = std::make_pair(MaskIdx, LoIdx);
6972  MaskPtr[LoIdx] = Idx;
6973  LoIdx++;
6974  } else {
6975  Locs[i] = std::make_pair(MaskIdx, HiIdx);
6976  MaskPtr[HiIdx] = Idx;
6977  HiIdx++;
6978  }
6979  }
6980 
6981  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6982  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6983  int MaskOps[] = { -1, -1, -1, -1 };
6984  for (unsigned i = 0; i != 4; ++i)
6985  if (Locs[i].first != -1)
6986  MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6987  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6988 }
6989 
6990 static bool MayFoldVectorLoad(SDValue V) {
6991  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6992  V = V.getOperand(0);
6993 
6994  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6995  V = V.getOperand(0);
6996  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6997  V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6998  // BUILD_VECTOR (load), undef
6999  V = V.getOperand(0);
7000 
7001  return MayFoldLoad(V);
7002 }
7003 
7004 static
7006  MVT VT = Op.getSimpleValueType();
7007 
7008  // Canonizalize to v2f64.
7009  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
7010  return DAG.getNode(ISD::BITCAST, dl, VT,
7012  V1, DAG));
7013 }
7014 
7015 static
7017  bool HasSSE2) {
7018  SDValue V1 = Op.getOperand(0);
7019  SDValue V2 = Op.getOperand(1);
7020  MVT VT = Op.getSimpleValueType();
7021 
7022  assert(VT != MVT::v2i64 && "unsupported shuffle type");
7023 
7024  if (HasSSE2 && VT == MVT::v2f64)
7025  return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
7026 
7027  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
7028  return DAG.getNode(ISD::BITCAST, dl, VT,
7030  DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
7031  DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
7032 }
7033 
7034 static
7036  SDValue V1 = Op.getOperand(0);
7037  SDValue V2 = Op.getOperand(1);
7038  MVT VT = Op.getSimpleValueType();
7039 
7040  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
7041  "unsupported shuffle type");
7042 
7043  if (V2.getOpcode() == ISD::UNDEF)
7044  V2 = V1;
7045 
7046  // v4i32 or v4f32
7047  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
7048 }
7049 
7050 static
7051 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
7052  SDValue V1 = Op.getOperand(0);
7053  SDValue V2 = Op.getOperand(1);
7054  MVT VT = Op.getSimpleValueType();
7055  unsigned NumElems = VT.getVectorNumElements();
7056 
7057  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
7058  // operand of these instructions is only memory, so check if there's a
7059  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
7060  // same masks.
7061  bool CanFoldLoad = false;
7062 
7063  // Trivial case, when V2 comes from a load.
7064  if (MayFoldVectorLoad(V2))
7065  CanFoldLoad = true;
7066 
7067  // When V1 is a load, it can be folded later into a store in isel, example:
7068  // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
7069  // turns into:
7070  // (MOVLPSmr addr:$src1, VR128:$src2)
7071  // So, recognize this potential and also use MOVLPS or MOVLPD
7072  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
7073  CanFoldLoad = true;
7074 
7075  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7076  if (CanFoldLoad) {
7077  if (HasSSE2 && NumElems == 2)
7078  return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
7079 
7080  if (NumElems == 4)
7081  // If we don't care about the second element, proceed to use movss.
7082  if (SVOp->getMaskElt(1) != -1)
7083  return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
7084  }
7085 
7086  // movl and movlp will both match v2i64, but v2i64 is never matched by
7087  // movl earlier because we make it strict to avoid messing with the movlp load
7088  // folding logic (see the code above getMOVLP call). Match it here then,
7089  // this is horrible, but will stay like this until we move all shuffle
7090  // matching to x86 specific nodes. Note that for the 1st condition all
7091  // types are matched with movsd.
7092  if (HasSSE2) {
7093  // FIXME: isMOVLMask should be checked and matched before getMOVLP,
7094  // as to remove this logic from here, as much as possible
7095  if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
7096  return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
7097  return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
7098  }
7099 
7100  assert(VT != MVT::v4i32 && "unsupported shuffle type");
7101 
7102  // Invert the operand order and use SHUFPS to match it.
7103  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
7104  getShuffleSHUFImmediate(SVOp), DAG);
7105 }
7106 
7107 // Reduce a vector shuffle to zext.
7108 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
7109  SelectionDAG &DAG) {
7110  // PMOVZX is only available from SSE41.
7111  if (!Subtarget->hasSSE41())
7112  return SDValue();
7113 
7114  MVT VT = Op.getSimpleValueType();
7115 
7116  // Only AVX2 support 256-bit vector integer extending.
7117  if (!Subtarget->hasInt256() && VT.is256BitVector())
7118  return SDValue();
7119 
7120  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7121  SDLoc DL(Op);
7122  SDValue V1 = Op.getOperand(0);
7123  SDValue V2 = Op.getOperand(1);
7124  unsigned NumElems = VT.getVectorNumElements();
7125 
7126  // Extending is an unary operation and the element type of the source vector
7127  // won't be equal to or larger than i64.
7128  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
7130  return SDValue();
7131 
7132  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
7133  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
7134  while ((1U << Shift) < NumElems) {
7135  if (SVOp->getMaskElt(1U << Shift) == 1)
7136  break;
7137  Shift += 1;
7138  // The maximal ratio is 8, i.e. from i8 to i64.
7139  if (Shift > 3)
7140  return SDValue();
7141  }
7142 
7143  // Check the shuffle mask.
7144  unsigned Mask = (1U << Shift) - 1;
7145  for (unsigned i = 0; i != NumElems; ++i) {
7146  int EltIdx = SVOp->getMaskElt(i);
7147  if ((i & Mask) != 0 && EltIdx != -1)
7148  return SDValue();
7149  if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
7150  return SDValue();
7151  }
7152 
7153  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
7154  MVT NeVT = MVT::getIntegerVT(NBits);
7155  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
7156 
7157  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
7158  return SDValue();
7159 
7160  // Simplify the operand as it's prepared to be fed into shuffle.
7161  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
7162  if (V1.getOpcode() == ISD::BITCAST &&
7165  V1.getOperand(0).getOperand(0)
7166  .getSimpleValueType().getSizeInBits() == SignificantBits) {
7167  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
7168  SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
7169  ConstantSDNode *CIdx =
7171  // If it's foldable, i.e. normal load with single use, we will let code
7172  // selection to fold it. Otherwise, we will short the conversion sequence.
7173  if (CIdx && CIdx->getZExtValue() == 0 &&
7174  (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
7175  MVT FullVT = V.getSimpleValueType();
7176  MVT V1VT = V1.getSimpleValueType();
7177  if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
7178  // The "ext_vec_elt" node is wider than the result node.
7179  // In this case we should extract subvector from V.
7180  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
7181  unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
7182  MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
7183  FullVT.getVectorNumElements()/Ratio);
7184  V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
7185  DAG.getIntPtrConstant(0));
7186  }
7187  V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
7188  }
7189  }
7190 
7191  return DAG.getNode(ISD::BITCAST, DL, VT,
7192  DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
7193 }
7194 
7195 static SDValue
7197  SelectionDAG &DAG) {
7198  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7199  MVT VT = Op.getSimpleValueType();
7200  SDLoc dl(Op);
7201  SDValue V1 = Op.getOperand(0);
7202  SDValue V2 = Op.getOperand(1);
7203 
7204  if (isZeroShuffle(SVOp))
7205  return getZeroVector(VT, Subtarget, DAG, dl);
7206 
7207  // Handle splat operations
7208  if (SVOp->isSplat()) {
7209  // Use vbroadcast whenever the splat comes from a foldable load
7210  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
7211  if (Broadcast.getNode())
7212  return Broadcast;
7213  }
7214 
7215  // Check integer expanding shuffles.
7216  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
7217  if (NewOp.getNode())
7218  return NewOp;
7219 
7220  // If the shuffle can be profitably rewritten as a narrower shuffle, then
7221  // do it!
7222  if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
7223  VT == MVT::v16i16 || VT == MVT::v32i8) {
7224  SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
7225  if (NewOp.getNode())
7226  return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
7227  } else if ((VT == MVT::v4i32 ||
7228  (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
7229  // FIXME: Figure out a cleaner way to do this.
7230  // Try to make use of movq to zero out the top part.
7231  if (ISD::isBuildVectorAllZeros(V2.getNode())) {
7232  SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
7233  if (NewOp.getNode()) {
7234  MVT NewVT = NewOp.getSimpleValueType();
7235  if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
7236  NewVT, true, false))
7237  return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
7238  DAG, Subtarget, dl);
7239  }
7240  } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
7241  SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
7242  if (NewOp.getNode()) {
7243  MVT NewVT = NewOp.getSimpleValueType();
7244  if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
7245  return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
7246  DAG, Subtarget, dl);
7247  }
7248  }
7249  }
7250  return SDValue();
7251 }
7252 
7253 SDValue
7254 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
7255  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7256  SDValue V1 = Op.getOperand(0);
7257  SDValue V2 = Op.getOperand(1);
7258  MVT VT = Op.getSimpleValueType();
7259  SDLoc dl(Op);
7260  unsigned NumElems = VT.getVectorNumElements();
7261  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
7262  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
7263  bool V1IsSplat = false;
7264  bool V2IsSplat = false;
7265  bool HasSSE2 = Subtarget->hasSSE2();
7266  bool HasFp256 = Subtarget->hasFp256();
7267  bool HasInt256 = Subtarget->hasInt256();
7268  MachineFunction &MF = DAG.getMachineFunction();
7269  bool OptForSize = MF.getFunction()->getAttributes().
7270  hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
7271 
7272  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
7273 
7274  if (V1IsUndef && V2IsUndef)
7275  return DAG.getUNDEF(VT);
7276 
7277  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
7278 
7279  // Vector shuffle lowering takes 3 steps:
7280  //
7281  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
7282  // narrowing and commutation of operands should be handled.
7283  // 2) Matching of shuffles with known shuffle masks to x86 target specific
7284  // shuffle nodes.
7285  // 3) Rewriting of unmatched masks into new generic shuffle operations,
7286  // so the shuffle can be broken into other shuffles and the legalizer can
7287  // try the lowering again.
7288  //
7289  // The general idea is that no vector_shuffle operation should be left to
7290  // be matched during isel, all of them must be converted to a target specific
7291  // node here.
7292 
7293  // Normalize the input vectors. Here splats, zeroed vectors, profitable
7294  // narrowing and commutation of operands should be handled. The actual code
7295  // doesn't include all of those, work in progress...
7296  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
7297  if (NewOp.getNode())
7298  return NewOp;
7299 
7300  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
7301 
7302  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
7303  // unpckh_undef). Only use pshufd if speed is more important than size.
7304  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
7305  return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7306  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
7307  return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7308 
7309  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
7310  V2IsUndef && MayFoldVectorLoad(V1))
7311  return getMOVDDup(Op, dl, V1, DAG);
7312 
7313  if (isMOVHLPS_v_undef_Mask(M, VT))
7314  return getMOVHighToLow(Op, dl, DAG);
7315 
7316  // Use to match splats
7317  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
7318  (VT == MVT::v2f64 || VT == MVT::v2i64))
7319  return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7320 
7321  if (isPSHUFDMask(M, VT)) {
7322  // The actual implementation will match the mask in the if above and then
7323  // during isel it can match several different instructions, not only pshufd
7324  // as its name says, sad but true, emulate the behavior for now...
7325  if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
7326  return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
7327 
7328  unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
7329 
7330  if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
7331  return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
7332 
7333  if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
7334  return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
7335  DAG);
7336 
7337  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
7338  TargetMask, DAG);
7339  }
7340 
7341  if (isPALIGNRMask(M, VT, Subtarget))
7342  return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
7344  DAG);
7345 
7346  // Check if this can be converted into a logical shift.
7347  bool isLeft = false;
7348  unsigned ShAmt = 0;
7349  SDValue ShVal;
7350  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
7351  if (isShift && ShVal.hasOneUse()) {
7352  // If the shifted value has multiple uses, it may be cheaper to use
7353  // v_set0 + movlhps or movhlps, etc.
7354  MVT EltVT = VT.getVectorElementType();
7355  ShAmt *= EltVT.getSizeInBits();
7356  return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
7357  }
7358 
7359  if (isMOVLMask(M, VT)) {
7360  if (ISD::isBuildVectorAllZeros(V1.getNode()))
7361  return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
7362  if (!isMOVLPMask(M, VT)) {
7363  if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
7364  return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
7365 
7366  if (VT == MVT::v4i32 || VT == MVT::v4f32)
7367  return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
7368  }
7369  }
7370 
7371  // FIXME: fold these into legal mask.
7372  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
7373  return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
7374 
7375  if (isMOVHLPSMask(M, VT))
7376  return getMOVHighToLow(Op, dl, DAG);
7377 
7378  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
7379  return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
7380 
7381  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
7382  return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
7383 
7384  if (isMOVLPMask(M, VT))
7385  return getMOVLP(Op, dl, DAG, HasSSE2);
7386 
7387  if (ShouldXformToMOVHLPS(M, VT) ||
7388  ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
7389  return CommuteVectorShuffle(SVOp, DAG);
7390 
7391  if (isShift) {
7392  // No better options. Use a vshldq / vsrldq.
7393  MVT EltVT = VT.getVectorElementType();
7394  ShAmt *= EltVT.getSizeInBits();
7395  return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
7396  }
7397 
7398  bool Commuted = false;
7399  // FIXME: This should also accept a bitcast of a splat? Be careful, not
7400  // 1,1,1,1 -> v8i16 though.
7401  V1IsSplat = isSplatVector(V1.getNode());
7402  V2IsSplat = isSplatVector(V2.getNode());
7403 
7404  // Canonicalize the splat or undef, if present, to be on the RHS.
7405  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
7406  CommuteVectorShuffleMask(M, NumElems);
7407  std::swap(V1, V2);
7408  std::swap(V1IsSplat, V2IsSplat);
7409  Commuted = true;
7410  }
7411 
7412  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
7413  // Shuffling low element of v1 into undef, just return v1.
7414  if (V2IsUndef)
7415  return V1;
7416  // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
7417  // the instruction selector will not match, so get a canonical MOVL with
7418  // swapped operands to undo the commute.
7419  return getMOVL(DAG, dl, VT, V2, V1);
7420  }
7421 
7422  if (isUNPCKLMask(M, VT, HasInt256))
7423  return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7424 
7425  if (isUNPCKHMask(M, VT, HasInt256))
7426  return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7427 
7428  if (V2IsSplat) {
7429  // Normalize mask so all entries that point to V2 points to its first
7430  // element then try to match unpck{h|l} again. If match, return a
7431  // new vector_shuffle with the corrected mask.p
7432  SmallVector<int, 8> NewMask(M.begin(), M.end());
7433  NormalizeMask(NewMask, NumElems);
7434  if (isUNPCKLMask(NewMask, VT, HasInt256, true))
7435  return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7436  if (isUNPCKHMask(NewMask, VT, HasInt256, true))
7437  return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7438  }
7439 
7440  if (Commuted) {
7441  // Commute is back and try unpck* again.
7442  // FIXME: this seems wrong.
7443  CommuteVectorShuffleMask(M, NumElems);
7444  std::swap(V1, V2);
7445  std::swap(V1IsSplat, V2IsSplat);
7446  Commuted = false;
7447 
7448  if (isUNPCKLMask(M, VT, HasInt256))
7449  return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7450 
7451  if (isUNPCKHMask(M, VT, HasInt256))
7452  return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7453  }
7454 
7455  // Normalize the node to match x86 shuffle ops if needed
7456  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
7457  return CommuteVectorShuffle(SVOp, DAG);
7458 
7459  // The checks below are all present in isShuffleMaskLegal, but they are
7460  // inlined here right now to enable us to directly emit target specific
7461  // nodes, and remove one by one until they don't return Op anymore.
7462 
7463  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
7464  SVOp->getSplatIndex() == 0 && V2IsUndef) {
7465  if (VT == MVT::v2f64 || VT == MVT::v2i64)
7466  return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7467  }
7468 
7469  if (isPSHUFHWMask(M, VT, HasInt256))
7470  return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
7472  DAG);
7473 
7474  if (isPSHUFLWMask(M, VT, HasInt256))
7475  return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
7477  DAG);
7478 
7479  if (isSHUFPMask(M, VT))
7480  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
7481  getShuffleSHUFImmediate(SVOp), DAG);
7482 
7483  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
7484  return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7485  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
7486  return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7487 
7488  //===--------------------------------------------------------------------===//
7489  // Generate target specific nodes for 128 or 256-bit shuffles only
7490  // supported in the AVX instruction set.
7491  //
7492 
7493  // Handle VMOVDDUPY permutations
7494  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
7495  return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
7496 
7497  // Handle VPERMILPS/D* permutations
7498  if (isVPERMILPMask(M, VT)) {
7499  if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
7500  return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
7501  getShuffleSHUFImmediate(SVOp), DAG);
7502  return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
7503  getShuffleSHUFImmediate(SVOp), DAG);
7504  }
7505 
7506  // Handle VPERM2F128/VPERM2I128 permutations
7507  if (isVPERM2X128Mask(M, VT, HasFp256))
7508  return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
7509  V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
7510 
7511  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
7512  if (BlendOp.getNode())
7513  return BlendOp;
7514 
7515  unsigned Imm8;
7516  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
7517  return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
7518 
7519  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
7520  VT.is512BitVector()) {
7521  MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
7522  MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
7523  SmallVector<SDValue, 16> permclMask;
7524  for (unsigned i = 0; i != NumElems; ++i) {
7525  permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
7526  }
7527 
7528  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
7529  &permclMask[0], NumElems);
7530  if (V2IsUndef)
7531  // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
7532  return DAG.getNode(X86ISD::VPERMV, dl, VT,
7533  DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
7534  return DAG.getNode(X86ISD::VPERMV3, dl, VT,
7535  DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
7536  }
7537 
7538  //===--------------------------------------------------------------------===//
7539  // Since no target specific shuffle was selected for this generic one,
7540  // lower it into other known shuffles. FIXME: this isn't true yet, but
7541  // this is the plan.
7542  //
7543 
7544  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
7545  if (VT == MVT::v8i16) {
7546  SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
7547  if (NewOp.getNode())
7548  return NewOp;
7549  }
7550 
7551  if (VT == MVT::v16i8) {
7552  SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
7553  if (NewOp.getNode())
7554  return NewOp;
7555  }
7556 
7557  if (VT == MVT::v32i8) {
7558  SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
7559  if (NewOp.getNode())
7560  return NewOp;
7561  }
7562 
7563  // Handle all 128-bit wide vectors with 4 elements, and match them with
7564  // several different shuffle types.
7565  if (NumElems == 4 && VT.is128BitVector())
7566  return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
7567 
7568  // Handle general 256-bit shuffles
7569  if (VT.is256BitVector())
7570  return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
7571 
7572  return SDValue();
7573 }
7574 
7576  MVT VT = Op.getSimpleValueType();
7577  SDLoc dl(Op);
7578 
7580  return SDValue();
7581 
7582  if (VT.getSizeInBits() == 8) {
7583  SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
7584  Op.getOperand(0), Op.getOperand(1));
7585  SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7586  DAG.getValueType(VT));
7587  return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7588  }
7589 
7590  if (VT.getSizeInBits() == 16) {
7591  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7592  // If Idx is 0, it's cheaper to do a move instead of a pextrw.
7593  if (Idx == 0)
7594  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7596  DAG.getNode(ISD::BITCAST, dl,
7597  MVT::v4i32,
7598  Op.getOperand(0)),
7599  Op.getOperand(1)));
7600  SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
7601  Op.getOperand(0), Op.getOperand(1));
7602  SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7603  DAG.getValueType(VT));
7604  return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7605  }
7606 
7607  if (VT == MVT::f32) {
7608  // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
7609  // the result back to FR32 register. It's only worth matching if the
7610  // result has a single use which is a store or a bitcast to i32. And in
7611  // the case of a store, it's not worth it if the index is a constant 0,
7612  // because a MOVSSmr can be used instead, which is smaller and faster.
7613  if (!Op.hasOneUse())
7614  return SDValue();
7615  SDNode *User = *Op.getNode()->use_begin();
7616  if ((User->getOpcode() != ISD::STORE ||
7617  (isa<ConstantSDNode>(Op.getOperand(1)) &&
7618  cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
7619  (User->getOpcode() != ISD::BITCAST ||
7620  User->getValueType(0) != MVT::i32))
7621  return SDValue();
7622  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7623  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
7624  Op.getOperand(0)),
7625  Op.getOperand(1));
7626  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
7627  }
7628 
7629  if (VT == MVT::i32 || VT == MVT::i64) {
7630  // ExtractPS/pextrq works with constant index.
7631  if (isa<ConstantSDNode>(Op.getOperand(1)))
7632  return Op;
7633  }
7634  return SDValue();
7635 }
7636 
7637 SDValue
7638 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
7639  SelectionDAG &DAG) const {
7640  SDLoc dl(Op);
7641  SDValue Vec = Op.getOperand(0);
7642  MVT VecVT = Vec.getSimpleValueType();
7643  SDValue Idx = Op.getOperand(1);
7644  if (!isa<ConstantSDNode>(Idx)) {
7645  if (VecVT.is512BitVector() ||
7646  (VecVT.is256BitVector() && Subtarget->hasInt256() &&
7647  VecVT.getVectorElementType().getSizeInBits() == 32)) {
7648 
7649  MVT MaskEltVT =
7651  MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
7652  MaskEltVT.getSizeInBits());
7653 
7654  Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
7655  SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
7656  getZeroVector(MaskVT, Subtarget, DAG, dl),
7657  Idx, DAG.getConstant(0, getPointerTy()));
7658  SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
7659  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
7660  Perm, DAG.getConstant(0, getPointerTy()));
7661  }
7662  return SDValue();
7663  }
7664 
7665  // If this is a 256-bit vector result, first extract the 128-bit vector and
7666  // then extract the element from the 128-bit vector.
7667  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
7668 
7669  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7670  // Get the 128-bit vector.
7671  Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
7672  MVT EltVT = VecVT.getVectorElementType();
7673 
7674  unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
7675 
7676  //if (IdxVal >= NumElems/2)
7677  // IdxVal -= NumElems/2;
7678  IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
7679  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
7680  DAG.getConstant(IdxVal, MVT::i32));
7681  }
7682 
7683  assert(VecVT.is128BitVector() && "Unexpected vector length");
7684 
7685  if (Subtarget->hasSSE41()) {
7686  SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
7687  if (Res.getNode())
7688  return Res;
7689  }
7690 
7691  MVT VT = Op.getSimpleValueType();
7692  // TODO: handle v16i8.
7693  if (VT.getSizeInBits() == 16) {
7694  SDValue Vec = Op.getOperand(0);
7695  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7696  if (Idx == 0)
7697  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7699  DAG.getNode(ISD::BITCAST, dl,
7700  MVT::v4i32, Vec),
7701  Op.getOperand(1)));
7702  // Transform it so it match pextrw which produces a 32-bit result.
7703  MVT EltVT = MVT::i32;
7704  SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
7705  Op.getOperand(0), Op.getOperand(1));
7706  SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
7707  DAG.getValueType(VT));
7708  return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7709  }
7710 
7711  if (VT.getSizeInBits() == 32) {
7712  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7713  if (Idx == 0)
7714  return Op;
7715 
7716  // SHUFPS the element to the lowest double word, then movss.
7717  int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
7718  MVT VVT = Op.getOperand(0).getSimpleValueType();
7719  SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7720  DAG.getUNDEF(VVT), Mask);
7721  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7722  DAG.getIntPtrConstant(0));
7723  }
7724 
7725  if (VT.getSizeInBits() == 64) {
7726  // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
7727  // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
7728  // to match extract_elt for f64.
7729  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7730  if (Idx == 0)
7731  return Op;
7732 
7733  // UNPCKHPD the element to the lowest double word, then movsd.
7734  // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
7735  // to a f64mem, the whole operation is folded into a single MOVHPDmr.
7736  int Mask[2] = { 1, -1 };
7737  MVT VVT = Op.getOperand(0).getSimpleValueType();
7738  SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7739  DAG.getUNDEF(VVT), Mask);
7740  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7741  DAG.getIntPtrConstant(0));
7742  }
7743 
7744  return SDValue();
7745 }
7746 
7748  MVT VT = Op.getSimpleValueType();
7749  MVT EltVT = VT.getVectorElementType();
7750  SDLoc dl(Op);
7751 
7752  SDValue N0 = Op.getOperand(0);
7753  SDValue N1 = Op.getOperand(1);
7754  SDValue N2 = Op.getOperand(2);
7755 
7756  if (!VT.is128BitVector())
7757  return SDValue();
7758 
7759  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
7760  isa<ConstantSDNode>(N2)) {
7761  unsigned Opc;
7762  if (VT == MVT::v8i16)
7763  Opc = X86ISD::PINSRW;
7764  else if (VT == MVT::v16i8)
7765  Opc = X86ISD::PINSRB;
7766  else
7767  Opc = X86ISD::PINSRB;
7768 
7769  // Transform it so it match pinsr{b,w} which expects a GR32 as its second
7770  // argument.
7771  if (N1.getValueType() != MVT::i32)
7772  N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7773  if (N2.getValueType() != MVT::i32)
7774  N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7775  return DAG.getNode(Opc, dl, VT, N0, N1, N2);
7776  }
7777 
7778  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
7779  // Bits [7:6] of the constant are the source select. This will always be
7780  // zero here. The DAG Combiner may combine an extract_elt index into these
7781  // bits. For example (insert (extract, 3), 2) could be matched by putting
7782  // the '3' into bits [7:6] of X86ISD::INSERTPS.
7783  // Bits [5:4] of the constant are the destination select. This is the
7784  // value of the incoming immediate.
7785  // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
7786  // combine either bitwise AND or insert of float 0.0 to set these bits.
7787  N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
7788  // Create this as a scalar to vector..
7789  N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
7790  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
7791  }
7792 
7793  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
7794  // PINSR* works with constant index.
7795  return Op;
7796  }
7797  return SDValue();
7798 }
7799 
7800 SDValue
7801 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
7802  MVT VT = Op.getSimpleValueType();
7803  MVT EltVT = VT.getVectorElementType();
7804 
7805  SDLoc dl(Op);
7806  SDValue N0 = Op.getOperand(0);
7807  SDValue N1 = Op.getOperand(1);
7808  SDValue N2 = Op.getOperand(2);
7809 
7810  // If this is a 256-bit vector result, first extract the 128-bit vector,
7811  // insert the element into the extracted half and then place it back.
7812  if (VT.is256BitVector() || VT.is512BitVector()) {
7813  if (!isa<ConstantSDNode>(N2))
7814  return SDValue();
7815 
7816  // Get the desired 128-bit vector half.
7817  unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
7818  SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
7819 
7820  // Insert the element into the desired half.
7821  unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
7822  unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
7823 
7824  V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
7825  DAG.getConstant(IdxIn128, MVT::i32));
7826 
7827  // Insert the changed part back to the 256-bit vector
7828  return Insert128BitVector(N0, V, IdxVal, DAG, dl);
7829  }
7830 
7831  if (Subtarget->hasSSE41())
7832  return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
7833 
7834  if (EltVT == MVT::i8)
7835  return SDValue();
7836 
7837  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
7838  // Transform it so it match pinsrw which expects a 16-bit value in a GR32
7839  // as its second argument.
7840  if (N1.getValueType() != MVT::i32)
7841  N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7842  if (N2.getValueType() != MVT::i32)
7843  N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7844  return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
7845  }
7846  return SDValue();
7847 }
7848 
7850  SDLoc dl(Op);
7851  MVT OpVT = Op.getSimpleValueType();
7852 
7853  // If this is a 256-bit vector result, first insert into a 128-bit
7854  // vector and then insert into the 256-bit vector.
7855  if (!OpVT.is128BitVector()) {
7856  // Insert into a 128-bit vector.
7857  unsigned SizeFactor = OpVT.getSizeInBits()/128;
7858  MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
7859  OpVT.getVectorNumElements() / SizeFactor);
7860 
7861  Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
7862 
7863  // Insert the 128-bit vector.
7864  return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
7865  }
7866 
7867  if (OpVT == MVT::v1i64 &&
7868  Op.getOperand(0).getValueType() == MVT::i64)
7869  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
7870 
7871  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
7872  assert(OpVT.is128BitVector() && "Expected an SSE type!");
7873  return DAG.getNode(ISD::BITCAST, dl, OpVT,
7874  DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
7875 }
7876 
7877 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
7878 // a simple subregister reference or explicit instructions to grab
7879 // upper bits of a vector.
7881  SelectionDAG &DAG) {
7882  SDLoc dl(Op);
7883  SDValue In = Op.getOperand(0);
7884  SDValue Idx = Op.getOperand(1);
7885  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7886  MVT ResVT = Op.getSimpleValueType();
7887  MVT InVT = In.getSimpleValueType();
7888 
7889  if (Subtarget->hasFp256()) {
7890  if (ResVT.is128BitVector() &&
7891  (InVT.is256BitVector() || InVT.is512BitVector()) &&
7892  isa<ConstantSDNode>(Idx)) {
7893  return Extract128BitVector(In, IdxVal, DAG, dl);
7894  }
7895  if (ResVT.is256BitVector() && InVT.is512BitVector() &&
7896  isa<ConstantSDNode>(Idx)) {
7897  return Extract256BitVector(In, IdxVal, DAG, dl);
7898  }
7899  }
7900  return SDValue();
7901 }
7902 
7903 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
7904 // simple superregister reference or explicit instructions to insert
7905 // the upper bits of a vector.
7907  SelectionDAG &DAG) {
7908  if (Subtarget->hasFp256()) {
7909  SDLoc dl(Op.getNode());
7910  SDValue Vec = Op.getNode()->getOperand(0);
7911  SDValue SubVec = Op.getNode()->getOperand(1);
7912  SDValue Idx = Op.getNode()->getOperand(2);
7913 
7914  if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
7916  SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
7917  isa<ConstantSDNode>(Idx)) {
7918  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7919  return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7920  }
7921 
7922  if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
7923  SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
7924  isa<ConstantSDNode>(Idx)) {
7925  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7926  return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
7927  }
7928  }
7929  return SDValue();
7930 }
7931 
7932 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7933 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7934 // one of the above mentioned nodes. It has to be wrapped because otherwise
7935 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7936 // be used to form addressing mode. These wrapped nodes will be selected
7937 // into MOV32ri.
7938 SDValue
7939 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7940  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7941 
7942  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7943  // global base reg.
7944  unsigned char OpFlag = 0;
7945  unsigned WrapperKind = X86ISD::Wrapper;
7947 
7948  if (Subtarget->isPICStyleRIPRel() &&
7949  (M == CodeModel::Small || M == CodeModel::Kernel))
7950  WrapperKind = X86ISD::WrapperRIP;
7951  else if (Subtarget->isPICStyleGOT())
7952  OpFlag = X86II::MO_GOTOFF;
7953  else if (Subtarget->isPICStyleStubPIC())
7954  OpFlag = X86II::MO_PIC_BASE_OFFSET;
7955 
7956  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7957  CP->getAlignment(),
7958  CP->getOffset(), OpFlag);
7959  SDLoc DL(CP);
7960  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7961  // With PIC, the address is actually $g + Offset.
7962  if (OpFlag) {
7963  Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7965  SDLoc(), getPointerTy()),
7966  Result);
7967  }
7968 
7969  return Result;
7970 }
7971 
7972 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7973  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7974 
7975  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7976  // global base reg.
7977  unsigned char OpFlag = 0;
7978  unsigned WrapperKind = X86ISD::Wrapper;
7980 
7981  if (Subtarget->isPICStyleRIPRel() &&
7982  (M == CodeModel::Small || M == CodeModel::Kernel))
7983  WrapperKind = X86ISD::WrapperRIP;
7984  else if (Subtarget->isPICStyleGOT())
7985  OpFlag = X86II::MO_GOTOFF;
7986  else if (Subtarget->isPICStyleStubPIC())
7987  OpFlag = X86II::MO_PIC_BASE_OFFSET;
7988 
7989  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7990  OpFlag);
7991  SDLoc DL(JT);
7992  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7993 
7994  // With PIC, the address is actually $g + Offset.
7995  if (OpFlag)
7996  Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7998  SDLoc(), getPointerTy()),
7999  Result);
8000 
8001  return Result;
8002 }
8003 
8004 SDValue
8005 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
8006  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
8007 
8008  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
8009  // global base reg.
8010  unsigned char OpFlag = 0;
8011  unsigned WrapperKind = X86ISD::Wrapper;
8013 
8014  if (Subtarget->isPICStyleRIPRel() &&
8015  (M == CodeModel::Small || M == CodeModel::Kernel)) {
8016  if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
8017  OpFlag = X86II::MO_GOTPCREL;
8018  WrapperKind = X86ISD::WrapperRIP;
8019  } else if (Subtarget->isPICStyleGOT()) {
8020  OpFlag = X86II::MO_GOT;
8021  } else if (Subtarget->isPICStyleStubPIC()) {
8023  } else if (Subtarget->isPICStyleStubNoDynamic()) {
8024  OpFlag = X86II::MO_DARWIN_NONLAZY;
8025  }
8026 
8027  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
8028 
8029  SDLoc DL(Op);
8030  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
8031 
8032  // With PIC, the address is actually $g + Offset.
8033  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
8034  !Subtarget->is64Bit()) {
8035  Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8037  SDLoc(), getPointerTy()),
8038  Result);
8039  }
8040 
8041  // For symbols that require a load from a stub to get the address, emit the
8042  // load.
8043  if (isGlobalStubReference(OpFlag))
8044  Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
8045  MachinePointerInfo::getGOT(), false, false, false, 0);
8046 
8047  return Result;
8048 }
8049 
8050 SDValue
8051 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
8052  // Create the TargetBlockAddressAddress node.
8053  unsigned char OpFlags =
8054  Subtarget->ClassifyBlockAddressReference();
8056  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
8057  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
8058  SDLoc dl(Op);
8059  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
8060  OpFlags);
8061 
8062  if (Subtarget->isPICStyleRIPRel() &&
8063  (M == CodeModel::Small || M == CodeModel::Kernel))
8064  Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
8065  else
8066  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
8067 
8068  // With PIC, the address is actually $g + Offset.
8069  if (isGlobalRelativeToPICBase(OpFlags)) {
8070  Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
8072  Result);
8073  }
8074 
8075  return Result;
8076 }
8077 
8078 SDValue
8079 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
8080  int64_t Offset, SelectionDAG &DAG) const {
8081  // Create the TargetGlobalAddress node, folding in the constant
8082  // offset if it is legal.
8083  unsigned char OpFlags =
8084  Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8086  SDValue Result;
8087  if (OpFlags == X86II::MO_NO_FLAG &&
8088  X86::isOffsetSuitableForCodeModel(Offset, M)) {
8089  // A direct static reference to a global.
8090  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
8091  Offset = 0;
8092  } else {
8093  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
8094  }
8095 
8096  if (Subtarget->isPICStyleRIPRel() &&
8097  (M == CodeModel::Small || M == CodeModel::Kernel))
8098  Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
8099  else
8100  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
8101 
8102  // With PIC, the address is actually $g + Offset.
8103  if (isGlobalRelativeToPICBase(OpFlags)) {
8104  Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
8106  Result);
8107  }
8108 
8109  // For globals that require a load from a stub to get the address, emit the
8110  // load.
8111  if (isGlobalStubReference(OpFlags))
8112  Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
8113  MachinePointerInfo::getGOT(), false, false, false, 0);
8114 
8115  // If there was a non-zero offset that we didn't fold, create an explicit
8116  // addition for it.
8117  if (Offset != 0)
8118  Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
8119  DAG.getConstant(Offset, getPointerTy()));
8120 
8121  return Result;
8122 }
8123 
8124 SDValue
8125 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
8126  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8127  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
8128  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
8129 }
8130 
8131 static SDValue
8133  SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
8134  unsigned char OperandFlags, bool LocalDynamic = false) {
8136  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8137  SDLoc dl(GA);
8138  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
8139  GA->getValueType(0),
8140  GA->getOffset(),
8141  OperandFlags);
8142 
8144  : X86ISD::TLSADDR;
8145 
8146  if (InFlag) {
8147  SDValue Ops[] = { Chain, TGA, *InFlag };
8148  Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
8149  } else {
8150  SDValue Ops[] = { Chain, TGA };
8151  Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
8152  }
8153 
8154  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
8155  MFI->setAdjustsStack(true);
8156 
8157  SDValue Flag = Chain.getValue(1);
8158  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
8159 }
8160 
8161 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
8162 static SDValue
8164  const EVT PtrVT) {
8165  SDValue InFlag;
8166  SDLoc dl(GA); // ? function entry point might be better
8167  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
8169  SDLoc(), PtrVT), InFlag);
8170  InFlag = Chain.getValue(1);
8171 
8172  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
8173 }
8174 
8175 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
8176 static SDValue
8178  const EVT PtrVT) {
8179  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
8180  X86::RAX, X86II::MO_TLSGD);
8181 }
8182 
8184  SelectionDAG &DAG,
8185  const EVT PtrVT,
8186  bool is64Bit) {
8187  SDLoc dl(GA);
8188 
8189  // Get the start address of the TLS block for this module.
8193 
8194  SDValue Base;
8195  if (is64Bit) {
8196  Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
8197  X86II::MO_TLSLD, /*LocalDynamic=*/true);
8198  } else {
8199  SDValue InFlag;
8200  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
8201  DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
8202  InFlag = Chain.getValue(1);
8203  Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
8204  X86II::MO_TLSLDM, /*LocalDynamic=*/true);
8205  }
8206 
8207  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
8208  // of Base.
8209 
8210  // Build x@dtpoff.
8211  unsigned char OperandFlags = X86II::MO_DTPOFF;
8212  unsigned WrapperKind = X86ISD::Wrapper;
8213  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
8214  GA->getValueType(0),
8215  GA->getOffset(), OperandFlags);
8216  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
8217 
8218  // Add x@dtpoff with the base.
8219  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
8220 }
8221 
8222 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
8224  const EVT PtrVT, TLSModel::Model model,
8225  bool is64Bit, bool isPIC) {
8226  SDLoc dl(GA);
8227 
8228  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
8230  is64Bit ? 257 : 256));
8231 
8233  DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
8234  MachinePointerInfo(Ptr), false, false, false, 0);
8235 
8236  unsigned char OperandFlags = 0;
8237  // Most TLS accesses are not RIP relative, even on x86-64. One exception is
8238  // initialexec.
8239  unsigned WrapperKind = X86ISD::Wrapper;
8240  if (model == TLSModel::LocalExec) {
8241  OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
8242  } else if (model == TLSModel::InitialExec) {
8243  if (is64Bit) {
8244  OperandFlags = X86II::MO_GOTTPOFF;
8245  WrapperKind = X86ISD::WrapperRIP;
8246  } else {
8247  OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
8248  }
8249  } else {
8250  llvm_unreachable("Unexpected model");
8251  }
8252 
8253  // emit "addl x@ntpoff,%eax" (local exec)
8254  // or "addl x@indntpoff,%eax" (initial exec)
8255  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
8256  SDValue TGA =
8257  DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
8258  GA->getOffset(), OperandFlags);
8259  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
8260 
8261  if (model == TLSModel::InitialExec) {
8262  if (isPIC && !is64Bit) {
8263  Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
8264  DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
8265  Offset);
8266  }
8267 
8268  Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
8269  MachinePointerInfo::getGOT(), false, false, false, 0);
8270  }
8271 
8272  // The address of the thread local variable is the add of the thread
8273  // pointer with the offset of the variable.
8274  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
8275 }
8276 
8277 SDValue
8278 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
8279 
8280  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8281  const GlobalValue *GV = GA->getGlobal();
8282 
8283  if (Subtarget->isTargetELF()) {
8285 
8286  switch (model) {
8288  if (Subtarget->is64Bit())
8289  return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
8290  return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
8292  return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
8293  Subtarget->is64Bit());
8294  case TLSModel::InitialExec:
8295  case TLSModel::LocalExec:
8296  return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
8297  Subtarget->is64Bit(),
8299  }
8300  llvm_unreachable("Unknown TLS model.");
8301  }
8302 
8303  if (Subtarget->isTargetDarwin()) {
8304  // Darwin only has one model of TLS. Lower to that.
8305  unsigned char OpFlag = 0;
8306  unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
8308 
8309  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
8310  // global base reg.
8311  bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
8312  !Subtarget->is64Bit();
8313  if (PIC32)
8314  OpFlag = X86II::MO_TLVP_PIC_BASE;
8315  else
8316  OpFlag = X86II::MO_TLVP;
8317  SDLoc DL(Op);
8318  SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
8319  GA->getValueType(0),
8320  GA->getOffset(), OpFlag);
8321  SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
8322 
8323  // With PIC32, the address is actually $g + Offset.
8324  if (PIC32)
8325  Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8327  SDLoc(), getPointerTy()),
8328  Offset);
8329 
8330  // Lowering the machine isd will make sure everything is in the right
8331  // location.
8332  SDValue Chain = DAG.getEntryNode();
8333  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8334  SDValue Args[] = { Chain, Offset };
8335  Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
8336 
8337  // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
8339  MFI->setAdjustsStack(true);
8340 
8341  // And our return value (tls address) is in the standard call return value
8342  // location.
8343  unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
8344  return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
8345  Chain.getValue(1));
8346  }
8347 
8348  if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
8349  // Just use the implicit TLS architecture
8350  // Need to generate someting similar to:
8351  // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
8352  // ; from TEB
8353  // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
8354  // mov rcx, qword [rdx+rcx*8]
8355  // mov eax, .tls$:tlsvar
8356  // [rax+rcx] contains the address
8357  // Windows 64bit: gs:0x58
8358  // Windows 32bit: fs:__tls_array
8359 
8360  // If GV is an alias then use the aliasee for determining
8361  // thread-localness.
8362  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
8363  GV = GA->resolveAliasedGlobal(false);
8364  SDLoc dl(GA);
8365  SDValue Chain = DAG.getEntryNode();
8366 
8367  // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
8368  // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
8369  // use its literal value of 0x2C.
8370  Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
8371  ? Type::getInt8PtrTy(*DAG.getContext(),
8372  256)
8374  257));
8375 
8376  SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
8377  (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
8378  DAG.getExternalSymbol("_tls_array", getPointerTy()));
8379 
8380  SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
8381  MachinePointerInfo(Ptr),
8382  false, false, false, 0);
8383 
8384  // Load the _tls_index variable
8385  SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
8386  if (Subtarget->is64Bit())
8387  IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
8388  IDX, MachinePointerInfo(), MVT::i32,
8389  false, false, 0);
8390  else
8391  IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
8392  false, false, false, 0);
8393 
8394  SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
8395  getPointerTy());
8396  IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
8397 
8398  SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
8399  res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
8400  false, false, false, 0);
8401 
8402  // Get the offset of start of .tls section
8403  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
8404  GA->getValueType(0),
8405  GA->getOffset(), X86II::MO_SECREL);
8406  SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
8407 
8408  // The address of the thread local variable is the add of the thread
8409  // pointer with the offset of the variable.
8410  return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
8411  }
8412 
8413  llvm_unreachable("TLS not implemented for this target.");
8414 }
8415 
8416 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
8417 /// and take a 2 x i32 value to shift plus a shift amount.
8418 SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
8419  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
8420  EVT VT = Op.getValueType();
8421  unsigned VTBits = VT.getSizeInBits();
8422  SDLoc dl(Op);
8423  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
8424  SDValue ShOpLo = Op.getOperand(0);
8425  SDValue ShOpHi = Op.getOperand(1);
8426  SDValue ShAmt = Op.getOperand(2);
8427  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
8428  DAG.getConstant(VTBits - 1, MVT::i8))
8429  : DAG.getConstant(0, VT);
8430 
8431  SDValue Tmp2, Tmp3;
8432  if (Op.getOpcode() == ISD::SHL_PARTS) {
8433  Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
8434  Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
8435  } else {
8436  Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
8437  Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
8438  }
8439 
8440  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
8441  DAG.getConstant(VTBits, MVT::i8));
8442  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
8443  AndNode, DAG.getConstant(0, MVT::i8));
8444 
8445  SDValue Hi, Lo;
8447  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
8448  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
8449 
8450  if (Op.getOpcode() == ISD::SHL_PARTS) {
8451  Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
8452  Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
8453  } else {
8454  Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
8455  Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
8456  }
8457 
8458  SDValue Ops[2] = { Lo, Hi };
8459  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
8460 }
8461 
8462 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
8463  SelectionDAG &DAG) const {
8464  EVT SrcVT = Op.getOperand(0).getValueType();
8465 
8466  if (SrcVT.isVector())
8467  return SDValue();
8468 
8469  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
8470  "Unknown SINT_TO_FP to lower!");
8471 
8472  // These are really Legal; return the operand so the caller accepts it as
8473  // Legal.
8474  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
8475  return Op;
8476  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
8477  Subtarget->is64Bit()) {
8478  return Op;
8479  }
8480 
8481  SDLoc dl(Op);
8482  unsigned Size = SrcVT.getSizeInBits()/8;
8483  MachineFunction &MF = DAG.getMachineFunction();
8484  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
8485  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8486  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8487  StackSlot,
8489  false, false, 0);
8490  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
8491 }
8492 
8494  SDValue StackSlot,
8495  SelectionDAG &DAG) const {
8496  // Build the FILD
8497  SDLoc DL(Op);
8498  SDVTList Tys;
8499  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
8500  if (useSSE)
8501  Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
8502  else
8503  Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
8504 
8505  unsigned ByteSize = SrcVT.getSizeInBits()/8;
8506 
8507  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
8508  MachineMemOperand *MMO;
8509  if (FI) {
8510  int SSFI = FI->getIndex();
8511  MMO =
8512  DAG.getMachineFunction()
8514  MachineMemOperand::MOLoad, ByteSize, ByteSize);
8515  } else {
8516  MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
8517  StackSlot = StackSlot.getOperand(1);
8518  }
8519  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
8520  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
8521  X86ISD::FILD, DL,
8522  Tys, Ops, array_lengthof(Ops),
8523  SrcVT, MMO);
8524 
8525  if (useSSE) {
8526  Chain = Result.getValue(1);
8527  SDValue InFlag = Result.getValue(2);
8528 
8529  // FIXME: Currently the FST is flagged to the FILD_FLAG. This
8530  // shouldn't be necessary except that RFP cannot be live across
8531  // multiple blocks. When stackifier is fixed, they can be uncoupled.
8532  MachineFunction &MF = DAG.getMachineFunction();
8533  unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
8534  int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
8535  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8536  Tys = DAG.getVTList(MVT::Other);
8537  SDValue Ops[] = {
8538  Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
8539  };
8540  MachineMemOperand *MMO =
8541  DAG.getMachineFunction()
8543  MachineMemOperand::MOStore, SSFISize, SSFISize);
8544 
8545  Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
8546  Ops, array_lengthof(Ops),
8547  Op.getValueType(), MMO);
8548  Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
8550  false, false, false, 0);
8551  }
8552 
8553  return Result;
8554 }
8555 
8556 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
8557 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
8558  SelectionDAG &DAG) const {
8559  // This algorithm is not obvious. Here it is what we're trying to output:
8560  /*
8561  movq %rax, %xmm0
8562  punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
8563  subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
8564  #ifdef __SSE3__
8565  haddpd %xmm0, %xmm0
8566  #else
8567  pshufd $0x4e, %xmm0, %xmm1
8568  addpd %xmm1, %xmm0
8569  #endif
8570  */
8571 
8572  SDLoc dl(Op);
8573  LLVMContext *Context = DAG.getContext();
8574 
8575  // Build some magic constants.
8576  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
8577  Constant *C0 = ConstantDataVector::get(*Context, CV0);
8578  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
8579 
8581  CV1.push_back(
8583  APInt(64, 0x4330000000000000ULL))));
8584  CV1.push_back(
8586  APInt(64, 0x4530000000000000ULL))));
8587  Constant *C1 = ConstantVector::get(CV1);
8588  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
8589 
8590  // Load the 64-bit value into an XMM register.
8592  Op.getOperand(0));
8593  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
8595  false, false, false, 16);
8596  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
8597  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
8598  CLod0);
8599 
8600  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
8602  false, false, false, 16);
8603  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
8604  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
8605  SDValue Result;
8606 
8607  if (Subtarget->hasSSE3()) {
8608  // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
8609  Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
8610  } else {
8611  SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
8613  S2F, 0x4E, DAG);
8614  Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
8615  DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
8616  Sub);
8617  }
8618 
8619  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
8620  DAG.getIntPtrConstant(0));
8621 }
8622 
8623 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
8624 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
8625  SelectionDAG &DAG) const {
8626  SDLoc dl(Op);
8627  // FP constant to bias correct the final result.
8628  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
8629  MVT::f64);
8630 
8631  // Load the 32-bit value into an XMM register.
8633  Op.getOperand(0));
8634 
8635  // Zero out the upper parts of the register.
8636  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
8637 
8638  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8639  DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
8640  DAG.getIntPtrConstant(0));
8641 
8642  // Or the load with the bias.
8643  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
8644  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8646  MVT::v2f64, Load)),
8647  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8649  MVT::v2f64, Bias)));
8651  DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
8652  DAG.getIntPtrConstant(0));
8653 
8654  // Subtract the bias.
8655  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
8656 
8657  // Handle final rounding.
8658  EVT DestVT = Op.getValueType();
8659 
8660  if (DestVT.bitsLT(MVT::f64))
8661  return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
8662  DAG.getIntPtrConstant(0));
8663  if (DestVT.bitsGT(MVT::f64))
8664  return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
8665 
8666  // Handle final rounding.
8667  return Sub;
8668 }
8669 
8670 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
8671  SelectionDAG &DAG) const {
8672  SDValue N0 = Op.getOperand(0);
8673  EVT SVT = N0.getValueType();
8674  SDLoc dl(Op);
8675 
8676  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
8677  SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
8678  "Custom UINT_TO_FP is not supported!");
8679 
8680  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
8681  SVT.getVectorNumElements());
8682  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
8683  DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
8684 }
8685 
8686 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
8687  SelectionDAG &DAG) const {
8688  SDValue N0 = Op.getOperand(0);
8689  SDLoc dl(Op);
8690 
8691  if (Op.getValueType().isVector())
8692  return lowerUINT_TO_FP_vec(Op, DAG);
8693 
8694  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
8695  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
8696  // the optimization here.
8697  if (DAG.SignBitIsZero(N0))
8698  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
8699 
8700  EVT SrcVT = N0.getValueType();
8701  EVT DstVT = Op.getValueType();
8702  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
8703  return LowerUINT_TO_FP_i64(Op, DAG);
8704  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
8705  return LowerUINT_TO_FP_i32(Op, DAG);
8706  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
8707  return SDValue();
8708 
8709  // Make a 64-bit buffer, and use it to build an FILD.
8710  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
8711  if (SrcVT == MVT::i32) {
8712  SDValue WordOff = DAG.getConstant(4, getPointerTy());
8713  SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
8714  getPointerTy(), StackSlot, WordOff);
8715  SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8716  StackSlot, MachinePointerInfo(),
8717  false, false, 0);
8718  SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
8719  OffsetSlot, MachinePointerInfo(),
8720  false, false, 0);
8721  SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
8722  return Fild;
8723  }
8724 
8725  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
8726  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8727  StackSlot, MachinePointerInfo(),
8728  false, false, 0);
8729  // For i64 source, we need to add the appropriate power of 2 if the input
8730  // was negative. This is the same as the optimization in
8731  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
8732  // we must be careful to do the computation in x87 extended precision, not
8733  // in SSE. (The generic code can't know it's OK to do this, or how to.)
8734  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
8735  MachineMemOperand *MMO =
8736  DAG.getMachineFunction()
8739 
8740  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
8741  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
8742  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
8743  array_lengthof(Ops), MVT::i64, MMO);
8744 
8745  APInt FF(32, 0x5F800000ULL);
8746 
8747  // Check whether the sign bit is set.
8748  SDValue SignSet = DAG.getSetCC(dl,
8750  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
8751  ISD::SETLT);
8752 
8753  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
8754  SDValue FudgePtr = DAG.getConstantPool(
8755  ConstantInt::get(*DAG.getContext(), FF.zext(64)),
8756  getPointerTy());
8757 
8758  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
8759  SDValue Zero = DAG.getIntPtrConstant(0);
8760  SDValue Four = DAG.getIntPtrConstant(4);
8761  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
8762  Zero, Four);
8763  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
8764 
8765  // Load the value out, extending it from f32 to f80.
8766  // FIXME: Avoid the extend by constructing the right constant pool?
8767  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
8769  MVT::f32, false, false, 4);
8770  // Extend everything to 80 bits to force it to be done on x87.
8771  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
8772  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
8773 }
8774 
8775 std::pair<SDValue,SDValue>
8776 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
8777  bool IsSigned, bool IsReplace) const {
8778  SDLoc DL(Op);
8779 
8780  EVT DstTy = Op.getValueType();
8781 
8782  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
8783  assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
8784  DstTy = MVT::i64;
8785  }
8786 
8787  assert(DstTy.getSimpleVT() <= MVT::i64 &&
8788  DstTy.getSimpleVT() >= MVT::i16 &&
8789  "Unknown FP_TO_INT to lower!");
8790 
8791  // These are really Legal.
8792  if (DstTy == MVT::i32 &&
8794  return std::make_pair(SDValue(), SDValue());
8795  if (Subtarget->is64Bit() &&
8796  DstTy == MVT::i64 &&
8798  return std::make_pair(SDValue(), SDValue());
8799 
8800  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
8801  // stack slot, or into the FTOL runtime function.
8802  MachineFunction &MF = DAG.getMachineFunction();
8803  unsigned MemSize = DstTy.getSizeInBits()/8;
8804  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8805  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8806 
8807  unsigned Opc;
8808  if (!IsSigned && isIntegerTypeFTOL(DstTy))
8809  Opc = X86ISD::WIN_FTOL;
8810  else
8811  switch (DstTy.getSimpleVT().SimpleTy) {
8812  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
8813  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
8814  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
8815  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
8816  }
8817 
8818  SDValue Chain = DAG.getEntryNode();
8819  SDValue Value = Op.getOperand(0);
8820  EVT TheVT = Op.getOperand(0).getValueType();
8821  // FIXME This causes a redundant load/store if the SSE-class value is already
8822  // in memory, such as if it is on the callstack.
8823  if (isScalarFPTypeInSSEReg(TheVT)) {
8824  assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
8825  Chain = DAG.getStore(Chain, DL, Value, StackSlot,
8827  false, false, 0);
8828  SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
8829  SDValue Ops[] = {
8830  Chain, StackSlot, DAG.getValueType(TheVT)
8831  };
8832 
8833  MachineMemOperand *MMO =
8835  MachineMemOperand::MOLoad, MemSize, MemSize);
8836  Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
8837  array_lengthof(Ops), DstTy, MMO);
8838  Chain = Value.getValue(1);
8839  SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8840  StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8841  }
8842 
8843  MachineMemOperand *MMO =
8845  MachineMemOperand::MOStore, MemSize, MemSize);
8846 
8847  if (Opc != X86ISD::WIN_FTOL) {
8848  // Build the FP_TO_INT*_IN_MEM
8849  SDValue Ops[] = { Chain, Value, StackSlot };
8850  SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
8851  Ops, array_lengthof(Ops), DstTy,
8852  MMO);
8853  return std::make_pair(FIST, StackSlot);
8854  } else {
8855  SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
8857  Chain, Value);
8858  SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
8859  MVT::i32, ftol.getValue(1));
8860  SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
8861  MVT::i32, eax.getValue(2));
8862  SDValue Ops[] = { eax, edx };
8863  SDValue pair = IsReplace
8864  ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
8865  : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
8866  return std::make_pair(pair, SDValue());
8867  }
8868 }
8869 
8871  const X86Subtarget *Subtarget) {
8872  MVT VT = Op->getSimpleValueType(0);
8873  SDValue In = Op->getOperand(0);
8874  MVT InVT = In.getSimpleValueType();
8875  SDLoc dl(Op);
8876 
8877  // Optimize vectors in AVX mode:
8878  //
8879  // v8i16 -> v8i32
8880  // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
8881  // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
8882  // Concat upper and lower parts.
8883  //
8884  // v4i32 -> v4i64
8885  // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
8886  // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
8887  // Concat upper and lower parts.
8888  //
8889 
8890  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
8891  ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
8892  ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
8893  return SDValue();
8894 
8895  if (Subtarget->hasInt256())
8896  return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
8897 
8898  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
8899  SDValue Undef = DAG.getUNDEF(InVT);
8900  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
8901  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
8902  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
8903 
8905  VT.getVectorNumElements()/2);
8906 
8907  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
8908  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
8909 
8910  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
8911 }
8912 
8914  SelectionDAG &DAG) {
8915  MVT VT = Op->getValueType(0).getSimpleVT();
8916  SDValue In = Op->getOperand(0);
8917  MVT InVT = In.getValueType().getSimpleVT();
8918  SDLoc DL(Op);
8919  unsigned int NumElts = VT.getVectorNumElements();
8920  if (NumElts != 8 && NumElts != 16)
8921  return SDValue();
8922 
8923  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
8924  return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
8925 
8926  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
8927  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8928  // Now we have only mask extension
8929  assert(InVT.getVectorElementType() == MVT::i1);
8930  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
8931  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
8932  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
8933  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8934  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
8936  false, false, false, Alignment);
8937 
8938  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
8939  if (VT.is512BitVector())
8940  return Brcst;
8941  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
8942 }
8943 
8944 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
8945  SelectionDAG &DAG) {
8946  if (Subtarget->hasFp256()) {
8947  SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
8948  if (Res.getNode())
8949  return Res;
8950  }
8951 
8952  return SDValue();
8953 }
8954 
8955 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
8956  SelectionDAG &DAG) {
8957  SDLoc DL(Op);
8958  MVT VT = Op.getSimpleValueType();
8959  SDValue In = Op.getOperand(0);
8960  MVT SVT = In.getSimpleValueType();
8961 
8962  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
8963  return LowerZERO_EXTEND_AVX512(Op, DAG);
8964 
8965  if (Subtarget->hasFp256()) {
8966  SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
8967  if (Res.getNode())
8968  return Res;
8969  }
8970 
8971  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
8973  return SDValue();
8974 }
8975 
8976 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8977  SDLoc DL(Op);
8978  MVT VT = Op.getSimpleValueType();
8979  SDValue In = Op.getOperand(0);
8980  MVT InVT = In.getSimpleValueType();
8981  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
8982  "Invalid TRUNCATE operation");
8983 
8984  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
8985  if (VT.getVectorElementType().getSizeInBits() >=8)
8986  return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
8987 
8988  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
8989  unsigned NumElts = InVT.getVectorNumElements();
8990  assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
8991  if (InVT.getSizeInBits() < 512) {
8992  MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
8993  In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
8994  InVT = ExtVT;
8995  }
8996  SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
8997  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
8998  SDValue CP = DAG.getConstantPool(C, getPointerTy());
8999  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
9000  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
9002  false, false, false, Alignment);
9003  SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
9004  SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
9005  return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
9006  }
9007 
9008  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
9009  // On AVX2, v4i64 -> v4i32 becomes VPERMD.
9010  if (Subtarget->hasInt256()) {
9011  static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
9012  In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
9013  In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
9014  ShufMask);
9015  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
9016  DAG.getIntPtrConstant(0));
9017  }
9018 
9019  // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
9020  SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
9021  DAG.getIntPtrConstant(0));
9022  SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
9023  DAG.getIntPtrConstant(2));
9024 
9025  OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
9026  OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
9027 
9028  // The PSHUFD mask:
9029  static const int ShufMask1[] = {0, 2, 0, 0};
9030  SDValue Undef = DAG.getUNDEF(VT);
9031  OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
9032  OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
9033 
9034  // The MOVLHPS mask:
9035  static const int ShufMask2[] = {0, 1, 4, 5};
9036  return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
9037  }
9038 
9039  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
9040  // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
9041  if (Subtarget->hasInt256()) {
9042  In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
9043 
9044  SmallVector<SDValue,32> pshufbMask;
9045  for (unsigned i = 0; i < 2; ++i) {
9046  pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
9047  pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
9048  pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
9049  pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
9050  pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
9051  pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
9052  pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
9053  pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
9054  for (unsigned j = 0; j < 8; ++j)
9055  pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
9056  }
9058  &pshufbMask[0], 32);
9059  In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
9060  In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
9061 
9062  static const int ShufMask[] = {0, 2, -1, -1};
9063  In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
9064  &ShufMask[0]);
9065  In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
9066  DAG.getIntPtrConstant(0));
9067  return DAG.getNode(ISD::BITCAST, DL, VT, In);
9068  }
9069 
9070  SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
9071  DAG.getIntPtrConstant(0));
9072 
9073  SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
9074  DAG.getIntPtrConstant(4));
9075 
9076  OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
9077  OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
9078 
9079  // The PSHUFB mask:
9080  static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
9081  -1, -1, -1, -1, -1, -1, -1, -1};
9082 
9083  SDValue Undef = DAG.getUNDEF(MVT::v16i8);
9084  OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
9085  OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
9086 
9087  OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
9088  OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
9089 
9090  // The MOVLHPS Mask:
9091  static const int ShufMask2[] = {0, 1, 4, 5};
9092  SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
9093  return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
9094  }
9095 
9096  // Handle truncation of V256 to V128 using shuffles.
9097  if (!VT.is128BitVector() || !InVT.is256BitVector())
9098  return SDValue();
9099 
9100  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
9101 
9102  unsigned NumElems = VT.getVectorNumElements();
9104  NumElems * 2);
9105 
9106  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
9107  // Prepare truncation shuffle mask
9108  for (unsigned i = 0; i != NumElems; ++i)
9109  MaskVec[i] = i * 2;
9110  SDValue V = DAG.getVectorShuffle(NVT, DL,
9111  DAG.getNode(ISD::BITCAST, DL, NVT, In),
9112  DAG.getUNDEF(NVT), &MaskVec[0]);
9113  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
9114  DAG.getIntPtrConstant(0));
9115 }
9116 
9117 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
9118  SelectionDAG &DAG) const {
9119  MVT VT = Op.getSimpleValueType();
9120  if (VT.isVector()) {
9121  if (VT == MVT::v8i16)
9122  return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
9123  DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
9124  MVT::v8i32, Op.getOperand(0)));
9125  return SDValue();
9126  }
9127 
9128  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
9129  /*IsSigned=*/ true, /*IsReplace=*/ false);
9130  SDValue FIST = Vals.first, StackSlot = Vals.second;
9131  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
9132  if (FIST.getNode() == 0) return Op;
9133 
9134  if (StackSlot.getNode())
9135  // Load the result.
9136  return DAG.getLoad(Op.getValueType(), SDLoc(Op),
9137  FIST, StackSlot, MachinePointerInfo(),
9138  false, false, false, 0);
9139 
9140  // The node is the result.
9141  return FIST;
9142 }
9143 
9144 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
9145  SelectionDAG &DAG) const {
9146  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
9147  /*IsSigned=*/ false, /*IsReplace=*/ false);
9148  SDValue FIST = Vals.first, StackSlot = Vals.second;
9149  assert(FIST.getNode() && "Unexpected failure");
9150 
9151  if (StackSlot.getNode())
9152  // Load the result.
9153  return DAG.getLoad(Op.getValueType(), SDLoc(Op),
9154  FIST, StackSlot, MachinePointerInfo(),
9155  false, false, false, 0);
9156 
9157  // The node is the result.
9158  return FIST;
9159 }
9160 
9162  SDLoc DL(Op);
9163  MVT VT = Op.getSimpleValueType();
9164  SDValue In = Op.getOperand(0);
9165  MVT SVT = In.getSimpleValueType();
9166 
9167  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
9168 
9169  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
9171  In, DAG.getUNDEF(SVT)));
9172 }
9173 
9174 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
9175  LLVMContext *Context = DAG.getContext();
9176  SDLoc dl(Op);
9177  MVT VT = Op.getSimpleValueType();
9178  MVT EltVT = VT;
9179  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
9180  if (VT.isVector()) {
9181  EltVT = VT.getVectorElementType();
9182  NumElts = VT.getVectorNumElements();
9183  }
9184  Constant *C;
9185  if (EltVT == MVT::f64)
9187  APInt(64, ~(1ULL << 63))));
9188  else
9190  APInt(32, ~(1U << 31))));
9191  C = ConstantVector::getSplat(NumElts, C);
9192  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
9193  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
9194  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9196  false, false, false, Alignment);
9197  if (VT.isVector()) {
9198  MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
9199  return DAG.getNode(ISD::BITCAST, dl, VT,
9200  DAG.getNode(ISD::AND, dl, ANDVT,
9201  DAG.getNode(ISD::BITCAST, dl, ANDVT,
9202  Op.getOperand(0)),
9203  DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
9204  }
9205  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
9206 }
9207 
9208 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
9209  LLVMContext *Context = DAG.getContext();
9210  SDLoc dl(Op);
9211  MVT VT = Op.getSimpleValueType();
9212  MVT EltVT = VT;
9213  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
9214  if (VT.isVector()) {
9215  EltVT = VT.getVectorElementType();
9216  NumElts = VT.getVectorNumElements();
9217  }
9218  Constant *C;
9219  if (EltVT == MVT::f64)
9221  APInt(64, 1ULL << 63)));
9222  else
9224  APInt(32, 1U << 31)));
9225  C = ConstantVector::getSplat(NumElts, C);
9226  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
9227  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
9228  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9230  false, false, false, Alignment);
9231  if (VT.isVector()) {
9232  MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
9233  return DAG.getNode(ISD::BITCAST, dl, VT,
9234  DAG.getNode(ISD::XOR, dl, XORVT,
9235  DAG.getNode(ISD::BITCAST, dl, XORVT,
9236  Op.getOperand(0)),
9237  DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
9238  }
9239 
9240  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
9241 }
9242 
9243 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
9244  LLVMContext *Context = DAG.getContext();
9245  SDValue Op0 = Op.getOperand(0);
9246  SDValue Op1 = Op.getOperand(1);
9247  SDLoc dl(Op);
9248  MVT VT = Op.getSimpleValueType();
9249  MVT SrcVT = Op1.getSimpleValueType();
9250 
9251  // If second operand is smaller, extend it first.
9252  if (SrcVT.bitsLT(VT)) {
9253  Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
9254  SrcVT = VT;
9255  }
9256  // And if it is bigger, shrink it first.
9257  if (SrcVT.bitsGT(VT)) {
9258  Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
9259  SrcVT = VT;
9260  }
9261 
9262  // At this point the operands and the result should have the same
9263  // type, and that won't be f80 since that is not custom lowered.
9264 
9265  // First get the sign bit of second operand.
9267  if (SrcVT == MVT::f64) {
9268  const fltSemantics &Sem = APFloat::IEEEdouble;
9269  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
9270  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
9271  } else {
9272  const fltSemantics &Sem = APFloat::IEEEsingle;
9273  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
9274  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9275  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9276  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9277  }
9278  Constant *C = ConstantVector::get(CV);
9279  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9280  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
9282  false, false, false, 16);
9283  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
9284 
9285  // Shift sign bit right or left if the two operands have different types.
9286  if (SrcVT.bitsGT(VT)) {
9287  // Op0 is MVT::f32, Op1 is MVT::f64.
9288  SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
9289  SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
9290  DAG.getConstant(32, MVT::i32));
9291  SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
9292  SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
9293  DAG.getIntPtrConstant(0));
9294  }
9295 
9296  // Clear first operand sign bit.
9297  CV.clear();
9298  if (VT == MVT::f64) {
9299  const fltSemantics &Sem = APFloat::IEEEdouble;
9300  CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
9301  APInt(64, ~(1ULL << 63)))));
9302  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
9303  } else {
9304  const fltSemantics &Sem = APFloat::IEEEsingle;
9305  CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
9306  APInt(32, ~(1U << 31)))));
9307  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9308  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9309  CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9310  }
9311  C = ConstantVector::get(CV);
9312  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9313  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9315  false, false, false, 16);
9316  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
9317 
9318  // Or the value with the sign bit.
9319  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
9320 }
9321 
9323  SDValue N0 = Op.getOperand(0);
9324  SDLoc dl(Op);
9325  MVT VT = Op.getSimpleValueType();
9326 
9327  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
9328  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
9329  DAG.getConstant(1, VT));
9330  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
9331 }
9332 
9333 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
9334 //
9336  SelectionDAG &DAG) {
9337  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
9338 
9339  if (!Subtarget->hasSSE41())
9340  return SDValue();
9341 
9342  if (!Op->hasOneUse())
9343  return SDValue();
9344 
9345  SDNode *N = Op.getNode();
9346  SDLoc DL(N);
9347 
9349  DenseMap<SDValue, unsigned> VecInMap;
9350  EVT VT = MVT::Other;
9351 
9352  // Recognize a special case where a vector is casted into wide integer to
9353  // test all 0s.
9354  Opnds.push_back(N->getOperand(0));
9355  Opnds.push_back(N->getOperand(1));
9356 
9357  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
9359  // BFS traverse all OR'd operands.
9360  if (I->getOpcode() == ISD::OR) {
9361  Opnds.push_back(I->getOperand(0));
9362  Opnds.push_back(I->getOperand(1));
9363  // Re-evaluate the number of nodes to be traversed.
9364  e += 2; // 2 more nodes (LHS and RHS) are pushed.
9365  continue;
9366  }
9367 
9368  // Quit if a non-EXTRACT_VECTOR_ELT
9369  if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9370  return SDValue();
9371 
9372  // Quit if without a constant index.
9373  SDValue Idx = I->getOperand(1);
9374  if (!isa<ConstantSDNode>(Idx))
9375  return SDValue();
9376 
9377  SDValue ExtractedFromVec = I->getOperand(0);
9378  DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
9379  if (M == VecInMap.end()) {
9380  VT = ExtractedFromVec.getValueType();
9381  // Quit if not 128/256-bit vector.
9382  if (!VT.is128BitVector() && !VT.is256BitVector())
9383  return SDValue();
9384  // Quit if not the same type.
9385  if (VecInMap.begin() != VecInMap.end() &&
9386  VT != VecInMap.begin()->first.getValueType())
9387  return SDValue();
9388  M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
9389  }
9390  M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
9391  }
9392 
9393  assert((VT.is128BitVector() || VT.is256BitVector()) &&
9394  "Not extracted from 128-/256-bit vector.");
9395 
9396  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
9397  SmallVector<SDValue, 8> VecIns;
9398 
9400  I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
9401  // Quit if not all elements are used.
9402  if (I->second != FullMask)
9403  return SDValue();
9404  VecIns.push_back(I->first);
9405  }
9406 
9407  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
9408 
9409  // Cast all vectors into TestVT for PTEST.
9410  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
9411  VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
9412 
9413  // If more than one full vectors are evaluated, OR them first before PTEST.
9414  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
9415  // Each iteration will OR 2 nodes and append the result until there is only
9416  // 1 node left, i.e. the final OR'd value of all vectors.
9417  SDValue LHS = VecIns[Slot];
9418  SDValue RHS = VecIns[Slot + 1];
9419  VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
9420  }
9421 
9422  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
9423  VecIns.back(), VecIns.back());
9424 }
9425 
9426 /// Emit nodes that will be selected as "test Op0,Op0", or something
9427 /// equivalent.
9428 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
9429  SelectionDAG &DAG) const {
9430  SDLoc dl(Op);
9431 
9432  // CF and OF aren't always set the way we want. Determine which
9433  // of these we need.
9434  bool NeedCF = false;
9435  bool NeedOF = false;
9436  switch (X86CC) {
9437  default: break;
9438  case X86::COND_A: case X86::COND_AE:
9439  case X86::COND_B: case X86::COND_BE:
9440  NeedCF = true;
9441  break;
9442  case X86::COND_G: case X86::COND_GE:
9443  case X86::COND_L: case X86::COND_LE:
9444  case X86::COND_O: case X86::COND_NO:
9445  NeedOF = true;
9446  break;
9447  }
9448 
9449  // See if we can use the EFLAGS value from the operand instead of
9450  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
9451  // we prove that the arithmetic won't overflow, we can't use OF or CF.
9452  if (Op.getResNo() != 0 || NeedOF || NeedCF)
9453  // Emit a CMP with 0, which is the TEST pattern.
9454  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
9455  DAG.getConstant(0, Op.getValueType()));
9456 
9457  unsigned Opcode = 0;
9458  unsigned NumOperands = 0;
9459 
9460  // Truncate operations may prevent the merge of the SETCC instruction
9461  // and the arithmetic instruction before it. Attempt to truncate the operands
9462  // of the arithmetic instruction and use a reduced bit-width instruction.
9463  bool NeedTruncation = false;
9464  SDValue ArithOp = Op;
9465  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
9466  SDValue Arith = Op->getOperand(0);
9467  // Both the trunc and the arithmetic op need to have one user each.
9468  if (Arith->hasOneUse())
9469  switch (Arith.getOpcode()) {
9470  default: break;
9471  case ISD::ADD:
9472  case ISD::SUB:
9473  case ISD::AND:
9474  case ISD::OR:
9475  case ISD::XOR: {
9476  NeedTruncation = true;
9477  ArithOp = Arith;
9478  }
9479  }
9480  }
9481 
9482  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
9483  // which may be the result of a CAST. We use the variable 'Op', which is the
9484  // non-casted variable when we check for possible users.
9485  switch (ArithOp.getOpcode()) {
9486  case ISD::ADD:
9487  // Due to an isel shortcoming, be conservative if this add is likely to be
9488  // selected as part of a load-modify-store instruction. When the root node
9489  // in a match is a store, isel doesn't know how to remap non-chain non-flag
9490  // uses of other nodes in the match, such as the ADD in this case. This
9491  // leads to the ADD being left around and reselected, with the result being
9492  // two adds in the output. Alas, even if none our users are stores, that
9493  // doesn't prove we're O.K. Ergo, if we have any parents that aren't
9494  // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
9495  // climbing the DAG back to the root, and it doesn't seem to be worth the
9496  // effort.
9497  for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9498  UE = Op.getNode()->use_end(); UI != UE; ++UI)
9499  if (UI->getOpcode() != ISD::CopyToReg &&
9500  UI->getOpcode() != ISD::SETCC &&
9501  UI->getOpcode() != ISD::STORE)
9502  goto default_case;
9503 
9504  if (ConstantSDNode *C =
9505  dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
9506  // An add of one will be selected as an INC.
9507  if (C->getAPIntValue() == 1) {
9508  Opcode = X86ISD::INC;
9509  NumOperands = 1;
9510  break;
9511  }
9512 
9513  // An add of negative one (subtract of one) will be selected as a DEC.
9514  if (C->getAPIntValue().isAllOnesValue()) {
9515  Opcode = X86ISD::DEC;
9516  NumOperands = 1;
9517  break;
9518  }
9519  }
9520 
9521  // Otherwise use a regular EFLAGS-setting add.
9522  Opcode = X86ISD::ADD;
9523  NumOperands = 2;
9524  break;
9525  case ISD::AND: {
9526  // If the primary and result isn't used, don't bother using X86ISD::AND,
9527  // because a TEST instruction will be better.
9528  bool NonFlagUse = false;
9529  for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9530  UE = Op.getNode()->use_end(); UI != UE; ++UI) {
9531  SDNode *User = *UI;
9532  unsigned UOpNo = UI.getOperandNo();
9533  if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
9534  // Look pass truncate.
9535  UOpNo = User->use_begin().getOperandNo();
9536  User = *User->use_begin();
9537  }
9538 
9539  if (User->getOpcode() != ISD::BRCOND &&
9540  User->getOpcode() != ISD::SETCC &&
9541  !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
9542  NonFlagUse = true;
9543  break;
9544  }
9545  }
9546 
9547  if (!NonFlagUse)
9548  break;
9549  }
9550  // FALL THROUGH
9551  case ISD::SUB:
9552  case ISD::OR:
9553  case ISD::XOR:
9554  // Due to the ISEL shortcoming noted above, be conservative if this op is
9555  // likely to be selected as part of a load-modify-store instruction.
9556  for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9557  UE = Op.getNode()->use_end(); UI != UE; ++UI)
9558  if (UI->getOpcode() == ISD::STORE)
9559  goto default_case;
9560 
9561  // Otherwise use a regular EFLAGS-setting instruction.
9562  switch (ArithOp.getOpcode()) {
9563  default: llvm_unreachable("unexpected operator!");
9564  case ISD::SUB: Opcode = X86ISD::SUB; break;
9565  case ISD::XOR: Opcode = X86ISD::XOR; break;
9566  case ISD::AND: Opcode = X86ISD::AND; break;
9567  case ISD::OR: {
9568  if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
9569  SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
9570  if (EFLAGS.getNode())
9571  return EFLAGS;
9572  }
9573  Opcode = X86ISD::OR;
9574  break;
9575  }
9576  }
9577 
9578  NumOperands = 2;
9579  break;
9580  case X86ISD::ADD:
9581  case X86ISD::SUB:
9582  case X86ISD::INC:
9583  case X86ISD::DEC:
9584  case X86ISD::OR:
9585  case X86ISD::XOR:
9586  case X86ISD::AND:
9587  return SDValue(Op.getNode(), 1);
9588  default:
9589  default_case:
9590  break;
9591  }
9592 
9593  // If we found that truncation is beneficial, perform the truncation and
9594  // update 'Op'.
9595  if (NeedTruncation) {
9596  EVT VT = Op.getValueType();
9597  SDValue WideVal = Op->getOperand(0);
9598  EVT WideVT = WideVal.getValueType();
9599  unsigned ConvertedOp = 0;
9600  // Use a target machine opcode to prevent further DAGCombine
9601  // optimizations that may separate the arithmetic operations
9602  // from the setcc node.
9603  switch (WideVal.getOpcode()) {
9604  default: break;
9605  case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
9606  case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
9607  case ISD::AND: ConvertedOp = X86ISD::AND; break;
9608  case ISD::OR: ConvertedOp = X86ISD::OR; break;
9609  case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
9610  }
9611 
9612  if (ConvertedOp) {
9613  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9614  if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
9615  SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
9616  SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
9617  Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
9618  }
9619  }
9620  }
9621 
9622  if (Opcode == 0)
9623  // Emit a CMP with 0, which is the TEST pattern.
9624  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
9625  DAG.getConstant(0, Op.getValueType()));
9626 
9627  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
9629  for (unsigned i = 0; i != NumOperands; ++i)
9630  Ops.push_back(Op.getOperand(i));
9631 
9632  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
9633  DAG.ReplaceAllUsesWith(Op, New);
9634  return SDValue(New.getNode(), 1);
9635 }
9636 
9637 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
9638 /// equivalent.
9639 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
9640  SelectionDAG &DAG) const {
9641  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
9642  if (C->getAPIntValue() == 0)
9643  return EmitTest(Op0, X86CC, DAG);
9644 
9645  SDLoc dl(Op0);
9646  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
9647  Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
9648  // Use SUB instead of CMP to enable CSE between SUB and CMP.
9649  SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
9650  SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
9651  Op0, Op1);
9652  return SDValue(Sub.getNode(), 1);
9653  }
9654  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
9655 }
9656 
9657 /// Convert a comparison if required by the subtarget.
9658 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
9659  SelectionDAG &DAG) const {
9660  // If the subtarget does not support the FUCOMI instruction, floating-point
9661  // comparisons have to be converted.
9662  if (Subtarget->hasCMov() ||
9663  Cmp.getOpcode() != X86ISD::CMP ||
9664  !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
9666  return Cmp;
9667 
9668  // The instruction selector will select an FUCOM instruction instead of
9669  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
9670  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
9671  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
9672  SDLoc dl(Cmp);
9673  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
9674  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
9675  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
9676  DAG.getConstant(8, MVT::i8));
9677  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
9678  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
9679 }
9680 
9681 static bool isAllOnes(SDValue V) {
9683  return C && C->isAllOnesValue();
9684 }
9685 
9686 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
9687 /// if it's possible.
9688 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
9689  SDLoc dl, SelectionDAG &DAG) const {
9690  SDValue Op0 = And.getOperand(0);
9691  SDValue Op1 = And.getOperand(1);
9692  if (Op0.getOpcode() == ISD::TRUNCATE)
9693  Op0 = Op0.getOperand(0);
9694  if (Op1.getOpcode() == ISD::TRUNCATE)
9695  Op1 = Op1.getOperand(0);
9696 
9697  SDValue LHS, RHS;
9698  if (Op1.getOpcode() == ISD::SHL)
9699  std::swap(Op0, Op1);
9700  if (Op0.getOpcode() == ISD::SHL) {
9701  if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
9702  if (And00C->getZExtValue() == 1) {
9703  // If we looked past a truncate, check that it's only truncating away
9704  // known zeros.
9705  unsigned BitWidth = Op0.getValueSizeInBits();
9706  unsigned AndBitWidth = And.getValueSizeInBits();
9707  if (BitWidth > AndBitWidth) {
9708  APInt Zeros, Ones;
9709  DAG.ComputeMaskedBits(Op0, Zeros, Ones);
9710  if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
9711  return SDValue();
9712  }
9713  LHS = Op1;
9714  RHS = Op0.getOperand(1);
9715  }
9716  } else if (Op1.getOpcode() == ISD::Constant) {
9717  ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
9718  uint64_t AndRHSVal = AndRHS->getZExtValue();
9719  SDValue AndLHS = Op0;
9720 
9721  if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
9722  LHS = AndLHS.getOperand(0);
9723  RHS = AndLHS.getOperand(1);
9724  }
9725 
9726  // Use BT if the immediate can't be encoded in a TEST instruction.
9727  if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
9728  LHS = AndLHS;
9729  RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
9730  }
9731  }
9732 
9733  if (LHS.getNode()) {
9734  // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
9735  // instruction. Since the shift amount is in-range-or-undefined, we know
9736  // that doing a bittest on the i32 value is ok. We extend to i32 because
9737  // the encoding for the i16 version is larger than the i32 version.
9738  // Also promote i16 to i32 for performance / code size reason.
9739  if (LHS.getValueType() == MVT::i8 ||
9740  LHS.getValueType() == MVT::i16)
9741  LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
9742 
9743  // If the operand types disagree, extend the shift amount to match. Since
9744  // BT ignores high bits (like shifts) we can use anyextend.
9745  if (LHS.getValueType() != RHS.getValueType())
9746  RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
9747 
9748  SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
9750  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9751  DAG.getConstant(Cond, MVT::i8), BT);
9752  }
9753 
9754  return SDValue();
9755 }
9756 
9757 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
9758 /// mask CMPs.
9759 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
9760  SDValue &Op1) {
9761  unsigned SSECC;
9762  bool Swap = false;
9763 
9764  // SSE Condition code mapping:
9765  // 0 - EQ
9766  // 1 - LT
9767  // 2 - LE
9768  // 3 - UNORD
9769  // 4 - NEQ
9770  // 5 - NLT
9771  // 6 - NLE
9772  // 7 - ORD
9773  switch (SetCCOpcode) {
9774  default: llvm_unreachable("Unexpected SETCC condition");
9775  case ISD::SETOEQ:
9776  case ISD::SETEQ: SSECC = 0; break;
9777  case ISD::SETOGT:
9778  case ISD::SETGT: Swap = true; // Fallthrough
9779  case ISD::SETLT:
9780  case ISD::SETOLT: SSECC = 1; break;
9781  case ISD::SETOGE:
9782  case ISD::SETGE: Swap = true; // Fallthrough
9783  case ISD::SETLE:
9784  case ISD::SETOLE: SSECC = 2; break;
9785  case ISD::SETUO: SSECC = 3; break;
9786  case ISD::SETUNE:
9787  case ISD::SETNE: SSECC = 4; break;
9788  case ISD::SETULE: Swap = true; // Fallthrough
9789  case ISD::SETUGE: SSECC = 5; break;
9790  case ISD::SETULT: Swap = true; // Fallthrough
9791  case ISD::SETUGT: SSECC = 6; break;
9792  case ISD::SETO: SSECC = 7; break;
9793  case ISD::SETUEQ:
9794  case ISD::SETONE: SSECC = 8; break;
9795  }
9796  if (Swap)
9797  std::swap(Op0, Op1);
9798 
9799  return SSECC;
9800 }
9801 
9802 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
9803 // ones, and then concatenate the result back.
9805  MVT VT = Op.getSimpleValueType();
9806 
9807  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
9808  "Unsupported value type for operation");
9809 
9810  unsigned NumElems = VT.getVectorNumElements();
9811  SDLoc dl(Op);
9812  SDValue CC = Op.getOperand(2);
9813 
9814  // Extract the LHS vectors
9815  SDValue LHS = Op.getOperand(0);
9816  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
9817  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
9818 
9819  // Extract the RHS vectors
9820  SDValue RHS = Op.getOperand(1);
9821  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
9822  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
9823 
9824  // Issue the operation on the smaller types and concatenate the result back
9825  MVT EltVT = VT.getVectorElementType();
9826  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
9827  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
9828  DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
9829  DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
9830 }
9831 
9833  SDValue Op0 = Op.getOperand(0);
9834  SDValue Op1 = Op.getOperand(1);
9835  SDValue CC = Op.getOperand(2);
9836  MVT VT = Op.getSimpleValueType();
9837 
9838  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
9839  Op.getValueType().getScalarType() == MVT::i1 &&
9840  "Cannot set masked compare for this operation");
9841 
9842  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9843  SDLoc dl(Op);
9844 
9845  bool Unsigned = false;
9846  unsigned SSECC;
9847  switch (SetCCOpcode) {
9848  default: llvm_unreachable("Unexpected SETCC condition");
9849  case ISD::SETNE: SSECC = 4; break;
9850  case ISD::SETEQ: SSECC = 0; break;
9851  case ISD::SETUGT: Unsigned = true;
9852  case ISD::SETGT: SSECC = 6; break; // NLE
9853  case ISD::SETULT: Unsigned = true;
9854  case ISD::SETLT: SSECC = 1; break;
9855  case ISD::SETUGE: Unsigned = true;
9856  case ISD::SETGE: SSECC = 5; break; // NLT
9857  case ISD::SETULE: Unsigned = true;
9858  case ISD::SETLE: SSECC = 2; break;
9859  }
9860  unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
9861  return DAG.getNode(Opc, dl, VT, Op0, Op1,
9862  DAG.getConstant(SSECC, MVT::i8));
9863 
9864 }
9865 
9866 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
9867  SelectionDAG &DAG) {
9868  SDValue Op0 = Op.getOperand(0);
9869  SDValue Op1 = Op.getOperand(1);
9870  SDValue CC = Op.getOperand(2);
9871  MVT VT = Op.getSimpleValueType();
9872  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9873  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
9874  SDLoc dl(Op);
9875 
9876  if (isFP) {
9877 #ifndef NDEBUG
9879  assert(EltVT == MVT::f32 || EltVT == MVT::f64);
9880 #endif
9881 
9882  unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
9883  unsigned Opc = X86ISD::CMPP;
9884  if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
9885  assert(VT.getVectorNumElements() <= 16);
9886  Opc = X86ISD::CMPM;
9887  }
9888  // In the two special cases we can't handle, emit two comparisons.
9889  if (SSECC == 8) {
9890  unsigned CC0, CC1;
9891  unsigned CombineOpc;
9892  if (SetCCOpcode == ISD::SETUEQ) {
9893  CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
9894  } else {
9895  assert(SetCCOpcode == ISD::SETONE);
9896  CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
9897  }
9898 
9899  SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
9900  DAG.getConstant(CC0, MVT::i8));
9901  SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
9902  DAG.getConstant(CC1, MVT::i8));
9903  return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
9904  }
9905  // Handle all other FP comparisons here.
9906  return DAG.getNode(Opc, dl, VT, Op0, Op1,
9907  DAG.getConstant(SSECC, MVT::i8));
9908  }
9909 
9910  // Break 256-bit integer vector compare into smaller ones.
9911  if (VT.is256BitVector() && !Subtarget->hasInt256())
9912  return Lower256IntVSETCC(Op, DAG);
9913 
9914  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
9915  EVT OpVT = Op1.getValueType();
9916  if (Subtarget->hasAVX512()) {
9917  if (Op1.getValueType().is512BitVector() ||
9918  (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
9919  return LowerIntVSETCC_AVX512(Op, DAG);
9920 
9921  // In AVX-512 architecture setcc returns mask with i1 elements,
9922  // But there is no compare instruction for i8 and i16 elements.
9923  // We are not talking about 512-bit operands in this case, these
9924  // types are illegal.
9925  if (MaskResult &&
9926  (OpVT.getVectorElementType().getSizeInBits() < 32 &&
9927  OpVT.getVectorElementType().getSizeInBits() >= 8))
9928  return DAG.getNode(ISD::TRUNCATE, dl, VT,
9929  DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
9930  }
9931 
9932  // We are handling one of the integer comparisons here. Since SSE only has
9933  // GT and EQ comparisons for integer, swapping operands and multiple
9934  // operations may be required for some comparisons.
9935  unsigned Opc;
9936  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
9937 
9938  switch (SetCCOpcode) {
9939  default: llvm_unreachable("Unexpected SETCC condition");
9940  case ISD::SETNE: Invert = true;
9941  case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
9942  case ISD::SETLT: Swap = true;
9943  case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
9944  case ISD::SETGE: Swap = true;
9945  case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
9946  Invert = true; break;
9947  case ISD::SETULT: Swap = true;
9948  case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
9949  FlipSigns = true; break;
9950  case ISD::SETUGE: Swap = true;
9951  case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
9952  FlipSigns = true; Invert = true; break;
9953  }
9954 
9955  // Special case: Use min/max operations for SETULE/SETUGE
9956  MVT VET = VT.getVectorElementType();
9957  bool hasMinMax =
9958  (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
9959  || (Subtarget->hasSSE2() && (VET == MVT::i8));
9960 
9961  if (hasMinMax) {
9962  switch (SetCCOpcode) {
9963  default: break;
9964  case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
9965  case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
9966  }
9967 
9968  if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
9969  }
9970 
9971  if (Swap)
9972  std::swap(Op0, Op1);
9973 
9974  // Check that the operation in question is available (most are plain SSE2,
9975  // but PCMPGTQ and PCMPEQQ have different requirements).
9976  if (VT == MVT::v2i64) {
9977  if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
9978  assert(Subtarget->hasSSE2() && "Don't know how to lower!");
9979 
9980  // First cast everything to the right type.
9981  Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
9982  Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
9983 
9984  // Since SSE has no unsigned integer comparisons, we need to flip the sign
9985  // bits of the inputs before performing those operations. The lower
9986  // compare is always unsigned.
9987  SDValue SB;
9988  if (FlipSigns) {
9989  SB = DAG.getConstant(0x80000000U, MVT::v4i32);
9990  } else {
9991  SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
9992  SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
9993  SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
9994  Sign, Zero, Sign, Zero);
9995  }
9996  Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
9997  Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
9998 
9999  // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
10000  SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
10001  SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
10002 
10003  // Create masks for only the low parts/high parts of the 64 bit integers.
10004  static const int MaskHi[] = { 1, 1, 3, 3 };
10005  static const int MaskLo[] = { 0, 0, 2, 2 };
10006  SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
10007  SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
10008  SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
10009 
10010  SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
10011  Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
10012 
10013  if (Invert)
10014  Result = DAG.getNOT(dl, Result, MVT::v4i32);
10015 
10016  return DAG.getNode(ISD::BITCAST, dl, VT, Result);
10017  }
10018 
10019  if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
10020  // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
10021  // pcmpeqd + pshufd + pand.
10022  assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
10023 
10024  // First cast everything to the right type.
10025  Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
10026  Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
10027 
10028  // Do the compare.
10029  SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
10030 
10031  // Make sure the lower and upper halves are both all-ones.
10032  static const int Mask[] = { 1, 0, 3, 2 };
10033  SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
10034  Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
10035 
10036  if (Invert)
10037  Result = DAG.getNOT(dl, Result, MVT::v4i32);
10038 
10039  return DAG.getNode(ISD::BITCAST, dl, VT, Result);
10040  }
10041  }
10042 
10043  // Since SSE has no unsigned integer comparisons, we need to flip the sign
10044  // bits of the inputs before performing those operations.
10045  if (FlipSigns) {
10046  EVT EltVT = VT.getVectorElementType();
10047  SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
10048  Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
10049  Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
10050  }
10051 
10052  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
10053 
10054  // If the logical-not of the result is required, perform that now.
10055  if (Invert)
10056  Result = DAG.getNOT(dl, Result, VT);
10057 
10058  if (MinMax)
10059  Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
10060 
10061  return Result;
10062 }
10063 
10064 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10065 
10066  MVT VT = Op.getSimpleValueType();
10067 
10068  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
10069 
10070  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
10071  SDValue Op0 = Op.getOperand(0);
10072  SDValue Op1 = Op.getOperand(1);
10073  SDLoc dl(Op);
10074  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10075 
10076  // Optimize to BT if possible.
10077  // Lower (X & (1 << N)) == 0 to BT(X, N).
10078  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
10079  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
10080  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
10081  Op1.getOpcode() == ISD::Constant &&
10082  cast<ConstantSDNode>(Op1)->isNullValue() &&
10083  (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10084  SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
10085  if (NewSetCC.getNode())
10086  return NewSetCC;
10087  }
10088 
10089  // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
10090  // these.
10091  if (Op1.getOpcode() == ISD::Constant &&
10092  (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
10093  cast<ConstantSDNode>(Op1)->isNullValue()) &&
10094  (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10095 
10096  // If the input is a setcc, then reuse the input setcc or use a new one with
10097  // the inverted condition.
10098  if (Op0.getOpcode() == X86ISD::SETCC) {
10100  bool Invert = (CC == ISD::SETNE) ^
10101  cast<ConstantSDNode>(Op1)->isNullValue();
10102  if (!Invert) return Op0;
10103 
10104  CCode = X86::GetOppositeBranchCondition(CCode);
10105  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10106  DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
10107  }
10108  }
10109 
10110  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
10111  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
10112  if (X86CC == X86::COND_INVALID)
10113  return SDValue();
10114 
10115  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
10116  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
10117  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10118  DAG.getConstant(X86CC, MVT::i8), EFLAGS);
10119 }
10120 
10121 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
10122 static bool isX86LogicalCmp(SDValue Op) {
10123  unsigned Opc = Op.getNode()->getOpcode();
10124  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
10125  Opc == X86ISD::SAHF)
10126  return true;
10127  if (Op.getResNo() == 1 &&
10128  (Opc == X86ISD::ADD ||
10129  Opc == X86ISD::SUB ||
10130  Opc == X86ISD::ADC ||
10131  Opc == X86ISD::SBB ||
10132  Opc == X86ISD::SMUL ||
10133  Opc == X86ISD::UMUL ||
10134  Opc == X86ISD::INC ||
10135  Opc == X86ISD::DEC ||
10136  Opc == X86ISD::OR ||
10137  Opc == X86ISD::XOR ||
10138  Opc == X86ISD::AND))
10139  return true;
10140 
10141  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
10142  return true;
10143 
10144  return false;
10145 }
10146 
10147 static bool isZero(SDValue V) {
10149  return C && C->isNullValue();
10150 }
10151 
10153  if (V.getOpcode() != ISD::TRUNCATE)
10154  return false;
10155 
10156  SDValue VOp0 = V.getOperand(0);
10157  unsigned InBits = VOp0.getValueSizeInBits();
10158  unsigned Bits = V.getValueSizeInBits();
10159  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
10160 }
10161 
10162 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10163  bool addTest = true;
10164  SDValue Cond = Op.getOperand(0);
10165  SDValue Op1 = Op.getOperand(1);
10166  SDValue Op2 = Op.getOperand(2);
10167  SDLoc DL(Op);
10168  EVT VT = Op1.getValueType();
10169  SDValue CC;
10170 
10171  // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
10172  // are available. Otherwise fp cmovs get lowered into a less efficient branch
10173  // sequence later on.
10174  if (Cond.getOpcode() == ISD::SETCC &&
10175  ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
10176  (Subtarget->hasSSE1() && VT == MVT::f32)) &&
10177  VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
10178  SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
10179  int SSECC = translateX86FSETCC(
10180  cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
10181 
10182  if (SSECC != 8) {
10183  unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
10184  SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
10185  DAG.getConstant(SSECC, MVT::i8));
10186  SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
10187  SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
10188  return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
10189  }
10190  }
10191 
10192  if (Cond.getOpcode() == ISD::SETCC) {
10193  SDValue NewCond = LowerSETCC(Cond, DAG);
10194  if (NewCond.getNode())
10195  Cond = NewCond;
10196  }
10197 
10198  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
10199  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
10200  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
10201  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
10202  if (Cond.getOpcode() == X86ISD::SETCC &&
10203  Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
10204  isZero(Cond.getOperand(1).getOperand(1))) {
10205  SDValue Cmp = Cond.getOperand(1);
10206 
10207  unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
10208 
10209  if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
10210  (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
10211  SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
10212 
10213  SDValue CmpOp0 = Cmp.getOperand(0);
10214  // Apply further optimizations for special cases
10215  // (select (x != 0), -1, 0) -> neg & sbb
10216  // (select (x == 0), 0, -1) -> neg & sbb
10217  if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
10218  if (YC->isNullValue() &&
10219  (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
10220  SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
10221  SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
10222  DAG.getConstant(0, CmpOp0.getValueType()),
10223  CmpOp0);
10224  SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
10226  SDValue(Neg.getNode(), 1));
10227  return Res;
10228  }
10229 
10230  Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
10231  CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
10232  Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10233 
10234  SDValue Res = // Res = 0 or -1.
10236  DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
10237 
10238  if (isAllOnes(Op1) != (CondCode == X86::COND_E))
10239  Res = DAG.getNOT(DL, Res, Res.getValueType());
10240 
10241  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
10242  if (N2C == 0 || !N2C->isNullValue())
10243  Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
10244  return Res;
10245  }
10246  }
10247 
10248  // Look past (and (setcc_carry (cmp ...)), 1).
10249  if (Cond.getOpcode() == ISD::AND &&
10250  Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
10252  if (C && C->getAPIntValue() == 1)
10253  Cond = Cond.getOperand(0);
10254  }
10255 
10256  // If condition flag is set by a X86ISD::CMP, then use it as the condition
10257  // setting operand in place of the X86ISD::SETCC.
10258  unsigned CondOpcode = Cond.getOpcode();
10259  if (CondOpcode == X86ISD::SETCC ||
10260  CondOpcode == X86ISD::SETCC_CARRY) {
10261  CC = Cond.getOperand(0);
10262 
10263  SDValue Cmp = Cond.getOperand(1);
10264  unsigned Opc = Cmp.getOpcode();
10265  MVT VT = Op.getSimpleValueType();
10266 
10267  bool IllegalFPCMov = false;
10268  if (VT.isFloatingPoint() && !VT.isVector() &&
10269  !isScalarFPTypeInSSEReg(VT)) // FPStack?
10270  IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
10271 
10272  if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
10273  Opc == X86ISD::BT) { // FIXME
10274  Cond = Cmp;
10275  addTest = false;
10276  }
10277  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
10278  CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
10279  ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
10280  Cond.getOperand(0).getValueType() != MVT::i8)) {
10281  SDValue LHS = Cond.getOperand(0);
10282  SDValue RHS = Cond.getOperand(1);
10283  unsigned X86Opcode;
10284  unsigned X86Cond;
10285  SDVTList VTs;
10286  switch (CondOpcode) {
10287  case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
10288  case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
10289  case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
10290  case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
10291  case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
10292  case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
10293  default: llvm_unreachable("unexpected overflowing operator");
10294  }
10295  if (CondOpcode == ISD::UMULO)
10296  VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
10297  MVT::i32);
10298  else
10299  VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
10300 
10301  SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
10302 
10303  if (CondOpcode == ISD::UMULO)
10304  Cond = X86Op.getValue(2);
10305  else
10306  Cond = X86Op.getValue(1);
10307 
10308  CC = DAG.getConstant(X86Cond, MVT::i8);
10309  addTest = false;
10310  }
10311 
10312  if (addTest) {
10313  // Look pass the truncate if the high bits are known zero.
10314  if (isTruncWithZeroHighBitsInput(Cond, DAG))
10315  Cond = Cond.getOperand(0);
10316 
10317  // We know the result of AND is compared against zero. Try to match
10318  // it to BT.
10319  if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
10320  SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
10321  if (NewSetCC.getNode()) {
10322  CC = NewSetCC.getOperand(0);
10323  Cond = NewSetCC.getOperand(1);
10324  addTest = false;
10325  }
10326  }
10327  }
10328 
10329  if (addTest) {
10330  CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10331  Cond = EmitTest(Cond, X86::COND_NE, DAG);
10332  }
10333 
10334  // a < b ? -1 : 0 -> RES = ~setcc_carry
10335  // a < b ? 0 : -1 -> RES = setcc_carry
10336  // a >= b ? -1 : 0 -> RES = setcc_carry
10337  // a >= b ? 0 : -1 -> RES = ~setcc_carry
10338  if (Cond.getOpcode() == X86ISD::SUB) {
10339  Cond = ConvertCmpIfNecessary(Cond, DAG);
10340  unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
10341 
10342  if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
10343  (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
10344  SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
10345  DAG.getConstant(X86::COND_B, MVT::i8), Cond);
10346  if (isAllOnes(Op1) != (CondCode == X86::COND_B))
10347  return DAG.getNOT(DL, Res, Res.getValueType());
10348  return Res;
10349  }
10350  }
10351 
10352  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
10353  // widen the cmov and push the truncate through. This avoids introducing a new
10354  // branch during isel and doesn't add any extensions.
10355  if (Op.getValueType() == MVT::i8 &&
10356  Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
10357  SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
10358  if (T1.getValueType() == T2.getValueType() &&
10359  // Blacklist CopyFromReg to avoid partial register stalls.
10360  T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
10361  SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
10362  SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
10363  return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
10364  }
10365  }
10366 
10367  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
10368  // condition is true.
10369  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
10370  SDValue Ops[] = { Op2, Op1, CC, Cond };
10371  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
10372 }
10373 
10375  MVT VT = Op->getSimpleValueType(0);
10376  SDValue In = Op->getOperand(0);
10377  MVT InVT = In.getSimpleValueType();
10378  SDLoc dl(Op);
10379 
10380  unsigned int NumElts = VT.getVectorNumElements();
10381  if (NumElts != 8 && NumElts != 16)
10382  return SDValue();
10383 
10384  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
10385  return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
10386 
10387  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10388  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
10389 
10390  MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
10391  Constant *C = ConstantInt::get(*DAG.getContext(),
10392  APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
10393 
10394  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
10395  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
10396  SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
10398  false, false, false, Alignment);
10399  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
10400  if (VT.is512BitVector())
10401  return Brcst;
10402  return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
10403 }
10404 
10405 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
10406  SelectionDAG &DAG) {
10407  MVT VT = Op->getSimpleValueType(0);
10408  SDValue In = Op->getOperand(0);
10409  MVT InVT = In.getSimpleValueType();
10410  SDLoc dl(Op);
10411 
10412  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
10413  return LowerSIGN_EXTEND_AVX512(Op, DAG);
10414 
10415  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
10416  (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
10417  (VT != MVT::v16i16 || InVT != MVT::v16i8))
10418  return SDValue();
10419 
10420  if (Subtarget->hasInt256())
10421  return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
10422 
10423  // Optimize vectors in AVX mode
10424  // Sign extend v8i16 to v8i32 and
10425  // v4i32 to v4i64
10426  //
10427  // Divide input vector into two parts
10428  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
10429  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
10430  // concat the vectors to original VT
10431 
10432  unsigned NumElems = InVT.getVectorNumElements();
10433  SDValue Undef = DAG.getUNDEF(InVT);
10434 
10435  SmallVector<int,8> ShufMask1(NumElems, -1);
10436  for (unsigned i = 0; i != NumElems/2; ++i)
10437  ShufMask1[i] = i;
10438 
10439  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
10440 
10441  SmallVector<int,8> ShufMask2(NumElems, -1);
10442  for (unsigned i = 0; i != NumElems/2; ++i)
10443  ShufMask2[i] = i + NumElems/2;
10444 
10445  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
10446 
10447  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
10448  VT.getVectorNumElements()/2);
10449 
10450  OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
10451  OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
10452 
10453  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
10454 }
10455 
10456 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
10457 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
10458 // from the AND / OR.
10459 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
10460  Opc = Op.getOpcode();
10461  if (Opc != ISD::OR && Opc != ISD::AND)
10462  return false;
10463  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
10464  Op.getOperand(0).hasOneUse() &&
10465  Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
10466  Op.getOperand(1).hasOneUse());
10467 }
10468 
10469 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
10470 // 1 and that the SETCC node has a single use.
10471 static bool isXor1OfSetCC(SDValue Op) {
10472  if (Op.getOpcode() != ISD::XOR)
10473  return false;
10475  if (N1C && N1C->getAPIntValue() == 1) {
10476  return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
10477  Op.getOperand(0).hasOneUse();
10478  }
10479  return false;
10480 }
10481 
10482 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
10483  bool addTest = true;
10484  SDValue Chain = Op.getOperand(0);
10485  SDValue Cond = Op.getOperand(1);
10486  SDValue Dest = Op.getOperand(2);
10487  SDLoc dl(Op);
10488  SDValue CC;
10489  bool Inverted = false;
10490 
10491  if (Cond.getOpcode() == ISD::SETCC) {
10492  // Check for setcc([su]{add,sub,mul}o == 0).
10493  if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
10494  isa<ConstantSDNode>(Cond.getOperand(1)) &&
10495  cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
10496  Cond.getOperand(0).getResNo() == 1 &&
10497  (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
10498  Cond.getOperand(0).getOpcode() == ISD::UADDO ||
10499  Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
10500  Cond.getOperand(0).getOpcode() == ISD::USUBO ||
10501  Cond.getOperand(0).getOpcode() == ISD::SMULO ||
10502  Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
10503  Inverted = true;
10504  Cond = Cond.getOperand(0);
10505  } else {
10506  SDValue NewCond = LowerSETCC(Cond, DAG);
10507  if (NewCond.getNode())
10508  Cond = NewCond;
10509  }
10510  }
10511 #if 0
10512  // FIXME: LowerXALUO doesn't handle these!!
10513  else if (Cond.getOpcode() == X86ISD::ADD ||
10514  Cond.getOpcode() == X86ISD::SUB ||
10515  Cond.getOpcode() == X86ISD::SMUL ||
10516  Cond.getOpcode() == X86ISD::UMUL)
10517  Cond = LowerXALUO(Cond, DAG);
10518 #endif
10519 
10520  // Look pass (and (setcc_carry (cmp ...)), 1).
10521  if (Cond.getOpcode() == ISD::AND &&
10522  Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
10524  if (C && C->getAPIntValue() == 1)
10525  Cond = Cond.getOperand(0);
10526  }
10527 
10528  // If condition flag is set by a X86ISD::CMP, then use it as the condition
10529  // setting operand in place of the X86ISD::SETCC.
10530  unsigned CondOpcode = Cond.getOpcode();
10531  if (CondOpcode == X86ISD::SETCC ||
10532  CondOpcode == X86ISD::SETCC_CARRY) {
10533  CC = Cond.getOperand(0);
10534 
10535  SDValue Cmp = Cond.getOperand(1);
10536  unsigned Opc = Cmp.getOpcode();
10537  // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
10538  if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
10539  Cond = Cmp;
10540  addTest = false;
10541  } else {
10542  switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
10543  default: break;
10544  case X86::COND_O:
10545  case X86::COND_B:
10546  // These can only come from an arithmetic instruction with overflow,
10547  // e.g. SADDO, UADDO.
10548  Cond = Cond.getNode()->getOperand(1);
10549  addTest = false;
10550  break;
10551  }
10552  }
10553  }
10554  CondOpcode = Cond.getOpcode();
10555  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
10556  CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
10557  ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
10558  Cond.getOperand(0).getValueType() != MVT::i8)) {
10559  SDValue LHS = Cond.getOperand(0);
10560  SDValue RHS = Cond.getOperand(1);
10561  unsigned X86Opcode;
10562  unsigned X86Cond;
10563  SDVTList VTs;
10564  switch (CondOpcode) {
10565  case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
10566  case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
10567  case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
10568  case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
10569  case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
10570  case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
10571  default: llvm_unreachable("unexpected overflowing operator");
10572  }
10573  if (Inverted)
10574  X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
10575  if (CondOpcode == ISD::UMULO)
10576  VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
10577  MVT::i32);
10578  else
10579  VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
10580 
10581  SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
10582 
10583  if (CondOpcode == ISD::UMULO)
10584  Cond = X86Op.getValue(2);
10585  else
10586  Cond = X86Op.getValue(1);
10587 
10588  CC = DAG.getConstant(X86Cond, MVT::i8);
10589  addTest = false;
10590  } else {
10591  unsigned CondOpc;
10592  if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
10593  SDValue Cmp = Cond.getOperand(0).getOperand(1);
10594  if (CondOpc == ISD::OR) {
10595  // Also, recognize the pattern generated by an FCMP_UNE. We can emit
10596  // two branches instead of an explicit OR instruction with a
10597  // separate test.
10598  if (Cmp == Cond.getOperand(1).getOperand(1) &&
10599  isX86LogicalCmp(Cmp)) {
10600  CC = Cond.getOperand(0).getOperand(0);
10601  Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10602  Chain, Dest, CC, Cmp);
10603  CC = Cond.getOperand(1).getOperand(0);
10604  Cond = Cmp;
10605  addTest = false;
10606  }
10607  } else { // ISD::AND
10608  // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
10609  // two branches instead of an explicit AND instruction with a
10610  // separate test. However, we only do this if this block doesn't
10611  // have a fall-through edge, because this requires an explicit
10612  // jmp when the condition is false.
10613  if (Cmp == Cond.getOperand(1).getOperand(1) &&
10614  isX86LogicalCmp(Cmp) &&
10615  Op.getNode()->hasOneUse()) {
10616  X86::CondCode CCode =
10618  CCode = X86::GetOppositeBranchCondition(CCode);
10619  CC = DAG.getConstant(CCode, MVT::i8);
10620  SDNode *User = *Op.getNode()->use_begin();
10621  // Look for an unconditional branch following this conditional branch.
10622  // We need this because we need to reverse the successors in order
10623  // to implement FCMP_OEQ.
10624  if (User->getOpcode() == ISD::BR) {
10625  SDValue FalseBB = User->getOperand(1);
10626  SDNode *NewBR =
10627  DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10628  assert(NewBR == User);
10629  (void)NewBR;
10630  Dest = FalseBB;
10631 
10632  Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10633  Chain, Dest, CC, Cmp);
10634  X86::CondCode CCode =
10636  CCode = X86::GetOppositeBranchCondition(CCode);
10637  CC = DAG.getConstant(CCode, MVT::i8);
10638  Cond = Cmp;
10639  addTest = false;
10640  }
10641  }
10642  }
10643  } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
10644  // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
10645  // It should be transformed during dag combiner except when the condition
10646  // is set by a arithmetics with overflow node.
10647  X86::CondCode CCode =
10649  CCode = X86::GetOppositeBranchCondition(CCode);
10650  CC = DAG.getConstant(CCode, MVT::i8);
10651  Cond = Cond.getOperand(0).getOperand(1);
10652  addTest = false;
10653  } else if (Cond.getOpcode() == ISD::SETCC &&
10654  cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
10655  // For FCMP_OEQ, we can emit
10656  // two branches instead of an explicit AND instruction with a
10657  // separate test. However, we only do this if this block doesn't
10658  // have a fall-through edge, because this requires an explicit
10659  // jmp when the condition is false.
10660  if (Op.getNode()->hasOneUse()) {
10661  SDNode *User = *Op.getNode()->use_begin();
10662  // Look for an unconditional branch following this conditional branch.
10663  // We need this because we need to reverse the successors in order
10664  // to implement FCMP_OEQ.
10665  if (User->getOpcode() == ISD::BR) {
10666  SDValue FalseBB = User->getOperand(1);
10667  SDNode *NewBR =
10668  DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10669  assert(NewBR == User);
10670  (void)NewBR;
10671  Dest = FalseBB;
10672 
10673  SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
10674  Cond.getOperand(0), Cond.getOperand(1));
10675  Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10676  CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10677  Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10678  Chain, Dest, CC, Cmp);
10679  CC = DAG.getConstant(X86::COND_P, MVT::i8);
10680  Cond = Cmp;
10681  addTest = false;
10682  }
10683  }
10684  } else if (Cond.getOpcode() == ISD::SETCC &&
10685  cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
10686  // For FCMP_UNE, we can emit
10687  // two branches instead of an explicit AND instruction with a
10688  // separate test. However, we only do this if this block doesn't
10689  // have a fall-through edge, because this requires an explicit
10690  // jmp when the condition is false.
10691  if (Op.getNode()->hasOneUse()) {
10692  SDNode *User = *Op.getNode()->use_begin();
10693  // Look for an unconditional branch following this conditional branch.
10694  // We need this because we need to reverse the successors in order
10695  // to implement FCMP_UNE.
10696  if (User->getOpcode() == ISD::BR) {
10697  SDValue FalseBB = User->getOperand(1);
10698  SDNode *NewBR =
10699  DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10700  assert(NewBR == User);
10701  (void)NewBR;
10702 
10703  SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
10704  Cond.getOperand(0), Cond.getOperand(1));
10705  Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10706  CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10707  Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10708  Chain, Dest, CC, Cmp);
10709  CC = DAG.getConstant(X86::COND_NP, MVT::i8);
10710  Cond = Cmp;
10711  addTest = false;
10712  Dest = FalseBB;
10713  }
10714  }
10715  }
10716  }
10717 
10718  if (addTest) {
10719  // Look pass the truncate if the high bits are known zero.
10720  if (isTruncWithZeroHighBitsInput(Cond, DAG))
10721  Cond = Cond.getOperand(0);
10722 
10723  // We know the result of AND is compared against zero. Try to match
10724  // it to BT.
10725  if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
10726  SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
10727  if (NewSetCC.getNode()) {
10728  CC = NewSetCC.getOperand(0);
10729  Cond = NewSetCC.getOperand(1);
10730  addTest = false;
10731  }
10732  }
10733  }
10734 
10735  if (addTest) {
10736  CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10737  Cond = EmitTest(Cond, X86::COND_NE, DAG);
10738  }
10739  Cond = ConvertCmpIfNecessary(Cond, DAG);
10740  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10741  Chain, Dest, CC, Cond);
10742 }
10743 
10744 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
10745 // Calls to _alloca is needed to probe the stack when allocating more than 4k
10746 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
10747 // that the guard pages used by the OS virtual memory manager are allocated in
10748 // correct sequence.
10749 SDValue
10750 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
10751  SelectionDAG &DAG) const {
10752  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
10754  "This should be used only on Windows targets or when segmented stacks "
10755  "are being used");
10756  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
10757  SDLoc dl(Op);
10758 
10759  // Get the inputs.
10760  SDValue Chain = Op.getOperand(0);
10761  SDValue Size = Op.getOperand(1);
10762  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
10763  EVT VT = Op.getNode()->getValueType(0);
10764 
10765  bool Is64Bit = Subtarget->is64Bit();
10766  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
10767 
10768  if (getTargetMachine().Options.EnableSegmentedStacks) {
10769  MachineFunction &MF = DAG.getMachineFunction();
10770  MachineRegisterInfo &MRI = MF.getRegInfo();
10771 
10772  if (Is64Bit) {
10773  // The 64 bit implementation of segmented stacks needs to clobber both r10
10774  // r11. This makes it impossible to use it along with nested parameters.
10775  const Function *F = MF.getFunction();
10776 
10777  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
10778  I != E; ++I)
10779  if (I->hasNestAttr())
10780  report_fatal_error("Cannot use segmented stacks with functions that "
10781  "have nested arguments.");
10782  }
10783 
10784  const TargetRegisterClass *AddrRegClass =
10785  getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
10786  unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
10787  Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
10788  SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
10789  DAG.getRegister(Vreg, SPTy));
10790  SDValue Ops1[2] = { Value, Chain };
10791  return DAG.getMergeValues(Ops1, 2, dl);
10792  } else {
10793  SDValue Flag;
10794  unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
10795 
10796  Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
10797  Flag = Chain.getValue(1);
10798  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10799 
10800  Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
10801 
10802  const X86RegisterInfo *RegInfo =
10803  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
10804  unsigned SPReg = RegInfo->getStackRegister();
10805  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
10806  Chain = SP.getValue(1);
10807 
10808  if (Align) {
10809  SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
10810  DAG.getConstant(-(uint64_t)Align, VT));
10811  Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
10812  }
10813 
10814  SDValue Ops1[2] = { SP, Chain };
10815  return DAG.getMergeValues(Ops1, 2, dl);
10816  }
10817 }
10818 
10819 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
10820  MachineFunction &MF = DAG.getMachineFunction();
10822 
10823  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10824  SDLoc DL(Op);
10825 
10826  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
10827  // vastart just stores the address of the VarArgsFrameIndex slot into the
10828  // memory location argument.
10829  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
10830  getPointerTy());
10831  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10832  MachinePointerInfo(SV), false, false, 0);
10833  }
10834 
10835  // __va_list_tag:
10836  // gp_offset (0 - 6 * 8)
10837  // fp_offset (48 - 48 + 8 * 16)
10838  // overflow_arg_area (point to parameters coming in memory).
10839  // reg_save_area
10840  SmallVector<SDValue, 8> MemOps;
10841  SDValue FIN = Op.getOperand(1);
10842  // Store gp_offset
10843  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
10844  DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
10845  MVT::i32),
10846  FIN, MachinePointerInfo(SV), false, false, 0);
10847  MemOps.push_back(Store);
10848 
10849  // Store fp_offset
10850  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10851  FIN, DAG.getIntPtrConstant(4));
10852  Store = DAG.getStore(Op.getOperand(0), DL,
10853  DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
10854  MVT::i32),
10855  FIN, MachinePointerInfo(SV, 4), false, false, 0);
10856  MemOps.push_back(Store);
10857 
10858  // Store ptr to overflow_arg_area
10859  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10860  FIN, DAG.getIntPtrConstant(4));
10861  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
10862  getPointerTy());
10863  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
10864  MachinePointerInfo(SV, 8),
10865  false, false, 0);
10866  MemOps.push_back(Store);
10867 
10868  // Store ptr to reg_save_area.
10869  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10870  FIN, DAG.getIntPtrConstant(8));
10871  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
10872  getPointerTy());
10873  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
10874  MachinePointerInfo(SV, 16), false, false, 0);
10875  MemOps.push_back(Store);
10876  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
10877  &MemOps[0], MemOps.size());
10878 }
10879 
10880 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10881  assert(Subtarget->is64Bit() &&
10882  "LowerVAARG only handles 64-bit va_arg!");
10883  assert((Subtarget->isTargetLinux() ||
10884  Subtarget->isTargetDarwin()) &&
10885  "Unhandled target in LowerVAARG");
10886  assert(Op.getNode()->getNumOperands() == 4);
10887  SDValue Chain = Op.getOperand(0);
10888  SDValue SrcPtr = Op.getOperand(1);
10889  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10890  unsigned Align = Op.getConstantOperandVal(3);
10891  SDLoc dl(Op);
10892 
10893  EVT ArgVT = Op.getNode()->getValueType(0);
10894  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
10895  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
10896  uint8_t ArgMode;
10897 
10898  // Decide which area this value should be read from.
10899  // TODO: Implement the AMD64 ABI in its entirety. This simple
10900  // selection mechanism works only for the basic types.
10901  if (ArgVT == MVT::f80) {
10902  llvm_unreachable("va_arg for f80 not yet implemented");
10903  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
10904  ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
10905  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
10906  ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
10907  } else {
10908  llvm_unreachable("Unhandled argument type in LowerVAARG");
10909  }
10910 
10911  if (ArgMode == 2) {
10912  // Sanity Check: Make sure using fp_offset makes sense.
10913  assert(!getTargetMachine().Options.UseSoftFloat &&
10914  !(DAG.getMachineFunction()
10916  .hasAttribute(AttributeSet::FunctionIndex,
10918  Subtarget->hasSSE1());
10919  }
10920 
10921  // Insert VAARG_64 node into the DAG
10922  // VAARG_64 returns two values: Variable Argument Address, Chain
10923  SmallVector<SDValue, 11> InstOps;
10924  InstOps.push_back(Chain);
10925  InstOps.push_back(SrcPtr);
10926  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
10927  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
10928  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
10929  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
10931  VTs, &InstOps[0], InstOps.size(),
10932  MVT::i64,
10933  MachinePointerInfo(SV),
10934  /*Align=*/0,
10935  /*Volatile=*/false,
10936  /*ReadMem=*/true,
10937  /*WriteMem=*/true);
10938  Chain = VAARG.getValue(1);
10939 
10940  // Load the next argument and return it
10941  return DAG.getLoad(ArgVT, dl,
10942  Chain,
10943  VAARG,
10945  false, false, false, 0);
10946 }
10947 
10948 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
10949  SelectionDAG &DAG) {
10950  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
10951  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
10952  SDValue Chain = Op.getOperand(0);
10953  SDValue DstPtr = Op.getOperand(1);
10954  SDValue SrcPtr = Op.getOperand(2);
10955  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10956  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10957  SDLoc DL(Op);
10958 
10959  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
10960  DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
10961  false,
10962  MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
10963 }
10964 
10965 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
10966 // amount is a constant. Takes immediate version of shift as input.
10967 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
10968  SDValue SrcOp, uint64_t ShiftAmt,
10969  SelectionDAG &DAG) {
10970 
10971  // Check for ShiftAmt >= element width
10972  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
10973  if (Opc == X86ISD::VSRAI)
10974  ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
10975  else
10976  return DAG.getConstant(0, VT);
10977  }
10978 
10979  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
10980  && "Unknown target vector shift-by-constant node");
10981 
10982  return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
10983 }
10984 
10985 // getTargetVShiftNode - Handle vector element shifts where the shift amount
10986 // may or may not be a constant. Takes immediate version of shift as input.
10987 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
10988  SDValue SrcOp, SDValue ShAmt,
10989  SelectionDAG &DAG) {
10990  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
10991 
10992  // Catch shift-by-constant.
10993  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
10994  return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
10995  CShAmt->getZExtValue(), DAG);
10996 
10997  // Change opcode to non-immediate version
10998  switch (Opc) {
10999  default: llvm_unreachable("Unknown target vector shift node");
11000  case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
11001  case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
11002  case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
11003  }
11004 
11005  // Need to build a vector containing shift amount
11006  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
11007  SDValue ShOps[4];
11008  ShOps[0] = ShAmt;
11009  ShOps[1] = DAG.getConstant(0, MVT::i32);
11010  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
11011  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
11012 
11013  // The return type has to be a 128-bit type with the same element
11014  // type as the input type.
11015  MVT EltVT = VT.getVectorElementType().getSimpleVT();
11016  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
11017 
11018  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
11019  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
11020 }
11021 
11023  SDLoc dl(Op);
11024  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11025  switch (IntNo) {
11026  default: return SDValue(); // Don't custom lower most intrinsics.
11027  // Comparison intrinsics.
11052  unsigned Opc;
11053  ISD::CondCode CC;
11054  switch (IntNo) {
11055  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11058  Opc = X86ISD::COMI;
11059  CC = ISD::SETEQ;
11060  break;
11063  Opc = X86ISD::COMI;
11064  CC = ISD::SETLT;
11065  break;
11068  Opc = X86ISD::COMI;
11069  CC = ISD::SETLE;
11070  break;
11073  Opc = X86ISD::COMI;
11074  CC = ISD::SETGT;
11075  break;
11078  Opc = X86ISD::COMI;
11079  CC = ISD::SETGE;
11080  break;
11083  Opc = X86ISD::COMI;
11084  CC = ISD::SETNE;
11085  break;
11088  Opc = X86ISD::UCOMI;
11089  CC = ISD::SETEQ;
11090  break;
11093  Opc = X86ISD::UCOMI;
11094  CC = ISD::SETLT;
11095  break;
11098  Opc = X86ISD::UCOMI;
11099  CC = ISD::SETLE;
11100  break;
11103  Opc = X86ISD::UCOMI;
11104  CC = ISD::SETGT;
11105  break;
11108  Opc = X86ISD::UCOMI;
11109  CC = ISD::SETGE;
11110  break;
11113  Opc = X86ISD::UCOMI;
11114  CC = ISD::SETNE;
11115  break;
11116  }
11117 
11118  SDValue LHS = Op.getOperand(1);
11119  SDValue RHS = Op.getOperand(2);
11120  unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
11121  assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
11122  SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
11123  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11124  DAG.getConstant(X86CC, MVT::i8), Cond);
11125  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11126  }
11127 
11128  // Arithmetic intrinsics.
11131  return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
11132  Op.getOperand(1), Op.getOperand(2));
11133 
11134  // SSE2/AVX2 sub with unsigned saturation intrinsics
11139  return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
11140  Op.getOperand(1), Op.getOperand(2));
11141 
11142  // SSE3/AVX horizontal add/sub intrinsics
11159  unsigned Opcode;
11160  switch (IntNo) {
11161  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11166  Opcode = X86ISD::FHADD;
11167  break;
11172  Opcode = X86ISD::FHSUB;
11173  break;
11178  Opcode = X86ISD::HADD;
11179  break;
11184  Opcode = X86ISD::HSUB;
11185  break;
11186  }
11187  return DAG.getNode(Opcode, dl, Op.getValueType(),
11188  Op.getOperand(1), Op.getOperand(2));
11189  }
11190 
11191  // SSE2/SSE41/AVX2 integer max/min intrinsics.
11224  unsigned Opcode;
11225  switch (IntNo) {
11226  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11235  Opcode = X86ISD::UMAX;
11236  break;
11245  Opcode = X86ISD::UMIN;
11246  break;
11255  Opcode = X86ISD::SMAX;
11256  break;
11265  Opcode = X86ISD::SMIN;
11266  break;
11267  }
11268  return DAG.getNode(Opcode, dl, Op.getValueType(),
11269  Op.getOperand(1), Op.getOperand(2));
11270  }
11271 
11272  // SSE/SSE2/AVX floating point max/min intrinsics.
11285  unsigned Opcode;
11286  switch (IntNo) {
11287  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11294  Opcode = X86ISD::FMAX;
11295  break;
11302  Opcode = X86ISD::FMIN;
11303  break;
11304  }
11305  return DAG.getNode(Opcode, dl, Op.getValueType(),
11306  Op.getOperand(1), Op.getOperand(2));
11307  }
11308 
11309  // AVX2 variable shift intrinsics
11320  unsigned Opcode;
11321  switch (IntNo) {
11322  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11327  Opcode = ISD::SHL;
11328  break;
11333  Opcode = ISD::SRL;
11334  break;
11337  Opcode = ISD::SRA;
11338  break;
11339  }
11340  return DAG.getNode(Opcode, dl, Op.getValueType(),
11341  Op.getOperand(1), Op.getOperand(2));
11342  }
11343 
11346  return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
11347  Op.getOperand(1), Op.getOperand(2));
11348 
11355  return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
11356  Op.getOperand(1), Op.getOperand(2));
11357 
11359  return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
11360  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
11361 
11366  return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
11367  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
11368 
11371  // Operands intentionally swapped. Mask is last operand to intrinsic,
11372  // but second operand for node/instruction.
11373  return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
11374  Op.getOperand(2), Op.getOperand(1));
11375 
11380  return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
11381 
11382  // ptest and testp intrinsics. The intrinsic these come from are designed to
11383  // return an integer value, not just an instruction so lower it to the ptest
11384  // or testp pattern and a setcc for the result.
11403  bool IsTestPacked = false;
11404  unsigned X86CC;
11405  switch (IntNo) {
11406  default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
11411  IsTestPacked = true; // Fallthrough
11414  // ZF = 1
11415  X86CC = X86::COND_E;
11416  break;
11421  IsTestPacked = true; // Fallthrough
11424  // CF = 1
11425  X86CC = X86::COND_B;
11426  break;
11431  IsTestPacked = true; // Fallthrough
11434  // ZF and CF = 0
11435  X86CC = X86::COND_A;
11436  break;
11437  }
11438 
11439  SDValue LHS = Op.getOperand(1);
11440  SDValue RHS = Op.getOperand(2);
11441  unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
11442  SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
11443  SDValue CC = DAG.getConstant(X86CC, MVT::i8);
11444  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
11445  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11446  }
11449  unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
11450  SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
11451  SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
11452  SDValue CC = DAG.getConstant(X86CC, MVT::i8);
11453  SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
11454  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
11455  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11456  }
11457 
11458  // SSE/AVX shift intrinsics
11475  unsigned Opcode;
11476  switch (IntNo) {
11477  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11484  Opcode = X86ISD::VSHL;
11485  break;
11492  Opcode = X86ISD::VSRL;
11493  break;
11498  Opcode = X86ISD::VSRA;
11499  break;
11500  }
11501  return DAG.getNode(Opcode, dl, Op.getValueType(),
11502  Op.getOperand(1), Op.getOperand(2));
11503  }
11504 
11505  // SSE/AVX immediate shift intrinsics
11522  unsigned Opcode;
11523  switch (IntNo) {
11524  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11531  Opcode = X86ISD::VSHLI;
11532  break;
11539  Opcode = X86ISD::VSRLI;
11540  break;
11545  Opcode = X86ISD::VSRAI;
11546  break;
11547  }
11548  return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
11549  Op.getOperand(1), Op.getOperand(2), DAG);
11550  }
11551 
11562  unsigned Opcode;
11563  unsigned X86CC;
11564  switch (IntNo) {
11565  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11567  Opcode = X86ISD::PCMPISTRI;
11568  X86CC = X86::COND_A;
11569  break;
11571  Opcode = X86ISD::PCMPESTRI;
11572  X86CC = X86::COND_A;
11573  break;
11575  Opcode = X86ISD::PCMPISTRI;
11576  X86CC = X86::COND_B;
11577  break;
11579  Opcode = X86ISD::PCMPESTRI;
11580  X86CC = X86::COND_B;
11581  break;
11583  Opcode = X86ISD::PCMPISTRI;
11584  X86CC = X86::COND_O;
11585  break;
11587  Opcode = X86ISD::PCMPESTRI;
11588  X86CC = X86::COND_O;
11589  break;
11591  Opcode = X86ISD::PCMPISTRI;
11592  X86CC = X86::COND_S;
11593  break;
11595  Opcode = X86ISD::PCMPESTRI;
11596  X86CC = X86::COND_S;
11597  break;
11599  Opcode = X86ISD::PCMPISTRI;
11600  X86CC = X86::COND_E;
11601  break;
11603  Opcode = X86ISD::PCMPESTRI;
11604  X86CC = X86::COND_E;
11605  break;
11606  }
11607  SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
11608  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
11609  SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
11610  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11611  DAG.getConstant(X86CC, MVT::i8),
11612  SDValue(PCMP.getNode(), 1));
11613  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11614  }
11615 
11618  unsigned Opcode;
11619  if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
11620  Opcode = X86ISD::PCMPISTRI;
11621  else
11622  Opcode = X86ISD::PCMPESTRI;
11623 
11624  SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
11625  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
11626  return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
11627  }
11664  unsigned Opc;
11665  switch (IntNo) {
11666  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
11673  Opc = X86ISD::FMADD;
11674  break;
11681  Opc = X86ISD::FMSUB;
11682  break;
11689  Opc = X86ISD::FNMADD;
11690  break;
11697  Opc = X86ISD::FNMSUB;
11698  break;
11705  Opc = X86ISD::FMADDSUB;
11706  break;
11713  Opc = X86ISD::FMSUBADD;
11714  break;
11715  }
11716 
11717  return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
11718  Op.getOperand(2), Op.getOperand(3));
11719  }
11720  }
11721 }
11722 
11723 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11724  SDValue Base, SDValue Index,
11725  SDValue ScaleOp, SDValue Chain,
11726  const X86Subtarget * Subtarget) {
11727  SDLoc dl(Op);
11728  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11729  assert(C && "Invalid scale type");
11730  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11731  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
11732  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11734  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
11735  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
11736  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11737  SDValue Segment = DAG.getRegister(0, MVT::i32);
11738  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
11739  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11740  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
11741  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
11742 }
11743 
11744 static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11745  SDValue Src, SDValue Mask, SDValue Base,
11746  SDValue Index, SDValue ScaleOp, SDValue Chain,
11747  const X86Subtarget * Subtarget) {
11748  SDLoc dl(Op);
11749  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11750  assert(C && "Invalid scale type");
11751  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11752  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11754  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
11755  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
11756  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11757  SDValue Segment = DAG.getRegister(0, MVT::i32);
11758  if (Src.getOpcode() == ISD::UNDEF)
11759  Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
11760  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
11761  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11762  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
11763  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
11764 }
11765 
11766 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11767  SDValue Src, SDValue Base, SDValue Index,
11768  SDValue ScaleOp, SDValue Chain) {
11769  SDLoc dl(Op);
11770  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11771  assert(C && "Invalid scale type");
11772  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11773  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11774  SDValue Segment = DAG.getRegister(0, MVT::i32);
11775  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11777  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
11778  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
11779  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
11780  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11781  return SDValue(Res, 1);
11782 }
11783 
11784 static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11785  SDValue Src, SDValue Mask, SDValue Base,
11786  SDValue Index, SDValue ScaleOp, SDValue Chain) {
11787  SDLoc dl(Op);
11788  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11789  assert(C && "Invalid scale type");
11790  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11791  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11792  SDValue Segment = DAG.getRegister(0, MVT::i32);
11793  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11795  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
11796  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
11797  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
11798  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11799  return SDValue(Res, 1);
11800 }
11801 
11803  SelectionDAG &DAG) {
11804  SDLoc dl(Op);
11805  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11806  switch (IntNo) {
11807  default: return SDValue(); // Don't custom lower most intrinsics.
11808 
11809  // RDRAND/RDSEED intrinsics.
11815  case Intrinsic::x86_rdseed_64: {
11816  unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
11817  IntNo == Intrinsic::x86_rdseed_32 ||
11820  // Emit the node with the right value type.
11821  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
11822  SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
11823 
11824  // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
11825  // Otherwise return the value from Rand, which is always 0, casted to i32.
11826  SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
11827  DAG.getConstant(1, Op->getValueType(1)),
11829  SDValue(Result.getNode(), 1) };
11830  SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
11831  DAG.getVTList(Op->getValueType(1), MVT::Glue),
11832  Ops, array_lengthof(Ops));
11833 
11834  // Return { result, isValid, chain }.
11835  return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
11836  SDValue(Result.getNode(), 2));
11837  }
11838  //int_gather(index, base, scale);
11847  unsigned Opc;
11848  switch (IntNo) {
11849  default: llvm_unreachable("Unexpected intrinsic!");
11850  case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
11851  case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
11852  case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
11853  case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
11854  case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
11855  case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
11856  case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
11857  case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
11858  }
11859  SDValue Chain = Op.getOperand(0);
11860  SDValue Index = Op.getOperand(2);
11861  SDValue Base = Op.getOperand(3);
11862  SDValue Scale = Op.getOperand(4);
11863  return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
11864  }
11865  //int_gather_mask(v1, mask, index, base, scale);
11874  unsigned Opc;
11875  switch (IntNo) {
11876  default: llvm_unreachable("Unexpected intrinsic!");
11878  Opc = X86::VGATHERQPSZrm; break;
11880  Opc = X86::VGATHERQPDZrm; break;
11882  Opc = X86::VGATHERDPDZrm; break;
11884  Opc = X86::VGATHERDPSZrm; break;
11886  Opc = X86::VPGATHERQDZrm; break;
11888  Opc = X86::VPGATHERQQZrm; break;
11890  Opc = X86::VPGATHERDDZrm; break;
11892  Opc = X86::VPGATHERDQZrm; break;
11893  }
11894  SDValue Chain = Op.getOperand(0);
11895  SDValue Src = Op.getOperand(2);
11896  SDValue Mask = Op.getOperand(3);
11897  SDValue Index = Op.getOperand(4);
11898  SDValue Base = Op.getOperand(5);
11899  SDValue Scale = Op.getOperand(6);
11900  return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
11901  Subtarget);
11902  }
11903  //int_scatter(base, index, v1, scale);
11912  unsigned Opc;
11913  switch (IntNo) {
11914  default: llvm_unreachable("Unexpected intrinsic!");
11916  Opc = X86::VSCATTERQPDZmr; break;
11918  Opc = X86::VSCATTERQPSZmr; break;
11920  Opc = X86::VSCATTERDPDZmr; break;
11922  Opc = X86::VSCATTERDPSZmr; break;
11924  Opc = X86::VPSCATTERQDZmr; break;
11926  Opc = X86::VPSCATTERQQZmr; break;
11928  Opc = X86::VPSCATTERDQZmr; break;
11930  Opc = X86::VPSCATTERDDZmr; break;
11931  }
11932  SDValue Chain = Op.getOperand(0);
11933  SDValue Base = Op.getOperand(2);
11934  SDValue Index = Op.getOperand(3);
11935  SDValue Src = Op.getOperand(4);
11936  SDValue Scale = Op.getOperand(5);
11937  return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
11938  }
11939  //int_scatter_mask(base, mask, index, v1, scale);
11948  unsigned Opc;
11949  switch (IntNo) {
11950  default: llvm_unreachable("Unexpected intrinsic!");
11952  Opc = X86::VSCATTERQPDZmr; break;
11954  Opc = X86::VSCATTERQPSZmr; break;
11956  Opc = X86::VSCATTERDPDZmr; break;
11958  Opc = X86::VSCATTERDPSZmr; break;
11960  Opc = X86::VPSCATTERQDZmr; break;
11962  Opc = X86::VPSCATTERQQZmr; break;
11964  Opc = X86::VPSCATTERDQZmr; break;
11966  Opc = X86::VPSCATTERDDZmr; break;
11967  }
11968  SDValue Chain = Op.getOperand(0);
11969  SDValue Base = Op.getOperand(2);
11970  SDValue Mask = Op.getOperand(3);
11971  SDValue Index = Op.getOperand(4);
11972  SDValue Src = Op.getOperand(5);
11973  SDValue Scale = Op.getOperand(6);
11974  return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
11975  }
11976  // XTEST intrinsics.
11977  case Intrinsic::x86_xtest: {
11978  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
11979  SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
11980  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11982  InTrans);
11983  SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
11984  return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
11985  Ret, SDValue(InTrans.getNode(), 1));
11986  }
11987  }
11988 }
11989 
11990 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
11991  SelectionDAG &DAG) const {
11993  MFI->setReturnAddressIsTaken(true);
11994 
11995  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11996  SDLoc dl(Op);
11997  EVT PtrVT = getPointerTy();
11998 
11999  if (Depth > 0) {
12000  SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12001  const X86RegisterInfo *RegInfo =
12002  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
12003  SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
12004  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
12005  DAG.getNode(ISD::ADD, dl, PtrVT,
12006  FrameAddr, Offset),
12007  MachinePointerInfo(), false, false, false, 0);
12008  }
12009 
12010  // Just load the return address.
12011  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
12012  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
12013  RetAddrFI, MachinePointerInfo(), false, false, false, 0);
12014 }
12015 
12016 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
12018  MFI->setFrameAddressIsTaken(true);
12019 
12020  EVT VT = Op.getValueType();
12021  SDLoc dl(Op); // FIXME probably not meaningful
12022  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
12023  const X86RegisterInfo *RegInfo =
12024  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
12025  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
12026  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
12027  (FrameReg == X86::EBP && VT == MVT::i32)) &&
12028  "Invalid Frame Register!");
12029  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
12030  while (Depth--)
12031  FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
12033  false, false, false, 0);
12034  return FrameAddr;
12035 }
12036 
12037 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
12038  SelectionDAG &DAG) const {
12039  const X86RegisterInfo *RegInfo =
12040  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
12041  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
12042 }
12043 
12044 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
12045  SDValue Chain = Op.getOperand(0);
12046  SDValue Offset = Op.getOperand(1);
12047  SDValue Handler = Op.getOperand(2);
12048  SDLoc dl (Op);
12049 
12050  EVT PtrVT = getPointerTy();
12051  const X86RegisterInfo *RegInfo =
12052  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
12053  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
12054  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
12055  (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
12056  "Invalid Frame Register!");
12057  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
12058  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
12059 
12060  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
12061  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
12062  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
12063  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
12064  false, false, 0);
12065  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
12066 
12067  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
12068  DAG.getRegister(StoreAddrReg, PtrVT));
12069 }
12070 
12071 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
12072  SelectionDAG &DAG) const {
12073  SDLoc DL(Op);
12074  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
12076  Op.getOperand(0), Op.getOperand(1));
12077 }
12078 
12079 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
12080  SelectionDAG &DAG) const {
12081  SDLoc DL(Op);
12082  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
12083  Op.getOperand(0), Op.getOperand(1));
12084 }
12085 
12087  return Op.getOperand(0);
12088 }
12089 
12090 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
12091  SelectionDAG &DAG) const {
12092  SDValue Root = Op.getOperand(0);
12093  SDValue Trmp = Op.getOperand(1); // trampoline
12094  SDValue FPtr = Op.getOperand(2); // nested function
12095  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
12096  SDLoc dl (Op);
12097 
12098  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12100 
12101  if (Subtarget->is64Bit()) {
12102  SDValue OutChains[6];
12103 
12104  // Large code-model.
12105  const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
12106  const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
12107 
12108  const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
12109  const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
12110 
12111  const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
12112 
12113  // Load the pointer to the nested function into R11.
12114  unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
12115  SDValue Addr = Trmp;
12116  OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
12117  Addr, MachinePointerInfo(TrmpAddr),
12118  false, false, 0);
12119 
12120  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12121  DAG.getConstant(2, MVT::i64));
12122  OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
12123  MachinePointerInfo(TrmpAddr, 2),
12124  false, false, 2);
12125 
12126  // Load the 'nest' parameter value into R10.
12127  // R10 is specified in X86CallingConv.td
12128  OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
12129  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12130  DAG.getConstant(10, MVT::i64));
12131  OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
12132  Addr, MachinePointerInfo(TrmpAddr, 10),
12133  false, false, 0);
12134 
12135  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12136  DAG.getConstant(12, MVT::i64));
12137  OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
12138  MachinePointerInfo(TrmpAddr, 12),
12139  false, false, 2);
12140 
12141  // Jump to the nested function.
12142  OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
12143  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12144  DAG.getConstant(20, MVT::i64));
12145  OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
12146  Addr, MachinePointerInfo(TrmpAddr, 20),
12147  false, false, 0);
12148 
12149  unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
12150  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12151  DAG.getConstant(22, MVT::i64));
12152  OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
12153  MachinePointerInfo(TrmpAddr, 22),
12154  false, false, 0);
12155 
12156  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
12157  } else {
12158  const Function *Func =
12159  cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
12160  CallingConv::ID CC = Func->getCallingConv();
12161  unsigned NestReg;
12162 
12163  switch (CC) {
12164  default:
12165  llvm_unreachable("Unsupported calling convention");
12166  case CallingConv::C:
12167  case CallingConv::X86_StdCall: {
12168  // Pass 'nest' parameter in ECX.
12169  // Must be kept in sync with X86CallingConv.td
12170  NestReg = X86::ECX;
12171 
12172  // Check that ECX wasn't needed by an 'inreg' parameter.
12173  FunctionType *FTy = Func->getFunctionType();
12174  const AttributeSet &Attrs = Func->getAttributes();
12175 
12176  if (!Attrs.isEmpty() && !Func->isVarArg()) {
12177  unsigned InRegCount = 0;
12178  unsigned Idx = 1;
12179 
12181  E = FTy->param_end(); I != E; ++I, ++Idx)
12182  if (Attrs.hasAttribute(Idx, Attribute::InReg))
12183  // FIXME: should only count parameters that are lowered to integers.
12184  InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
12185 
12186  if (InRegCount > 2) {
12187  report_fatal_error("Nest register in use - reduce number of inreg"
12188  " parameters!");
12189  }
12190  }
12191  break;
12192  }
12195  case CallingConv::Fast:
12196  // Pass 'nest' parameter in EAX.
12197  // Must be kept in sync with X86CallingConv.td
12198  NestReg = X86::EAX;
12199  break;
12200  }
12201 
12202  SDValue OutChains[4];
12203  SDValue Addr, Disp;
12204 
12205  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12206  DAG.getConstant(10, MVT::i32));
12207  Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
12208 
12209  // This is storing the opcode for MOV32ri.
12210  const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
12211  const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
12212  OutChains[0] = DAG.getStore(Root, dl,
12213  DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
12214  Trmp, MachinePointerInfo(TrmpAddr),
12215  false, false, 0);
12216 
12217  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12218  DAG.getConstant(1, MVT::i32));
12219  OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
12220  MachinePointerInfo(TrmpAddr, 1),
12221  false, false, 1);
12222 
12223  const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
12224  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12225  DAG.getConstant(5, MVT::i32));
12226  OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
12227  MachinePointerInfo(TrmpAddr, 5),
12228  false, false, 1);
12229 
12230  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12231  DAG.getConstant(6, MVT::i32));
12232  OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
12233  MachinePointerInfo(TrmpAddr, 6),
12234  false, false, 1);
12235 
12236  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
12237  }
12238 }
12239 
12240 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
12241  SelectionDAG &DAG) const {
12242  /*
12243  The rounding mode is in bits 11:10 of FPSR, and has the following
12244  settings:
12245  00 Round to nearest
12246  01 Round to -inf
12247  10 Round to +inf
12248  11 Round to 0
12249 
12250  FLT_ROUNDS, on the other hand, expects the following:
12251  -1 Undefined
12252  0 Round to 0
12253  1 Round to nearest
12254  2 Round to +inf
12255  3 Round to -inf
12256 
12257  To perform the conversion, we do:
12258  (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
12259  */
12260 
12261  MachineFunction &MF = DAG.getMachineFunction();
12262  const TargetMachine &TM = MF.getTarget();
12263  const TargetFrameLowering &TFI = *TM.getFrameLowering();
12264  unsigned StackAlignment = TFI.getStackAlignment();
12265  EVT VT = Op.getValueType();
12266  SDLoc DL(Op);
12267 
12268  // Save FP Control Word to stack slot
12269  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
12270  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
12271 
12272  MachineMemOperand *MMO =
12275 
12276  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
12278  DAG.getVTList(MVT::Other),
12279  Ops, array_lengthof(Ops), MVT::i16,
12280  MMO);
12281 
12282  // Load FP Control Word from stack slot
12283  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
12284  MachinePointerInfo(), false, false, false, 0);
12285 
12286  // Transform as necessary
12287  SDValue CWD1 =
12288  DAG.getNode(ISD::SRL, DL, MVT::i16,
12289  DAG.getNode(ISD::AND, DL, MVT::i16,
12290  CWD, DAG.getConstant(0x800, MVT::i16)),
12291  DAG.getConstant(11, MVT::i8));
12292  SDValue CWD2 =
12293  DAG.getNode(ISD::SRL, DL, MVT::i16,
12294  DAG.getNode(ISD::AND, DL, MVT::i16,
12295  CWD, DAG.getConstant(0x400, MVT::i16)),
12296  DAG.getConstant(9, MVT::i8));
12297 
12298  SDValue RetVal =
12299  DAG.getNode(ISD::AND, DL, MVT::i16,
12300  DAG.getNode(ISD::ADD, DL, MVT::i16,
12301  DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
12302  DAG.getConstant(1, MVT::i16)),
12303  DAG.getConstant(3, MVT::i16));
12304 
12305  return DAG.getNode((VT.getSizeInBits() < 16 ?
12306  ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
12307 }
12308 
12310  EVT VT = Op.getValueType();
12311  EVT OpVT = VT;
12312  unsigned NumBits = VT.getSizeInBits();
12313  SDLoc dl(Op);
12314 
12315  Op = Op.getOperand(0);
12316  if (VT == MVT::i8) {
12317  // Zero extend to i32 since there is not an i8 bsr.
12318  OpVT = MVT::i32;
12319  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
12320  }
12321 
12322  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
12323  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
12324  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
12325 
12326  // If src is zero (i.e. bsr sets ZF), returns NumBits.
12327  SDValue Ops[] = {
12328  Op,
12329  DAG.getConstant(NumBits+NumBits-1, OpVT),
12331  Op.getValue(1)
12332  };
12333  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
12334 
12335  // Finally xor with NumBits-1.
12336  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
12337 
12338  if (VT == MVT::i8)
12339  Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
12340  return Op;
12341 }
12342 
12344  EVT VT = Op.getValueType();
12345  EVT OpVT = VT;
12346  unsigned NumBits = VT.getSizeInBits();
12347  SDLoc dl(Op);
12348 
12349  Op = Op.getOperand(0);
12350  if (VT == MVT::i8) {
12351  // Zero extend to i32 since there is not an i8 bsr.
12352  OpVT = MVT::i32;
12353  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
12354  }
12355 
12356  // Issue a bsr (scan bits in reverse).
12357  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
12358  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
12359 
12360  // And xor with NumBits-1.
12361  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
12362 
12363  if (VT == MVT::i8)
12364  Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
12365  return Op;
12366 }
12367 
12369  EVT VT = Op.getValueType();
12370  unsigned NumBits = VT.getSizeInBits();
12371  SDLoc dl(Op);
12372  Op = Op.getOperand(0);
12373 
12374  // Issue a bsf (scan bits forward) which also sets EFLAGS.
12375  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
12376  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
12377 
12378  // If src is zero (i.e. bsf sets ZF), returns NumBits.
12379  SDValue Ops[] = {
12380  Op,
12381  DAG.getConstant(NumBits, VT),
12383  Op.getValue(1)
12384  };
12385  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
12386 }
12387 
12388 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
12389 // ones, and then concatenate the result back.
12391  EVT VT = Op.getValueType();
12392 
12393  assert(VT.is256BitVector() && VT.isInteger() &&
12394  "Unsupported value type for operation");
12395 
12396  unsigned NumElems = VT.getVectorNumElements();
12397  SDLoc dl(Op);
12398 
12399  // Extract the LHS vectors
12400  SDValue LHS = Op.getOperand(0);
12401  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
12402  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
12403 
12404  // Extract the RHS vectors
12405  SDValue RHS = Op.getOperand(1);
12406  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
12407  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
12408 
12409  MVT EltVT = VT.getVectorElementType().getSimpleVT();
12410  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
12411 
12412  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
12413  DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
12414  DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
12415 }
12416 
12418  assert(Op.getValueType().is256BitVector() &&
12419  Op.getValueType().isInteger() &&
12420  "Only handle AVX 256-bit vector integer operation");
12421  return Lower256IntArith(Op, DAG);
12422 }
12423 
12425  assert(Op.getValueType().is256BitVector() &&
12426  Op.getValueType().isInteger() &&
12427  "Only handle AVX 256-bit vector integer operation");
12428  return Lower256IntArith(Op, DAG);
12429 }
12430 
12431 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
12432  SelectionDAG &DAG) {
12433  SDLoc dl(Op);
12434  EVT VT = Op.getValueType();
12435 
12436  // Decompose 256-bit ops into smaller 128-bit ops.
12437  if (VT.is256BitVector() && !Subtarget->hasInt256())
12438  return Lower256IntArith(Op, DAG);
12439 
12440  SDValue A = Op.getOperand(0);
12441  SDValue B = Op.getOperand(1);
12442 
12443  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
12444  if (VT == MVT::v4i32) {
12445  assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
12446  "Should not custom lower when pmuldq is available!");
12447 
12448  // Extract the odd parts.
12449  static const int UnpackMask[] = { 1, -1, 3, -1 };
12450  SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
12451  SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
12452 
12453  // Multiply the even parts.
12454  SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
12455  // Now multiply odd parts.
12456  SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
12457 
12458  Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
12459  Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
12460 
12461  // Merge the two vectors back together with a shuffle. This expands into 2
12462  // shuffles.
12463  static const int ShufMask[] = { 0, 4, 2, 6 };
12464  return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
12465  }
12466 
12467  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
12468  "Only know how to lower V2I64/V4I64/V8I64 multiply");
12469 
12470  // Ahi = psrlqi(a, 32);
12471  // Bhi = psrlqi(b, 32);
12472  //
12473  // AloBlo = pmuludq(a, b);
12474  // AloBhi = pmuludq(a, Bhi);
12475  // AhiBlo = pmuludq(Ahi, b);
12476 
12477  // AloBhi = psllqi(AloBhi, 32);
12478  // AhiBlo = psllqi(AhiBlo, 32);
12479  // return AloBlo + AloBhi + AhiBlo;
12480 
12481  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
12482  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
12483 
12484  // Bit cast to 32-bit vectors for MULUDQ
12485  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
12486  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
12487  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
12488  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
12489  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
12490  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
12491 
12492  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
12493  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
12494  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
12495 
12496  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
12497  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
12498 
12499  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
12500  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
12501 }
12502 
12504  EVT VT = Op.getValueType();
12505  EVT EltTy = VT.getVectorElementType();
12506  unsigned NumElts = VT.getVectorNumElements();
12507  SDValue N0 = Op.getOperand(0);
12508  SDLoc dl(Op);
12509 
12510  // Lower sdiv X, pow2-const.
12512  if (!C)
12513  return SDValue();
12514 
12515  APInt SplatValue, SplatUndef;
12516  unsigned SplatBitSize;
12517  bool HasAnyUndefs;
12518  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
12519  HasAnyUndefs) ||
12520  EltTy.getSizeInBits() < SplatBitSize)
12521  return SDValue();
12522 
12523  if ((SplatValue != 0) &&
12524  (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
12525  unsigned Lg2 = SplatValue.countTrailingZeros();
12526  // Splat the sign bit.
12527  SmallVector<SDValue, 16> Sz(NumElts,
12528  DAG.getConstant(EltTy.getSizeInBits() - 1,
12529  EltTy));
12530  SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
12531  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
12532  NumElts));
12533  // Add (N0 < 0) ? abs2 - 1 : 0;
12534  SmallVector<SDValue, 16> Amt(NumElts,
12535  DAG.getConstant(EltTy.getSizeInBits() - Lg2,
12536  EltTy));
12537  SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
12538  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
12539  NumElts));
12540  SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
12541  SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
12542  SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
12543  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
12544  NumElts));
12545 
12546  // If we're dividing by a positive value, we're done. Otherwise, we must
12547  // negate the result.
12548  if (SplatValue.isNonNegative())
12549  return SRA;
12550 
12551  SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
12552  SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
12553  return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
12554  }
12555  return SDValue();
12556 }
12557 
12559  const X86Subtarget *Subtarget) {
12560  EVT VT = Op.getValueType();
12561  SDLoc dl(Op);
12562  SDValue R = Op.getOperand(0);
12563  SDValue Amt = Op.getOperand(1);
12564 
12565  // Optimize shl/srl/sra with constant shift amount.
12566  if (isSplatVector(Amt.getNode())) {
12567  SDValue SclrAmt = Amt->getOperand(0);
12568  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
12569  uint64_t ShiftAmt = C->getZExtValue();
12570 
12571  if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
12572  (Subtarget->hasInt256() &&
12573  (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
12574  (Subtarget->hasAVX512() &&
12575  (VT == MVT::v8i64 || VT == MVT::v16i32))) {
12576  if (Op.getOpcode() == ISD::SHL)
12577  return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
12578  DAG);
12579  if (Op.getOpcode() == ISD::SRL)
12580  return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
12581  DAG);
12582  if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
12583  return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
12584  DAG);
12585  }
12586 
12587  if (VT == MVT::v16i8) {
12588  if (Op.getOpcode() == ISD::SHL) {
12589  // Make a large shift.
12591  MVT::v8i16, R, ShiftAmt,
12592  DAG);
12593  SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
12594  // Zero out the rightmost bits.
12596  DAG.getConstant(uint8_t(-1U << ShiftAmt),
12597  MVT::i8));
12598  return DAG.getNode(ISD::AND, dl, VT, SHL,
12599  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
12600  }
12601  if (Op.getOpcode() == ISD::SRL) {
12602  // Make a large shift.
12604  MVT::v8i16, R, ShiftAmt,
12605  DAG);
12606  SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
12607  // Zero out the leftmost bits.
12609  DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
12610  MVT::i8));
12611  return DAG.getNode(ISD::AND, dl, VT, SRL,
12612  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
12613  }
12614  if (Op.getOpcode() == ISD::SRA) {
12615  if (ShiftAmt == 7) {
12616  // R s>> 7 === R s< 0
12617  SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
12618  return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
12619  }
12620 
12621  // R s>> a === ((R u>> a) ^ m) - m
12622  SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
12623  SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
12624  MVT::i8));
12625  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
12626  Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
12627  Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
12628  return Res;
12629  }
12630  llvm_unreachable("Unknown shift opcode.");
12631  }
12632 
12633  if (Subtarget->hasInt256() && VT == MVT::v32i8) {
12634  if (Op.getOpcode() == ISD::SHL) {
12635  // Make a large shift.
12637  MVT::v16i16, R, ShiftAmt,
12638  DAG);
12639  SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
12640  // Zero out the rightmost bits.
12642  DAG.getConstant(uint8_t(-1U << ShiftAmt),
12643  MVT::i8));
12644  return DAG.getNode(ISD::AND, dl, VT, SHL,
12645  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
12646  }
12647  if (Op.getOpcode() == ISD::SRL) {
12648  // Make a large shift.
12650  MVT::v16i16, R, ShiftAmt,
12651  DAG);
12652  SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
12653  // Zero out the leftmost bits.
12655  DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
12656  MVT::i8));
12657  return DAG.getNode(ISD::AND, dl, VT, SRL,
12658  DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
12659  }
12660  if (Op.getOpcode() == ISD::SRA) {
12661  if (ShiftAmt == 7) {
12662  // R s>> 7 === R s< 0
12663  SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
12664  return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
12665  }
12666 
12667  // R s>> a === ((R u>> a) ^ m) - m
12668  SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
12669  SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
12670  MVT::i8));
12671  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
12672  Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
12673  Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
12674  return Res;
12675  }
12676  llvm_unreachable("Unknown shift opcode.");
12677  }
12678  }
12679  }
12680 
12681  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
12682  if (!Subtarget->is64Bit() &&
12683  (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
12684  Amt.getOpcode() == ISD::BITCAST &&
12685  Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
12686  Amt = Amt.getOperand(0);
12687  unsigned Ratio = Amt.getValueType().getVectorNumElements() /
12688  VT.getVectorNumElements();
12689  unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
12690  uint64_t ShiftAmt = 0;
12691  for (unsigned i = 0; i != Ratio; ++i) {
12693  if (C == 0)
12694  return SDValue();
12695  // 6 == Log2(64)
12696  ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
12697  }
12698  // Check remaining shift amounts.
12699  for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
12700  uint64_t ShAmt = 0;
12701  for (unsigned j = 0; j != Ratio; ++j) {
12702  ConstantSDNode *C =
12703  dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
12704  if (C == 0)
12705  return SDValue();
12706  // 6 == Log2(64)
12707  ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
12708  }
12709  if (ShAmt != ShiftAmt)
12710  return SDValue();
12711  }
12712  switch (Op.getOpcode()) {
12713  default:
12714  llvm_unreachable("Unknown shift opcode!");
12715  case ISD::SHL:
12716  return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
12717  DAG);
12718  case ISD::SRL:
12719  return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
12720  DAG);
12721  case ISD::SRA:
12722  return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
12723  DAG);
12724  }
12725  }
12726 
12727  return SDValue();
12728 }
12729 
12731  const X86Subtarget* Subtarget) {
12732  EVT VT = Op.getValueType();
12733  SDLoc dl(Op);
12734  SDValue R = Op.getOperand(0);
12735  SDValue Amt = Op.getOperand(1);
12736 
12737  if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
12738  VT == MVT::v4i32 || VT == MVT::v8i16 ||
12739  (Subtarget->hasInt256() &&
12740  ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
12741  VT == MVT::v8i32 || VT == MVT::v16i16)) ||
12742  (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
12743  SDValue BaseShAmt;
12744  EVT EltVT = VT.getVectorElementType();
12745 
12746  if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
12747  unsigned NumElts = VT.getVectorNumElements();
12748  unsigned i, j;
12749  for (i = 0; i != NumElts; ++i) {
12750  if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
12751  continue;
12752  break;
12753  }
12754  for (j = i; j != NumElts; ++j) {
12755  SDValue Arg = Amt.getOperand(j);
12756  if (Arg.getOpcode() == ISD::UNDEF) continue;
12757  if (Arg != Amt.getOperand(i))
12758  break;
12759  }
12760  if (i != NumElts && j == NumElts)
12761  BaseShAmt = Amt.getOperand(i);
12762  } else {
12763  if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12764  Amt = Amt.getOperand(0);
12765  if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
12766  cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
12767  SDValue InVec = Amt.getOperand(0);
12768  if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
12769  unsigned NumElts = InVec.getValueType().getVectorNumElements();
12770  unsigned i = 0;
12771  for (; i != NumElts; ++i) {
12772  SDValue Arg = InVec.getOperand(i);
12773  if (Arg.getOpcode() == ISD::UNDEF) continue;
12774  BaseShAmt = Arg;
12775  break;
12776  }
12777  } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
12778  if (ConstantSDNode *C =
12779  dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
12780  unsigned SplatIdx =
12781  cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
12782  if (C->getZExtValue() == SplatIdx)
12783  BaseShAmt = InVec.getOperand(1);
12784  }
12785  }
12786  if (BaseShAmt.getNode() == 0)
12787  BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
12788  DAG.getIntPtrConstant(0));
12789  }
12790  }
12791 
12792  if (BaseShAmt.getNode()) {
12793  if (EltVT.bitsGT(MVT::i32))
12794  BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
12795  else if (EltVT.bitsLT(MVT::i32))
12796  BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
12797 
12798  switch (Op.getOpcode()) {
12799  default:
12800  llvm_unreachable("Unknown shift opcode!");
12801  case ISD::SHL:
12802  switch (VT.getSimpleVT().SimpleTy) {
12803  default: return SDValue();
12804  case MVT::v2i64:
12805  case MVT::v4i32:
12806  case MVT::v8i16:
12807  case MVT::v4i64:
12808  case MVT::v8i32:
12809  case MVT::v16i16:
12810  case MVT::v16i32:
12811  case MVT::v8i64:
12812  return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
12813  }
12814  case ISD::SRA:
12815  switch (VT.getSimpleVT().SimpleTy) {
12816  default: return SDValue();
12817  case MVT::v4i32:
12818  case MVT::v8i16:
12819  case MVT::v8i32:
12820  case MVT::v16i16:
12821  case MVT::v16i32:
12822  case MVT::v8i64:
12823  return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
12824  }
12825  case ISD::SRL:
12826  switch (VT.getSimpleVT().SimpleTy) {
12827  default: return SDValue();
12828  case MVT::v2i64:
12829  case MVT::v4i32:
12830  case MVT::v8i16:
12831  case MVT::v4i64:
12832  case MVT::v8i32:
12833  case MVT::v16i16:
12834  case MVT::v16i32:
12835  case MVT::v8i64:
12836  return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
12837  }
12838  }
12839  }
12840  }
12841 
12842  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
12843  if (!Subtarget->is64Bit() &&
12844  (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
12845  (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
12846  Amt.getOpcode() == ISD::BITCAST &&
12847  Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
12848  Amt = Amt.getOperand(0);
12849  unsigned Ratio = Amt.getValueType().getVectorNumElements() /
12850  VT.getVectorNumElements();
12851  std::vector<SDValue> Vals(Ratio);
12852  for (unsigned i = 0; i != Ratio; ++i)
12853  Vals[i] = Amt.getOperand(i);
12854  for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
12855  for (unsigned j = 0; j != Ratio; ++j)
12856  if (Vals[j] != Amt.getOperand(i + j))
12857  return SDValue();
12858  }
12859  switch (Op.getOpcode()) {
12860  default:
12861  llvm_unreachable("Unknown shift opcode!");
12862  case ISD::SHL:
12863  return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
12864  case ISD::SRL:
12865  return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
12866  case ISD::SRA:
12867  return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
12868  }
12869  }
12870 
12871  return SDValue();
12872 }
12873 
12874 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
12875  SelectionDAG &DAG) {
12876 
12877  EVT VT = Op.getValueType();
12878  SDLoc dl(Op);
12879  SDValue R = Op.getOperand(0);
12880  SDValue Amt = Op.getOperand(1);
12881  SDValue V;
12882 
12883  if (!Subtarget->hasSSE2())
12884  return SDValue();
12885 
12886  V = LowerScalarImmediateShift(Op, DAG, Subtarget);
12887  if (V.getNode())
12888  return V;
12889 
12890  V = LowerScalarVariableShift(Op, DAG, Subtarget);
12891  if (V.getNode())
12892  return V;
12893 
12894  if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
12895  return Op;
12896  // AVX2 has VPSLLV/VPSRAV/VPSRLV.
12897  if (Subtarget->hasInt256()) {
12898  if (Op.getOpcode() == ISD::SRL &&
12899  (VT == MVT::v2i64 || VT == MVT::v4i32 ||
12900  VT == MVT::v4i64 || VT == MVT::v8i32))
12901  return Op;
12902  if (Op.getOpcode() == ISD::SHL &&
12903  (VT == MVT::v2i64 || VT == MVT::v4i32 ||
12904  VT == MVT::v4i64 || VT == MVT::v8i32))
12905  return Op;
12906  if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
12907  return Op;
12908  }
12909 
12910  // Lower SHL with variable shift amount.
12911  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
12912  Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
12913 
12914  Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
12915  Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
12916  Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
12917  return DAG.getNode(ISD::MUL, dl, VT, Op, R);
12918  }
12919  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
12920  assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
12921 
12922  // a = a << 5;
12923  Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
12924  Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
12925 
12926  // Turn 'a' into a mask suitable for VSELECT
12927  SDValue VSelM = DAG.getConstant(0x80, VT);
12928  SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
12929  OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
12930 
12931  SDValue CM1 = DAG.getConstant(0x0f, VT);
12932  SDValue CM2 = DAG.getConstant(0x3f, VT);
12933 
12934  // r = VSELECT(r, psllw(r & (char16)15, 4), a);
12935  SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
12937  M = DAG.getNode(ISD::BITCAST, dl, VT, M);
12938  R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
12939 
12940  // a += a
12941  Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
12942  OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
12943  OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
12944 
12945  // r = VSELECT(r, psllw(r & (char16)63, 2), a);
12946  M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
12948  M = DAG.getNode(ISD::BITCAST, dl, VT, M);
12949  R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
12950 
12951  // a += a
12952  Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
12953  OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
12954  OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
12955 
12956  // return VSELECT(r, r+r, a);
12957  R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
12958  DAG.getNode(ISD::ADD, dl, VT, R, R), R);
12959  return R;
12960  }
12961 
12962  // Decompose 256-bit shifts into smaller 128-bit shifts.
12963  if (VT.is256BitVector()) {
12964  unsigned NumElems = VT.getVectorNumElements();
12965  MVT EltVT = VT.getVectorElementType().getSimpleVT();
12966  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
12967 
12968  // Extract the two vectors
12969  SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
12970  SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
12971 
12972  // Recreate the shift amount vectors
12973  SDValue Amt1, Amt2;
12974  if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
12975  // Constant shift amount
12976  SmallVector<SDValue, 4> Amt1Csts;
12977  SmallVector<SDValue, 4> Amt2Csts;
12978  for (unsigned i = 0; i != NumElems/2; ++i)
12979  Amt1Csts.push_back(Amt->getOperand(i));
12980  for (unsigned i = NumElems/2; i != NumElems; ++i)
12981  Amt2Csts.push_back(Amt->getOperand(i));
12982 
12983  Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
12984  &Amt1Csts[0], NumElems/2);
12985  Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
12986  &Amt2Csts[0], NumElems/2);
12987  } else {
12988  // Variable shift amount
12989  Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
12990  Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
12991  }
12992 
12993  // Issue new vector shifts for the smaller types
12994  V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
12995  V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
12996 
12997  // Concatenate the result back
12998  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
12999  }
13000 
13001  return SDValue();
13002 }
13003 
13005  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
13006  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
13007  // looks for this combo and may remove the "setcc" instruction if the "setcc"
13008  // has only one use.
13009  SDNode *N = Op.getNode();
13010  SDValue LHS = N->getOperand(0);
13011  SDValue RHS = N->getOperand(1);
13012  unsigned BaseOp = 0;
13013  unsigned Cond = 0;
13014  SDLoc DL(Op);
13015  switch (Op.getOpcode()) {
13016  default: llvm_unreachable("Unknown ovf instruction!");
13017  case ISD::SADDO:
13018  // A subtract of one will be selected as a INC. Note that INC doesn't
13019  // set CF, so we can't do this for UADDO.
13020  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
13021  if (C->isOne()) {
13022  BaseOp = X86ISD::INC;
13023  Cond = X86::COND_O;
13024  break;
13025  }
13026  BaseOp = X86ISD::ADD;
13027  Cond = X86::COND_O;
13028  break;
13029  case ISD::UADDO:
13030  BaseOp = X86ISD::ADD;
13031  Cond = X86::COND_B;
13032  break;
13033  case ISD::SSUBO:
13034  // A subtract of one will be selected as a DEC. Note that DEC doesn't
13035  // set CF, so we can't do this for USUBO.
13036  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
13037  if (C->isOne()) {
13038  BaseOp = X86ISD::DEC;
13039  Cond = X86::COND_O;
13040  break;
13041  }
13042  BaseOp = X86ISD::SUB;
13043  Cond = X86::COND_O;
13044  break;
13045  case ISD::USUBO:
13046  BaseOp = X86ISD::SUB;
13047  Cond = X86::COND_B;
13048  break;
13049  case ISD::SMULO:
13050  BaseOp = X86ISD::SMUL;
13051  Cond = X86::COND_O;
13052  break;
13053  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
13054  SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
13055  MVT::i32);
13056  SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
13057 
13058  SDValue SetCC =
13059  DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
13061  SDValue(Sum.getNode(), 2));
13062 
13063  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
13064  }
13065  }
13066 
13067  // Also sets EFLAGS.
13068  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
13069  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
13070 
13071  SDValue SetCC =
13072  DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
13073  DAG.getConstant(Cond, MVT::i32),
13074  SDValue(Sum.getNode(), 1));
13075 
13076  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
13077 }
13078 
13079 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
13080  SelectionDAG &DAG) const {
13081  SDLoc dl(Op);
13082  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
13083  EVT VT = Op.getValueType();
13084 
13085  if (!Subtarget->hasSSE2() || !VT.isVector())
13086  return SDValue();
13087 
13088  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
13089  ExtraVT.getScalarType().getSizeInBits();
13090 
13091  switch (VT.getSimpleVT().SimpleTy) {
13092  default: return SDValue();
13093  case MVT::v8i32:
13094  case MVT::v16i16:
13095  if (!Subtarget->hasFp256())
13096  return SDValue();
13097  if (!Subtarget->hasInt256()) {
13098  // needs to be split
13099  unsigned NumElems = VT.getVectorNumElements();
13100 
13101  // Extract the LHS vectors
13102  SDValue LHS = Op.getOperand(0);
13103  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
13104  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
13105 
13106  MVT EltVT = VT.getVectorElementType().getSimpleVT();
13107  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
13108 
13109  EVT ExtraEltVT = ExtraVT.getVectorElementType();
13110  unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
13111  ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
13112  ExtraNumElems/2);
13113  SDValue Extra = DAG.getValueType(ExtraVT);
13114 
13115  LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
13116  LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
13117 
13118  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
13119  }
13120  // fall through
13121  case MVT::v4i32:
13122  case MVT::v8i16: {
13123  // (sext (vzext x)) -> (vsext x)
13124  SDValue Op0 = Op.getOperand(0);
13125  SDValue Op00 = Op0.getOperand(0);
13126  SDValue Tmp1;
13127  // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
13128  if (Op0.getOpcode() == ISD::BITCAST &&
13129  Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
13130  Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
13131  if (Tmp1.getNode()) {
13132  SDValue Tmp1Op0 = Tmp1.getOperand(0);
13133  assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
13134  "This optimization is invalid without a VZEXT.");
13135  return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
13136  }
13137 
13138  // If the above didn't work, then just use Shift-Left + Shift-Right.
13139  Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
13140  DAG);
13141  return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
13142  DAG);
13143  }
13144  }
13145 }
13146 
13147 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
13148  SelectionDAG &DAG) {
13149  SDLoc dl(Op);
13150  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
13151  cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
13152  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
13153  cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13154 
13155  // The only fence that needs an instruction is a sequentially-consistent
13156  // cross-thread fence.
13157  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
13158  // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
13159  // no-sse2). There isn't any reason to disable it if the target processor
13160  // supports it.
13161  if (Subtarget->hasSSE2() || Subtarget->is64Bit())
13162  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
13163 
13164  SDValue Chain = Op.getOperand(0);
13165  SDValue Zero = DAG.getConstant(0, MVT::i32);
13166  SDValue Ops[] = {
13167  DAG.getRegister(X86::ESP, MVT::i32), // Base
13168  DAG.getTargetConstant(1, MVT::i8), // Scale
13169  DAG.getRegister(0, MVT::i32), // Index
13170  DAG.getTargetConstant(0, MVT::i32), // Disp
13171  DAG.getRegister(0, MVT::i32), // Segment.
13172  Zero,
13173  Chain
13174  };
13175  SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
13176  return SDValue(Res, 0);
13177  }
13178 
13179  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
13180  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
13181 }
13182 
13183 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
13184  SelectionDAG &DAG) {
13185  EVT T = Op.getValueType();
13186  SDLoc DL(Op);
13187  unsigned Reg = 0;
13188  unsigned size = 0;
13189  switch(T.getSimpleVT().SimpleTy) {
13190  default: llvm_unreachable("Invalid value type!");
13191  case MVT::i8: Reg = X86::AL; size = 1; break;
13192  case MVT::i16: Reg = X86::AX; size = 2; break;
13193  case MVT::i32: Reg = X86::EAX; size = 4; break;
13194  case MVT::i64:
13195  assert(Subtarget->is64Bit() && "Node not type legal!");
13196  Reg = X86::RAX; size = 8;
13197  break;
13198  }
13199  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
13200  Op.getOperand(2), SDValue());
13201  SDValue Ops[] = { cpIn.getValue(0),
13202  Op.getOperand(1),
13203  Op.getOperand(3),
13204  DAG.getTargetConstant(size, MVT::i8),
13205  cpIn.getValue(1) };
13206  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13207  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
13208  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
13209  Ops, array_lengthof(Ops), T, MMO);
13210  SDValue cpOut =
13211  DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
13212  return cpOut;
13213 }
13214 
13216  SelectionDAG &DAG) {
13217  assert(Subtarget->is64Bit() && "Result not type legalized?");
13218  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13219  SDValue TheChain = Op.getOperand(0);
13220  SDLoc dl(Op);
13221  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
13222  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
13223  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
13224  rax.getValue(2));
13225  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
13226  DAG.getConstant(32, MVT::i8));
13227  SDValue Ops[] = {
13228  DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
13229  rdx.getValue(1)
13230  };
13231  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
13232 }
13233 
13234 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
13235  SelectionDAG &DAG) {
13236  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13237  MVT DstVT = Op.getSimpleValueType();
13238  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
13239  Subtarget->hasMMX() && "Unexpected custom BITCAST");
13240  assert((DstVT == MVT::i64 ||
13241  (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
13242  "Unexpected custom BITCAST");
13243  // i64 <=> MMX conversions are Legal.
13244  if (SrcVT==MVT::i64 && DstVT.isVector())
13245  return Op;
13246  if (DstVT==MVT::i64 && SrcVT.isVector())
13247  return Op;
13248  // MMX <=> MMX conversions are Legal.
13249  if (SrcVT.isVector() && DstVT.isVector())
13250  return Op;
13251  // All other conversions need to be expanded.
13252  return SDValue();
13253 }
13254 
13256  SDNode *Node = Op.getNode();
13257  SDLoc dl(Node);
13258  EVT T = Node->getValueType(0);
13259  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
13260  DAG.getConstant(0, T), Node->getOperand(2));
13261  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
13262  cast<AtomicSDNode>(Node)->getMemoryVT(),
13263  Node->getOperand(0),
13264  Node->getOperand(1), negOp,
13265  cast<AtomicSDNode>(Node)->getSrcValue(),
13266  cast<AtomicSDNode>(Node)->getAlignment(),
13267  cast<AtomicSDNode>(Node)->getOrdering(),
13268  cast<AtomicSDNode>(Node)->getSynchScope());
13269 }
13270 
13272  SDNode *Node = Op.getNode();
13273  SDLoc dl(Node);
13274  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
13275 
13276  // Convert seq_cst store -> xchg
13277  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
13278  // FIXME: On 32-bit, store -> fist or movq would be more efficient
13279  // (The only way to get a 16-byte store is cmpxchg16b)
13280  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
13281  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
13282  !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
13283  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
13284  cast<AtomicSDNode>(Node)->getMemoryVT(),
13285  Node->getOperand(0),
13286  Node->getOperand(1), Node->getOperand(2),
13287  cast<AtomicSDNode>(Node)->getMemOperand(),
13288  cast<AtomicSDNode>(Node)->getOrdering(),
13289  cast<AtomicSDNode>(Node)->getSynchScope());
13290  return Swap.getValue(1);
13291  }
13292  // Other atomic stores have a simple pattern.
13293  return Op;
13294 }
13295 
13297  EVT VT = Op.getNode()->getValueType(0);
13298 
13299  // Let legalize expand this if it isn't a legal type yet.
13300  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13301  return SDValue();
13302 
13303  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
13304 
13305  unsigned Opc;
13306  bool ExtraOp = false;
13307  switch (Op.getOpcode()) {
13308  default: llvm_unreachable("Invalid code");
13309  case ISD::ADDC: Opc = X86ISD::ADD; break;
13310  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
13311  case ISD::SUBC: Opc = X86ISD::SUB; break;
13312  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
13313  }
13314 
13315  if (!ExtraOp)
13316  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
13317  Op.getOperand(1));
13318  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
13319  Op.getOperand(1), Op.getOperand(2));
13320 }
13321 
13322 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
13323  SelectionDAG &DAG) {
13324  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
13325 
13326  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
13327  // which returns the values as { float, float } (in XMM0) or
13328  // { double, double } (which is returned in XMM0, XMM1).
13329  SDLoc dl(Op);
13330  SDValue Arg = Op.getOperand(0);
13331  EVT ArgVT = Arg.getValueType();
13332  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
13333 
13336 
13337  Entry.Node = Arg;
13338  Entry.Ty = ArgTy;
13339  Entry.isSExt = false;
13340  Entry.isZExt = false;
13341  Args.push_back(Entry);
13342 
13343  bool isF64 = ArgVT == MVT::f64;
13344  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
13345  // the small struct {f32, f32} is returned in (eax, edx). For f64,
13346  // the results are returned via SRet in memory.
13347  const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
13348  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13349  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
13350 
13351  Type *RetTy = isF64
13352  ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
13353  : (Type*)VectorType::get(ArgTy, 4);
13355  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
13356  false, false, false, false, 0,
13357  CallingConv::C, /*isTaillCall=*/false,
13358  /*doesNotRet=*/false, /*isReturnValueUsed*/true,
13359  Callee, Args, DAG, dl);
13360  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
13361 
13362  if (isF64)
13363  // Returned in xmm0 and xmm1.
13364  return CallResult.first;
13365 
13366  // Returned in bits 0:31 and 32:64 xmm0.
13367  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
13368  CallResult.first, DAG.getIntPtrConstant(0));
13369  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
13370  CallResult.first, DAG.getIntPtrConstant(1));
13371  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
13372  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
13373 }
13374 
13375 /// LowerOperation - Provide custom lowering hooks for some operations.
13376 ///
13378  switch (Op.getOpcode()) {
13379  default: llvm_unreachable("Should not custom lower this!");
13380  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
13381  case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
13382  case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG);
13383  case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
13384  case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
13385  case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
13386  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
13387  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
13388  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
13389  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
13390  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
13391  case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
13392  case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
13393  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
13394  case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
13395  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
13396  case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
13397  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
13398  case ISD::SHL_PARTS:
13399  case ISD::SRA_PARTS:
13400  case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
13401  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
13402  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
13403  case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
13404  case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
13405  case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
13406  case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
13407  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
13408  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
13409  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
13410  case ISD::FABS: return LowerFABS(Op, DAG);
13411  case ISD::FNEG: return LowerFNEG(Op, DAG);
13412  case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
13413  case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
13414  case ISD::SETCC: return LowerSETCC(Op, DAG);
13415  case ISD::SELECT: return LowerSELECT(Op, DAG);
13416  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
13417  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
13418  case ISD::VASTART: return LowerVASTART(Op, DAG);
13419  case ISD::VAARG: return LowerVAARG(Op, DAG);
13420  case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
13421  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
13422  case ISD::INTRINSIC_VOID:
13423  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
13424  case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
13425  case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
13427  return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
13428  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
13429  case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
13430  case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
13431  case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
13432  case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
13433  case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
13434  case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
13435  case ISD::CTLZ: return LowerCTLZ(Op, DAG);
13436  case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
13437  case ISD::CTTZ: return LowerCTTZ(Op, DAG);
13438  case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
13439  case ISD::SRA:
13440  case ISD::SRL:
13441  case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
13442  case ISD::SADDO:
13443  case ISD::UADDO:
13444  case ISD::SSUBO:
13445  case ISD::USUBO:
13446  case ISD::SMULO:
13447  case ISD::UMULO: return LowerXALUO(Op, DAG);
13448  case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
13449  case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
13450  case ISD::ADDC:
13451  case ISD::ADDE:
13452  case ISD::SUBC:
13453  case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
13454  case ISD::ADD: return LowerADD(Op, DAG);
13455  case ISD::SUB: return LowerSUB(Op, DAG);
13456  case ISD::SDIV: return LowerSDIV(Op, DAG);
13457  case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
13458  }
13459 }
13460 
13461 static void ReplaceATOMIC_LOAD(SDNode *Node,
13462  SmallVectorImpl<SDValue> &Results,
13463  SelectionDAG &DAG) {
13464  SDLoc dl(Node);
13465  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
13466 
13467  // Convert wide load -> cmpxchg8b/cmpxchg16b
13468  // FIXME: On 32-bit, load -> fild or movq would be more efficient
13469  // (The only way to get a 16-byte load is cmpxchg16b)
13470  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
13471  SDValue Zero = DAG.getConstant(0, VT);
13472  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
13473  Node->getOperand(0),
13474  Node->getOperand(1), Zero, Zero,
13475  cast<AtomicSDNode>(Node)->getMemOperand(),
13476  cast<AtomicSDNode>(Node)->getOrdering(),
13477  cast<AtomicSDNode>(Node)->getSynchScope());
13478  Results.push_back(Swap.getValue(0));
13479  Results.push_back(Swap.getValue(1));
13480 }
13481 
13482 static void
13484  SelectionDAG &DAG, unsigned NewOp) {
13485  SDLoc dl(Node);
13486  assert (Node->getValueType(0) == MVT::i64 &&
13487  "Only know how to expand i64 atomics");
13488 
13489  SDValue Chain = Node->getOperand(0);
13490  SDValue In1 = Node->getOperand(1);
13491  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
13492  Node->getOperand(2), DAG.getIntPtrConstant(0));
13493  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
13494  Node->getOperand(2), DAG.getIntPtrConstant(1));
13495  SDValue Ops[] = { Chain, In1, In2L, In2H };
13497  SDValue Result =
13498  DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
13499  cast<MemSDNode>(Node)->getMemOperand());
13500  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
13501  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
13502  Results.push_back(Result.getValue(2));
13503 }
13504 
13505 /// ReplaceNodeResults - Replace a node with an illegal result type
13506 /// with a new node built out of custom code.
13508  SmallVectorImpl<SDValue>&Results,
13509  SelectionDAG &DAG) const {
13510  SDLoc dl(N);
13511  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13512  switch (N->getOpcode()) {
13513  default:
13514  llvm_unreachable("Do not know how to custom type legalize this operation!");
13516  case ISD::ADDC:
13517  case ISD::ADDE:
13518  case ISD::SUBC:
13519  case ISD::SUBE:
13520  // We don't want to expand or promote these.
13521  return;
13522  case ISD::FP_TO_SINT:
13523  case ISD::FP_TO_UINT: {
13524  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
13525 
13526  if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
13527  return;
13528 
13529  std::pair<SDValue,SDValue> Vals =
13530  FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
13531  SDValue FIST = Vals.first, StackSlot = Vals.second;
13532  if (FIST.getNode() != 0) {
13533  EVT VT = N->getValueType(0);
13534  // Return a load from the stack slot.
13535  if (StackSlot.getNode() != 0)
13536  Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
13538  false, false, false, 0));
13539  else
13540  Results.push_back(FIST);
13541  }
13542  return;
13543  }
13544  case ISD::UINT_TO_FP: {
13545  assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
13546  if (N->getOperand(0).getValueType() != MVT::v2i32 ||
13547  N->getValueType(0) != MVT::v2f32)
13548  return;
13549  SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
13550  N->getOperand(0));
13551  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13552  MVT::f64);
13553  SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
13554  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
13555  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
13556  Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
13557  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
13558  Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
13559  return;
13560  }
13561  case ISD::FP_ROUND: {
13562  if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
13563  return;
13564  SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
13565  Results.push_back(V);
13566  return;
13567  }
13568  case ISD::READCYCLECOUNTER: {
13569  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13570  SDValue TheChain = N->getOperand(0);
13571  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
13572  SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
13573  rd.getValue(1));
13574  SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
13575  eax.getValue(2));
13576  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
13577  SDValue Ops[] = { eax, edx };
13578  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
13579  array_lengthof(Ops)));
13580  Results.push_back(edx.getValue(1));
13581  return;
13582  }
13583  case ISD::ATOMIC_CMP_SWAP: {
13584  EVT T = N->getValueType(0);
13585  assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
13586  bool Regs64bit = T == MVT::i128;
13587  EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
13588  SDValue cpInL, cpInH;
13589  cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
13590  DAG.getConstant(0, HalfT));
13591  cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
13592  DAG.getConstant(1, HalfT));
13593  cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
13594  Regs64bit ? X86::RAX : X86::EAX,
13595  cpInL, SDValue());
13596  cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
13597  Regs64bit ? X86::RDX : X86::EDX,
13598  cpInH, cpInL.getValue(1));
13599  SDValue swapInL, swapInH;
13600  swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
13601  DAG.getConstant(0, HalfT));
13602  swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
13603  DAG.getConstant(1, HalfT));
13604  swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
13605  Regs64bit ? X86::RBX : X86::EBX,
13606  swapInL, cpInH.getValue(1));
13607  swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
13608  Regs64bit ? X86::RCX : X86::ECX,
13609  swapInH, swapInL.getValue(1));
13610  SDValue Ops[] = { swapInH.getValue(0),
13611  N->getOperand(1),
13612  swapInH.getValue(1) };
13613  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13614  MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
13615  unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
13617  SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
13618  Ops, array_lengthof(Ops), T, MMO);
13619  SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
13620  Regs64bit ? X86::RAX : X86::EAX,
13621  HalfT, Result.getValue(1));
13622  SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
13623  Regs64bit ? X86::RDX : X86::EDX,
13624  HalfT, cpOutL.getValue(2));
13625  SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
13626  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
13627  Results.push_back(cpOutH.getValue(1));
13628  return;
13629  }
13630  case ISD::ATOMIC_LOAD_ADD:
13631  case ISD::ATOMIC_LOAD_AND:
13632  case ISD::ATOMIC_LOAD_NAND:
13633  case ISD::ATOMIC_LOAD_OR:
13634  case ISD::ATOMIC_LOAD_SUB:
13635  case ISD::ATOMIC_LOAD_XOR:
13636  case ISD::ATOMIC_LOAD_MAX:
13637  case ISD::ATOMIC_LOAD_MIN:
13638  case ISD::ATOMIC_LOAD_UMAX:
13639  case ISD::ATOMIC_LOAD_UMIN:
13640  case ISD::ATOMIC_SWAP: {
13641  unsigned Opc;
13642  switch (N->getOpcode()) {
13643  default: llvm_unreachable("Unexpected opcode");
13644  case ISD::ATOMIC_LOAD_ADD:
13645  Opc = X86ISD::ATOMADD64_DAG;
13646  break;
13647  case ISD::ATOMIC_LOAD_AND:
13648  Opc = X86ISD::ATOMAND64_DAG;
13649  break;
13650  case ISD::ATOMIC_LOAD_NAND:
13651  Opc = X86ISD::ATOMNAND64_DAG;
13652  break;
13653  case ISD::ATOMIC_LOAD_OR:
13654  Opc = X86ISD::ATOMOR64_DAG;
13655  break;
13656  case ISD::ATOMIC_LOAD_SUB:
13657  Opc = X86ISD::ATOMSUB64_DAG;
13658  break;
13659  case ISD::ATOMIC_LOAD_XOR:
13660  Opc = X86ISD::ATOMXOR64_DAG;
13661  break;
13662  case ISD::ATOMIC_LOAD_MAX:
13663  Opc = X86ISD::ATOMMAX64_DAG;
13664  break;
13665  case ISD::ATOMIC_LOAD_MIN:
13666  Opc = X86ISD::ATOMMIN64_DAG;
13667  break;
13668  case ISD::ATOMIC_LOAD_UMAX:
13669  Opc = X86ISD::ATOMUMAX64_DAG;
13670  break;
13671  case ISD::ATOMIC_LOAD_UMIN:
13672  Opc = X86ISD::ATOMUMIN64_DAG;
13673  break;
13674  case ISD::ATOMIC_SWAP:
13675  Opc = X86ISD::ATOMSWAP64_DAG;
13676  break;
13677  }
13678  ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
13679  return;
13680  }
13681  case ISD::ATOMIC_LOAD:
13682  ReplaceATOMIC_LOAD(N, Results, DAG);
13683  }
13684 }
13685 
13686 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
13687  switch (Opcode) {
13688  default: return NULL;
13689  case X86ISD::BSF: return "X86ISD::BSF";
13690  case X86ISD::BSR: return "X86ISD::BSR";
13691  case X86ISD::SHLD: return "X86ISD::SHLD";
13692  case X86ISD::SHRD: return "X86ISD::SHRD";
13693  case X86ISD::FAND: return "X86ISD::FAND";
13694  case X86ISD::FANDN: return "X86ISD::FANDN";
13695  case X86ISD::FOR: return "X86ISD::FOR";
13696  case X86ISD::FXOR: return "X86ISD::FXOR";
13697  case X86ISD::FSRL: return "X86ISD::FSRL";
13698  case X86ISD::FILD: return "X86ISD::FILD";
13699  case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
13700  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
13701  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
13702  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
13703  case X86ISD::FLD: return "X86ISD::FLD";
13704  case X86ISD::FST: return "X86ISD::FST";
13705  case X86ISD::CALL: return "X86ISD::CALL";
13706  case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
13707  case X86ISD::BT: return "X86ISD::BT";
13708  case X86ISD::CMP: return "X86ISD::CMP";
13709  case X86ISD::COMI: return "X86ISD::COMI";
13710  case X86ISD::UCOMI: return "X86ISD::UCOMI";
13711  case X86ISD::CMPM: return "X86ISD::CMPM";
13712  case X86ISD::CMPMU: return "X86ISD::CMPMU";
13713  case X86ISD::SETCC: return "X86ISD::SETCC";
13714  case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
13715  case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd";
13716  case X86ISD::FSETCCss: return "X86ISD::FSETCCss";
13717  case X86ISD::CMOV: return "X86ISD::CMOV";
13718  case X86ISD::BRCOND: return "X86ISD::BRCOND";
13719  case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
13720  case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
13721  case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
13722  case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
13723  case X86ISD::Wrapper: return "X86ISD::Wrapper";
13724  case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
13725  case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
13726  case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
13727  case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
13728  case X86ISD::PINSRB: return "X86ISD::PINSRB";
13729  case X86ISD::PINSRW: return "X86ISD::PINSRW";
13730  case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
13731  case X86ISD::ANDNP: return "X86ISD::ANDNP";
13732  case X86ISD::PSIGN: return "X86ISD::PSIGN";
13733  case X86ISD::BLENDV: return "X86ISD::BLENDV";
13734  case X86ISD::BLENDI: return "X86ISD::BLENDI";
13735  case X86ISD::SUBUS: return "X86ISD::SUBUS";
13736  case X86ISD::HADD: return "X86ISD::HADD";
13737  case X86ISD::HSUB: return "X86ISD::HSUB";
13738  case X86ISD::FHADD: return "X86ISD::FHADD";
13739  case X86ISD::FHSUB: return "X86ISD::FHSUB";
13740  case X86ISD::UMAX: return "X86ISD::UMAX";
13741  case X86ISD::UMIN: return "X86ISD::UMIN";
13742  case X86ISD::SMAX: return "X86ISD::SMAX";
13743  case X86ISD::SMIN: return "X86ISD::SMIN";
13744  case X86ISD::FMAX: return "X86ISD::FMAX";
13745  case X86ISD::FMIN: return "X86ISD::FMIN";
13746  case X86ISD::FMAXC: return "X86ISD::FMAXC";
13747  case X86ISD::FMINC: return "X86ISD::FMINC";
13748  case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
13749  case X86ISD::FRCP: return "X86ISD::FRCP";
13750  case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
13751  case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
13752  case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
13753  case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
13754  case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
13755  case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
13756  case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
13757  case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
13758  case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
13759  case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
13760  case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
13761  case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
13762  case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
13763  case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
13764  case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
13765  case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
13766  case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
13767  case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
13768  case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL";
13769  case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
13770  case X86ISD::VZEXT: return "X86ISD::VZEXT";
13771  case X86ISD::VSEXT: return "X86ISD::VSEXT";
13772  case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
13773  case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";
13774  case X86ISD::VINSERT: return "X86ISD::VINSERT";
13775  case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
13776  case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
13777  case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
13778  case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
13779  case X86ISD::VSHL: return "X86ISD::VSHL";
13780  case X86ISD::VSRL: return "X86ISD::VSRL";
13781  case X86ISD::VSRA: return "X86ISD::VSRA";
13782  case X86ISD::VSHLI: return "X86ISD::VSHLI";
13783  case X86ISD::VSRLI: return "X86ISD::VSRLI";
13784  case X86ISD::VSRAI: return "X86ISD::VSRAI";
13785  case X86ISD::CMPP: return "X86ISD::CMPP";
13786  case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
13787  case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
13788  case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
13789  case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
13790  case X86ISD::ADD: return "X86ISD::ADD";
13791  case X86ISD::SUB: return "X86ISD::SUB";
13792  case X86ISD::ADC: return "X86ISD::ADC";
13793  case X86ISD::SBB: return "X86ISD::SBB";
13794  case X86ISD::SMUL: return "X86ISD::SMUL";
13795  case X86ISD::UMUL: return "X86ISD::UMUL";
13796  case X86ISD::INC: return "X86ISD::INC";
13797  case X86ISD::DEC: return "X86ISD::DEC";
13798  case X86ISD::OR: return "X86ISD::OR";
13799  case X86ISD::XOR: return "X86ISD::XOR";
13800  case X86ISD::AND: return "X86ISD::AND";
13801  case X86ISD::BLSI: return "X86ISD::BLSI";
13802  case X86ISD::BLSMSK: return "X86ISD::BLSMSK";
13803  case X86ISD::BLSR: return "X86ISD::BLSR";
13804  case X86ISD::BZHI: return "X86ISD::BZHI";
13805  case X86ISD::BEXTR: return "X86ISD::BEXTR";
13806  case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
13807  case X86ISD::PTEST: return "X86ISD::PTEST";
13808  case X86ISD::TESTP: return "X86ISD::TESTP";
13809  case X86ISD::TESTM: return "X86ISD::TESTM";
13810  case X86ISD::KORTEST: return "X86ISD::KORTEST";
13811  case X86ISD::KTEST: return "X86ISD::KTEST";
13812  case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
13813  case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
13814  case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
13815  case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
13816  case X86ISD::SHUFP: return "X86ISD::SHUFP";
13817  case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
13818  case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
13819  case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
13820  case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
13821  case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
13822  case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
13823  case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
13824  case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
13825  case X86ISD::MOVSD: return "X86ISD::MOVSD";
13826  case X86ISD::MOVSS: return "X86ISD::MOVSS";
13827  case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
13828  case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
13829  case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
13830  case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
13831  case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
13832  case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
13833  case X86ISD::VPERMV: return "X86ISD::VPERMV";
13834  case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
13835  case X86ISD::VPERMI: return "X86ISD::VPERMI";
13836  case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
13837  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
13838  case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
13839  case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
13840  case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
13841  case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
13842  case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
13843  case X86ISD::SAHF: return "X86ISD::SAHF";
13844  case X86ISD::RDRAND: return "X86ISD::RDRAND";
13845  case X86ISD::RDSEED: return "X86ISD::RDSEED";
13846  case X86ISD::FMADD: return "X86ISD::FMADD";
13847  case X86ISD::FMSUB: return "X86ISD::FMSUB";
13848  case X86ISD::FNMADD: return "X86ISD::FNMADD";
13849  case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
13850  case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
13851  case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
13852  case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
13853  case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
13854  case X86ISD::XTEST: return "X86ISD::XTEST";
13855  }
13856 }
13857 
13858 // isLegalAddressingMode - Return true if the addressing mode represented
13859 // by AM is legal for this target, for a load/store of the specified type.
13861  Type *Ty) const {
13862  // X86 supports extremely general addressing modes.
13865 
13866  // X86 allows a sign-extended 32-bit immediate field as a displacement.
13867  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
13868  return false;
13869 
13870  if (AM.BaseGV) {
13871  unsigned GVFlags =
13873 
13874  // If a reference to this global requires an extra load, we can't fold it.
13875  if (isGlobalStubReference(GVFlags))
13876  return false;
13877 
13878  // If BaseGV requires a register for the PIC base, we cannot also have a
13879  // BaseReg specified.
13880  if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
13881  return false;
13882 
13883  // If lower 4G is not available, then we must use rip-relative addressing.
13884  if ((M != CodeModel::Small || R != Reloc::Static) &&
13885  Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
13886  return false;
13887  }
13888 
13889  switch (AM.Scale) {
13890  case 0:
13891  case 1:
13892  case 2:
13893  case 4:
13894  case 8:
13895  // These scales always work.
13896  break;
13897  case 3:
13898  case 5:
13899  case 9:
13900  // These scales are formed with basereg+scalereg. Only accept if there is
13901  // no basereg yet.
13902  if (AM.HasBaseReg)
13903  return false;
13904  break;
13905  default: // Other stuff never works.
13906  return false;
13907  }
13908 
13909  return true;
13910 }
13911 
13913  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13914  return false;
13915  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
13916  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
13917  return NumBits1 > NumBits2;
13918 }
13919 
13921  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13922  return false;
13923 
13924  if (!isTypeLegal(EVT::getEVT(Ty1)))
13925  return false;
13926 
13927  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
13928 
13929  // Assuming the caller doesn't have a zeroext or signext return parameter,
13930  // truncation all the way down to i1 is valid.
13931  return true;
13932 }
13933 
13935  return isInt<32>(Imm);
13936 }
13937 
13939  // Can also use sub to handle negated immediates.
13940  return isInt<32>(Imm);
13941 }
13942 
13944  if (!VT1.isInteger() || !VT2.isInteger())
13945  return false;
13946  unsigned NumBits1 = VT1.getSizeInBits();
13947  unsigned NumBits2 = VT2.getSizeInBits();
13948  return NumBits1 > NumBits2;
13949 }
13950 
13951 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
13952  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
13953  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
13954 }
13955 
13956 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
13957  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
13958  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
13959 }
13960 
13962  EVT VT1 = Val.getValueType();
13963  if (isZExtFree(VT1, VT2))
13964  return true;
13965 
13966  if (Val.getOpcode() != ISD::LOAD)
13967  return false;
13968 
13969  if (!VT1.isSimple() || !VT1.isInteger() ||
13970  !VT2.isSimple() || !VT2.isInteger())
13971  return false;
13972 
13973  switch (VT1.getSimpleVT().SimpleTy) {
13974  default: break;
13975  case MVT::i8:
13976  case MVT::i16:
13977  case MVT::i32:
13978  // X86 has 8, 16, and 32-bit zero-extending loads.
13979  return true;
13980  }
13981 
13982  return false;
13983 }
13984 
13985 bool
13987  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
13988  return false;
13989 
13990  VT = VT.getScalarType();
13991 
13992  if (!VT.isSimple())
13993  return false;
13994 
13995  switch (VT.getSimpleVT().SimpleTy) {
13996  case MVT::f32:
13997  case MVT::f64:
13998  return true;
13999  default:
14000  break;
14001  }
14002 
14003  return false;
14004 }
14005 
14007  // i16 instructions are longer (0x66 prefix) and potentially slower.
14008  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
14009 }
14010 
14011 /// isShuffleMaskLegal - Targets can use this to indicate that they only
14012 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
14013 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
14014 /// are assumed to be legal.
14015 bool
14017  EVT VT) const {
14018  if (!VT.isSimple())
14019  return false;
14020 
14021  MVT SVT = VT.getSimpleVT();
14022 
14023  // Very little shuffling can be done for 64-bit vectors right now.
14024  if (VT.getSizeInBits() == 64)
14025  return false;
14026 
14027  // FIXME: pshufb, blends, shifts.
14028  return (SVT.getVectorNumElements() == 2 ||
14029  ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
14030  isMOVLMask(M, SVT) ||
14031  isSHUFPMask(M, SVT) ||
14032  isPSHUFDMask(M, SVT) ||
14033  isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
14034  isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
14035  isPALIGNRMask(M, SVT, Subtarget) ||
14036  isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
14037  isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
14038  isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
14039  isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
14040 }
14041 
14042 bool
14044  EVT VT) const {
14045  if (!VT.isSimple())
14046  return false;
14047 
14048  MVT SVT = VT.getSimpleVT();
14049  unsigned NumElts = SVT.getVectorNumElements();
14050  // FIXME: This collection of masks seems suspect.
14051  if (NumElts == 2)
14052  return true;
14053  if (NumElts == 4 && SVT.is128BitVector()) {
14054  return (isMOVLMask(Mask, SVT) ||
14055  isCommutedMOVLMask(Mask, SVT, true) ||
14056  isSHUFPMask(Mask, SVT) ||
14057  isSHUFPMask(Mask, SVT, /* Commuted */ true));
14058  }
14059  return false;
14060 }
14061 
14062 //===----------------------------------------------------------------------===//
14063 // X86 Scheduler Hooks
14064 //===----------------------------------------------------------------------===//
14065 
14066 /// Utility function to emit xbegin specifying the start of an RTM region.
14068  const TargetInstrInfo *TII) {
14069  DebugLoc DL = MI->getDebugLoc();
14070 
14071  const BasicBlock *BB = MBB->getBasicBlock();
14073  ++I;
14074 
14075  // For the v = xbegin(), we generate
14076  //
14077  // thisMBB:
14078  // xbegin sinkMBB
14079  //
14080  // mainMBB:
14081  // eax = -1
14082  //
14083  // sinkMBB:
14084  // v = eax
14085 
14086  MachineBasicBlock *thisMBB = MBB;
14087  MachineFunction *MF = MBB->getParent();
14088  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14089  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14090  MF->insert(I, mainMBB);
14091  MF->insert(I, sinkMBB);
14092 
14093  // Transfer the remainder of BB and its successor edges to sinkMBB.
14094  sinkMBB->splice(sinkMBB->begin(), MBB,
14096  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14097 
14098  // thisMBB:
14099  // xbegin sinkMBB
14100  // # fallthrough to mainMBB
14101  // # abortion to sinkMBB
14102  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
14103  thisMBB->addSuccessor(mainMBB);
14104  thisMBB->addSuccessor(sinkMBB);
14105 
14106  // mainMBB:
14107  // EAX = -1
14108  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
14109  mainMBB->addSuccessor(sinkMBB);
14110 
14111  // sinkMBB:
14112  // EAX is live into the sinkMBB
14113  sinkMBB->addLiveIn(X86::EAX);
14114  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14115  TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
14116  .addReg(X86::EAX);
14117 
14118  MI->eraseFromParent();
14119  return sinkMBB;
14120 }
14121 
14122 // Get CMPXCHG opcode for the specified data type.
14123 static unsigned getCmpXChgOpcode(EVT VT) {
14124  switch (VT.getSimpleVT().SimpleTy) {
14125  case MVT::i8: return X86::LCMPXCHG8;
14126  case MVT::i16: return X86::LCMPXCHG16;
14127  case MVT::i32: return X86::LCMPXCHG32;
14128  case MVT::i64: return X86::LCMPXCHG64;
14129  default:
14130  break;
14131  }
14132  llvm_unreachable("Invalid operand size!");
14133 }
14134 
14135 // Get LOAD opcode for the specified data type.
14136 static unsigned getLoadOpcode(EVT VT) {
14137  switch (VT.getSimpleVT().SimpleTy) {
14138  case MVT::i8: return X86::MOV8rm;
14139  case MVT::i16: return X86::MOV16rm;
14140  case MVT::i32: return X86::MOV32rm;
14141  case MVT::i64: return X86::MOV64rm;
14142  default:
14143  break;
14144  }
14145  llvm_unreachable("Invalid operand size!");
14146 }
14147 
14148 // Get opcode of the non-atomic one from the specified atomic instruction.
14149 static unsigned getNonAtomicOpcode(unsigned Opc) {
14150  switch (Opc) {
14151  case X86::ATOMAND8: return X86::AND8rr;
14152  case X86::ATOMAND16: return X86::AND16rr;
14153  case X86::ATOMAND32: return X86::AND32rr;
14154  case X86::ATOMAND64: return X86::AND64rr;
14155  case X86::ATOMOR8: return X86::OR8rr;
14156  case X86::ATOMOR16: return X86::OR16rr;
14157  case X86::ATOMOR32: return X86::OR32rr;
14158  case X86::ATOMOR64: return X86::OR64rr;
14159  case X86::ATOMXOR8: return X86::XOR8rr;
14160  case X86::ATOMXOR16: return X86::XOR16rr;
14161  case X86::ATOMXOR32: return X86::XOR32rr;
14162  case X86::ATOMXOR64: return X86::XOR64rr;
14163  }
14164  llvm_unreachable("Unhandled atomic-load-op opcode!");
14165 }
14166 
14167 // Get opcode of the non-atomic one from the specified atomic instruction with
14168 // extra opcode.
14169 static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
14170  unsigned &ExtraOpc) {
14171  switch (Opc) {
14172  case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr;
14173  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr;
14174  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr;
14175  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr;
14176  case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr;
14177  case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
14178  case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
14179  case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
14180  case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr;
14181  case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
14182  case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
14183  case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
14184  case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr;
14185  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
14186  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
14187  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
14188  case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr;
14189  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
14190  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
14191  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
14192  }
14193  llvm_unreachable("Unhandled atomic-load-op opcode!");
14194 }
14195 
14196 // Get opcode of the non-atomic one from the specified atomic instruction for
14197 // 64-bit data type on 32-bit target.
14198 static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
14199  switch (Opc) {
14200  case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr;
14201  case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr;
14202  case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr;
14203  case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr;
14204  case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr;
14205  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
14206  case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr;
14207  case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr;
14208  case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr;
14209  case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr;
14210  }
14211  llvm_unreachable("Unhandled atomic-load-op opcode!");
14212 }
14213 
14214 // Get opcode of the non-atomic one from the specified atomic instruction for
14215 // 64-bit data type on 32-bit target with extra opcode.
14216 static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
14217  unsigned &HiOpc,
14218  unsigned &ExtraOpc) {
14219  switch (Opc) {
14220  case X86::ATOMNAND6432:
14221  ExtraOpc = X86::NOT32r;
14222  HiOpc = X86::AND32rr;
14223  return X86::AND32rr;
14224  }
14225  llvm_unreachable("Unhandled atomic-load-op opcode!");
14226 }
14227 
14228 // Get pseudo CMOV opcode from the specified data type.
14229 static unsigned getPseudoCMOVOpc(EVT VT) {
14230  switch (VT.getSimpleVT().SimpleTy) {
14231  case MVT::i8: return X86::CMOV_GR8;
14232  case MVT::i16: return X86::CMOV_GR16;
14233  case MVT::i32: return X86::CMOV_GR32;
14234  default:
14235  break;
14236  }
14237  llvm_unreachable("Unknown CMOV opcode!");
14238 }
14239 
14240 // EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
14241 // They will be translated into a spin-loop or compare-exchange loop from
14242 //
14243 // ...
14244 // dst = atomic-fetch-op MI.addr, MI.val
14245 // ...
14246 //
14247 // to
14248 //
14249 // ...
14250 // t1 = LOAD MI.addr
14251 // loop:
14252 // t4 = phi(t1, t3 / loop)
14253 // t2 = OP MI.val, t4
14254 // EAX = t4
14255 // LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
14256 // t3 = EAX
14257 // JNE loop
14258 // sink:
14259 // dst = t3
14260 // ...
14262 X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
14263  MachineBasicBlock *MBB) const {
14265  DebugLoc DL = MI->getDebugLoc();
14266 
14267  MachineFunction *MF = MBB->getParent();
14268  MachineRegisterInfo &MRI = MF->getRegInfo();
14269 
14270  const BasicBlock *BB = MBB->getBasicBlock();
14272  ++I;
14273 
14274  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
14275  "Unexpected number of operands");
14276 
14277  assert(MI->hasOneMemOperand() &&
14278  "Expected atomic-load-op to have one memoperand");
14279 
14280  // Memory Reference
14283 
14284  unsigned DstReg, SrcReg;
14285  unsigned MemOpndSlot;
14286 
14287  unsigned CurOp = 0;
14288 
14289  DstReg = MI->getOperand(CurOp++).getReg();
14290  MemOpndSlot = CurOp;
14291  CurOp += X86::AddrNumOperands;
14292  SrcReg = MI->getOperand(CurOp++).getReg();
14293 
14294  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
14295  MVT::SimpleValueType VT = *RC->vt_begin();
14296  unsigned t1 = MRI.createVirtualRegister(RC);
14297  unsigned t2 = MRI.createVirtualRegister(RC);
14298  unsigned t3 = MRI.createVirtualRegister(RC);
14299  unsigned t4 = MRI.createVirtualRegister(RC);
14300  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
14301 
14302  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
14303  unsigned LOADOpc = getLoadOpcode(VT);
14304 
14305  // For the atomic load-arith operator, we generate
14306  //
14307  // thisMBB:
14308  // t1 = LOAD [MI.addr]
14309  // mainMBB:
14310  // t4 = phi(t1 / thisMBB, t3 / mainMBB)
14311  // t1 = OP MI.val, EAX
14312  // EAX = t4
14313  // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
14314  // t3 = EAX
14315  // JNE mainMBB
14316  // sinkMBB:
14317  // dst = t3
14318 
14319  MachineBasicBlock *thisMBB = MBB;
14320  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14321  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14322  MF->insert(I, mainMBB);
14323  MF->insert(I, sinkMBB);
14324 
14325  MachineInstrBuilder MIB;
14326 
14327  // Transfer the remainder of BB and its successor edges to sinkMBB.
14328  sinkMBB->splice(sinkMBB->begin(), MBB,
14330  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14331 
14332  // thisMBB:
14333  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
14334  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14335  MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14336  if (NewMO.isReg())
14337  NewMO.setIsKill(false);
14338  MIB.addOperand(NewMO);
14339  }
14340  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
14341  unsigned flags = (*MMOI)->getFlags();
14342  flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
14343  MachineMemOperand *MMO =
14344  MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
14345  (*MMOI)->getSize(),
14346  (*MMOI)->getBaseAlignment(),
14347  (*MMOI)->getTBAAInfo(),
14348  (*MMOI)->getRanges());
14349  MIB.addMemOperand(MMO);
14350  }
14351 
14352  thisMBB->addSuccessor(mainMBB);
14353 
14354  // mainMBB:
14355  MachineBasicBlock *origMainMBB = mainMBB;
14356 
14357  // Add a PHI.
14358  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
14359  .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
14360 
14361  unsigned Opc = MI->getOpcode();
14362  switch (Opc) {
14363  default:
14364  llvm_unreachable("Unhandled atomic-load-op opcode!");
14365  case X86::ATOMAND8:
14366  case X86::ATOMAND16:
14367  case X86::ATOMAND32:
14368  case X86::ATOMAND64:
14369  case X86::ATOMOR8:
14370  case X86::ATOMOR16:
14371  case X86::ATOMOR32:
14372  case X86::ATOMOR64:
14373  case X86::ATOMXOR8:
14374  case X86::ATOMXOR16:
14375  case X86::ATOMXOR32:
14376  case X86::ATOMXOR64: {
14377  unsigned ARITHOpc = getNonAtomicOpcode(Opc);
14378  BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
14379  .addReg(t4);
14380  break;
14381  }
14382  case X86::ATOMNAND8:
14383  case X86::ATOMNAND16:
14384  case X86::ATOMNAND32:
14385  case X86::ATOMNAND64: {
14386  unsigned Tmp = MRI.createVirtualRegister(RC);
14387  unsigned NOTOpc;
14388  unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
14389  BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
14390  .addReg(t4);
14391  BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
14392  break;
14393  }
14394  case X86::ATOMMAX8:
14395  case X86::ATOMMAX16:
14396  case X86::ATOMMAX32:
14397  case X86::ATOMMAX64:
14398  case X86::ATOMMIN8:
14399  case X86::ATOMMIN16:
14400  case X86::ATOMMIN32:
14401  case X86::ATOMMIN64:
14402  case X86::ATOMUMAX8:
14403  case X86::ATOMUMAX16:
14404  case X86::ATOMUMAX32:
14405  case X86::ATOMUMAX64:
14406  case X86::ATOMUMIN8:
14407  case X86::ATOMUMIN16:
14408  case X86::ATOMUMIN32:
14409  case X86::ATOMUMIN64: {
14410  unsigned CMPOpc;
14411  unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
14412 
14413  BuildMI(mainMBB, DL, TII->get(CMPOpc))
14414  .addReg(SrcReg)
14415  .addReg(t4);
14416 
14417  if (Subtarget->hasCMov()) {
14418  if (VT != MVT::i8) {
14419  // Native support
14420  BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
14421  .addReg(SrcReg)
14422  .addReg(t4);
14423  } else {
14424  // Promote i8 to i32 to use CMOV32
14426  const TargetRegisterClass *RC32 =
14427  TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
14428  unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
14429  unsigned AccReg32 = MRI.createVirtualRegister(RC32);
14430  unsigned Tmp = MRI.createVirtualRegister(RC32);
14431 
14432  unsigned Undef = MRI.createVirtualRegister(RC32);
14433  BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
14434 
14435  BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
14436  .addReg(Undef)
14437  .addReg(SrcReg)
14438  .addImm(X86::sub_8bit);
14439  BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
14440  .addReg(Undef)
14441  .addReg(t4)
14442  .addImm(X86::sub_8bit);
14443 
14444  BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
14445  .addReg(SrcReg32)
14446  .addReg(AccReg32);
14447 
14448  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
14449  .addReg(Tmp, 0, X86::sub_8bit);
14450  }
14451  } else {
14452  // Use pseudo select and lower them.
14453  assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
14454  "Invalid atomic-load-op transformation!");
14455  unsigned SelOpc = getPseudoCMOVOpc(VT);
14456  X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
14457  assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
14458  MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
14459  .addReg(SrcReg).addReg(t4)
14460  .addImm(CC);
14461  mainMBB = EmitLoweredSelect(MIB, mainMBB);
14462  // Replace the original PHI node as mainMBB is changed after CMOV
14463  // lowering.
14464  BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
14465  .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
14466  Phi->eraseFromParent();
14467  }
14468  break;
14469  }
14470  }
14471 
14472  // Copy PhyReg back from virtual register.
14473  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
14474  .addReg(t4);
14475 
14476  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
14477  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14478  MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14479  if (NewMO.isReg())
14480  NewMO.setIsKill(false);
14481  MIB.addOperand(NewMO);
14482  }
14483  MIB.addReg(t2);
14484  MIB.setMemRefs(MMOBegin, MMOEnd);
14485 
14486  // Copy PhyReg back to virtual register.
14487  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
14488  .addReg(PhyReg);
14489 
14490  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
14491 
14492  mainMBB->addSuccessor(origMainMBB);
14493  mainMBB->addSuccessor(sinkMBB);
14494 
14495  // sinkMBB:
14496  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14497  TII->get(TargetOpcode::COPY), DstReg)
14498  .addReg(t3);
14499 
14500  MI->eraseFromParent();
14501  return sinkMBB;
14502 }
14503 
14504 // EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
14505 // instructions. They will be translated into a spin-loop or compare-exchange
14506 // loop from
14507 //
14508 // ...
14509 // dst = atomic-fetch-op MI.addr, MI.val
14510 // ...
14511 //
14512 // to
14513 //
14514 // ...
14515 // t1L = LOAD [MI.addr + 0]
14516 // t1H = LOAD [MI.addr + 4]
14517 // loop:
14518 // t4L = phi(t1L, t3L / loop)
14519 // t4H = phi(t1H, t3H / loop)
14520 // t2L = OP MI.val.lo, t4L
14521 // t2H = OP MI.val.hi, t4H
14522 // EAX = t4L
14523 // EDX = t4H
14524 // EBX = t2L
14525 // ECX = t2H
14526 // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
14527 // t3L = EAX
14528 // t3H = EDX
14529 // JNE loop
14530 // sink:
14531 // dstL = t3L
14532 // dstH = t3H
14533 // ...
14535 X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
14536  MachineBasicBlock *MBB) const {
14538  DebugLoc DL = MI->getDebugLoc();
14539 
14540  MachineFunction *MF = MBB->getParent();
14541  MachineRegisterInfo &MRI = MF->getRegInfo();
14542 
14543  const BasicBlock *BB = MBB->getBasicBlock();
14544  MachineFunction::iterator I = MBB;
14545  ++I;
14546 
14547  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
14548  "Unexpected number of operands");
14549 
14550  assert(MI->hasOneMemOperand() &&
14551  "Expected atomic-load-op32 to have one memoperand");
14552 
14553  // Memory Reference
14556 
14557  unsigned DstLoReg, DstHiReg;
14558  unsigned SrcLoReg, SrcHiReg;
14559  unsigned MemOpndSlot;
14560 
14561  unsigned CurOp = 0;
14562 
14563  DstLoReg = MI->getOperand(CurOp++).getReg();
14564  DstHiReg = MI->getOperand(CurOp++).getReg();
14565  MemOpndSlot = CurOp;
14566  CurOp += X86::AddrNumOperands;
14567  SrcLoReg = MI->getOperand(CurOp++).getReg();
14568  SrcHiReg = MI->getOperand(CurOp++).getReg();
14569 
14570  const TargetRegisterClass *RC = &X86::GR32RegClass;
14571  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
14572 
14573  unsigned t1L = MRI.createVirtualRegister(RC);
14574  unsigned t1H = MRI.createVirtualRegister(RC);
14575  unsigned t2L = MRI.createVirtualRegister(RC);
14576  unsigned t2H = MRI.createVirtualRegister(RC);
14577  unsigned t3L = MRI.createVirtualRegister(RC);
14578  unsigned t3H = MRI.createVirtualRegister(RC);
14579  unsigned t4L = MRI.createVirtualRegister(RC);
14580  unsigned t4H = MRI.createVirtualRegister(RC);
14581 
14582  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
14583  unsigned LOADOpc = X86::MOV32rm;
14584 
14585  // For the atomic load-arith operator, we generate
14586  //
14587  // thisMBB:
14588  // t1L = LOAD [MI.addr + 0]
14589  // t1H = LOAD [MI.addr + 4]
14590  // mainMBB:
14591  // t4L = phi(t1L / thisMBB, t3L / mainMBB)
14592  // t4H = phi(t1H / thisMBB, t3H / mainMBB)
14593  // t2L = OP MI.val.lo, t4L
14594  // t2H = OP MI.val.hi, t4H
14595  // EBX = t2L
14596  // ECX = t2H
14597  // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
14598  // t3L = EAX
14599  // t3H = EDX
14600  // JNE loop
14601  // sinkMBB:
14602  // dstL = t3L
14603  // dstH = t3H
14604 
14605  MachineBasicBlock *thisMBB = MBB;
14606  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14607  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14608  MF->insert(I, mainMBB);
14609  MF->insert(I, sinkMBB);
14610 
14611  MachineInstrBuilder MIB;
14612 
14613  // Transfer the remainder of BB and its successor edges to sinkMBB.
14614  sinkMBB->splice(sinkMBB->begin(), MBB,
14616  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14617 
14618  // thisMBB:
14619  // Lo
14620  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
14621  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14622  MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14623  if (NewMO.isReg())
14624  NewMO.setIsKill(false);
14625  MIB.addOperand(NewMO);
14626  }
14627  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
14628  unsigned flags = (*MMOI)->getFlags();
14629  flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
14630  MachineMemOperand *MMO =
14631  MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
14632  (*MMOI)->getSize(),
14633  (*MMOI)->getBaseAlignment(),
14634  (*MMOI)->getTBAAInfo(),
14635  (*MMOI)->getRanges());
14636  MIB.addMemOperand(MMO);
14637  };
14638  MachineInstr *LowMI = MIB;
14639 
14640  // Hi
14641  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
14642  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14643  if (i == X86::AddrDisp) {
14644  MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
14645  } else {
14646  MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14647  if (NewMO.isReg())
14648  NewMO.setIsKill(false);
14649  MIB.addOperand(NewMO);
14650  }
14651  }
14652  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
14653 
14654  thisMBB->addSuccessor(mainMBB);
14655 
14656  // mainMBB:
14657  MachineBasicBlock *origMainMBB = mainMBB;
14658 
14659  // Add PHIs.
14660  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
14661  .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
14662  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
14663  .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
14664 
14665  unsigned Opc = MI->getOpcode();
14666  switch (Opc) {
14667  default:
14668  llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
14669  case X86::ATOMAND6432:
14670  case X86::ATOMOR6432:
14671  case X86::ATOMXOR6432:
14672  case X86::ATOMADD6432:
14673  case X86::ATOMSUB6432: {
14674  unsigned HiOpc;
14675  unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
14676  BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
14677  .addReg(SrcLoReg);
14678  BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
14679  .addReg(SrcHiReg);
14680  break;
14681  }
14682  case X86::ATOMNAND6432: {
14683  unsigned HiOpc, NOTOpc;
14684  unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
14685  unsigned TmpL = MRI.createVirtualRegister(RC);
14686  unsigned TmpH = MRI.createVirtualRegister(RC);
14687  BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
14688  .addReg(t4L);
14689  BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
14690  .addReg(t4H);
14691  BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
14692  BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
14693  break;
14694  }
14695  case X86::ATOMMAX6432:
14696  case X86::ATOMMIN6432:
14697  case X86::ATOMUMAX6432:
14698  case X86::ATOMUMIN6432: {
14699  unsigned HiOpc;
14700  unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
14701  unsigned cL = MRI.createVirtualRegister(RC8);
14702  unsigned cH = MRI.createVirtualRegister(RC8);
14703  unsigned cL32 = MRI.createVirtualRegister(RC);
14704  unsigned cH32 = MRI.createVirtualRegister(RC);
14705  unsigned cc = MRI.createVirtualRegister(RC);
14706  // cl := cmp src_lo, lo
14707  BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
14708  .addReg(SrcLoReg).addReg(t4L);
14709  BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
14710  BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
14711  // ch := cmp src_hi, hi
14712  BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
14713  .addReg(SrcHiReg).addReg(t4H);
14714  BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
14715  BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
14716  // cc := if (src_hi == hi) ? cl : ch;
14717  if (Subtarget->hasCMov()) {
14718  BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
14719  .addReg(cH32).addReg(cL32);
14720  } else {
14721  MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
14722  .addReg(cH32).addReg(cL32)
14723  .addImm(X86::COND_E);
14724  mainMBB = EmitLoweredSelect(MIB, mainMBB);
14725  }
14726  BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
14727  if (Subtarget->hasCMov()) {
14728  BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
14729  .addReg(SrcLoReg).addReg(t4L);
14730  BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
14731  .addReg(SrcHiReg).addReg(t4H);
14732  } else {
14733  MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
14734  .addReg(SrcLoReg).addReg(t4L)
14735  .addImm(X86::COND_NE);
14736  mainMBB = EmitLoweredSelect(MIB, mainMBB);
14737  // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
14738  // 2nd CMOV lowering.
14739  mainMBB->addLiveIn(X86::EFLAGS);
14740  MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
14741  .addReg(SrcHiReg).addReg(t4H)
14742  .addImm(X86::COND_NE);
14743  mainMBB = EmitLoweredSelect(MIB, mainMBB);
14744  // Replace the original PHI node as mainMBB is changed after CMOV
14745  // lowering.
14746  BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
14747  .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
14748  BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
14749  .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
14750  PhiL->eraseFromParent();
14751  PhiH->eraseFromParent();
14752  }
14753  break;
14754  }
14755  case X86::ATOMSWAP6432: {
14756  unsigned HiOpc;
14757  unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
14758  BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
14759  BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
14760  break;
14761  }
14762  }
14763 
14764  // Copy EDX:EAX back from HiReg:LoReg
14765  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
14766  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
14767  // Copy ECX:EBX from t1H:t1L
14768  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
14769  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
14770 
14771  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
14772  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14773  MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14774  if (NewMO.isReg())
14775  NewMO.setIsKill(false);
14776  MIB.addOperand(NewMO);
14777  }
14778  MIB.setMemRefs(MMOBegin, MMOEnd);
14779 
14780  // Copy EDX:EAX back to t3H:t3L
14781  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
14782  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
14783 
14784  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
14785 
14786  mainMBB->addSuccessor(origMainMBB);
14787  mainMBB->addSuccessor(sinkMBB);
14788 
14789  // sinkMBB:
14790  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14791  TII->get(TargetOpcode::COPY), DstLoReg)
14792  .addReg(t3L);
14793  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14794  TII->get(TargetOpcode::COPY), DstHiReg)
14795  .addReg(t3H);
14796 
14797  MI->eraseFromParent();
14798  return sinkMBB;
14799 }
14800 
14801 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
14802 // or XMM0_V32I8 in AVX all of this code can be replaced with that
14803 // in the .td file.
14805  const TargetInstrInfo *TII) {
14806  unsigned Opc;
14807  switch (MI->getOpcode()) {
14808  default: llvm_unreachable("illegal opcode!");
14809  case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
14810  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
14811  case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
14812  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
14813  case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
14814  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
14815  case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
14816  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
14817  }
14818 
14819  DebugLoc dl = MI->getDebugLoc();
14820  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
14821 
14822  unsigned NumArgs = MI->getNumOperands();
14823  for (unsigned i = 1; i < NumArgs; ++i) {
14824  MachineOperand &Op = MI->getOperand(i);
14825  if (!(Op.isReg() && Op.isImplicit()))
14826  MIB.addOperand(Op);
14827  }
14828  if (MI->hasOneMemOperand())
14829  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
14830 
14831  BuildMI(*BB, MI, dl,
14832  TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
14833  .addReg(X86::XMM0);
14834 
14835  MI->eraseFromParent();
14836  return BB;
14837 }
14838 
14839 // FIXME: Custom handling because TableGen doesn't support multiple implicit
14840 // defs in an instruction pattern
14842  const TargetInstrInfo *TII) {
14843  unsigned Opc;
14844  switch (MI->getOpcode()) {
14845  default: llvm_unreachable("illegal opcode!");
14846  case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
14847  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
14848  case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
14849  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
14850  case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
14851  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
14852  case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
14853  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
14854  }
14855 
14856  DebugLoc dl = MI->getDebugLoc();
14857  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
14858 
14859  unsigned NumArgs = MI->getNumOperands(); // remove the results
14860  for (unsigned i = 1; i < NumArgs; ++i) {
14861  MachineOperand &Op = MI->getOperand(i);
14862  if (!(Op.isReg() && Op.isImplicit()))
14863  MIB.addOperand(Op);
14864  }
14865  if (MI->hasOneMemOperand())
14866  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
14867 
14868  BuildMI(*BB, MI, dl,
14869  TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
14870  .addReg(X86::ECX);
14871 
14872  MI->eraseFromParent();
14873  return BB;
14874 }
14875 
14877  const TargetInstrInfo *TII,
14878  const X86Subtarget* Subtarget) {
14879  DebugLoc dl = MI->getDebugLoc();
14880 
14881  // Address into RAX/EAX, other two args into ECX, EDX.
14882  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
14883  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
14884  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
14885  for (int i = 0; i < X86::AddrNumOperands; ++i)
14886  MIB.addOperand(MI->getOperand(i));
14887 
14888  unsigned ValOps = X86::AddrNumOperands;
14889  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
14890  .addReg(MI->getOperand(ValOps).getReg());
14891  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
14892  .addReg(MI->getOperand(ValOps+1).getReg());
14893 
14894  // The instruction doesn't actually take any operands though.
14895  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
14896 
14897  MI->eraseFromParent(); // The pseudo is gone now.
14898  return BB;
14899 }
14900 
14902 X86TargetLowering::EmitVAARG64WithCustomInserter(
14903  MachineInstr *MI,
14904  MachineBasicBlock *MBB) const {
14905  // Emit va_arg instruction on X86-64.
14906 
14907  // Operands to this pseudo-instruction:
14908  // 0 ) Output : destination address (reg)
14909  // 1-5) Input : va_list address (addr, i64mem)
14910  // 6 ) ArgSize : Size (in bytes) of vararg type
14911  // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
14912  // 8 ) Align : Alignment of type
14913  // 9 ) EFLAGS (implicit-def)
14914 
14915  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
14916  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
14917 
14918  unsigned DestReg = MI->getOperand(0).getReg();
14919  MachineOperand &Base = MI->getOperand(1);
14920  MachineOperand &Scale = MI->getOperand(2);
14921  MachineOperand &Index = MI->getOperand(3);
14922  MachineOperand &Disp = MI->getOperand(4);
14923  MachineOperand &Segment = MI->getOperand(5);
14924  unsigned ArgSize = MI->getOperand(6).getImm();
14925  unsigned ArgMode = MI->getOperand(7).getImm();
14926  unsigned Align = MI->getOperand(8).getImm();
14927 
14928  // Memory Reference
14929  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
14932 
14933  // Machine Information
14935  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
14936  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
14937  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
14938  DebugLoc DL = MI->getDebugLoc();
14939 
14940  // struct va_list {
14941  // i32 gp_offset
14942  // i32 fp_offset
14943  // i64 overflow_area (address)
14944  // i64 reg_save_area (address)
14945  // }
14946  // sizeof(va_list) = 24
14947  // alignment(va_list) = 8
14948 
14949  unsigned TotalNumIntRegs = 6;
14950  unsigned TotalNumXMMRegs = 8;
14951  bool UseGPOffset = (ArgMode == 1);
14952  bool UseFPOffset = (ArgMode == 2);
14953  unsigned MaxOffset = TotalNumIntRegs * 8 +
14954  (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
14955 
14956  /* Align ArgSize to a multiple of 8 */
14957  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
14958  bool NeedsAlign = (Align > 8);
14959 
14960  MachineBasicBlock *thisMBB = MBB;
14961  MachineBasicBlock *overflowMBB;
14962  MachineBasicBlock *offsetMBB;
14963  MachineBasicBlock *endMBB;
14964 
14965  unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
14966  unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
14967  unsigned OffsetReg = 0;
14968 
14969  if (!UseGPOffset && !UseFPOffset) {
14970  // If we only pull from the overflow region, we don't create a branch.
14971  // We don't need to alter control flow.
14972  OffsetDestReg = 0; // unused
14973  OverflowDestReg = DestReg;
14974 
14975  offsetMBB = NULL;
14976  overflowMBB = thisMBB;
14977  endMBB = thisMBB;
14978  } else {
14979  // First emit code to check if gp_offset (or fp_offset) is below the bound.
14980  // If so, pull the argument from reg_save_area. (branch to offsetMBB)
14981  // If not, pull from overflow_area. (branch to overflowMBB)
14982  //
14983  // thisMBB
14984  // | .
14985  // | .
14986  // offsetMBB overflowMBB
14987  // | .
14988  // | .
14989  // endMBB
14990 
14991  // Registers for the PHI in endMBB
14992  OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
14993  OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
14994 
14995  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
14996  MachineFunction *MF = MBB->getParent();
14997  overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14998  offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14999  endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15000 
15001  MachineFunction::iterator MBBIter = MBB;
15002  ++MBBIter;
15003 
15004  // Insert the new basic blocks
15005  MF->insert(MBBIter, offsetMBB);
15006  MF->insert(MBBIter, overflowMBB);
15007  MF->insert(MBBIter, endMBB);
15008 
15009  // Transfer the remainder of MBB and its successor edges to endMBB.
15010  endMBB->splice(endMBB->begin(), thisMBB,
15012  thisMBB->end());
15013  endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
15014 
15015  // Make offsetMBB and overflowMBB successors of thisMBB
15016  thisMBB->addSuccessor(offsetMBB);
15017  thisMBB->addSuccessor(overflowMBB);
15018 
15019  // endMBB is a successor of both offsetMBB and overflowMBB
15020  offsetMBB->addSuccessor(endMBB);
15021  overflowMBB->addSuccessor(endMBB);
15022 
15023  // Load the offset value into a register
15024  OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
15025  BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
15026  .addOperand(Base)
15027  .addOperand(Scale)
15028  .addOperand(Index)
15029  .addDisp(Disp, UseFPOffset ? 4 : 0)
15030  .addOperand(Segment)
15031  .setMemRefs(MMOBegin, MMOEnd);
15032 
15033  // Check if there is enough room left to pull this argument.
15034  BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
15035  .addReg(OffsetReg)
15036  .addImm(MaxOffset + 8 - ArgSizeA8);
15037 
15038  // Branch to "overflowMBB" if offset >= max
15039  // Fall through to "offsetMBB" otherwise
15040  BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
15041  .addMBB(overflowMBB);
15042  }
15043 
15044  // In offsetMBB, emit code to use the reg_save_area.
15045  if (offsetMBB) {
15046  assert(OffsetReg != 0);
15047 
15048  // Read the reg_save_area address.
15049  unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
15050  BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
15051  .addOperand(Base)
15052  .addOperand(Scale)
15053  .addOperand(Index)
15054  .addDisp(Disp, 16)
15055  .addOperand(Segment)
15056  .setMemRefs(MMOBegin, MMOEnd);
15057 
15058  // Zero-extend the offset
15059  unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
15060  BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
15061  .addImm(0)
15062  .addReg(OffsetReg)
15063  .addImm(X86::sub_32bit);
15064 
15065  // Add the offset to the reg_save_area to get the final address.
15066  BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
15067  .addReg(OffsetReg64)
15068  .addReg(RegSaveReg);
15069 
15070  // Compute the offset for the next argument
15071  unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
15072  BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
15073  .addReg(OffsetReg)
15074  .addImm(UseFPOffset ? 16 : 8);
15075 
15076  // Store it back into the va_list.
15077  BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
15078  .addOperand(Base)
15079  .addOperand(Scale)
15080  .addOperand(Index)
15081  .addDisp(Disp, UseFPOffset ? 4 : 0)
15082  .addOperand(Segment)
15083  .addReg(NextOffsetReg)
15084  .setMemRefs(MMOBegin, MMOEnd);
15085 
15086  // Jump to endMBB
15087  BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
15088  .addMBB(endMBB);
15089  }
15090 
15091  //
15092  // Emit code to use overflow area
15093  //
15094 
15095  // Load the overflow_area address into a register.
15096  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
15097  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
15098  .addOperand(Base)
15099  .addOperand(Scale)
15100  .addOperand(Index)
15101  .addDisp(Disp, 8)
15102  .addOperand(Segment)
15103  .setMemRefs(MMOBegin, MMOEnd);
15104 
15105  // If we need to align it, do so. Otherwise, just copy the address
15106  // to OverflowDestReg.
15107  if (NeedsAlign) {
15108  // Align the overflow address
15109  assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
15110  unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
15111 
15112  // aligned_addr = (addr + (align-1)) & ~(align-1)
15113  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
15114  .addReg(OverflowAddrReg)
15115  .addImm(Align-1);
15116 
15117  BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
15118  .addReg(TmpReg)
15119  .addImm(~(uint64_t)(Align-1));
15120  } else {
15121  BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
15122  .addReg(OverflowAddrReg);
15123  }
15124 
15125  // Compute the next overflow address after this argument.
15126  // (the overflow address should be kept 8-byte aligned)
15127  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
15128  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
15129  .addReg(OverflowDestReg)
15130  .addImm(ArgSizeA8);
15131 
15132  // Store the new overflow address.
15133  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
15134  .addOperand(Base)
15135  .addOperand(Scale)
15136  .addOperand(Index)
15137  .addDisp(Disp, 8)
15138  .addOperand(Segment)
15139  .addReg(NextAddrReg)
15140  .setMemRefs(MMOBegin, MMOEnd);
15141 
15142  // If we branched, emit the PHI to the front of endMBB.
15143  if (offsetMBB) {
15144  BuildMI(*endMBB, endMBB->begin(), DL,
15145  TII->get(X86::PHI), DestReg)
15146  .addReg(OffsetDestReg).addMBB(offsetMBB)
15147  .addReg(OverflowDestReg).addMBB(overflowMBB);
15148  }
15149 
15150  // Erase the pseudo instruction
15151  MI->eraseFromParent();
15152 
15153  return endMBB;
15154 }
15155 
15157 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
15158  MachineInstr *MI,
15159  MachineBasicBlock *MBB) const {
15160  // Emit code to save XMM registers to the stack. The ABI says that the
15161  // number of registers to save is given in %al, so it's theoretically
15162  // possible to do an indirect jump trick to avoid saving all of them,
15163  // however this code takes a simpler approach and just executes all
15164  // of the stores if %al is non-zero. It's less code, and it's probably
15165  // easier on the hardware branch predictor, and stores aren't all that
15166  // expensive anyway.
15167 
15168  // Create the new basic blocks. One block contains all the XMM stores,
15169  // and one block is the final destination regardless of whether any
15170  // stores were performed.
15171  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
15172  MachineFunction *F = MBB->getParent();
15173  MachineFunction::iterator MBBIter = MBB;
15174  ++MBBIter;
15175  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
15176  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
15177  F->insert(MBBIter, XMMSaveMBB);
15178  F->insert(MBBIter, EndMBB);
15179 
15180  // Transfer the remainder of MBB and its successor edges to EndMBB.
15181  EndMBB->splice(EndMBB->begin(), MBB,
15183  MBB->end());
15184  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
15185 
15186  // The original block will now fall through to the XMM save block.
15187  MBB->addSuccessor(XMMSaveMBB);
15188  // The XMMSaveMBB will fall through to the end block.
15189  XMMSaveMBB->addSuccessor(EndMBB);
15190 
15191  // Now add the instructions.
15193  DebugLoc DL = MI->getDebugLoc();
15194 
15195  unsigned CountReg = MI->getOperand(0).getReg();
15196  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
15197  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
15198 
15199  if (!Subtarget->isTargetWin64()) {
15200  // If %al is 0, branch around the XMM save block.
15201  BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
15202  BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
15203  MBB->addSuccessor(EndMBB);
15204  }
15205 
15206  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
15207  // In the XMM save block, save all the XMM argument registers.
15208  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
15209  int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
15210  MachineMemOperand *MMO =
15212  MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
15214  /*Size=*/16, /*Align=*/16);
15215  BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
15216  .addFrameIndex(RegSaveFrameIndex)
15217  .addImm(/*Scale=*/1)
15218  .addReg(/*IndexReg=*/0)
15219  .addImm(/*Disp=*/Offset)
15220  .addReg(/*Segment=*/0)
15221  .addReg(MI->getOperand(i).getReg())
15222  .addMemOperand(MMO);
15223  }
15224 
15225  MI->eraseFromParent(); // The pseudo instruction is gone now.
15226 
15227  return EndMBB;
15228 }
15229 
15230 // The EFLAGS operand of SelectItr might be missing a kill marker
15231 // because there were multiple uses of EFLAGS, and ISel didn't know
15232 // which to mark. Figure out whether SelectItr should have had a
15233 // kill marker, and set it if it should. Returns the correct kill
15234 // marker value.
15236  MachineBasicBlock* BB,
15237  const TargetRegisterInfo* TRI) {
15238  // Scan forward through BB for a use/def of EFLAGS.
15239  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
15240  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
15241  const MachineInstr& mi = *miI;
15242  if (mi.readsRegister(X86::EFLAGS))
15243  return false;
15244  if (mi.definesRegister(X86::EFLAGS))
15245  break; // Should have kill-flag - update below.
15246  }
15247 
15248  // If we hit the end of the block, check whether EFLAGS is live into a
15249  // successor.
15250  if (miI == BB->end()) {
15251  for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
15252  sEnd = BB->succ_end();
15253  sItr != sEnd; ++sItr) {
15254  MachineBasicBlock* succ = *sItr;
15255  if (succ->isLiveIn(X86::EFLAGS))
15256  return false;
15257  }
15258  }
15259 
15260  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
15261  // out. SelectMI should have a kill flag on EFLAGS.
15262  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
15263  return true;
15264 }
15265 
15267 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
15268  MachineBasicBlock *BB) const {
15270  DebugLoc DL = MI->getDebugLoc();
15271 
15272  // To "insert" a SELECT_CC instruction, we actually have to insert the
15273  // diamond control-flow pattern. The incoming instruction knows the
15274  // destination vreg to set, the condition code register to branch on, the
15275  // true/false values to select between, and a branch opcode to use.
15276  const BasicBlock *LLVM_BB = BB->getBasicBlock();
15277  MachineFunction::iterator It = BB;
15278  ++It;
15279 
15280  // thisMBB:
15281  // ...
15282  // TrueVal = ...
15283  // cmpTY ccX, r1, r2
15284  // bCC copy1MBB
15285  // fallthrough --> copy0MBB
15286  MachineBasicBlock *thisMBB = BB;
15287  MachineFunction *F = BB->getParent();
15288  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
15289  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
15290  F->insert(It, copy0MBB);
15291  F->insert(It, sinkMBB);
15292 
15293  // If the EFLAGS register isn't dead in the terminator, then claim that it's
15294  // live into the sink and copy blocks.
15296  if (!MI->killsRegister(X86::EFLAGS) &&
15297  !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
15298  copy0MBB->addLiveIn(X86::EFLAGS);
15299  sinkMBB->addLiveIn(X86::EFLAGS);
15300  }
15301 
15302  // Transfer the remainder of BB and its successor edges to sinkMBB.
15303  sinkMBB->splice(sinkMBB->begin(), BB,
15305  BB->end());
15306  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
15307 
15308  // Add the true and fallthrough blocks as its successors.
15309  BB->addSuccessor(copy0MBB);
15310  BB->addSuccessor(sinkMBB);
15311 
15312  // Create the conditional branch instruction.
15313  unsigned Opc =
15315  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
15316 
15317  // copy0MBB:
15318  // %FalseValue = ...
15319  // # fallthrough to sinkMBB
15320  copy0MBB->addSuccessor(sinkMBB);
15321 
15322  // sinkMBB:
15323  // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
15324  // ...
15325  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
15326  TII->get(X86::PHI), MI->getOperand(0).getReg())
15327  .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
15328  .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
15329 
15330  MI->eraseFromParent(); // The pseudo instruction is gone now.
15331  return sinkMBB;
15332 }
15333 
15335 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
15336  bool Is64Bit) const {
15338  DebugLoc DL = MI->getDebugLoc();
15339  MachineFunction *MF = BB->getParent();
15340  const BasicBlock *LLVM_BB = BB->getBasicBlock();
15341 
15342  assert(getTargetMachine().Options.EnableSegmentedStacks);
15343 
15344  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
15345  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
15346 
15347  // BB:
15348  // ... [Till the alloca]
15349  // If stacklet is not large enough, jump to mallocMBB
15350  //
15351  // bumpMBB:
15352  // Allocate by subtracting from RSP
15353  // Jump to continueMBB
15354  //
15355  // mallocMBB:
15356  // Allocate by call to runtime
15357  //
15358  // continueMBB:
15359  // ...
15360  // [rest of original BB]
15361  //
15362 
15363  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15364  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15365  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15366 
15367  MachineRegisterInfo &MRI = MF->getRegInfo();
15368  const TargetRegisterClass *AddrRegClass =
15369  getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
15370 
15371  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
15372  bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
15373  tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
15374  SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
15375  sizeVReg = MI->getOperand(1).getReg(),
15376  physSPReg = Is64Bit ? X86::RSP : X86::ESP;
15377 
15378  MachineFunction::iterator MBBIter = BB;
15379  ++MBBIter;
15380 
15381  MF->insert(MBBIter, bumpMBB);
15382  MF->insert(MBBIter, mallocMBB);
15383  MF->insert(MBBIter, continueMBB);
15384 
15385  continueMBB->splice(continueMBB->begin(), BB, llvm::next
15386  (MachineBasicBlock::iterator(MI)), BB->end());
15387  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
15388 
15389  // Add code to the main basic block to check if the stack limit has been hit,
15390  // and if so, jump to mallocMBB otherwise to bumpMBB.
15391  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
15392  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
15393  .addReg(tmpSPVReg).addReg(sizeVReg);
15394  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
15395  .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
15396  .addReg(SPLimitVReg);
15397  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
15398 
15399  // bumpMBB simply decreases the stack pointer, since we know the current
15400  // stacklet has enough space.
15401  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
15402  .addReg(SPLimitVReg);
15403  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
15404  .addReg(SPLimitVReg);
15405  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
15406 
15407  // Calls into a routine in libgcc to allocate more space from the heap.
15408  const uint32_t *RegMask =
15410  if (Is64Bit) {
15411  BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
15412  .addReg(sizeVReg);
15413  BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
15414  .addExternalSymbol("__morestack_allocate_stack_space")
15415  .addRegMask(RegMask)
15416  .addReg(X86::RDI, RegState::Implicit)
15417  .addReg(X86::RAX, RegState::ImplicitDefine);
15418  } else {
15419  BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
15420  .addImm(12);
15421  BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
15422  BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
15423  .addExternalSymbol("__morestack_allocate_stack_space")
15424  .addRegMask(RegMask)
15426  }
15427 
15428  if (!Is64Bit)
15429  BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
15430  .addImm(16);
15431 
15432  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
15433  .addReg(Is64Bit ? X86::RAX : X86::EAX);
15434  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
15435 
15436  // Set up the CFG correctly.
15437  BB->addSuccessor(bumpMBB);
15438  BB->addSuccessor(mallocMBB);
15439  mallocMBB->addSuccessor(continueMBB);
15440  bumpMBB->addSuccessor(continueMBB);
15441 
15442  // Take care of the PHI nodes.
15443  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
15444  MI->getOperand(0).getReg())
15445  .addReg(mallocPtrVReg).addMBB(mallocMBB)
15446  .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
15447 
15448  // Delete the original pseudo instruction.
15449  MI->eraseFromParent();
15450 
15451  // And we're done.
15452  return continueMBB;
15453 }
15454 
15456 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
15457  MachineBasicBlock *BB) const {
15459  DebugLoc DL = MI->getDebugLoc();
15460 
15461  assert(!Subtarget->isTargetEnvMacho());
15462 
15463  // The lowering is pretty easy: we're just emitting the call to _alloca. The
15464  // non-trivial part is impdef of ESP.
15465 
15466  if (Subtarget->isTargetWin64()) {
15467  if (Subtarget->isTargetCygMing()) {
15468  // ___chkstk(Mingw64):
15469  // Clobbers R10, R11, RAX and EFLAGS.
15470  // Updates RSP.
15471  BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
15472  .addExternalSymbol("___chkstk")
15473  .addReg(X86::RAX, RegState::Implicit)
15474  .addReg(X86::RSP, RegState::Implicit)
15477  .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
15478  } else {
15479  // __chkstk(MSVCRT): does not update stack pointer.
15480  // Clobbers R10, R11 and EFLAGS.
15481  BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
15482  .addExternalSymbol("__chkstk")
15483  .addReg(X86::RAX, RegState::Implicit)
15484  .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
15485  // RAX has the offset to be subtracted from RSP.
15486  BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
15487  .addReg(X86::RSP)
15488  .addReg(X86::RAX);
15489  }
15490  } else {
15491  const char *StackProbeSymbol =
15492  Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
15493 
15494  BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
15495  .addExternalSymbol(StackProbeSymbol)
15500  .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
15501  }
15502 
15503  MI->eraseFromParent(); // The pseudo instruction is gone now.
15504  return BB;
15505 }
15506 
15508 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
15509  MachineBasicBlock *BB) const {
15510  // This is pretty easy. We're taking the value that we received from
15511  // our load from the relocation, sticking it in either RDI (x86-64)
15512  // or EAX and doing an indirect call. The return value will then
15513  // be in the normal return register.
15514  const X86InstrInfo *TII
15515  = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
15516  DebugLoc DL = MI->getDebugLoc();
15517  MachineFunction *F = BB->getParent();
15518 
15519  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
15520  assert(MI->getOperand(3).isGlobal() && "This should be a global");
15521 
15522  // Get a register mask for the lowered call.
15523  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
15524  // proper register mask.
15525  const uint32_t *RegMask =
15527  if (Subtarget->is64Bit()) {
15528  MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
15529  TII->get(X86::MOV64rm), X86::RDI)
15530  .addReg(X86::RIP)
15531  .addImm(0).addReg(0)
15532  .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
15533  MI->getOperand(3).getTargetFlags())
15534  .addReg(0);
15535  MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
15536  addDirectMem(MIB, X86::RDI);
15537  MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
15538  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
15539  MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
15540  TII->get(X86::MOV32rm), X86::EAX)
15541  .addReg(0)
15542  .addImm(0).addReg(0)
15543  .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
15544  MI->getOperand(3).getTargetFlags())
15545  .addReg(0);
15546  MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
15547  addDirectMem(MIB, X86::EAX);
15549  } else {
15550  MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
15551  TII->get(X86::MOV32rm), X86::EAX)
15552  .addReg(TII->getGlobalBaseReg(F))
15553  .addImm(0).addReg(0)
15554  .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
15555  MI->getOperand(3).getTargetFlags())
15556  .addReg(0);
15557  MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
15558  addDirectMem(MIB, X86::EAX);
15560  }
15561 
15562  MI->eraseFromParent(); // The pseudo instruction is gone now.
15563  return BB;
15564 }
15565 
15567 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
15568  MachineBasicBlock *MBB) const {
15569  DebugLoc DL = MI->getDebugLoc();
15571 
15572  MachineFunction *MF = MBB->getParent();
15573  MachineRegisterInfo &MRI = MF->getRegInfo();
15574 
15575  const BasicBlock *BB = MBB->getBasicBlock();
15576  MachineFunction::iterator I = MBB;
15577  ++I;
15578 
15579  // Memory Reference
15582 
15583  unsigned DstReg;
15584  unsigned MemOpndSlot = 0;
15585 
15586  unsigned CurOp = 0;
15587 
15588  DstReg = MI->getOperand(CurOp++).getReg();
15589  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
15590  assert(RC->hasType(MVT::i32) && "Invalid destination!");
15591  unsigned mainDstReg = MRI.createVirtualRegister(RC);
15592  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
15593 
15594  MemOpndSlot = CurOp;
15595 
15596  MVT PVT = getPointerTy();
15597  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
15598  "Invalid Pointer Size!");
15599 
15600  // For v = setjmp(buf), we generate
15601  //
15602  // thisMBB:
15603  // buf[LabelOffset] = restoreMBB
15604  // SjLjSetup restoreMBB
15605  //
15606  // mainMBB:
15607  // v_main = 0
15608  //
15609  // sinkMBB:
15610  // v = phi(main, restore)
15611  //
15612  // restoreMBB:
15613  // v_restore = 1
15614 
15615  MachineBasicBlock *thisMBB = MBB;
15616  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
15617  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
15618  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
15619  MF->insert(I, mainMBB);
15620  MF->insert(I, sinkMBB);
15621  MF->push_back(restoreMBB);
15622 
15623  MachineInstrBuilder MIB;
15624 
15625  // Transfer the remainder of BB and its successor edges to sinkMBB.
15626  sinkMBB->splice(sinkMBB->begin(), MBB,
15628  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
15629 
15630  // thisMBB:
15631  unsigned PtrStoreOpc = 0;
15632  unsigned LabelReg = 0;
15633  const int64_t LabelOffset = 1 * PVT.getStoreSize();
15635  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
15636  (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
15637 
15638  // Prepare IP either in reg or imm.
15639  if (!UseImmLabel) {
15640  PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
15641  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
15642  LabelReg = MRI.createVirtualRegister(PtrRC);
15643  if (Subtarget->is64Bit()) {
15644  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
15645  .addReg(X86::RIP)
15646  .addImm(0)
15647  .addReg(0)
15648  .addMBB(restoreMBB)
15649  .addReg(0);
15650  } else {
15651  const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
15652  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
15653  .addReg(XII->getGlobalBaseReg(MF))
15654  .addImm(0)
15655  .addReg(0)
15656  .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
15657  .addReg(0);
15658  }
15659  } else
15660  PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
15661  // Store IP
15662  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
15663  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
15664  if (i == X86::AddrDisp)
15665  MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
15666  else
15667  MIB.addOperand(MI->getOperand(MemOpndSlot + i));
15668  }
15669  if (!UseImmLabel)
15670  MIB.addReg(LabelReg);
15671  else
15672  MIB.addMBB(restoreMBB);
15673  MIB.setMemRefs(MMOBegin, MMOEnd);
15674  // Setup
15675  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
15676  .addMBB(restoreMBB);
15677 
15678  const X86RegisterInfo *RegInfo =
15679  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
15680  MIB.addRegMask(RegInfo->getNoPreservedMask());
15681  thisMBB->addSuccessor(mainMBB);
15682  thisMBB->addSuccessor(restoreMBB);
15683 
15684  // mainMBB:
15685  // EAX = 0
15686  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
15687  mainMBB->addSuccessor(sinkMBB);
15688 
15689  // sinkMBB:
15690  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
15691  TII->get(X86::PHI), DstReg)
15692  .addReg(mainDstReg).addMBB(mainMBB)
15693  .addReg(restoreDstReg).addMBB(restoreMBB);
15694 
15695  // restoreMBB:
15696  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
15697  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
15698  restoreMBB->addSuccessor(sinkMBB);
15699 
15700  MI->eraseFromParent();
15701  return sinkMBB;
15702 }
15703 
15705 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
15706  MachineBasicBlock *MBB) const {
15707  DebugLoc DL = MI->getDebugLoc();
15709 
15710  MachineFunction *MF = MBB->getParent();
15711  MachineRegisterInfo &MRI = MF->getRegInfo();
15712 
15713  // Memory Reference
15716 
15717  MVT PVT = getPointerTy();
15718  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
15719  "Invalid Pointer Size!");
15720 
15721  const TargetRegisterClass *RC =
15722  (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
15723  unsigned Tmp = MRI.createVirtualRegister(RC);
15724  // Since FP is only updated here but NOT referenced, it's treated as GPR.
15725  const X86RegisterInfo *RegInfo =
15726  static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
15727  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
15728  unsigned SP = RegInfo->getStackRegister();
15729 
15730  MachineInstrBuilder MIB;
15731 
15732  const int64_t LabelOffset = 1 * PVT.getStoreSize();
15733  const int64_t SPOffset = 2 * PVT.getStoreSize();
15734 
15735  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
15736  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
15737 
15738  // Reload FP
15739  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
15740  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
15741  MIB.addOperand(MI->getOperand(i));
15742  MIB.setMemRefs(MMOBegin, MMOEnd);
15743  // Reload IP
15744  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
15745  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
15746  if (i == X86::AddrDisp)
15747  MIB.addDisp(MI->getOperand(i), LabelOffset);
15748  else
15749  MIB.addOperand(MI->getOperand(i));
15750  }
15751  MIB.setMemRefs(MMOBegin, MMOEnd);
15752  // Reload SP
15753  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
15754  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
15755  if (i == X86::AddrDisp)
15756  MIB.addDisp(MI->getOperand(i), SPOffset);
15757  else
15758  MIB.addOperand(MI->getOperand(i));
15759  }
15760  MIB.setMemRefs(MMOBegin, MMOEnd);
15761  // Jump
15762  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
15763 
15764  MI->eraseFromParent();
15765  return MBB;
15766 }
15767 
15770  MachineBasicBlock *BB) const {
15771  switch (MI->getOpcode()) {
15772  default: llvm_unreachable("Unexpected instr type to insert");
15773  case X86::TAILJMPd64:
15774  case X86::TAILJMPr64:
15775  case X86::TAILJMPm64:
15776  llvm_unreachable("TAILJMP64 would not be touched here.");
15777  case X86::TCRETURNdi64:
15778  case X86::TCRETURNri64:
15779  case X86::TCRETURNmi64:
15780  return BB;
15781  case X86::WIN_ALLOCA:
15782  return EmitLoweredWinAlloca(MI, BB);
15783  case X86::SEG_ALLOCA_32:
15784  return EmitLoweredSegAlloca(MI, BB, false);
15785  case X86::SEG_ALLOCA_64:
15786  return EmitLoweredSegAlloca(MI, BB, true);
15787  case X86::TLSCall_32:
15788  case X86::TLSCall_64:
15789  return EmitLoweredTLSCall(MI, BB);
15790  case X86::CMOV_GR8:
15791  case X86::CMOV_FR32:
15792  case X86::CMOV_FR64:
15793  case X86::CMOV_V4F32:
15794  case X86::CMOV_V2F64:
15795  case X86::CMOV_V2I64:
15796  case X86::CMOV_V8F32:
15797  case X86::CMOV_V4F64:
15798  case X86::CMOV_V4I64:
15799  case X86::CMOV_V16F32:
15800  case X86::CMOV_V8F64:
15801  case X86::CMOV_V8I64:
15802  case X86::CMOV_GR16:
15803  case X86::CMOV_GR32:
15804  case X86::CMOV_RFP32:
15805  case X86::CMOV_RFP64:
15806  case X86::CMOV_RFP80:
15807  return EmitLoweredSelect(MI, BB);
15808 
15809  case X86::FP32_TO_INT16_IN_MEM:
15810  case X86::FP32_TO_INT32_IN_MEM:
15811  case X86::FP32_TO_INT64_IN_MEM:
15812  case X86::FP64_TO_INT16_IN_MEM:
15813  case X86::FP64_TO_INT32_IN_MEM:
15814  case X86::FP64_TO_INT64_IN_MEM:
15815  case X86::FP80_TO_INT16_IN_MEM:
15816  case X86::FP80_TO_INT32_IN_MEM:
15817  case X86::FP80_TO_INT64_IN_MEM: {
15819  DebugLoc DL = MI->getDebugLoc();
15820 
15821  // Change the floating point control register to use "round towards zero"
15822  // mode when truncating to an integer value.
15823  MachineFunction *F = BB->getParent();
15824  int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
15825  addFrameReference(BuildMI(*BB, MI, DL,
15826  TII->get(X86::FNSTCW16m)), CWFrameIdx);
15827 
15828  // Load the old value of the high byte of the control word...
15829  unsigned OldCW =
15830  F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
15831  addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
15832  CWFrameIdx);
15833 
15834  // Set the high part to be round to zero...
15835  addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
15836  .addImm(0xC7F);
15837 
15838  // Reload the modified control word now...
15839  addFrameReference(BuildMI(*BB, MI, DL,
15840  TII->get(X86::FLDCW16m)), CWFrameIdx);
15841 
15842  // Restore the memory image of control word to original value
15843  addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
15844  .addReg(OldCW);
15845 
15846  // Get the X86 opcode to use.
15847  unsigned Opc;
15848  switch (MI->getOpcode()) {
15849  default: llvm_unreachable("illegal opcode!");
15850  case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
15851  case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
15852  case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
15853  case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
15854  case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
15855  case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
15856  case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
15857  case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
15858  case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
15859  }
15860 
15861  X86AddressMode AM;
15862  MachineOperand &Op = MI->getOperand(0);
15863  if (Op.isReg()) {
15865  AM.Base.Reg = Op.getReg();
15866  } else {
15868  AM.Base.FrameIndex = Op.getIndex();
15869  }
15870  Op = MI->getOperand(1);
15871  if (Op.isImm())
15872  AM.Scale = Op.getImm();
15873  Op = MI->getOperand(2);
15874  if (Op.isImm())
15875  AM.IndexReg = Op.getImm();
15876  Op = MI->getOperand(3);
15877  if (Op.isGlobal()) {
15878  AM.GV = Op.getGlobal();
15879  } else {
15880  AM.Disp = Op.getImm();
15881  }
15882  addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
15883  .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
15884 
15885  // Reload the original control word now.
15886  addFrameReference(BuildMI(*BB, MI, DL,
15887  TII->get(X86::FLDCW16m)), CWFrameIdx);
15888 
15889  MI->eraseFromParent(); // The pseudo instruction is gone now.
15890  return BB;
15891  }
15892  // String/text processing lowering.
15893  case X86::PCMPISTRM128REG:
15894  case X86::VPCMPISTRM128REG:
15895  case X86::PCMPISTRM128MEM:
15896  case X86::VPCMPISTRM128MEM:
15897  case X86::PCMPESTRM128REG:
15898  case X86::VPCMPESTRM128REG:
15899  case X86::PCMPESTRM128MEM:
15900  case X86::VPCMPESTRM128MEM:
15901  assert(Subtarget->hasSSE42() &&
15902  "Target must have SSE4.2 or AVX features enabled");
15903  return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
15904 
15905  // String/text processing lowering.
15906  case X86::PCMPISTRIREG:
15907  case X86::VPCMPISTRIREG:
15908  case X86::PCMPISTRIMEM:
15909  case X86::VPCMPISTRIMEM:
15910  case X86::PCMPESTRIREG:
15911  case X86::VPCMPESTRIREG:
15912  case X86::PCMPESTRIMEM:
15913  case X86::VPCMPESTRIMEM:
15914  assert(Subtarget->hasSSE42() &&
15915  "Target must have SSE4.2 or AVX features enabled");
15916  return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
15917 
15918  // Thread synchronization.
15919  case X86::MONITOR:
15920  return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
15921 
15922  // xbegin
15923  case X86::XBEGIN:
15924  return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
15925 
15926  // Atomic Lowering.
15927  case X86::ATOMAND8:
15928  case X86::ATOMAND16:
15929  case X86::ATOMAND32:
15930  case X86::ATOMAND64:
15931  // Fall through
15932  case X86::ATOMOR8:
15933  case X86::ATOMOR16:
15934  case X86::ATOMOR32:
15935  case X86::ATOMOR64:
15936  // Fall through
15937  case X86::ATOMXOR16:
15938  case X86::ATOMXOR8:
15939  case X86::ATOMXOR32:
15940  case X86::ATOMXOR64:
15941  // Fall through
15942  case X86::ATOMNAND8:
15943  case X86::ATOMNAND16:
15944  case X86::ATOMNAND32:
15945  case X86::ATOMNAND64:
15946  // Fall through
15947  case X86::ATOMMAX8:
15948  case X86::ATOMMAX16:
15949  case X86::ATOMMAX32:
15950  case X86::ATOMMAX64:
15951  // Fall through
15952  case X86::ATOMMIN8:
15953  case X86::ATOMMIN16:
15954  case X86::ATOMMIN32:
15955  case X86::ATOMMIN64:
15956  // Fall through
15957  case X86::ATOMUMAX8:
15958  case X86::ATOMUMAX16:
15959  case X86::ATOMUMAX32:
15960  case X86::ATOMUMAX64:
15961  // Fall through
15962  case X86::ATOMUMIN8:
15963  case X86::ATOMUMIN16:
15964  case X86::ATOMUMIN32:
15965  case X86::ATOMUMIN64:
15966  return EmitAtomicLoadArith(MI, BB);
15967 
15968  // This group does 64-bit operations on a 32-bit host.
15969  case X86::ATOMAND6432:
15970  case X86::ATOMOR6432:
15971  case X86::ATOMXOR6432:
15972  case X86::ATOMNAND6432:
15973  case X86::ATOMADD6432:
15974  case X86::ATOMSUB6432:
15975  case X86::ATOMMAX6432:
15976  case X86::ATOMMIN6432:
15977  case X86::ATOMUMAX6432:
15978  case X86::ATOMUMIN6432:
15979  case X86::ATOMSWAP6432:
15980  return EmitAtomicLoadArith6432(MI, BB);
15981 
15983  return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
15984 
15985  case X86::VAARG_64:
15986  return EmitVAARG64WithCustomInserter(MI, BB);
15987 
15988  case X86::EH_SjLj_SetJmp32:
15989  case X86::EH_SjLj_SetJmp64:
15990  return emitEHSjLjSetJmp(MI, BB);
15991 
15992  case X86::EH_SjLj_LongJmp32:
15993  case X86::EH_SjLj_LongJmp64:
15994  return emitEHSjLjLongJmp(MI, BB);
15995  }
15996 }
15997 
15998 //===----------------------------------------------------------------------===//
15999 // X86 Optimization Hooks
16000 //===----------------------------------------------------------------------===//
16001 
16003  APInt &KnownZero,
16004  APInt &KnownOne,
16005  const SelectionDAG &DAG,
16006  unsigned Depth) const {
16007  unsigned BitWidth = KnownZero.getBitWidth();
16008  unsigned Opc = Op.getOpcode();
16009  assert((Opc >= ISD::BUILTIN_OP_END ||
16010  Opc == ISD::INTRINSIC_WO_CHAIN ||
16011  Opc == ISD::INTRINSIC_W_CHAIN ||
16012  Opc == ISD::INTRINSIC_VOID) &&
16013  "Should use MaskedValueIsZero if you don't know whether Op"
16014  " is a target node!");
16015 
16016  KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
16017  switch (Opc) {
16018  default: break;
16019  case X86ISD::ADD:
16020  case X86ISD::SUB:
16021  case X86ISD::ADC:
16022  case X86ISD::SBB:
16023  case X86ISD::SMUL:
16024  case X86ISD::UMUL:
16025  case X86ISD::INC:
16026  case X86ISD::DEC:
16027  case X86ISD::OR:
16028  case X86ISD::XOR:
16029  case X86ISD::AND:
16030  // These nodes' second result is a boolean.
16031  if (Op.getResNo() == 0)
16032  break;
16033  // Fallthrough
16034  case X86ISD::SETCC:
16035  KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
16036  break;
16037  case ISD::INTRINSIC_WO_CHAIN: {
16038  unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
16039  unsigned NumLoBits = 0;
16040  switch (IntId) {
16041  default: break;
16049  // High bits of movmskp{s|d}, pmovmskb are known zero.
16050  switch (IntId) {
16051  default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
16052  case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
16053  case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
16054  case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
16055  case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
16056  case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
16057  case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
16058  case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
16059  }
16060  KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
16061  break;
16062  }
16063  }
16064  break;
16065  }
16066  }
16067 }
16068 
16070  unsigned Depth) const {
16071  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
16072  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
16073  return Op.getValueType().getScalarType().getSizeInBits();
16074 
16075  // Fallback case.
16076  return 1;
16077 }
16078 
16079 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
16080 /// node is a GlobalAddress + offset.
16082  const GlobalValue* &GA,
16083  int64_t &Offset) const {
16084  if (N->getOpcode() == X86ISD::Wrapper) {
16085  if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
16086  GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
16087  Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
16088  return true;
16089  }
16090  }
16091  return TargetLowering::isGAPlusOffset(N, GA, Offset);
16092 }
16093 
16094 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
16095 /// same as extracting the high 128-bit part of 256-bit vector and then
16096 /// inserting the result into the low part of a new 256-bit vector
16098  EVT VT = SVOp->getValueType(0);
16099  unsigned NumElems = VT.getVectorNumElements();
16100 
16101  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16102  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
16103  if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
16104  SVOp->getMaskElt(j) >= 0)
16105  return false;
16106 
16107  return true;
16108 }
16109 
16110 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
16111 /// same as extracting the low 128-bit part of 256-bit vector and then
16112 /// inserting the result into the high part of a new 256-bit vector
16114  EVT VT = SVOp->getValueType(0);
16115  unsigned NumElems = VT.getVectorNumElements();
16116 
16117  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16118  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
16119  if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
16120  SVOp->getMaskElt(j) >= 0)
16121  return false;
16122 
16123  return true;
16124 }
16125 
16126 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
16129  const X86Subtarget* Subtarget) {
16130  SDLoc dl(N);
16131  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
16132  SDValue V1 = SVOp->getOperand(0);
16133  SDValue V2 = SVOp->getOperand(1);
16134  EVT VT = SVOp->getValueType(0);
16135  unsigned NumElems = VT.getVectorNumElements();
16136 
16137  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
16138  V2.getOpcode() == ISD::CONCAT_VECTORS) {
16139  //
16140  // 0,0,0,...
16141  // |
16142  // V UNDEF BUILD_VECTOR UNDEF
16143  // \ / \ /
16144  // CONCAT_VECTOR CONCAT_VECTOR
16145  // \ /
16146  // \ /
16147  // RESULT: V + zero extended
16148  //
16149  if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
16150  V2.getOperand(1).getOpcode() != ISD::UNDEF ||
16151  V1.getOperand(1).getOpcode() != ISD::UNDEF)
16152  return SDValue();
16153 
16155  return SDValue();
16156 
16157  // To match the shuffle mask, the first half of the mask should
16158  // be exactly the first vector, and all the rest a splat with the
16159  // first element of the second one.
16160  for (unsigned i = 0; i != NumElems/2; ++i)
16161  if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
16162  !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
16163  return SDValue();
16164 
16165  // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
16166  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
16167  if (Ld->hasNUsesOfValue(1, 0)) {
16169  SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
16170  SDValue ResNode =
16171  DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
16172  array_lengthof(Ops),
16173  Ld->getMemoryVT(),
16174  Ld->getPointerInfo(),
16175  Ld->getAlignment(),
16176  false/*isVolatile*/, true/*ReadMem*/,
16177  false/*WriteMem*/);
16178 
16179  // Make sure the newly-created LOAD is in the same position as Ld in
16180  // terms of dependency. We create a TokenFactor for Ld and ResNode,
16181  // and update uses of Ld's output chain to use the TokenFactor.
16182  if (Ld->hasAnyUseOfValue(1)) {
16183  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16184  SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
16185  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16186  DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
16187  SDValue(ResNode.getNode(), 1));
16188  }
16189 
16190  return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
16191  }
16192  }
16193 
16194  // Emit a zeroed vector and insert the desired subvector on its
16195  // first half.
16196  SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
16197  SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
16198  return DCI.CombineTo(N, InsV);
16199  }
16200 
16201  //===--------------------------------------------------------------------===//
16202  // Combine some shuffles into subvector extracts and inserts:
16203  //
16204 
16205  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16206  if (isShuffleHigh128VectorInsertLow(SVOp)) {
16207  SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
16208  SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
16209  return DCI.CombineTo(N, InsV);
16210  }
16211 
16212  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16213  if (isShuffleLow128VectorInsertHigh(SVOp)) {
16214  SDValue V = Extract128BitVector(V1, 0, DAG, dl);
16215  SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
16216  return DCI.CombineTo(N, InsV);
16217  }
16218 
16219  return SDValue();
16220 }
16221 
16222 /// PerformShuffleCombine - Performs several different shuffle combines.
16225  const X86Subtarget *Subtarget) {
16226  SDLoc dl(N);
16227  EVT VT = N->getValueType(0);
16228 
16229  // Don't create instructions with illegal types after legalize types has run.
16230  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16231  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
16232  return SDValue();
16233 
16234  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
16235  if (Subtarget->hasFp256() && VT.is256BitVector() &&
16237  return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
16238 
16239  // Only handle 128 wide vector from here on.
16240  if (!VT.is128BitVector())
16241  return SDValue();
16242 
16243  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
16244  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
16245  // consecutive, non-overlapping, and in the right order.
16247  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
16248  Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
16249 
16250  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
16251 }
16252 
16253 /// PerformTruncateCombine - Converts truncate operation to
16254 /// a sequence of vector shuffle operations.
16255 /// It is possible when we truncate 256-bit vector to 128-bit vector
16258  const X86Subtarget *Subtarget) {
16259  return SDValue();
16260 }
16261 
16262 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
16263 /// specific shuffle of a load can be folded into a single element load.
16264 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
16265 /// shuffles have been customed lowered so we need to handle those here.
16268  if (DCI.isBeforeLegalizeOps())
16269  return SDValue();
16270 
16271  SDValue InVec = N->getOperand(0);
16272  SDValue EltNo = N->getOperand(1);
16273 
16274  if (!isa<ConstantSDNode>(EltNo))
16275  return SDValue();
16276 
16277  EVT VT = InVec.getValueType();
16278 
16279  bool HasShuffleIntoBitcast = false;
16280  if (InVec.getOpcode() == ISD::BITCAST) {
16281  // Don't duplicate a load with other uses.
16282  if (!InVec.hasOneUse())
16283  return SDValue();
16284  EVT BCVT = InVec.getOperand(0).getValueType();
16285  if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
16286  return SDValue();
16287  InVec = InVec.getOperand(0);
16288  HasShuffleIntoBitcast = true;
16289  }
16290 
16291  if (!isTargetShuffle(InVec.getOpcode()))
16292  return SDValue();
16293 
16294  // Don't duplicate a load with other uses.
16295  if (!InVec.hasOneUse())
16296  return SDValue();
16297 
16298  SmallVector<int, 16> ShuffleMask;
16299  bool UnaryShuffle;
16300  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
16301  UnaryShuffle))
16302  return SDValue();
16303 
16304  // Select the input vector, guarding against out of range extract vector.
16305  unsigned NumElems = VT.getVectorNumElements();
16306  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
16307  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
16308  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
16309  : InVec.getOperand(1);
16310 
16311  // If inputs to shuffle are the same for both ops, then allow 2 uses
16312  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
16313 
16314  if (LdNode.getOpcode() == ISD::BITCAST) {
16315  // Don't duplicate a load with other uses.
16316  if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
16317  return SDValue();
16318 
16319  AllowedUses = 1; // only allow 1 load use if we have a bitcast
16320  LdNode = LdNode.getOperand(0);
16321  }
16322 
16323  if (!ISD::isNormalLoad(LdNode.getNode()))
16324  return SDValue();
16325 
16326  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
16327 
16328  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
16329  return SDValue();
16330 
16331  if (HasShuffleIntoBitcast) {
16332  // If there's a bitcast before the shuffle, check if the load type and
16333  // alignment is valid.
16334  unsigned Align = LN0->getAlignment();
16335  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16336  unsigned NewAlign = TLI.getDataLayout()->
16337  getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
16338 
16339  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
16340  return SDValue();
16341  }
16342 
16343  // All checks match so transform back to vector_shuffle so that DAG combiner
16344  // can finish the job
16345  SDLoc dl(N);
16346 
16347  // Create shuffle node taking into account the case that its a unary shuffle
16348  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
16349  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
16350  InVec.getOperand(0), Shuffle,
16351  &ShuffleMask[0]);
16352  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
16353  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
16354  EltNo);
16355 }
16356 
16357 /// Extract one bit from mask vector, like v16i1 or v8i1.
16358 /// AVX-512 feature.
16360  SDValue Vec = N->getOperand(0);
16361  SDLoc dl(Vec);
16362  MVT VecVT = Vec.getSimpleValueType();
16363  SDValue Idx = N->getOperand(1);
16364  MVT EltVT = N->getSimpleValueType(0);
16365 
16366  assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) ||
16367  "Unexpected operands in ExtractBitFromMaskVector");
16368 
16369  // variable index
16370  if (!isa<ConstantSDNode>(Idx)) {
16371  MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
16372  SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
16373  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
16374  ExtVT.getVectorElementType(), Ext);
16375  return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
16376  }
16377 
16378  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
16379 
16380  MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits());
16381  unsigned MaxShift = VecVT.getSizeInBits() - 1;
16382  Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec);
16383  Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec,
16384  DAG.getConstant(MaxShift - IdxVal, ScalarVT));
16385  Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec,
16386  DAG.getConstant(MaxShift, ScalarVT));
16387 
16388  if (VecVT == MVT::v16i1) {
16389  Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec);
16390  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec);
16391  }
16392  return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec);
16393 }
16394 
16395 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
16396 /// generation and convert it from being a bunch of shuffles and extracts
16397 /// to a simple store and scalar loads to extract the elements.
16400  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
16401  if (NewOp.getNode())
16402  return NewOp;
16403 
16404  SDValue InputVector = N->getOperand(0);
16405 
16406  if (InputVector.getValueType().getVectorElementType() == MVT::i1 &&
16407  !DCI.isBeforeLegalize())
16408  return ExtractBitFromMaskVector(N, DAG);
16409 
16410  // Detect whether we are trying to convert from mmx to i32 and the bitcast
16411  // from mmx to v2i32 has a single usage.
16412  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
16413  InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
16414  InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
16415  return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
16416  N->getValueType(0),
16417  InputVector.getNode()->getOperand(0));
16418 
16419  // Only operate on vectors of 4 elements, where the alternative shuffling
16420  // gets to be more expensive.
16421  if (InputVector.getValueType() != MVT::v4i32)
16422  return SDValue();
16423 
16424  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
16425  // single use which is a sign-extend or zero-extend, and all elements are
16426  // used.
16428  unsigned ExtractedElements = 0;
16429  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
16430  UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
16431  if (UI.getUse().getResNo() != InputVector.getResNo())
16432  return SDValue();
16433 
16434  SDNode *Extract = *UI;
16435  if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16436  return SDValue();
16437 
16438  if (Extract->getValueType(0) != MVT::i32)
16439  return SDValue();
16440  if (!Extract->hasOneUse())
16441  return SDValue();
16442  if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
16443  Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
16444  return SDValue();
16445  if (!isa<ConstantSDNode>(Extract->getOperand(1)))
16446  return SDValue();
16447 
16448  // Record which element was extracted.
16449  ExtractedElements |=
16450  1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
16451 
16452  Uses.push_back(Extract);
16453  }
16454 
16455  // If not all the elements were used, this may not be worthwhile.
16456  if (ExtractedElements != 15)
16457  return SDValue();
16458 
16459  // Ok, we've now decided to do the transformation.
16460  SDLoc dl(InputVector);
16461 
16462  // Store the value to a temporary stack slot.
16463  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
16464  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
16465  MachinePointerInfo(), false, false, 0);
16466 
16467  // Replace each use (extract) with a load of the appropriate element.
16468  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
16469  UE = Uses.end(); UI != UE; ++UI) {
16470  SDNode *Extract = *UI;
16471 
16472  // cOMpute the element's address.
16473  SDValue Idx = Extract->getOperand(1);
16474  unsigned EltSize =
16475  InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
16476  uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
16477  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16478  SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
16479 
16480  SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
16481  StackPtr, OffsetVal);
16482 
16483  // Load the scalar.
16484  SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
16485  ScalarAddr, MachinePointerInfo(),
16486  false, false, false, 0);
16487 
16488  // Replace the exact with the load.
16489  DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
16490  }
16491 
16492  // The replacement was made in place; don't return anything.
16493  return SDValue();
16494 }
16495 
16496 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
16497 static std::pair<unsigned, bool>
16499  SelectionDAG &DAG, const X86Subtarget *Subtarget) {
16500  if (!VT.isVector())
16501  return std::make_pair(0, false);
16502 
16503  bool NeedSplit = false;
16504  switch (VT.getSimpleVT().SimpleTy) {
16505  default: return std::make_pair(0, false);
16506  case MVT::v32i8:
16507  case MVT::v16i16:
16508  case MVT::v8i32:
16509  if (!Subtarget->hasAVX2())
16510  NeedSplit = true;
16511  if (!Subtarget->hasAVX())
16512  return std::make_pair(0, false);
16513  break;
16514  case MVT::v16i8:
16515  case MVT::v8i16:
16516  case MVT::v4i32:
16517  if (!Subtarget->hasSSE2())
16518  return std::make_pair(0, false);
16519  }
16520 
16521  // SSE2 has only a small subset of the operations.
16522  bool hasUnsigned = Subtarget->hasSSE41() ||
16523  (Subtarget->hasSSE2() && VT == MVT::v16i8);
16524  bool hasSigned = Subtarget->hasSSE41() ||
16525  (Subtarget->hasSSE2() && VT == MVT::v8i16);
16526 
16527  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16528 
16529  unsigned Opc = 0;
16530  // Check for x CC y ? x : y.
16531  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
16532  DAG.isEqualTo(RHS, Cond.getOperand(1))) {
16533  switch (CC) {
16534  default: break;
16535  case ISD::SETULT:
16536  case ISD::SETULE:
16537  Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
16538  case ISD::SETUGT:
16539  case ISD::SETUGE:
16540  Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
16541  case ISD::SETLT:
16542  case ISD::SETLE:
16543  Opc = hasSigned ? X86ISD::SMIN : 0; break;
16544  case ISD::SETGT:
16545  case ISD::SETGE:
16546  Opc = hasSigned ? X86ISD::SMAX : 0; break;
16547  }
16548  // Check for x CC y ? y : x -- a min/max with reversed arms.
16549  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
16550  DAG.isEqualTo(RHS, Cond.getOperand(0))) {
16551  switch (CC) {
16552  default: break;
16553  case ISD::SETULT:
16554  case ISD::SETULE:
16555  Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
16556  case ISD::SETUGT:
16557  case ISD::SETUGE:
16558  Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
16559  case ISD::SETLT:
16560  case ISD::SETLE:
16561  Opc = hasSigned ? X86ISD::SMAX : 0; break;
16562  case ISD::SETGT:
16563  case ISD::SETGE:
16564  Opc = hasSigned ? X86ISD::SMIN : 0; break;
16565  }
16566  }
16567 
16568  return std::make_pair(Opc, NeedSplit);
16569 }
16570 
16571 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
16572 /// nodes.
16575  const X86Subtarget *Subtarget) {
16576  SDLoc DL(N);
16577  SDValue Cond = N->getOperand(0);
16578  // Get the LHS/RHS of the select.
16579  SDValue LHS = N->getOperand(1);
16580  SDValue RHS = N->getOperand(2);
16581  EVT VT = LHS.getValueType();
16582  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16583 
16584  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
16585  // instructions match the semantics of the common C idiom x<y?x:y but not
16586  // x<=y?x:y, because of how they handle negative zero (which can be
16587  // ignored in unsafe-math mode).
16588  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
16589  VT != MVT::f80 && TLI.isTypeLegal(VT) &&
16590  (Subtarget->hasSSE2() ||
16591  (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
16592  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16593 
16594  unsigned Opcode = 0;
16595  // Check for x CC y ? x : y.
16596  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
16597  DAG.isEqualTo(RHS, Cond.getOperand(1))) {
16598  switch (CC) {
16599  default: break;
16600  case ISD::SETULT:
16601  // Converting this to a min would handle NaNs incorrectly, and swapping
16602  // the operands would cause it to handle comparisons between positive
16603  // and negative zero incorrectly.
16604  if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
16605  if (!DAG.getTarget().Options.UnsafeFPMath &&
16606  !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
16607  break;
16608  std::swap(LHS, RHS);
16609  }
16610  Opcode = X86ISD::FMIN;
16611  break;
16612  case ISD::SETOLE:
16613  // Converting this to a min would handle comparisons between positive
16614  // and negative zero incorrectly.
16615  if (!DAG.getTarget().Options.UnsafeFPMath &&
16616  !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
16617  break;
16618  Opcode = X86ISD::FMIN;
16619  break;
16620  case ISD::SETULE:
16621  // Converting this to a min would handle both negative zeros and NaNs
16622  // incorrectly, but we can swap the operands to fix both.
16623  std::swap(LHS, RHS);
16624  case ISD::SETOLT:
16625  case ISD::SETLT:
16626  case ISD::SETLE:
16627  Opcode = X86ISD::FMIN;
16628  break;
16629 
16630  case ISD::SETOGE:
16631  // Converting this to a max would handle comparisons between positive
16632  // and negative zero incorrectly.
16633  if (!DAG.getTarget().Options.UnsafeFPMath &&
16634  !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
16635  break;
16636  Opcode = X86ISD::FMAX;
16637  break;
16638  case ISD::SETUGT:
16639  // Converting this to a max would handle NaNs incorrectly, and swapping
16640  // the operands would cause it to handle comparisons between positive
16641  // and negative zero incorrectly.
16642  if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
16643  if (!DAG.getTarget().Options.UnsafeFPMath &&
16644  !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
16645  break;
16646  std::swap(LHS, RHS);
16647  }
16648  Opcode = X86ISD::FMAX;
16649  break;
16650  case ISD::SETUGE:
16651  // Converting this to a max would handle both negative zeros and NaNs
16652  // incorrectly, but we can swap the operands to fix both.
16653  std::swap(LHS, RHS);
16654  case ISD::SETOGT:
16655  case ISD::SETGT:
16656  case ISD::SETGE:
16657  Opcode = X86ISD::FMAX;
16658  break;
16659  }
16660  // Check for x CC y ? y : x -- a min/max with reversed arms.
16661  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
16662  DAG.isEqualTo(RHS, Cond.getOperand(0))) {
16663  switch (CC) {
16664  default: break;
16665  case ISD::SETOGE:
16666  // Converting this to a min would handle comparisons between positive
16667  // and negative zero incorrectly, and swapping the operands would
16668  // cause it to handle NaNs incorrectly.
16669  if (!DAG.getTarget().Options.UnsafeFPMath &&
16670  !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
16671  if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
16672  break;
16673  std::swap(LHS, RHS);
16674  }
16675  Opcode = X86ISD::FMIN;
16676  break;
16677  case ISD::SETUGT:
16678  // Converting this to a min would handle NaNs incorrectly.
16679  if (!DAG.getTarget().Options.UnsafeFPMath &&
16680  (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
16681  break;
16682  Opcode = X86ISD::FMIN;
16683  break;
16684  case ISD::SETUGE:
16685  // Converting this to a min would handle both negative zeros and NaNs
16686  // incorrectly, but we can swap the operands to fix both.
16687  std::swap(LHS, RHS);
16688  case ISD::SETOGT:
16689  case ISD::SETGT:
16690  case ISD::SETGE:
16691  Opcode = X86ISD::FMIN;
16692  break;
16693 
16694  case ISD::SETULT:
16695  // Converting this to a max would handle NaNs incorrectly.
16696  if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
16697  break;
16698  Opcode = X86ISD::FMAX;
16699  break;
16700  case ISD::SETOLE:
16701  // Converting this to a max would handle comparisons between positive
16702  // and negative zero incorrectly, and swapping the operands would
16703  // cause it to handle NaNs incorrectly.
16704  if (!DAG.getTarget().Options.UnsafeFPMath &&
16705  !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
16706  if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
16707  break;
16708  std::swap(LHS, RHS);
16709  }
16710  Opcode = X86ISD::FMAX;
16711  break;
16712  case ISD::SETULE:
16713  // Converting this to a max would handle both negative zeros and NaNs
16714  // incorrectly, but we can swap the operands to fix both.
16715  std::swap(LHS, RHS);
16716  case ISD::SETOLT:
16717  case ISD::SETLT:
16718  case ISD::SETLE:
16719  Opcode = X86ISD::FMAX;
16720  break;
16721  }
16722  }
16723 
16724  if (Opcode)
16725  return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
16726  }
16727 
16728  EVT CondVT = Cond.getValueType();
16729  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
16730  CondVT.getVectorElementType() == MVT::i1) {
16731  // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
16732  // lowering on AVX-512. In this case we convert it to
16733  // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
16734  // The same situation for all 128 and 256-bit vectors of i8 and i16
16735  EVT OpVT = LHS.getValueType();
16736  if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
16737  (OpVT.getVectorElementType() == MVT::i8 ||
16738  OpVT.getVectorElementType() == MVT::i16)) {
16739  Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
16740  DCI.AddToWorklist(Cond.getNode());
16741  return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
16742  }
16743  }
16744  // If this is a select between two integer constants, try to do some
16745  // optimizations.
16746  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
16747  if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
16748  // Don't do this for crazy integer types.
16749  if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
16750  // If this is efficiently invertible, canonicalize the LHSC/RHSC values
16751  // so that TrueC (the true value) is larger than FalseC.
16752  bool NeedsCondInvert = false;
16753 
16754  if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
16755  // Efficiently invertible.
16756  (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
16757  (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
16758  isa<ConstantSDNode>(Cond.getOperand(1))))) {
16759  NeedsCondInvert = true;
16760  std::swap(TrueC, FalseC);
16761  }
16762 
16763  // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
16764  if (FalseC->getAPIntValue() == 0 &&
16765  TrueC->getAPIntValue().isPowerOf2()) {
16766  if (NeedsCondInvert) // Invert the condition if needed.
16767  Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
16768  DAG.getConstant(1, Cond.getValueType()));
16769 
16770  // Zero extend the condition if needed.
16771  Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
16772 
16773  unsigned ShAmt = TrueC->getAPIntValue().logBase2();
16774  return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
16775  DAG.getConstant(ShAmt, MVT::i8));
16776  }
16777 
16778  // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
16779  if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
16780  if (NeedsCondInvert) // Invert the condition if needed.
16781  Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
16782  DAG.getConstant(1, Cond.getValueType()));
16783 
16784  // Zero extend the condition if needed.
16785  Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
16786  FalseC->getValueType(0), Cond);
16787  return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
16788  SDValue(FalseC, 0));
16789  }
16790 
16791  // Optimize cases that will turn into an LEA instruction. This requires
16792  // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
16793  if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
16794  uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
16795  if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
16796 
16797  bool isFastMultiplier = false;
16798  if (Diff < 10) {
16799  switch ((unsigned char)Diff) {
16800  default: break;
16801  case 1: // result = add base, cond
16802  case 2: // result = lea base( , cond*2)
16803  case 3: // result = lea base(cond, cond*2)
16804  case 4: // result = lea base( , cond*4)
16805  case 5: // result = lea base(cond, cond*4)
16806  case 8: // result = lea base( , cond*8)
16807  case 9: // result = lea base(cond, cond*8)
16808  isFastMultiplier = true;
16809  break;
16810  }
16811  }
16812 
16813  if (isFastMultiplier) {
16814  APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
16815  if (NeedsCondInvert) // Invert the condition if needed.
16816  Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
16817  DAG.getConstant(1, Cond.getValueType()));
16818 
16819  // Zero extend the condition if needed.
16820  Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
16821  Cond);
16822  // Scale the condition by the difference.
16823  if (Diff != 1)
16824  Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
16825  DAG.getConstant(Diff, Cond.getValueType()));
16826 
16827  // Add the base if non-zero.
16828  if (FalseC->getAPIntValue() != 0)
16829  Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
16830  SDValue(FalseC, 0));
16831  return Cond;
16832  }
16833  }
16834  }
16835  }
16836 
16837  // Canonicalize max and min:
16838  // (x > y) ? x : y -> (x >= y) ? x : y
16839  // (x < y) ? x : y -> (x <= y) ? x : y
16840  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
16841  // the need for an extra compare
16842  // against zero. e.g.
16843  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
16844  // subl %esi, %edi
16845  // testl %edi, %edi
16846  // movl $0, %eax
16847  // cmovgl %edi, %eax
16848  // =>
16849  // xorl %eax, %eax
16850  // subl %esi, $edi
16851  // cmovsl %eax, %edi
16852  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
16853  DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
16854  DAG.isEqualTo(RHS, Cond.getOperand(1))) {
16855  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16856  switch (CC) {
16857  default: break;
16858  case ISD::SETLT:
16859  case ISD::SETGT: {
16860  ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
16861  Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
16862  Cond.getOperand(0), Cond.getOperand(1), NewCC);
16863  return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
16864  }
16865  }
16866  }
16867 
16868  // Early exit check
16869  if (!TLI.isTypeLegal(VT))
16870  return SDValue();
16871 
16872  // Match VSELECTs into subs with unsigned saturation.
16873  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
16874  // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
16875  ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
16876  (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
16877  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16878 
16879  // Check if one of the arms of the VSELECT is a zero vector. If it's on the
16880  // left side invert the predicate to simplify logic below.
16881  SDValue Other;
16882  if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
16883  Other = RHS;
16884  CC = ISD::getSetCCInverse(CC, true);
16885  } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
16886  Other = LHS;
16887  }
16888 
16889  if (Other.getNode() && Other->getNumOperands() == 2 &&
16890  DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
16891  SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
16892  SDValue CondRHS = Cond->getOperand(1);
16893 
16894  // Look for a general sub with unsigned saturation first.
16895  // x >= y ? x-y : 0 --> subus x, y
16896  // x > y ? x-y : 0 --> subus x, y
16897  if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
16898  Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
16899  return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
16900 
16901  // If the RHS is a constant we have to reverse the const canonicalization.
16902  // x > C-1 ? x+-C : 0 --> subus x, C
16903  if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
16904  isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
16905  APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
16906  if (CondRHS.getConstantOperandVal(0) == -A-1)
16907  return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
16908  DAG.getConstant(-A, VT));
16909  }
16910 
16911  // Another special case: If C was a sign bit, the sub has been
16912  // canonicalized into a xor.
16913  // FIXME: Would it be better to use ComputeMaskedBits to determine whether
16914  // it's safe to decanonicalize the xor?
16915  // x s< 0 ? x^C : 0 --> subus x, C
16916  if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
16917  ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
16918  isSplatVector(OpRHS.getNode())) {
16919  APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
16920  if (A.isSignBit())
16921  return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
16922  }
16923  }
16924  }
16925 
16926  // Try to match a min/max vector operation.
16927  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
16928  std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
16929  unsigned Opc = ret.first;
16930  bool NeedSplit = ret.second;
16931 
16932  if (Opc && NeedSplit) {
16933  unsigned NumElems = VT.getVectorNumElements();
16934  // Extract the LHS vectors
16935  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
16936  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
16937 
16938  // Extract the RHS vectors
16939  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
16940  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
16941 
16942  // Create min/max for each subvector
16943  LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
16944  RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
16945 
16946  // Merge the result
16947  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
16948  } else if (Opc)
16949  return DAG.getNode(Opc, DL, VT, LHS, RHS);
16950  }
16951 
16952  // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
16953  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
16954  // Check if SETCC has already been promoted
16955  TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
16956 
16957  assert(Cond.getValueType().isVector() &&
16958  "vector select expects a vector selector!");
16959 
16960  EVT IntVT = Cond.getValueType();
16961  bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
16962  bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
16963 
16964  if (!TValIsAllOnes && !FValIsAllZeros) {
16965  // Try invert the condition if true value is not all 1s and false value
16966  // is not all 0s.
16967  bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
16968  bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
16969 
16970  if (TValIsAllZeros || FValIsAllOnes) {
16971  SDValue CC = Cond.getOperand(2);
16972  ISD::CondCode NewCC =
16973  ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
16974  Cond.getOperand(0).getValueType().isInteger());
16975  Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
16976  std::swap(LHS, RHS);
16977  TValIsAllOnes = FValIsAllOnes;
16978  FValIsAllZeros = TValIsAllZeros;
16979  }
16980  }
16981 
16982  if (TValIsAllOnes || FValIsAllZeros) {
16983  SDValue Ret;
16984 
16985  if (TValIsAllOnes && FValIsAllZeros)
16986  Ret = Cond;
16987  else if (TValIsAllOnes)
16988  Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
16989  DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
16990  else if (FValIsAllZeros)
16991  Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
16992  DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
16993 
16994  return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
16995  }
16996  }
16997 
16998  // If we know that this node is legal then we know that it is going to be
16999  // matched by one of the SSE/AVX BLEND instructions. These instructions only
17000  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
17001  // to simplify previous instructions.
17002  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
17003  !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
17004  unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
17005 
17006  // Don't optimize vector selects that map to mask-registers.
17007  if (BitWidth == 1)
17008  return SDValue();
17009 
17010  // Check all uses of that condition operand to check whether it will be
17011  // consumed by non-BLEND instructions, which may depend on all bits are set
17012  // properly.
17013  for (SDNode::use_iterator I = Cond->use_begin(),
17014  E = Cond->use_end(); I != E; ++I)
17015  if (I->getOpcode() != ISD::VSELECT)
17016  // TODO: Add other opcodes eventually lowered into BLEND.
17017  return SDValue();
17018 
17019  assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
17020  APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
17021 
17022  APInt KnownZero, KnownOne;
17024  DCI.isBeforeLegalizeOps());
17025  if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
17026  TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
17027  DCI.CommitTargetLoweringOpt(TLO);
17028  }
17029 
17030  return SDValue();
17031 }
17032 
17033 // Check whether a boolean test is testing a boolean value generated by
17034 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
17035 // code.
17036 //
17037 // Simplify the following patterns:
17038 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
17039 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
17040 // to (Op EFLAGS Cond)
17041 //
17042 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
17043 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
17044 // to (Op EFLAGS !Cond)
17045 //
17046 // where Op could be BRCOND or CMOV.
17047 //
17049  // Quit if not CMP and SUB with its value result used.
17050  if (Cmp.getOpcode() != X86ISD::CMP &&
17051  (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
17052  return SDValue();
17053 
17054  // Quit if not used as a boolean value.
17055  if (CC != X86::COND_E && CC != X86::COND_NE)
17056  return SDValue();
17057 
17058  // Check CMP operands. One of them should be 0 or 1 and the other should be
17059  // an SetCC or extended from it.
17060  SDValue Op1 = Cmp.getOperand(0);
17061  SDValue Op2 = Cmp.getOperand(1);
17062 
17063  SDValue SetCC;
17064  const ConstantSDNode* C = 0;
17065  bool needOppositeCond = (CC == X86::COND_E);
17066  bool checkAgainstTrue = false; // Is it a comparison against 1?
17067 
17068  if ((C = dyn_cast<ConstantSDNode>(Op1)))
17069  SetCC = Op2;
17070  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
17071  SetCC = Op1;
17072  else // Quit if all operands are not constants.
17073  return SDValue();
17074 
17075  if (C->getZExtValue() == 1) {
17076  needOppositeCond = !needOppositeCond;
17077  checkAgainstTrue = true;
17078  } else if (C->getZExtValue() != 0)
17079  // Quit if the constant is neither 0 or 1.
17080  return SDValue();
17081 
17082  bool truncatedToBoolWithAnd = false;
17083  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
17084  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
17085  SetCC.getOpcode() == ISD::TRUNCATE ||
17086  SetCC.getOpcode() == ISD::AND) {
17087  if (SetCC.getOpcode() == ISD::AND) {
17088  int OpIdx = -1;
17089  ConstantSDNode *CS;
17090  if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
17091  CS->getZExtValue() == 1)
17092  OpIdx = 1;
17093  if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
17094  CS->getZExtValue() == 1)
17095  OpIdx = 0;
17096  if (OpIdx == -1)
17097  break;
17098  SetCC = SetCC.getOperand(OpIdx);
17099  truncatedToBoolWithAnd = true;
17100  } else
17101  SetCC = SetCC.getOperand(0);
17102  }
17103 
17104  switch (SetCC.getOpcode()) {
17105  case X86ISD::SETCC_CARRY:
17106  // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
17107  // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
17108  // i.e. it's a comparison against true but the result of SETCC_CARRY is not
17109  // truncated to i1 using 'and'.
17110  if (checkAgainstTrue && !truncatedToBoolWithAnd)
17111  break;
17112  assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
17113  "Invalid use of SETCC_CARRY!");
17114  // FALL THROUGH
17115  case X86ISD::SETCC:
17116  // Set the condition code or opposite one if necessary.
17117  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
17118  if (needOppositeCond)
17120  return SetCC.getOperand(1);
17121  case X86ISD::CMOV: {
17122  // Check whether false/true value has canonical one, i.e. 0 or 1.
17123  ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
17124  ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
17125  // Quit if true value is not a constant.
17126  if (!TVal)
17127  return SDValue();
17128  // Quit if false value is not a constant.
17129  if (!FVal) {
17130  SDValue Op = SetCC.getOperand(0);
17131  // Skip 'zext' or 'trunc' node.
17132  if (Op.getOpcode() == ISD::ZERO_EXTEND ||
17133  Op.getOpcode() == ISD::TRUNCATE)
17134  Op = Op.getOperand(0);
17135  // A special case for rdrand/rdseed, where 0 is set if false cond is
17136  // found.
17137  if ((Op.getOpcode() != X86ISD::RDRAND &&
17138  Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
17139  return SDValue();
17140  }
17141  // Quit if false value is not the constant 0 or 1.
17142  bool FValIsFalse = true;
17143  if (FVal && FVal->getZExtValue() != 0) {
17144  if (FVal->getZExtValue() != 1)
17145  return SDValue();
17146  // If FVal is 1, opposite cond is needed.
17147  needOppositeCond = !needOppositeCond;
17148  FValIsFalse = false;
17149  }
17150  // Quit if TVal is not the constant opposite of FVal.
17151  if (FValIsFalse && TVal->getZExtValue() != 1)
17152  return SDValue();
17153  if (!FValIsFalse && TVal->getZExtValue() != 0)
17154  return SDValue();
17155  CC = X86::CondCode(SetCC.getConstantOperandVal(2));
17156  if (needOppositeCond)
17158  return SetCC.getOperand(3);
17159  }
17160  }
17161 
17162  return SDValue();
17163 }
17164 
17165 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
17168  const X86Subtarget *Subtarget) {
17169  SDLoc DL(N);
17170 
17171  // If the flag operand isn't dead, don't touch this CMOV.
17172  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
17173  return SDValue();
17174 
17175  SDValue FalseOp = N->getOperand(0);
17176  SDValue TrueOp = N->getOperand(1);
17178  SDValue Cond = N->getOperand(3);
17179 
17180  if (CC == X86::COND_E || CC == X86::COND_NE) {
17181  switch (Cond.getOpcode()) {
17182  default: break;
17183  case X86ISD::BSR:
17184  case X86ISD::BSF:
17185  // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
17186  if (DAG.isKnownNeverZero(Cond.getOperand(0)))
17187  return (CC == X86::COND_E) ? FalseOp : TrueOp;
17188  }
17189  }
17190 
17191  SDValue Flags;
17192 
17193  Flags = checkBoolTestSetCCCombine(Cond, CC);
17194  if (Flags.getNode() &&
17195  // Extra check as FCMOV only supports a subset of X86 cond.
17196  (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
17197  SDValue Ops[] = { FalseOp, TrueOp,
17198  DAG.getConstant(CC, MVT::i8), Flags };
17199  return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
17200  Ops, array_lengthof(Ops));
17201  }
17202 
17203  // If this is a select between two integer constants, try to do some
17204  // optimizations. Note that the operands are ordered the opposite of SELECT
17205  // operands.
17206  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
17207  if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
17208  // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
17209  // larger than FalseC (the false value).
17210  if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
17212  std::swap(TrueC, FalseC);
17213  std::swap(TrueOp, FalseOp);
17214  }
17215 
17216  // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
17217  // This is efficient for any integer data type (including i8/i16) and
17218  // shift amount.
17219  if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
17220  Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17221  DAG.getConstant(CC, MVT::i8), Cond);
17222 
17223  // Zero extend the condition if needed.
17224  Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
17225 
17226  unsigned ShAmt = TrueC->getAPIntValue().logBase2();
17227  Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
17228  DAG.getConstant(ShAmt, MVT::i8));
17229  if (N->getNumValues() == 2) // Dead flag value?
17230  return DCI.CombineTo(N, Cond, SDValue());
17231  return Cond;
17232  }
17233 
17234  // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
17235  // for any integer data type, including i8/i16.
17236  if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
17237  Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17238  DAG.getConstant(CC, MVT::i8), Cond);
17239 
17240  // Zero extend the condition if needed.
17241  Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
17242  FalseC->getValueType(0), Cond);
17243  Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
17244  SDValue(FalseC, 0));
17245 
17246  if (N->getNumValues() == 2) // Dead flag value?
17247  return DCI.CombineTo(N, Cond, SDValue());
17248  return Cond;
17249  }
17250 
17251  // Optimize cases that will turn into an LEA instruction. This requires
17252  // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
17253  if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
17254  uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
17255  if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
17256 
17257  bool isFastMultiplier = false;
17258  if (Diff < 10) {
17259  switch ((unsigned char)Diff) {
17260  default: break;
17261  case 1: // result = add base, cond
17262  case 2: // result = lea base( , cond*2)
17263  case 3: // result = lea base(cond, cond*2)
17264  case 4: // result = lea base( , cond*4)
17265  case 5: // result = lea base(cond, cond*4)
17266  case 8: // result = lea base( , cond*8)
17267  case 9: // result = lea base(cond, cond*8)
17268  isFastMultiplier = true;
17269  break;
17270  }
17271  }
17272 
17273  if (isFastMultiplier) {
17274  APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
17275  Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17276  DAG.getConstant(CC, MVT::i8), Cond);
17277  // Zero extend the condition if needed.
17278  Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
17279  Cond);
17280  // Scale the condition by the difference.
17281  if (Diff != 1)
17282  Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
17283  DAG.getConstant(Diff, Cond.getValueType()));
17284 
17285  // Add the base if non-zero.
17286  if (FalseC->getAPIntValue() != 0)
17287  Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
17288  SDValue(FalseC, 0));
17289  if (N->getNumValues() == 2) // Dead flag value?
17290  return DCI.CombineTo(N, Cond, SDValue());
17291  return Cond;
17292  }
17293  }
17294  }
17295  }
17296 
17297  // Handle these cases:
17298  // (select (x != c), e, c) -> select (x != c), e, x),
17299  // (select (x == c), c, e) -> select (x == c), x, e)
17300  // where the c is an integer constant, and the "select" is the combination
17301  // of CMOV and CMP.
17302  //
17303  // The rationale for this change is that the conditional-move from a constant
17304  // needs two instructions, however, conditional-move from a register needs
17305  // only one instruction.
17306  //
17307  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
17308  // some instruction-combining opportunities. This opt needs to be
17309  // postponed as late as possible.
17310  //
17311  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
17312  // the DCI.xxxx conditions are provided to postpone the optimization as
17313  // late as possible.
17314 
17315  ConstantSDNode *CmpAgainst = 0;
17316  if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
17317  (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
17318  !isa<ConstantSDNode>(Cond.getOperand(0))) {
17319 
17320  if (CC == X86::COND_NE &&
17321  CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
17323  std::swap(TrueOp, FalseOp);
17324  }
17325 
17326  if (CC == X86::COND_E &&
17327  CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
17328  SDValue Ops[] = { FalseOp, Cond.getOperand(0),
17329  DAG.getConstant(CC, MVT::i8), Cond };
17330  return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
17331  array_lengthof(Ops));
17332  }
17333  }
17334  }
17335 
17336  return SDValue();
17337 }
17338 
17339 /// PerformMulCombine - Optimize a single multiply with constant into two
17340 /// in order to implement it with two cheaper instructions, e.g.
17341 /// LEA + SHL, LEA + LEA.
17344  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17345  return SDValue();
17346 
17347  EVT VT = N->getValueType(0);
17348  if (VT != MVT::i64)
17349  return SDValue();
17350 
17352  if (!C)
17353  return SDValue();
17354  uint64_t MulAmt = C->getZExtValue();
17355  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
17356  return SDValue();
17357 
17358  uint64_t MulAmt1 = 0;
17359  uint64_t MulAmt2 = 0;
17360  if ((MulAmt % 9) == 0) {
17361  MulAmt1 = 9;
17362  MulAmt2 = MulAmt / 9;
17363  } else if ((MulAmt % 5) == 0) {
17364  MulAmt1 = 5;
17365  MulAmt2 = MulAmt / 5;
17366  } else if ((MulAmt % 3) == 0) {
17367  MulAmt1 = 3;
17368  MulAmt2 = MulAmt / 3;
17369  }
17370  if (MulAmt2 &&
17371  (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
17372  SDLoc DL(N);
17373 
17374  if (isPowerOf2_64(MulAmt2) &&
17375  !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
17376  // If second multiplifer is pow2, issue it first. We want the multiply by
17377  // 3, 5, or 9 to be folded into the addressing mode unless the lone use
17378  // is an add.
17379  std::swap(MulAmt1, MulAmt2);
17380 
17381  SDValue NewMul;
17382  if (isPowerOf2_64(MulAmt1))
17383  NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
17384  DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
17385  else
17386  NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
17387  DAG.getConstant(MulAmt1, VT));
17388 
17389  if (isPowerOf2_64(MulAmt2))
17390  NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
17391  DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
17392  else
17393  NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
17394  DAG.getConstant(MulAmt2, VT));
17395 
17396  // Do not add new nodes to DAG combiner worklist.
17397  DCI.CombineTo(N, NewMul, false);
17398  }
17399  return SDValue();
17400 }
17401 
17403  SDValue N0 = N->getOperand(0);
17404  SDValue N1 = N->getOperand(1);
17406  EVT VT = N0.getValueType();
17407 
17408  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
17409  // since the result of setcc_c is all zero's or all ones.
17410  if (VT.isInteger() && !VT.isVector() &&
17411  N1C && N0.getOpcode() == ISD::AND &&
17412  N0.getOperand(1).getOpcode() == ISD::Constant) {
17413  SDValue N00 = N0.getOperand(0);
17414  if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
17415  ((N00.getOpcode() == ISD::ANY_EXTEND ||
17416  N00.getOpcode() == ISD::ZERO_EXTEND) &&
17417  N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
17418  APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
17419  APInt ShAmt = N1C->getAPIntValue();
17420  Mask = Mask.shl(ShAmt);
17421  if (Mask != 0)
17422  return DAG.getNode(ISD::AND, SDLoc(N), VT,
17423  N00, DAG.getConstant(Mask, VT));
17424  }
17425  }
17426 
17427  // Hardware support for vector shifts is sparse which makes us scalarize the
17428  // vector operations in many cases. Also, on sandybridge ADD is faster than
17429  // shl.
17430  // (shl V, 1) -> add V,V
17431  if (isSplatVector(N1.getNode())) {
17432  assert(N0.getValueType().isVector() && "Invalid vector shift type");
17434  // We shift all of the values by one. In many cases we do not have
17435  // hardware support for this operation. This is better expressed as an ADD
17436  // of two values.
17437  if (N1C && (1 == N1C->getZExtValue())) {
17438  return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
17439  }
17440  }
17441 
17442  return SDValue();
17443 }
17444 
17445 /// \brief Returns a vector of 0s if the node in input is a vector logical
17446 /// shift by a constant amount which is known to be bigger than or equal
17447 /// to the vector element size in bits.
17449  const X86Subtarget *Subtarget) {
17450  EVT VT = N->getValueType(0);
17451 
17452  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
17453  (!Subtarget->hasInt256() ||
17454  (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
17455  return SDValue();
17456 
17457  SDValue Amt = N->getOperand(1);
17458  SDLoc DL(N);
17459  if (isSplatVector(Amt.getNode())) {
17460  SDValue SclrAmt = Amt->getOperand(0);
17461  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
17462  APInt ShiftAmt = C->getAPIntValue();
17463  unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
17464 
17465  // SSE2/AVX2 logical shifts always return a vector of 0s
17466  // if the shift amount is bigger than or equal to
17467  // the element size. The constant shift amount will be
17468  // encoded as a 8-bit immediate.
17469  if (ShiftAmt.trunc(8).uge(MaxAmount))
17470  return getZeroVector(VT, Subtarget, DAG, DL);
17471  }
17472  }
17473 
17474  return SDValue();
17475 }
17476 
17477 /// PerformShiftCombine - Combine shifts.
17480  const X86Subtarget *Subtarget) {
17481  if (N->getOpcode() == ISD::SHL) {
17482  SDValue V = PerformSHLCombine(N, DAG);
17483  if (V.getNode()) return V;
17484  }
17485 
17486  if (N->getOpcode() != ISD::SRA) {
17487  // Try to fold this logical shift into a zero vector.
17488  SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
17489  if (V.getNode()) return V;
17490  }
17491 
17492  return SDValue();
17493 }
17494 
17495 // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
17496 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
17497 // and friends. Likewise for OR -> CMPNEQSS.
17500  const X86Subtarget *Subtarget) {
17501  unsigned opcode;
17502 
17503  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
17504  // we're requiring SSE2 for both.
17505  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
17506  SDValue N0 = N->getOperand(0);
17507  SDValue N1 = N->getOperand(1);
17508  SDValue CMP0 = N0->getOperand(1);
17509  SDValue CMP1 = N1->getOperand(1);
17510  SDLoc DL(N);
17511 
17512  // The SETCCs should both refer to the same CMP.
17513  if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
17514  return SDValue();
17515 
17516  SDValue CMP00 = CMP0->getOperand(0);
17517  SDValue CMP01 = CMP0->getOperand(1);
17518  EVT VT = CMP00.getValueType();
17519 
17520  if (VT == MVT::f32 || VT == MVT::f64) {
17521  bool ExpectingFlags = false;
17522  // Check for any users that want flags:
17523  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
17524  !ExpectingFlags && UI != UE; ++UI)
17525  switch (UI->getOpcode()) {
17526  default:
17527  case ISD::BR_CC:
17528  case ISD::BRCOND:
17529  case ISD::SELECT:
17530  ExpectingFlags = true;
17531  break;
17532  case ISD::CopyToReg:
17533  case ISD::SIGN_EXTEND:
17534  case ISD::ZERO_EXTEND:
17535  case ISD::ANY_EXTEND:
17536  break;
17537  }
17538 
17539  if (!ExpectingFlags) {
17540  enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
17541  enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
17542 
17543  if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
17544  X86::CondCode tmp = cc0;
17545  cc0 = cc1;
17546  cc1 = tmp;
17547  }
17548 
17549  if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
17550  (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
17551  bool is64BitFP = (CMP00.getValueType() == MVT::f64);
17552  X86ISD::NodeType NTOperator = is64BitFP ?
17554  // FIXME: need symbolic constants for these magic numbers.
17555  // See X86ATTInstPrinter.cpp:printSSECC().
17556  unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
17557  SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
17558  DAG.getConstant(x86cc, MVT::i8));
17559  SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
17560  OnesOrZeroesF);
17561  SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
17562  DAG.getConstant(1, MVT::i32));
17563  SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
17564  return OneBitOfTruth;
17565  }
17566  }
17567  }
17568  }
17569  return SDValue();
17570 }
17571 
17572 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
17573 /// so it can be folded inside ANDNP.
17574 static bool CanFoldXORWithAllOnes(const SDNode *N) {
17575  EVT VT = N->getValueType(0);
17576 
17577  // Match direct AllOnes for 128 and 256-bit vectors
17579  return true;
17580 
17581  // Look through a bit convert.
17582  if (N->getOpcode() == ISD::BITCAST)
17583  N = N->getOperand(0).getNode();
17584 
17585  // Sometimes the operand may come from a insert_subvector building a 256-bit
17586  // allones vector
17587  if (VT.is256BitVector() &&
17588  N->getOpcode() == ISD::INSERT_SUBVECTOR) {
17589  SDValue V1 = N->getOperand(0);
17590  SDValue V2 = N->getOperand(1);
17591 
17592  if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
17593  V1.getOperand(0).getOpcode() == ISD::UNDEF &&
17596  return true;
17597  }
17598 
17599  return false;
17600 }
17601 
17602 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
17603 // register. In most cases we actually compare or select YMM-sized registers
17604 // and mixing the two types creates horrible code. This method optimizes
17605 // some of the transition sequences.
17608  const X86Subtarget *Subtarget) {
17609  EVT VT = N->getValueType(0);
17610  if (!VT.is256BitVector())
17611  return SDValue();
17612 
17613  assert((N->getOpcode() == ISD::ANY_EXTEND ||
17614  N->getOpcode() == ISD::ZERO_EXTEND ||
17615  N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
17616 
17617  SDValue Narrow = N->getOperand(0);
17618  EVT NarrowVT = Narrow->getValueType(0);
17619  if (!NarrowVT.is128BitVector())
17620  return SDValue();
17621 
17622  if (Narrow->getOpcode() != ISD::XOR &&
17623  Narrow->getOpcode() != ISD::AND &&
17624  Narrow->getOpcode() != ISD::OR)
17625  return SDValue();
17626 
17627  SDValue N0 = Narrow->getOperand(0);
17628  SDValue N1 = Narrow->getOperand(1);
17629  SDLoc DL(Narrow);
17630 
17631  // The Left side has to be a trunc.
17632  if (N0.getOpcode() != ISD::TRUNCATE)
17633  return SDValue();
17634 
17635  // The type of the truncated inputs.
17636  EVT WideVT = N0->getOperand(0)->getValueType(0);
17637  if (WideVT != VT)
17638  return SDValue();
17639 
17640  // The right side has to be a 'trunc' or a constant vector.
17641  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
17642  bool RHSConst = (isSplatVector(N1.getNode()) &&
17643  isa<ConstantSDNode>(N1->getOperand(0)));
17644  if (!RHSTrunc && !RHSConst)
17645  return SDValue();
17646 
17647  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17648 
17649  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
17650  return SDValue();
17651 
17652  // Set N0 and N1 to hold the inputs to the new wide operation.
17653  N0 = N0->getOperand(0);
17654  if (RHSConst) {
17655  N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
17656  N1->getOperand(0));
17658  N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
17659  } else if (RHSTrunc) {
17660  N1 = N1->getOperand(0);
17661  }
17662 
17663  // Generate the wide operation.
17664  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
17665  unsigned Opcode = N->getOpcode();
17666  switch (Opcode) {
17667  case ISD::ANY_EXTEND:
17668  return Op;
17669  case ISD::ZERO_EXTEND: {
17670  unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
17671  APInt Mask = APInt::getAllOnesValue(InBits);
17672  Mask = Mask.zext(VT.getScalarType().getSizeInBits());
17673  return DAG.getNode(ISD::AND, DL, VT,
17674  Op, DAG.getConstant(Mask, VT));
17675  }
17676  case ISD::SIGN_EXTEND:
17677  return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
17678  Op, DAG.getValueType(NarrowVT));
17679  default:
17680  llvm_unreachable("Unexpected opcode");
17681  }
17682 }
17683 
17686  const X86Subtarget *Subtarget) {
17687  EVT VT = N->getValueType(0);
17688  if (DCI.isBeforeLegalizeOps())
17689  return SDValue();
17690 
17691  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
17692  if (R.getNode())
17693  return R;
17694 
17695  // Create BLSI, BLSR, and BZHI instructions
17696  // BLSI is X & (-X)
17697  // BLSR is X & (X-1)
17698  // BZHI is X & ((1 << Y) - 1)
17699  // BEXTR is ((X >> imm) & (2**size-1))
17700  if (VT == MVT::i32 || VT == MVT::i64) {
17701  SDValue N0 = N->getOperand(0);
17702  SDValue N1 = N->getOperand(1);
17703  SDLoc DL(N);
17704 
17705  if (Subtarget->hasBMI()) {
17706  // Check LHS for neg
17707  if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
17708  isZero(N0.getOperand(0)))
17709  return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
17710 
17711  // Check RHS for neg
17712  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
17713  isZero(N1.getOperand(0)))
17714  return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
17715 
17716  // Check LHS for X-1
17717  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
17718  isAllOnes(N0.getOperand(1)))
17719  return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
17720 
17721  // Check RHS for X-1
17722  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
17723  isAllOnes(N1.getOperand(1)))
17724  return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
17725  }
17726 
17727  if (Subtarget->hasBMI2()) {
17728  // Check for (and (add (shl 1, Y), -1), X)
17729  if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
17730  SDValue N00 = N0.getOperand(0);
17731  if (N00.getOpcode() == ISD::SHL) {
17732  SDValue N001 = N00.getOperand(1);
17733  assert(N001.getValueType() == MVT::i8 && "unexpected type");
17735  if (C && C->getZExtValue() == 1)
17736  return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
17737  }
17738  }
17739 
17740  // Check for (and X, (add (shl 1, Y), -1))
17741  if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
17742  SDValue N10 = N1.getOperand(0);
17743  if (N10.getOpcode() == ISD::SHL) {
17744  SDValue N101 = N10.getOperand(1);
17745  assert(N101.getValueType() == MVT::i8 && "unexpected type");
17747  if (C && C->getZExtValue() == 1)
17748  return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
17749  }
17750  }
17751  }
17752 
17753  // Check for BEXTR.
17754  if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
17755  (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
17756  ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
17757  ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
17758  if (MaskNode && ShiftNode) {
17759  uint64_t Mask = MaskNode->getZExtValue();
17760  uint64_t Shift = ShiftNode->getZExtValue();
17761  if (isMask_64(Mask)) {
17762  uint64_t MaskSize = CountPopulation_64(Mask);
17763  if (Shift + MaskSize <= VT.getSizeInBits())
17764  return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
17765  DAG.getConstant(Shift | (MaskSize << 8), VT));
17766  }
17767  }
17768  } // BEXTR
17769 
17770  return SDValue();
17771  }
17772 
17773  // Want to form ANDNP nodes:
17774  // 1) In the hopes of then easily combining them with OR and AND nodes
17775  // to form PBLEND/PSIGN.
17776  // 2) To match ANDN packed intrinsics
17777  if (VT != MVT::v2i64 && VT != MVT::v4i64)
17778  return SDValue();
17779 
17780  SDValue N0 = N->getOperand(0);
17781  SDValue N1 = N->getOperand(1);
17782  SDLoc DL(N);
17783 
17784  // Check LHS for vnot
17785  if (N0.getOpcode() == ISD::XOR &&
17786  //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
17788  return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
17789 
17790  // Check RHS for vnot
17791  if (N1.getOpcode() == ISD::XOR &&
17792  //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
17794  return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
17795 
17796  return SDValue();
17797 }
17798 
17801  const X86Subtarget *Subtarget) {
17802  EVT VT = N->getValueType(0);
17803  if (DCI.isBeforeLegalizeOps())
17804  return SDValue();
17805 
17806  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
17807  if (R.getNode())
17808  return R;
17809 
17810  SDValue N0 = N->getOperand(0);
17811  SDValue N1 = N->getOperand(1);
17812 
17813  // look for psign/blend
17814  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
17815  if (!Subtarget->hasSSSE3() ||
17816  (VT == MVT::v4i64 && !Subtarget->hasInt256()))
17817  return SDValue();
17818 
17819  // Canonicalize pandn to RHS
17820  if (N0.getOpcode() == X86ISD::ANDNP)
17821  std::swap(N0, N1);
17822  // or (and (m, y), (pandn m, x))
17823  if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
17824  SDValue Mask = N1.getOperand(0);
17825  SDValue X = N1.getOperand(1);
17826  SDValue Y;
17827  if (N0.getOperand(0) == Mask)
17828  Y = N0.getOperand(1);
17829  if (N0.getOperand(1) == Mask)
17830  Y = N0.getOperand(0);
17831 
17832  // Check to see if the mask appeared in both the AND and ANDNP and
17833  if (!Y.getNode())
17834  return SDValue();
17835 
17836  // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
17837  // Look through mask bitcast.
17838  if (Mask.getOpcode() == ISD::BITCAST)
17839  Mask = Mask.getOperand(0);
17840  if (X.getOpcode() == ISD::BITCAST)
17841  X = X.getOperand(0);
17842  if (Y.getOpcode() == ISD::BITCAST)
17843  Y = Y.getOperand(0);
17844 
17845  EVT MaskVT = Mask.getValueType();
17846 
17847  // Validate that the Mask operand is a vector sra node.
17848  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
17849  // there is no psrai.b
17850  unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
17851  unsigned SraAmt = ~0;
17852  if (Mask.getOpcode() == ISD::SRA) {
17853  SDValue Amt = Mask.getOperand(1);
17854  if (isSplatVector(Amt.getNode())) {
17855  SDValue SclrAmt = Amt->getOperand(0);
17856  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
17857  SraAmt = C->getZExtValue();
17858  }
17859  } else if (Mask.getOpcode() == X86ISD::VSRAI) {
17860  SDValue SraC = Mask.getOperand(1);
17861  SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
17862  }
17863  if ((SraAmt + 1) != EltBits)
17864  return SDValue();
17865 
17866  SDLoc DL(N);
17867 
17868  // Now we know we at least have a plendvb with the mask val. See if
17869  // we can form a psignb/w/d.
17870  // psign = x.type == y.type == mask.type && y = sub(0, x);
17871  if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
17873  X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
17874  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
17875  "Unsupported VT for PSIGN");
17876  Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
17877  return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
17878  }
17879  // PBLENDVB only available on SSE 4.1
17880  if (!Subtarget->hasSSE41())
17881  return SDValue();
17882 
17883  EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
17884 
17885  X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
17886  Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
17887  Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
17888  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
17889  return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
17890  }
17891  }
17892 
17893  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
17894  return SDValue();
17895 
17896  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
17897  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
17898  std::swap(N0, N1);
17899  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
17900  return SDValue();
17901  if (!N0.hasOneUse() || !N1.hasOneUse())
17902  return SDValue();
17903 
17904  SDValue ShAmt0 = N0.getOperand(1);
17905  if (ShAmt0.getValueType() != MVT::i8)
17906  return SDValue();
17907  SDValue ShAmt1 = N1.getOperand(1);
17908  if (ShAmt1.getValueType() != MVT::i8)
17909  return SDValue();
17910  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
17911  ShAmt0 = ShAmt0.getOperand(0);
17912  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
17913  ShAmt1 = ShAmt1.getOperand(0);
17914 
17915  SDLoc DL(N);
17916  unsigned Opc = X86ISD::SHLD;
17917  SDValue Op0 = N0.getOperand(0);
17918  SDValue Op1 = N1.getOperand(0);
17919  if (ShAmt0.getOpcode() == ISD::SUB) {
17920  Opc = X86ISD::SHRD;
17921  std::swap(Op0, Op1);
17922  std::swap(ShAmt0, ShAmt1);
17923  }
17924 
17925  unsigned Bits = VT.getSizeInBits();
17926  if (ShAmt1.getOpcode() == ISD::SUB) {
17927  SDValue Sum = ShAmt1.getOperand(0);
17928  if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
17929  SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
17930  if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
17931  ShAmt1Op1 = ShAmt1Op1.getOperand(0);
17932  if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
17933  return DAG.getNode(Opc, DL, VT,
17934  Op0, Op1,
17935  DAG.getNode(ISD::TRUNCATE, DL,
17936  MVT::i8, ShAmt0));
17937  }
17938  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
17939  ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
17940  if (ShAmt0C &&
17941  ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
17942  return DAG.getNode(Opc, DL, VT,
17943  N0.getOperand(0), N1.getOperand(0),
17944  DAG.getNode(ISD::TRUNCATE, DL,
17945  MVT::i8, ShAmt0));
17946  }
17947 
17948  return SDValue();
17949 }
17950 
17951 // Generate NEG and CMOV for integer abs.
17953  EVT VT = N->getValueType(0);
17954 
17955  // Since X86 does not have CMOV for 8-bit integer, we don't convert
17956  // 8-bit integer abs to NEG and CMOV.
17957  if (VT.isInteger() && VT.getSizeInBits() == 8)
17958  return SDValue();
17959 
17960  SDValue N0 = N->getOperand(0);
17961  SDValue N1 = N->getOperand(1);
17962  SDLoc DL(N);
17963 
17964  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
17965  // and change it to SUB and CMOV.
17966  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
17967  N0.getOpcode() == ISD::ADD &&
17968  N0.getOperand(1) == N1 &&
17969  N1.getOpcode() == ISD::SRA &&
17970  N1.getOperand(0) == N0.getOperand(0))
17971  if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
17972  if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
17973  // Generate SUB & CMOV.
17974  SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
17975  DAG.getConstant(0, VT), N0.getOperand(0));
17976 
17977  SDValue Ops[] = { N0.getOperand(0), Neg,
17979  SDValue(Neg.getNode(), 1) };
17980  return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
17981  Ops, array_lengthof(Ops));
17982  }
17983  return SDValue();
17984 }
17985 
17986 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
17989  const X86Subtarget *Subtarget) {
17990  EVT VT = N->getValueType(0);
17991  if (DCI.isBeforeLegalizeOps())
17992  return SDValue();
17993 
17994  if (Subtarget->hasCMov()) {
17995  SDValue RV = performIntegerAbsCombine(N, DAG);
17996  if (RV.getNode())
17997  return RV;
17998  }
17999 
18000  // Try forming BMI if it is available.
18001  if (!Subtarget->hasBMI())
18002  return SDValue();
18003 
18004  if (VT != MVT::i32 && VT != MVT::i64)
18005  return SDValue();
18006 
18007  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
18008 
18009  // Create BLSMSK instructions by finding X ^ (X-1)
18010  SDValue N0 = N->getOperand(0);
18011  SDValue N1 = N->getOperand(1);
18012  SDLoc DL(N);
18013 
18014  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
18015  isAllOnes(N0.getOperand(1)))
18016  return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
18017 
18018  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
18019  isAllOnes(N1.getOperand(1)))
18020  return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
18021 
18022  return SDValue();
18023 }
18024 
18025 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
18028  const X86Subtarget *Subtarget) {
18029  LoadSDNode *Ld = cast<LoadSDNode>(N);
18030  EVT RegVT = Ld->getValueType(0);
18031  EVT MemVT = Ld->getMemoryVT();
18032  SDLoc dl(Ld);
18033  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18034  unsigned RegSz = RegVT.getSizeInBits();
18035 
18036  // On Sandybridge unaligned 256bit loads are inefficient.
18038  unsigned Alignment = Ld->getAlignment();
18039  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
18040  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
18041  !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
18042  unsigned NumElems = RegVT.getVectorNumElements();
18043  if (NumElems < 2)
18044  return SDValue();
18045 
18046  SDValue Ptr = Ld->getBasePtr();
18047  SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
18048 
18049  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18050  NumElems/2);
18051  SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
18052  Ld->getPointerInfo(), Ld->isVolatile(),
18053  Ld->isNonTemporal(), Ld->isInvariant(),
18054  Alignment);
18055  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18056  SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
18057  Ld->getPointerInfo(), Ld->isVolatile(),
18058  Ld->isNonTemporal(), Ld->isInvariant(),
18059  std::min(16U, Alignment));
18061  Load1.getValue(1),
18062  Load2.getValue(1));
18063 
18064  SDValue NewVec = DAG.getUNDEF(RegVT);
18065  NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
18066  NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
18067  return DCI.CombineTo(N, NewVec, TF, true);
18068  }
18069 
18070  // If this is a vector EXT Load then attempt to optimize it using a
18071  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
18072  // expansion is still better than scalar code.
18073  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
18074  // emit a shuffle and a arithmetic shift.
18075  // TODO: It is possible to support ZExt by zeroing the undef values
18076  // during the shuffle phase or after the shuffle.
18077  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
18078  (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
18079  assert(MemVT != RegVT && "Cannot extend to the same type");
18080  assert(MemVT.isVector() && "Must load a vector from memory");
18081 
18082  unsigned NumElems = RegVT.getVectorNumElements();
18083  unsigned MemSz = MemVT.getSizeInBits();
18084  assert(RegSz > MemSz && "Register size must be greater than the mem size");
18085 
18086  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
18087  return SDValue();
18088 
18089  // All sizes must be a power of two.
18090  if (!isPowerOf2_32(RegSz * MemSz * NumElems))
18091  return SDValue();
18092 
18093  // Attempt to load the original value using scalar loads.
18094  // Find the largest scalar type that divides the total loaded size.
18095  MVT SclrLoadTy = MVT::i8;
18096  for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
18097  tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
18098  MVT Tp = (MVT::SimpleValueType)tp;
18099  if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18100  SclrLoadTy = Tp;
18101  }
18102  }
18103 
18104  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18105  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18106  (64 <= MemSz))
18107  SclrLoadTy = MVT::f64;
18108 
18109  // Calculate the number of scalar loads that we need to perform
18110  // in order to load our vector from memory.
18111  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18112  if (Ext == ISD::SEXTLOAD && NumLoads > 1)
18113  return SDValue();
18114 
18115  unsigned loadRegZize = RegSz;
18116  if (Ext == ISD::SEXTLOAD && RegSz == 256)
18117  loadRegZize /= 2;
18118 
18119  // Represent our vector as a sequence of elements which are the
18120  // largest scalar that we can load.
18121  EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
18122  loadRegZize/SclrLoadTy.getSizeInBits());
18123 
18124  // Represent the data using the same element type that is stored in
18125  // memory. In practice, we ''widen'' MemVT.
18126  EVT WideVecVT =
18127  EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18128  loadRegZize/MemVT.getScalarType().getSizeInBits());
18129 
18130  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18131  "Invalid vector type");
18132 
18133  // We can't shuffle using an illegal type.
18134  if (!TLI.isTypeLegal(WideVecVT))
18135  return SDValue();
18136 
18137  SmallVector<SDValue, 8> Chains;
18138  SDValue Ptr = Ld->getBasePtr();
18139  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
18140  TLI.getPointerTy());
18141  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18142 
18143  for (unsigned i = 0; i < NumLoads; ++i) {
18144  // Perform a single load.
18145  SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
18146  Ptr, Ld->getPointerInfo(),
18147  Ld->isVolatile(), Ld->isNonTemporal(),
18148  Ld->isInvariant(), Ld->getAlignment());
18149  Chains.push_back(ScalarLoad.getValue(1));
18150  // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18151  // another round of DAGCombining.
18152  if (i == 0)
18153  Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18154  else
18155  Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18156  ScalarLoad, DAG.getIntPtrConstant(i));
18157 
18158  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18159  }
18160 
18161  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
18162  Chains.size());
18163 
18164  // Bitcast the loaded value to a vector of the original element type, in
18165  // the size of the target vector type.
18166  SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
18167  unsigned SizeRatio = RegSz/MemSz;
18168 
18169  if (Ext == ISD::SEXTLOAD) {
18170  // If we have SSE4.1 we can directly emit a VSEXT node.
18171  if (Subtarget->hasSSE41()) {
18172  SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
18173  return DCI.CombineTo(N, Sext, TF, true);
18174  }
18175 
18176  // Otherwise we'll shuffle the small elements in the high bits of the
18177  // larger type and perform an arithmetic shift. If the shift is not legal
18178  // it's better to scalarize.
18179  if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
18180  return SDValue();
18181 
18182  // Redistribute the loaded elements into the different locations.
18183  SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
18184  for (unsigned i = 0; i != NumElems; ++i)
18185  ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
18186 
18187  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18188  DAG.getUNDEF(WideVecVT),
18189  &ShuffleVec[0]);
18190 
18191  Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
18192 
18193  // Build the arithmetic shift.
18194  unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
18196  Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
18197  DAG.getConstant(Amt, RegVT));
18198 
18199  return DCI.CombineTo(N, Shuff, TF, true);
18200  }
18201 
18202  // Redistribute the loaded elements into the different locations.
18203  SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
18204  for (unsigned i = 0; i != NumElems; ++i)
18205  ShuffleVec[i*SizeRatio] = i;
18206 
18207  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18208  DAG.getUNDEF(WideVecVT),
18209  &ShuffleVec[0]);
18210 
18211  // Bitcast to the requested type.
18212  Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
18213  // Replace the original load with the new sequence
18214  // and return the new chain.
18215  return DCI.CombineTo(N, Shuff, TF, true);
18216  }
18217 
18218  return SDValue();
18219 }
18220 
18221 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
18223  const X86Subtarget *Subtarget) {
18224  StoreSDNode *St = cast<StoreSDNode>(N);
18225  EVT VT = St->getValue().getValueType();
18226  EVT StVT = St->getMemoryVT();
18227  SDLoc dl(St);
18228  SDValue StoredVal = St->getOperand(1);
18229  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18230 
18231  // If we are saving a concatenation of two XMM registers, perform two stores.
18232  // On Sandy Bridge, 256-bit memory operations are executed by two
18233  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
18234  // memory operation.
18235  unsigned Alignment = St->getAlignment();
18236  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
18237  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
18238  StVT == VT && !IsAligned) {
18239  unsigned NumElems = VT.getVectorNumElements();
18240  if (NumElems < 2)
18241  return SDValue();
18242 
18243  SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
18244  SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
18245 
18246  SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
18247  SDValue Ptr0 = St->getBasePtr();
18248  SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
18249 
18250  SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
18251  St->getPointerInfo(), St->isVolatile(),
18252  St->isNonTemporal(), Alignment);
18253  SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
18254  St->getPointerInfo(), St->isVolatile(),
18255  St->isNonTemporal(),
18256  std::min(16U, Alignment));
18257  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
18258  }
18259 
18260  // Optimize trunc store (of multiple scalars) to shuffle and store.
18261  // First, pack all of the elements in one place. Next, store to memory
18262  // in fewer chunks.
18263  if (St->isTruncatingStore() && VT.isVector()) {
18264  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18265  unsigned NumElems = VT.getVectorNumElements();
18266  assert(StVT != VT && "Cannot truncate to the same type");
18267  unsigned FromSz = VT.getVectorElementType().getSizeInBits();
18268  unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
18269 
18270  // From, To sizes and ElemCount must be pow of two
18271  if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
18272  // We are going to use the original vector elt for storing.
18273  // Accumulated smaller vector elements must be a multiple of the store size.
18274  if (0 != (NumElems * FromSz) % ToSz) return SDValue();
18275 
18276  unsigned SizeRatio = FromSz / ToSz;
18277 
18278  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
18279 
18280  // Create a type on which we perform the shuffle
18281  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
18282  StVT.getScalarType(), NumElems*SizeRatio);
18283 
18284  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
18285 
18286  SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
18287  SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
18288  for (unsigned i = 0; i != NumElems; ++i)
18289  ShuffleVec[i] = i * SizeRatio;
18290 
18291  // Can't shuffle using an illegal type.
18292  if (!TLI.isTypeLegal(WideVecVT))
18293  return SDValue();
18294 
18295  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
18296  DAG.getUNDEF(WideVecVT),
18297  &ShuffleVec[0]);
18298  // At this point all of the data is stored at the bottom of the
18299  // register. We now need to save it to mem.
18300 
18301  // Find the largest store unit
18302  MVT StoreType = MVT::i8;
18303  for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
18304  tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
18305  MVT Tp = (MVT::SimpleValueType)tp;
18306  if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
18307  StoreType = Tp;
18308  }
18309 
18310  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18311  if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
18312  (64 <= NumElems * ToSz))
18313  StoreType = MVT::f64;
18314 
18315  // Bitcast the original vector into a vector of store-size units
18316  EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
18317  StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
18318  assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
18319  SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
18320  SmallVector<SDValue, 8> Chains;
18321  SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
18322  TLI.getPointerTy());
18323  SDValue Ptr = St->getBasePtr();
18324 
18325  // Perform one or more big stores into memory.
18326  for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
18327  SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
18328  StoreType, ShuffWide,
18329  DAG.getIntPtrConstant(i));
18330  SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
18331  St->getPointerInfo(), St->isVolatile(),
18332  St->isNonTemporal(), St->getAlignment());
18333  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18334  Chains.push_back(Ch);
18335  }
18336 
18337  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
18338  Chains.size());
18339  }
18340 
18341  // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
18342  // the FP state in cases where an emms may be missing.
18343  // A preferable solution to the general problem is to figure out the right
18344  // places to insert EMMS. This qualifies as a quick hack.
18345 
18346  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
18347  if (VT.getSizeInBits() != 64)
18348  return SDValue();
18349 
18350  const Function *F = DAG.getMachineFunction().getFunction();
18351  bool NoImplicitFloatOps = F->getAttributes().
18352  hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
18353  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
18354  && Subtarget->hasSSE2();
18355  if ((VT.isVector() ||
18356  (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
18357  isa<LoadSDNode>(St->getValue()) &&
18358  !cast<LoadSDNode>(St->getValue())->isVolatile() &&
18359  St->getChain().hasOneUse() && !St->isVolatile()) {
18360  SDNode* LdVal = St->getValue().getNode();
18361  LoadSDNode *Ld = 0;
18362  int TokenFactorIndex = -1;
18364  SDNode* ChainVal = St->getChain().getNode();
18365  // Must be a store of a load. We currently handle two cases: the load
18366  // is a direct child, and it's under an intervening TokenFactor. It is
18367  // possible to dig deeper under nested TokenFactors.
18368  if (ChainVal == LdVal)
18369  Ld = cast<LoadSDNode>(St->getChain());
18370  else if (St->getValue().hasOneUse() &&
18371  ChainVal->getOpcode() == ISD::TokenFactor) {
18372  for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
18373  if (ChainVal->getOperand(i).getNode() == LdVal) {
18374  TokenFactorIndex = i;
18375  Ld = cast<LoadSDNode>(St->getValue());
18376  } else
18377  Ops.push_back(ChainVal->getOperand(i));
18378  }
18379  }
18380 
18381  if (!Ld || !ISD::isNormalLoad(Ld))
18382  return SDValue();
18383 
18384  // If this is not the MMX case, i.e. we are just turning i64 load/store
18385  // into f64 load/store, avoid the transformation if there are multiple
18386  // uses of the loaded value.
18387  if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
18388  return SDValue();
18389 
18390  SDLoc LdDL(Ld);
18391  SDLoc StDL(N);
18392  // If we are a 64-bit capable x86, lower to a single movq load/store pair.
18393  // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
18394  // pair instead.
18395  if (Subtarget->is64Bit() || F64IsLegal) {
18396  EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
18397  SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
18398  Ld->getPointerInfo(), Ld->isVolatile(),
18399  Ld->isNonTemporal(), Ld->isInvariant(),
18400  Ld->getAlignment());
18401  SDValue NewChain = NewLd.getValue(1);
18402  if (TokenFactorIndex != -1) {
18403  Ops.push_back(NewChain);
18404  NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
18405  Ops.size());
18406  }
18407  return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
18408  St->getPointerInfo(),
18409  St->isVolatile(), St->isNonTemporal(),
18410  St->getAlignment());
18411  }
18412 
18413  // Otherwise, lower to two pairs of 32-bit loads / stores.
18414  SDValue LoAddr = Ld->getBasePtr();
18415  SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
18416  DAG.getConstant(4, MVT::i32));
18417 
18418  SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
18419  Ld->getPointerInfo(),
18420  Ld->isVolatile(), Ld->isNonTemporal(),
18421  Ld->isInvariant(), Ld->getAlignment());
18422  SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
18423  Ld->getPointerInfo().getWithOffset(4),
18424  Ld->isVolatile(), Ld->isNonTemporal(),
18425  Ld->isInvariant(),
18426  MinAlign(Ld->getAlignment(), 4));
18427 
18428  SDValue NewChain = LoLd.getValue(1);
18429  if (TokenFactorIndex != -1) {
18430  Ops.push_back(LoLd);
18431  Ops.push_back(HiLd);
18432  NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
18433  Ops.size());
18434  }
18435 
18436  LoAddr = St->getBasePtr();
18437  HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
18438  DAG.getConstant(4, MVT::i32));
18439 
18440  SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
18441  St->getPointerInfo(),
18442  St->isVolatile(), St->isNonTemporal(),
18443  St->getAlignment());
18444  SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
18445  St->getPointerInfo().getWithOffset(4),
18446  St->isVolatile(),
18447  St->isNonTemporal(),
18448  MinAlign(St->getAlignment(), 4));
18449  return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
18450  }
18451  return SDValue();
18452 }
18453 
18454 /// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
18455 /// and return the operands for the horizontal operation in LHS and RHS. A
18456 /// horizontal operation performs the binary operation on successive elements
18457 /// of its first operand, then on successive elements of its second operand,
18458 /// returning the resulting values in a vector. For example, if
18459 /// A = < float a0, float a1, float a2, float a3 >
18460 /// and
18461 /// B = < float b0, float b1, float b2, float b3 >
18462 /// then the result of doing a horizontal operation on A and B is
18463 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
18464 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
18465 /// A horizontal-op B, for some already available A and B, and if so then LHS is
18466 /// set to A, RHS to B, and the routine returns 'true'.
18467 /// Note that the binary operation should have the property that if one of the
18468 /// operands is UNDEF then the result is UNDEF.
18469 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
18470  // Look for the following pattern: if
18471  // A = < float a0, float a1, float a2, float a3 >
18472  // B = < float b0, float b1, float b2, float b3 >
18473  // and
18474  // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
18475  // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
18476  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
18477  // which is A horizontal-op B.
18478 
18479  // At least one of the operands should be a vector shuffle.
18480  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
18481  RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
18482  return false;
18483 
18484  MVT VT = LHS.getSimpleValueType();
18485 
18486  assert((VT.is128BitVector() || VT.is256BitVector()) &&
18487  "Unsupported vector type for horizontal add/sub");
18488 
18489  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
18490  // operate independently on 128-bit lanes.
18491  unsigned NumElts = VT.getVectorNumElements();
18492  unsigned NumLanes = VT.getSizeInBits()/128;
18493  unsigned NumLaneElts = NumElts / NumLanes;
18494  assert((NumLaneElts % 2 == 0) &&
18495  "Vector type should have an even number of elements in each lane");
18496  unsigned HalfLaneElts = NumLaneElts/2;
18497 
18498  // View LHS in the form
18499  // LHS = VECTOR_SHUFFLE A, B, LMask
18500  // If LHS is not a shuffle then pretend it is the shuffle
18501  // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
18502  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
18503  // type VT.
18504  SDValue A, B;
18505  SmallVector<int, 16> LMask(NumElts);
18506  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
18507  if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
18508  A = LHS.getOperand(0);
18509  if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
18510  B = LHS.getOperand(1);
18511  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
18512  std::copy(Mask.begin(), Mask.end(), LMask.begin());
18513  } else {
18514  if (LHS.getOpcode() != ISD::UNDEF)
18515  A = LHS;
18516  for (unsigned i = 0; i != NumElts; ++i)
18517  LMask[i] = i;
18518  }
18519 
18520  // Likewise, view RHS in the form
18521  // RHS = VECTOR_SHUFFLE C, D, RMask
18522  SDValue C, D;
18523  SmallVector<int, 16> RMask(NumElts);
18524  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
18525  if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
18526  C = RHS.getOperand(0);
18527  if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
18528  D = RHS.getOperand(1);
18529  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
18530  std::copy(Mask.begin(), Mask.end(), RMask.begin());
18531  } else {
18532  if (RHS.getOpcode() != ISD::UNDEF)
18533  C = RHS;
18534  for (unsigned i = 0; i != NumElts; ++i)
18535  RMask[i] = i;
18536  }
18537 
18538  // Check that the shuffles are both shuffling the same vectors.
18539  if (!(A == C && B == D) && !(A == D && B == C))
18540  return false;
18541 
18542  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
18543  if (!A.getNode() && !B.getNode())
18544  return false;
18545 
18546  // If A and B occur in reverse order in RHS, then "swap" them (which means
18547  // rewriting the mask).
18548  if (A != C)
18549  CommuteVectorShuffleMask(RMask, NumElts);
18550 
18551  // At this point LHS and RHS are equivalent to
18552  // LHS = VECTOR_SHUFFLE A, B, LMask
18553  // RHS = VECTOR_SHUFFLE A, B, RMask
18554  // Check that the masks correspond to performing a horizontal operation.
18555  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
18556  for (unsigned i = 0; i != NumLaneElts; ++i) {
18557  int LIdx = LMask[i+l], RIdx = RMask[i+l];
18558 
18559  // Ignore any UNDEF components.
18560  if (LIdx < 0 || RIdx < 0 ||
18561  (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
18562  (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
18563  continue;
18564 
18565  // Check that successive elements are being operated on. If not, this is
18566  // not a horizontal operation.
18567  unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
18568  int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
18569  if (!(LIdx == Index && RIdx == Index + 1) &&
18570  !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
18571  return false;
18572  }
18573  }
18574 
18575  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
18576  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
18577  return true;
18578 }
18579 
18580 /// PerformFADDCombine - Do target-specific dag combines on floating point adds.
18582  const X86Subtarget *Subtarget) {
18583  EVT VT = N->getValueType(0);
18584  SDValue LHS = N->getOperand(0);
18585  SDValue RHS = N->getOperand(1);
18586 
18587  // Try to synthesize horizontal adds from adds of shuffles.
18588  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
18589  (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
18590  isHorizontalBinOp(LHS, RHS, true))
18591  return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
18592  return SDValue();
18593 }
18594 
18595 /// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
18597  const X86Subtarget *Subtarget) {
18598  EVT VT = N->getValueType(0);
18599  SDValue LHS = N->getOperand(0);
18600  SDValue RHS = N->getOperand(1);
18601 
18602  // Try to synthesize horizontal subs from subs of shuffles.
18603  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
18604  (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
18605  isHorizontalBinOp(LHS, RHS, false))
18606  return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
18607  return SDValue();
18608 }
18609 
18610 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
18611 /// X86ISD::FXOR nodes.
18613  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
18614  // F[X]OR(0.0, x) -> x
18615  // F[X]OR(x, 0.0) -> x
18616  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
18617  if (C->getValueAPF().isPosZero())
18618  return N->getOperand(1);
18619  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
18620  if (C->getValueAPF().isPosZero())
18621  return N->getOperand(0);
18622  return SDValue();
18623 }
18624 
18625 /// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
18626 /// X86ISD::FMAX nodes.
18628  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
18629 
18630  // Only perform optimizations if UnsafeMath is used.
18631  if (!DAG.getTarget().Options.UnsafeFPMath)
18632  return SDValue();
18633 
18634  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
18635  // into FMINC and FMAXC, which are Commutative operations.
18636  unsigned NewOp = 0;
18637  switch (N->getOpcode()) {
18638  default: llvm_unreachable("unknown opcode");
18639  case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
18640  case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
18641  }
18642 
18643  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
18644  N->getOperand(0), N->getOperand(1));
18645 }
18646 
18647 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
18649  // FAND(0.0, x) -> 0.0
18650  // FAND(x, 0.0) -> 0.0
18651  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
18652  if (C->getValueAPF().isPosZero())
18653  return N->getOperand(0);
18654  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
18655  if (C->getValueAPF().isPosZero())
18656  return N->getOperand(1);
18657  return SDValue();
18658 }
18659 
18660 /// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
18662  // FANDN(x, 0.0) -> 0.0
18663  // FANDN(0.0, x) -> x
18664  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
18665  if (C->getValueAPF().isPosZero())
18666  return N->getOperand(1);
18667  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
18668  if (C->getValueAPF().isPosZero())
18669  return N->getOperand(1);
18670  return SDValue();
18671 }
18672 
18674  SelectionDAG &DAG,
18676  // BT ignores high bits in the bit index operand.
18677  SDValue Op1 = N->getOperand(1);
18678  if (Op1.hasOneUse()) {
18679  unsigned BitWidth = Op1.getValueSizeInBits();
18680  APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
18681  APInt KnownZero, KnownOne;
18683  !DCI.isBeforeLegalizeOps());
18684  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18685  if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
18686  TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
18687  DCI.CommitTargetLoweringOpt(TLO);
18688  }
18689  return SDValue();
18690 }
18691 
18693  SDValue Op = N->getOperand(0);
18694  if (Op.getOpcode() == ISD::BITCAST)
18695  Op = Op.getOperand(0);
18696  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
18697  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
18699  OpVT.getVectorElementType().getSizeInBits()) {
18700  return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
18701  }
18702  return SDValue();
18703 }
18704 
18706  const X86Subtarget *Subtarget) {
18707  EVT VT = N->getValueType(0);
18708  if (!VT.isVector())
18709  return SDValue();
18710 
18711  SDValue N0 = N->getOperand(0);
18712  SDValue N1 = N->getOperand(1);
18713  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
18714  SDLoc dl(N);
18715 
18716  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
18717  // both SSE and AVX2 since there is no sign-extended shift right
18718  // operation on a vector with 64-bit elements.
18719  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
18720  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
18721  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
18722  N0.getOpcode() == ISD::SIGN_EXTEND)) {
18723  SDValue N00 = N0.getOperand(0);
18724 
18725  // EXTLOAD has a better solution on AVX2,
18726  // it may be replaced with X86ISD::VSEXT node.
18727  if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
18728  if (!ISD::isNormalLoad(N00.getNode()))
18729  return SDValue();
18730 
18731  if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
18733  N00, N1);
18734  return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
18735  }
18736  }
18737  return SDValue();
18738 }
18739 
18742  const X86Subtarget *Subtarget) {
18743  if (!DCI.isBeforeLegalizeOps())
18744  return SDValue();
18745 
18746  if (!Subtarget->hasFp256())
18747  return SDValue();
18748 
18749  EVT VT = N->getValueType(0);
18750  if (VT.isVector() && VT.getSizeInBits() == 256) {
18751  SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
18752  if (R.getNode())
18753  return R;
18754  }
18755 
18756  return SDValue();
18757 }
18758 
18760  const X86Subtarget* Subtarget) {
18761  SDLoc dl(N);
18762  EVT VT = N->getValueType(0);
18763 
18764  // Let legalize expand this if it isn't a legal type yet.
18765  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18766  return SDValue();
18767 
18768  EVT ScalarVT = VT.getScalarType();
18769  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
18770  (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
18771  return SDValue();
18772 
18773  SDValue A = N->getOperand(0);
18774  SDValue B = N->getOperand(1);
18775  SDValue C = N->getOperand(2);
18776 
18777  bool NegA = (A.getOpcode() == ISD::FNEG);
18778  bool NegB = (B.getOpcode() == ISD::FNEG);
18779  bool NegC = (C.getOpcode() == ISD::FNEG);
18780 
18781  // Negative multiplication when NegA xor NegB
18782  bool NegMul = (NegA != NegB);
18783  if (NegA)
18784  A = A.getOperand(0);
18785  if (NegB)
18786  B = B.getOperand(0);
18787  if (NegC)
18788  C = C.getOperand(0);
18789 
18790  unsigned Opcode;
18791  if (!NegMul)
18792  Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
18793  else
18794  Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
18795 
18796  return DAG.getNode(Opcode, dl, VT, A, B, C);
18797 }
18798 
18801  const X86Subtarget *Subtarget) {
18802  // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
18803  // (and (i32 x86isd::setcc_carry), 1)
18804  // This eliminates the zext. This transformation is necessary because
18805  // ISD::SETCC is always legalized to i8.
18806  SDLoc dl(N);
18807  SDValue N0 = N->getOperand(0);
18808  EVT VT = N->getValueType(0);
18809 
18810  if (N0.getOpcode() == ISD::AND &&
18811  N0.hasOneUse() &&
18812  N0.getOperand(0).hasOneUse()) {
18813  SDValue N00 = N0.getOperand(0);
18814  if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
18816  if (!C || C->getZExtValue() != 1)
18817  return SDValue();
18818  return DAG.getNode(ISD::AND, dl, VT,
18819  DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
18820  N00.getOperand(0), N00.getOperand(1)),
18821  DAG.getConstant(1, VT));
18822  }
18823  }
18824 
18825  if (VT.is256BitVector()) {
18826  SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
18827  if (R.getNode())
18828  return R;
18829  }
18830 
18831  return SDValue();
18832 }
18833 
18834 // Optimize x == -y --> x+y == 0
18835 // x != -y --> x+y != 0
18837  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
18838  SDValue LHS = N->getOperand(0);
18839  SDValue RHS = N->getOperand(1);
18840 
18841  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
18842  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
18843  if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
18844  SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
18845  LHS.getValueType(), RHS, LHS.getOperand(1));
18846  return DAG.getSetCC(SDLoc(N), N->getValueType(0),
18847  addV, DAG.getConstant(0, addV.getValueType()), CC);
18848  }
18849  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
18850  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
18851  if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
18852  SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
18853  RHS.getValueType(), LHS, RHS.getOperand(1));
18854  return DAG.getSetCC(SDLoc(N), N->getValueType(0),
18855  addV, DAG.getConstant(0, addV.getValueType()), CC);
18856  }
18857  return SDValue();
18858 }
18859 
18860 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
18861 // as "sbb reg,reg", since it can be extended without zext and produces
18862 // an all-ones bit which is more useful than 0/1 in some cases.
18864  return DAG.getNode(ISD::AND, DL, MVT::i8,
18866  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
18867  DAG.getConstant(1, MVT::i8));
18868 }
18869 
18870 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
18873  const X86Subtarget *Subtarget) {
18874  SDLoc DL(N);
18876  SDValue EFLAGS = N->getOperand(1);
18877 
18878  if (CC == X86::COND_A) {
18879  // Try to convert COND_A into COND_B in an attempt to facilitate
18880  // materializing "setb reg".
18881  //
18882  // Do not flip "e > c", where "c" is a constant, because Cmp instruction
18883  // cannot take an immediate as its first operand.
18884  //
18885  if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
18886  EFLAGS.getValueType().isInteger() &&
18887  !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
18888  SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
18889  EFLAGS.getNode()->getVTList(),
18890  EFLAGS.getOperand(1), EFLAGS.getOperand(0));
18891  SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
18892  return MaterializeSETB(DL, NewEFLAGS, DAG);
18893  }
18894  }
18895 
18896  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
18897  // a zext and produces an all-ones bit which is more useful than 0/1 in some
18898  // cases.
18899  if (CC == X86::COND_B)
18900  return MaterializeSETB(DL, EFLAGS, DAG);
18901 
18902  SDValue Flags;
18903 
18904  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
18905  if (Flags.getNode()) {
18906  SDValue Cond = DAG.getConstant(CC, MVT::i8);
18907  return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
18908  }
18909 
18910  return SDValue();
18911 }
18912 
18913 // Optimize branch condition evaluation.
18914 //
18917  const X86Subtarget *Subtarget) {
18918  SDLoc DL(N);
18919  SDValue Chain = N->getOperand(0);
18920  SDValue Dest = N->getOperand(1);
18921  SDValue EFLAGS = N->getOperand(3);
18923 
18924  SDValue Flags;
18925 
18926  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
18927  if (Flags.getNode()) {
18928  SDValue Cond = DAG.getConstant(CC, MVT::i8);
18929  return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
18930  Flags);
18931  }
18932 
18933  return SDValue();
18934 }
18935 
18937  const X86TargetLowering *XTLI) {
18938  SDValue Op0 = N->getOperand(0);
18939  EVT InVT = Op0->getValueType(0);
18940 
18941  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
18942  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
18943  SDLoc dl(N);
18944  MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
18945  SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
18946  return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
18947  }
18948 
18949  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
18950  // a 32-bit target where SSE doesn't support i64->FP operations.
18951  if (Op0.getOpcode() == ISD::LOAD) {
18952  LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
18953  EVT VT = Ld->getValueType(0);
18954  if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
18955  ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
18956  !XTLI->getSubtarget()->is64Bit() &&
18957  VT == MVT::i64) {
18958  SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
18959  Ld->getChain(), Op0, DAG);
18960  DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
18961  return FILDChain;
18962  }
18963  }
18964  return SDValue();
18965 }
18966 
18967 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
18969  X86TargetLowering::DAGCombinerInfo &DCI) {
18970  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
18971  // the result is either zero or one (depending on the input carry bit).
18972  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
18973  if (X86::isZeroNode(N->getOperand(0)) &&
18974  X86::isZeroNode(N->getOperand(1)) &&
18975  // We don't have a good way to replace an EFLAGS use, so only do this when
18976  // dead right now.
18977  SDValue(N, 1).use_empty()) {
18978  SDLoc DL(N);
18979  EVT VT = N->getValueType(0);
18980  SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
18981  SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
18982  DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
18984  N->getOperand(2)),
18985  DAG.getConstant(1, VT));
18986  return DCI.CombineTo(N, Res1, CarryOut);
18987  }
18988 
18989  return SDValue();
18990 }
18991 
18992 // fold (add Y, (sete X, 0)) -> adc 0, Y
18993 // (add Y, (setne X, 0)) -> sbb -1, Y
18994 // (sub (sete X, 0), Y) -> sbb 0, Y
18995 // (sub (setne X, 0), Y) -> adc -1, Y
18997  SDLoc DL(N);
18998 
18999  // Look through ZExts.
19000  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
19001  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
19002  return SDValue();
19003 
19004  SDValue SetCC = Ext.getOperand(0);
19005  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
19006  return SDValue();
19007 
19009  if (CC != X86::COND_E && CC != X86::COND_NE)
19010  return SDValue();
19011 
19012  SDValue Cmp = SetCC.getOperand(1);
19013  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
19014  !X86::isZeroNode(Cmp.getOperand(1)) ||
19015  !Cmp.getOperand(0).getValueType().isInteger())
19016  return SDValue();
19017 
19018  SDValue CmpOp0 = Cmp.getOperand(0);
19019  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
19020  DAG.getConstant(1, CmpOp0.getValueType()));
19021 
19022  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
19023  if (CC == X86::COND_NE)
19024  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
19025  DL, OtherVal.getValueType(), OtherVal,
19026  DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
19027  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
19028  DL, OtherVal.getValueType(), OtherVal,
19029  DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
19030 }
19031 
19032 /// PerformADDCombine - Do target-specific dag combines on integer adds.
19034  const X86Subtarget *Subtarget) {
19035  EVT VT = N->getValueType(0);
19036  SDValue Op0 = N->getOperand(0);
19037  SDValue Op1 = N->getOperand(1);
19038 
19039  // Try to synthesize horizontal adds from adds of shuffles.
19040  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
19041  (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
19042  isHorizontalBinOp(Op0, Op1, true))
19043  return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
19044 
19045  return OptimizeConditionalInDecrement(N, DAG);
19046 }
19047 
19049  const X86Subtarget *Subtarget) {
19050  SDValue Op0 = N->getOperand(0);
19051  SDValue Op1 = N->getOperand(1);
19052 
19053  // X86 can't encode an immediate LHS of a sub. See if we can push the
19054  // negation into a preceding instruction.
19055  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
19056  // If the RHS of the sub is a XOR with one use and a constant, invert the
19057  // immediate. Then add one to the LHS of the sub so we can turn
19058  // X-Y -> X+~Y+1, saving one register.
19059  if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
19060  isa<ConstantSDNode>(Op1.getOperand(1))) {
19061  APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
19062  EVT VT = Op0.getValueType();
19063  SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
19064  Op1.getOperand(0),
19065  DAG.getConstant(~XorC, VT));
19066  return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
19067  DAG.getConstant(C->getAPIntValue()+1, VT));
19068  }
19069  }
19070 
19071  // Try to synthesize horizontal adds from adds of shuffles.
19072  EVT VT = N->getValueType(0);
19073  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
19074  (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
19075  isHorizontalBinOp(Op0, Op1, true))
19076  return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
19077 
19078  return OptimizeConditionalInDecrement(N, DAG);
19079 }
19080 
19081 /// performVZEXTCombine - Performs build vector combines
19084  const X86Subtarget *Subtarget) {
19085  // (vzext (bitcast (vzext (x)) -> (vzext x)
19086  SDValue In = N->getOperand(0);
19087  while (In.getOpcode() == ISD::BITCAST)
19088  In = In.getOperand(0);
19089 
19090  if (In.getOpcode() != X86ISD::VZEXT)
19091  return SDValue();
19092 
19093  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
19094  In.getOperand(0));
19095 }
19096 
19098  DAGCombinerInfo &DCI) const {
19099  SelectionDAG &DAG = DCI.DAG;
19100  switch (N->getOpcode()) {
19101  default: break;
19103  return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
19104  case ISD::VSELECT:
19105  case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
19106  case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
19107  case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
19108  case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
19109  case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
19110  case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
19111  case ISD::SHL:
19112  case ISD::SRA:
19113  case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
19114  case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
19115  case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
19116  case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
19117  case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
19118  case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
19119  case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
19120  case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
19121  case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
19122  case X86ISD::FXOR:
19123  case X86ISD::FOR: return PerformFORCombine(N, DAG);
19124  case X86ISD::FMIN:
19125  case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
19126  case X86ISD::FAND: return PerformFANDCombine(N, DAG);
19127  case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
19128  case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
19129  case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
19130  case ISD::ANY_EXTEND:
19131  case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
19132  case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
19133  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
19134  case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
19135  case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
19136  case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
19137  case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
19138  case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
19139  case X86ISD::SHUFP: // Handle all target specific shuffles
19140  case X86ISD::PALIGNR:
19141  case X86ISD::UNPCKH:
19142  case X86ISD::UNPCKL:
19143  case X86ISD::MOVHLPS:
19144  case X86ISD::MOVLHPS:
19145  case X86ISD::PSHUFD:
19146  case X86ISD::PSHUFHW:
19147  case X86ISD::PSHUFLW:
19148  case X86ISD::MOVSS:
19149  case X86ISD::MOVSD:
19150  case X86ISD::VPERMILP:
19151  case X86ISD::VPERM2X128:
19152  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
19153  case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
19154  }
19155 
19156  return SDValue();
19157 }
19158 
19159 /// isTypeDesirableForOp - Return true if the target has native support for
19160 /// the specified value type and it is 'desirable' to use the type for the
19161 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
19162 /// instruction encodings are longer and some i16 instructions are slow.
19163 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
19164  if (!isTypeLegal(VT))
19165  return false;
19166  if (VT != MVT::i16)
19167  return true;
19168 
19169  switch (Opc) {
19170  default:
19171  return true;
19172  case ISD::LOAD:
19173  case ISD::SIGN_EXTEND:
19174  case ISD::ZERO_EXTEND:
19175  case ISD::ANY_EXTEND:
19176  case ISD::SHL:
19177  case ISD::SRL:
19178  case ISD::SUB:
19179  case ISD::ADD:
19180  case ISD::MUL:
19181  case ISD::AND:
19182  case ISD::OR:
19183  case ISD::XOR:
19184  return false;
19185  }
19186 }
19187 
19188 /// IsDesirableToPromoteOp - This method query the target whether it is
19189 /// beneficial for dag combiner to promote the specified node. If true, it
19190 /// should return the desired promotion type by reference.
19192  EVT VT = Op.getValueType();
19193  if (VT != MVT::i16)
19194  return false;
19195 
19196  bool Promote = false;
19197  bool Commute = false;
19198  switch (Op.getOpcode()) {
19199  default: break;
19200  case ISD::LOAD: {
19201  LoadSDNode *LD = cast<LoadSDNode>(Op);
19202  // If the non-extending load has a single use and it's not live out, then it
19203  // might be folded.
19204  if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
19205  Op.hasOneUse()*/) {
19206  for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
19207  UE = Op.getNode()->use_end(); UI != UE; ++UI) {
19208  // The only case where we'd want to promote LOAD (rather then it being
19209  // promoted as an operand is when it's only use is liveout.
19210  if (UI->getOpcode() != ISD::CopyToReg)
19211  return false;
19212  }
19213  }
19214  Promote = true;
19215  break;
19216  }
19217  case ISD::SIGN_EXTEND:
19218  case ISD::ZERO_EXTEND:
19219  case ISD::ANY_EXTEND:
19220  Promote = true;
19221  break;
19222  case ISD::SHL:
19223  case ISD::SRL: {
19224  SDValue N0 = Op.getOperand(0);
19225  // Look out for (store (shl (load), x)).
19226  if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
19227  return false;
19228  Promote = true;
19229  break;
19230  }
19231  case ISD::ADD:
19232  case ISD::MUL:
19233  case ISD::AND:
19234  case ISD::OR:
19235  case ISD::XOR:
19236  Commute = true;
19237  // fallthrough
19238  case ISD::SUB: {
19239  SDValue N0 = Op.getOperand(0);
19240  SDValue N1 = Op.getOperand(1);
19241  if (!Commute && MayFoldLoad(N1))
19242  return false;
19243  // Avoid disabling potential load folding opportunities.
19244  if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
19245  return false;
19246  if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
19247  return false;
19248  Promote = true;
19249  }
19250  }
19251 
19252  PVT = MVT::i32;
19253  return Promote;
19254 }
19255 
19256 //===----------------------------------------------------------------------===//
19257 // X86 Inline Assembly Support
19258 //===----------------------------------------------------------------------===//
19259 
19260 namespace {
19261  // Helper to match a string separated by whitespace.
19262  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
19263  s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
19264 
19265  for (unsigned i = 0, e = args.size(); i != e; ++i) {
19266  StringRef piece(*args[i]);
19267  if (!s.startswith(piece)) // Check if the piece matches.
19268  return false;
19269 
19270  s = s.substr(piece.size());
19271  StringRef::size_type pos = s.find_first_not_of(" \t");
19272  if (pos == 0) // We matched a prefix.
19273  return false;
19274 
19275  s = s.substr(pos);
19276  }
19277 
19278  return s.empty();
19279  }
19281 }
19282 
19283 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
19284 
19285  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
19286  if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
19287  std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
19288  std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
19289 
19290  if (AsmPieces.size() == 3)
19291  return true;
19292  else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
19293  return true;
19294  }
19295  }
19296  return false;
19297 }
19298 
19300  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
19301 
19302  std::string AsmStr = IA->getAsmString();
19303 
19304  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
19305  if (!Ty || Ty->getBitWidth() % 16 != 0)
19306  return false;
19307 
19308  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
19309  SmallVector<StringRef, 4> AsmPieces;
19310  SplitString(AsmStr, AsmPieces, ";\n");
19311 
19312  switch (AsmPieces.size()) {
19313  default: return false;
19314  case 1:
19315  // FIXME: this should verify that we are targeting a 486 or better. If not,
19316  // we will turn this bswap into something that will be lowered to logical
19317  // ops instead of emitting the bswap asm. For now, we don't support 486 or
19318  // lower so don't worry about this.
19319  // bswap $0
19320  if (matchAsm(AsmPieces[0], "bswap", "$0") ||
19321  matchAsm(AsmPieces[0], "bswapl", "$0") ||
19322  matchAsm(AsmPieces[0], "bswapq", "$0") ||
19323  matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
19324  matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
19325  matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
19326  // No need to check constraints, nothing other than the equivalent of
19327  // "=r,0" would be valid here.
19329  }
19330 
19331  // rorw $$8, ${0:w} --> llvm.bswap.i16
19332  if (CI->getType()->isIntegerTy(16) &&
19333  IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
19334  (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
19335  matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
19336  AsmPieces.clear();
19337  const std::string &ConstraintsStr = IA->getConstraintString();
19338  SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
19339  array_pod_sort(AsmPieces.begin(), AsmPieces.end());
19340  if (clobbersFlagRegisters(AsmPieces))
19342  }
19343  break;
19344  case 3:
19345  if (CI->getType()->isIntegerTy(32) &&
19346  IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
19347  matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
19348  matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
19349  matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
19350  AsmPieces.clear();
19351  const std::string &ConstraintsStr = IA->getConstraintString();
19352  SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
19353  array_pod_sort(AsmPieces.begin(), AsmPieces.end());
19354  if (clobbersFlagRegisters(AsmPieces))
19356  }
19357 
19358  if (CI->getType()->isIntegerTy(64)) {
19360  if (Constraints.size() >= 2 &&
19361  Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
19362  Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
19363  // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
19364  if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
19365  matchAsm(AsmPieces[1], "bswap", "%edx") &&
19366  matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
19368  }
19369  }
19370  break;
19371  }
19372  return false;
19373 }
19374 
19375 /// getConstraintType - Given a constraint letter, return the type of
19376 /// constraint it is for this target.
19378 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
19379  if (Constraint.size() == 1) {
19380  switch (Constraint[0]) {
19381  case 'R':
19382  case 'q':
19383  case 'Q':
19384  case 'f':
19385  case 't':
19386  case 'u':
19387  case 'y':
19388  case 'x':
19389  case 'Y':
19390  case 'l':
19391  return C_RegisterClass;
19392  case 'a':
19393  case 'b':
19394  case 'c':
19395  case 'd':
19396  case 'S':
19397  case 'D':
19398  case 'A':
19399  return C_Register;
19400  case 'I':
19401  case 'J':
19402  case 'K':
19403  case 'L':
19404  case 'M':
19405  case 'N':
19406  case 'G':
19407  case 'C':
19408  case 'e':
19409  case 'Z':
19410  return C_Other;
19411  default:
19412  break;
19413  }
19414  }
19415  return TargetLowering::getConstraintType(Constraint);
19416 }
19417 
19418 /// Examine constraint type and operand type and determine a weight value.
19419 /// This object must already have been set up with the operand type
19420 /// and the current alternative constraint selected.
19423  AsmOperandInfo &info, const char *constraint) const {
19424  ConstraintWeight weight = CW_Invalid;
19425  Value *CallOperandVal = info.CallOperandVal;
19426  // If we don't have a value, we can't do a match,
19427  // but allow it at the lowest weight.
19428  if (CallOperandVal == NULL)
19429  return CW_Default;
19430  Type *type = CallOperandVal->getType();
19431  // Look at the constraint type.
19432  switch (*constraint) {
19433  default:
19434  weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
19435  case 'R':
19436  case 'q':
19437  case 'Q':
19438  case 'a':
19439  case 'b':
19440  case 'c':
19441  case 'd':
19442  case 'S':
19443  case 'D':
19444  case 'A':
19445  if (CallOperandVal->getType()->isIntegerTy())
19446  weight = CW_SpecificReg;
19447  break;
19448  case 'f':
19449  case 't':
19450  case 'u':
19451  if (type->isFloatingPointTy())
19452  weight = CW_SpecificReg;
19453  break;
19454  case 'y':
19455  if (type->isX86_MMXTy() && Subtarget->hasMMX())
19456  weight = CW_SpecificReg;
19457  break;
19458  case 'x':
19459  case 'Y':
19460  if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
19461  ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
19462  weight = CW_Register;
19463  break;
19464  case 'I':
19465  if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
19466  if (C->getZExtValue() <= 31)
19467  weight = CW_Constant;
19468  }
19469  break;
19470  case 'J':
19471  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19472  if (C->getZExtValue() <= 63)
19473  weight = CW_Constant;
19474  }
19475  break;
19476  case 'K':
19477  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19478  if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
19479  weight = CW_Constant;
19480  }
19481  break;
19482  case 'L':
19483  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19484  if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
19485  weight = CW_Constant;
19486  }
19487  break;
19488  case 'M':
19489  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19490  if (C->getZExtValue() <= 3)
19491  weight = CW_Constant;
19492  }
19493  break;
19494  case 'N':
19495  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19496  if (C->getZExtValue() <= 0xff)
19497  weight = CW_Constant;
19498  }
19499  break;
19500  case 'G':
19501  case 'C':
19502  if (dyn_cast<ConstantFP>(CallOperandVal)) {
19503  weight = CW_Constant;
19504  }
19505  break;
19506  case 'e':
19507  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19508  if ((C->getSExtValue() >= -0x80000000LL) &&
19509  (C->getSExtValue() <= 0x7fffffffLL))
19510  weight = CW_Constant;
19511  }
19512  break;
19513  case 'Z':
19514  if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19515  if (C->getZExtValue() <= 0xffffffff)
19516  weight = CW_Constant;
19517  }
19518  break;
19519  }
19520  return weight;
19521 }
19522 
19523 /// LowerXConstraint - try to replace an X constraint, which matches anything,
19524 /// with another that has more specific requirements based on the type of the
19525 /// corresponding operand.
19526 const char *X86TargetLowering::
19527 LowerXConstraint(EVT ConstraintVT) const {
19528  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
19529  // 'f' like normal targets.
19530  if (ConstraintVT.isFloatingPoint()) {
19531  if (Subtarget->hasSSE2())
19532  return "Y";
19533  if (Subtarget->hasSSE1())
19534  return "x";
19535  }
19536 
19537  return TargetLowering::LowerXConstraint(ConstraintVT);
19538 }
19539 
19540 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
19541 /// vector. If it is invalid, don't add anything to Ops.
19543  std::string &Constraint,
19544  std::vector<SDValue>&Ops,
19545  SelectionDAG &DAG) const {
19546  SDValue Result(0, 0);
19547 
19548  // Only support length 1 constraints for now.
19549  if (Constraint.length() > 1) return;
19550 
19551  char ConstraintLetter = Constraint[0];
19552  switch (ConstraintLetter) {
19553  default: break;
19554  case 'I':
19555  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19556  if (C->getZExtValue() <= 31) {
19557  Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19558  break;
19559  }
19560  }
19561  return;
19562  case 'J':
19563  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19564  if (C->getZExtValue() <= 63) {
19565  Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19566  break;
19567  }
19568  }
19569  return;
19570  case 'K':
19571  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19572  if (isInt<8>(C->getSExtValue())) {
19573  Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19574  break;
19575  }
19576  }
19577  return;
19578  case 'N':
19579  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19580  if (C->getZExtValue() <= 255) {
19581  Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19582  break;
19583  }
19584  }
19585  return;
19586  case 'e': {
19587  // 32-bit signed value
19588  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19590  C->getSExtValue())) {
19591  // Widen to 64 bits here to get it sign extended.
19592  Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
19593  break;
19594  }
19595  // FIXME gcc accepts some relocatable values here too, but only in certain
19596  // memory models; it's complicated.
19597  }
19598  return;
19599  }
19600  case 'Z': {
19601  // 32-bit unsigned value
19602  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19604  C->getZExtValue())) {
19605  Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19606  break;
19607  }
19608  }
19609  // FIXME gcc accepts some relocatable values here too, but only in certain
19610  // memory models; it's complicated.
19611  return;
19612  }
19613  case 'i': {
19614  // Literal immediates are always ok.
19615  if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
19616  // Widen to 64 bits here to get it sign extended.
19617  Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
19618  break;
19619  }
19620 
19621  // In any sort of PIC mode addresses need to be computed at runtime by
19622  // adding in a register or some sort of table lookup. These can't
19623  // be used as immediates.
19624  if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
19625  return;
19626 
19627  // If we are in non-pic codegen mode, we allow the address of a global (with
19628  // an optional displacement) to be used with 'i'.
19629  GlobalAddressSDNode *GA = 0;
19630  int64_t Offset = 0;
19631 
19632  // Match either (GA), (GA+C), (GA+C1+C2), etc.
19633  while (1) {
19634  if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
19635  Offset += GA->getOffset();
19636  break;
19637  } else if (Op.getOpcode() == ISD::ADD) {
19638  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
19639  Offset += C->getZExtValue();
19640  Op = Op.getOperand(0);
19641  continue;
19642  }
19643  } else if (Op.getOpcode() == ISD::SUB) {
19644  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
19645  Offset += -C->getZExtValue();
19646  Op = Op.getOperand(0);
19647  continue;
19648  }
19649  }
19650 
19651  // Otherwise, this isn't something we can handle, reject it.
19652  return;
19653  }
19654 
19655  const GlobalValue *GV = GA->getGlobal();
19656  // If we require an extra load to get this address, as in PIC mode, we
19657  // can't accept it.
19659  getTargetMachine())))
19660  return;
19661 
19662  Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
19663  GA->getValueType(0), Offset);
19664  break;
19665  }
19666  }
19667 
19668  if (Result.getNode()) {
19669  Ops.push_back(Result);
19670  return;
19671  }
19672  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
19673 }
19674 
19675 std::pair<unsigned, const TargetRegisterClass*>
19677  MVT VT) const {
19678  // First, see if this is a constraint that directly corresponds to an LLVM
19679  // register class.
19680  if (Constraint.size() == 1) {
19681  // GCC Constraint Letters
19682  switch (Constraint[0]) {
19683  default: break;
19684  // TODO: Slight differences here in allocation order and leaving
19685  // RIP in the class. Do they matter any more here than they do
19686  // in the normal allocation?
19687  case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
19688  if (Subtarget->is64Bit()) {
19689  if (VT == MVT::i32 || VT == MVT::f32)
19690  return std::make_pair(0U, &X86::GR32RegClass);
19691  if (VT == MVT::i16)
19692  return std::make_pair(0U, &X86::GR16RegClass);
19693  if (VT == MVT::i8 || VT == MVT::i1)
19694  return std::make_pair(0U, &X86::GR8RegClass);
19695  if (VT == MVT::i64 || VT == MVT::f64)
19696  return std::make_pair(0U, &X86::GR64RegClass);
19697  break;
19698  }
19699  // 32-bit fallthrough
19700  case 'Q': // Q_REGS
19701  if (VT == MVT::i32 || VT == MVT::f32)
19702  return std::make_pair(0U, &X86::GR32_ABCDRegClass);
19703  if (VT == MVT::i16)
19704  return std::make_pair(0U, &X86::GR16_ABCDRegClass);
19705  if (VT == MVT::i8 || VT == MVT::i1)
19706  return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
19707  if (VT == MVT::i64)
19708  return std::make_pair(0U, &X86::GR64_ABCDRegClass);
19709  break;
19710  case 'r': // GENERAL_REGS
19711  case 'l': // INDEX_REGS
19712  if (VT == MVT::i8 || VT == MVT::i1)
19713  return std::make_pair(0U, &X86::GR8RegClass);
19714  if (VT == MVT::i16)
19715  return std::make_pair(0U, &X86::GR16RegClass);
19716  if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
19717  return std::make_pair(0U, &X86::GR32RegClass);
19718  return std::make_pair(0U, &X86::GR64RegClass);
19719  case 'R': // LEGACY_REGS
19720  if (VT == MVT::i8 || VT == MVT::i1)
19721  return std::make_pair(0U, &X86::GR8_NOREXRegClass);
19722  if (VT == MVT::i16)
19723  return std::make_pair(0U, &X86::GR16_NOREXRegClass);
19724  if (VT == MVT::i32 || !Subtarget->is64Bit())
19725  return std::make_pair(0U, &X86::GR32_NOREXRegClass);
19726  return std::make_pair(0U, &X86::GR64_NOREXRegClass);
19727  case 'f': // FP Stack registers.
19728  // If SSE is enabled for this VT, use f80 to ensure the isel moves the
19729  // value to the correct fpstack register class.
19730  if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
19731  return std::make_pair(0U, &X86::RFP32RegClass);
19732  if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
19733  return std::make_pair(0U, &X86::RFP64RegClass);
19734  return std::make_pair(0U, &X86::RFP80RegClass);
19735  case 'y': // MMX_REGS if MMX allowed.
19736  if (!Subtarget->hasMMX()) break;
19737  return std::make_pair(0U, &X86::VR64RegClass);
19738  case 'Y': // SSE_REGS if SSE2 allowed
19739  if (!Subtarget->hasSSE2()) break;
19740  // FALL THROUGH.
19741  case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
19742  if (!Subtarget->hasSSE1()) break;
19743 
19744  switch (VT.SimpleTy) {
19745  default: break;
19746  // Scalar SSE types.
19747  case MVT::f32:
19748  case MVT::i32:
19749  return std::make_pair(0U, &X86::FR32RegClass);
19750  case MVT::f64:
19751  case MVT::i64:
19752  return std::make_pair(0U, &X86::FR64RegClass);
19753  // Vector types.
19754  case MVT::v16i8:
19755  case MVT::v8i16:
19756  case MVT::v4i32:
19757  case MVT::v2i64:
19758  case MVT::v4f32:
19759  case MVT::v2f64:
19760  return std::make_pair(0U, &X86::VR128RegClass);
19761  // AVX types.
19762  case MVT::v32i8:
19763  case MVT::v16i16:
19764  case MVT::v8i32:
19765  case MVT::v4i64:
19766  case MVT::v8f32:
19767  case MVT::v4f64:
19768  return std::make_pair(0U, &X86::VR256RegClass);
19769  case MVT::v8f64:
19770  case MVT::v16f32:
19771  case MVT::v16i32:
19772  case MVT::v8i64:
19773  return std::make_pair(0U, &X86::VR512RegClass);
19774  }
19775  break;
19776  }
19777  }
19778 
19779  // Use the default implementation in TargetLowering to convert the register
19780  // constraint into a member of a register class.
19781  std::pair<unsigned, const TargetRegisterClass*> Res;
19782  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
19783 
19784  // Not found as a standard register?
19785  if (Res.second == 0) {
19786  // Map st(0) -> st(7) -> ST0
19787  if (Constraint.size() == 7 && Constraint[0] == '{' &&
19788  tolower(Constraint[1]) == 's' &&
19789  tolower(Constraint[2]) == 't' &&
19790  Constraint[3] == '(' &&
19791  (Constraint[4] >= '0' && Constraint[4] <= '7') &&
19792  Constraint[5] == ')' &&
19793  Constraint[6] == '}') {
19794 
19795  Res.first = X86::ST0+Constraint[4]-'0';
19796  Res.second = &X86::RFP80RegClass;
19797  return Res;
19798  }
19799 
19800  // GCC allows "st(0)" to be called just plain "st".
19801  if (StringRef("{st}").equals_lower(Constraint)) {
19802  Res.first = X86::ST0;
19803  Res.second = &X86::RFP80RegClass;
19804  return Res;
19805  }
19806 
19807  // flags -> EFLAGS
19808  if (StringRef("{flags}").equals_lower(Constraint)) {
19809  Res.first = X86::EFLAGS;
19810  Res.second = &X86::CCRRegClass;
19811  return Res;
19812  }
19813 
19814  // 'A' means EAX + EDX.
19815  if (Constraint == "A") {
19816  Res.first = X86::EAX;
19817  Res.second = &X86::GR32_ADRegClass;
19818  return Res;
19819  }
19820  return Res;
19821  }
19822 
19823  // Otherwise, check to see if this is a register class of the wrong value
19824  // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
19825  // turn into {ax},{dx}.
19826  if (Res.second->hasType(VT))
19827  return Res; // Correct type already, nothing to do.
19828 
19829  // All of the single-register GCC register classes map their values onto
19830  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
19831  // really want an 8-bit or 32-bit register, map to the appropriate register
19832  // class and return the appropriate register.
19833  if (Res.second == &X86::GR16RegClass) {
19834  if (VT == MVT::i8 || VT == MVT::i1) {
19835  unsigned DestReg = 0;
19836  switch (Res.first) {
19837  default: break;
19838  case X86::AX: DestReg = X86::AL; break;
19839  case X86::DX: DestReg = X86::DL; break;
19840  case X86::CX: DestReg = X86::CL; break;
19841  case X86::BX: DestReg = X86::BL; break;
19842  }
19843  if (DestReg) {
19844  Res.first = DestReg;
19845  Res.second = &X86::GR8RegClass;
19846  }
19847  } else if (VT == MVT::i32 || VT == MVT::f32) {
19848  unsigned DestReg = 0;
19849  switch (Res.first) {
19850  default: break;
19851  case X86::AX: DestReg = X86::EAX; break;
19852  case X86::DX: DestReg = X86::EDX; break;
19853  case X86::CX: DestReg = X86::ECX; break;
19854  case X86::BX: DestReg = X86::EBX; break;
19855  case X86::SI: DestReg = X86::ESI; break;
19856  case X86::DI: DestReg = X86::EDI; break;
19857  case X86::BP: DestReg = X86::EBP; break;
19858  case X86::SP: DestReg = X86::ESP; break;
19859  }
19860  if (DestReg) {
19861  Res.first = DestReg;
19862  Res.second = &X86::GR32RegClass;
19863  }
19864  } else if (VT == MVT::i64 || VT == MVT::f64) {
19865  unsigned DestReg = 0;
19866  switch (Res.first) {
19867  default: break;
19868  case X86::AX: DestReg = X86::RAX; break;
19869  case X86::DX: DestReg = X86::RDX; break;
19870  case X86::CX: DestReg = X86::RCX; break;
19871  case X86::BX: DestReg = X86::RBX; break;
19872  case X86::SI: DestReg = X86::RSI; break;
19873  case X86::DI: DestReg = X86::RDI; break;
19874  case X86::BP: DestReg = X86::RBP; break;
19875  case X86::SP: DestReg = X86::RSP; break;
19876  }
19877  if (DestReg) {
19878  Res.first = DestReg;
19879  Res.second = &X86::GR64RegClass;
19880  }
19881  }
19882  } else if (Res.second == &X86::FR32RegClass ||
19883  Res.second == &X86::FR64RegClass ||
19884  Res.second == &X86::VR128RegClass ||
19885  Res.second == &X86::VR256RegClass ||
19886  Res.second == &X86::FR32XRegClass ||
19887  Res.second == &X86::FR64XRegClass ||
19888  Res.second == &X86::VR128XRegClass ||
19889  Res.second == &X86::VR256XRegClass ||
19890  Res.second == &X86::VR512RegClass) {
19891  // Handle references to XMM physical registers that got mapped into the
19892  // wrong class. This can happen with constraints like {xmm0} where the
19893  // target independent register mapper will just pick the first match it can
19894  // find, ignoring the required type.
19895 
19896  if (VT == MVT::f32 || VT == MVT::i32)
19897  Res.second = &X86::FR32RegClass;
19898  else if (VT == MVT::f64 || VT == MVT::i64)
19899  Res.second = &X86::FR64RegClass;
19900  else if (X86::VR128RegClass.hasType(VT))
19901  Res.second = &X86::VR128RegClass;
19902  else if (X86::VR256RegClass.hasType(VT))
19903  Res.second = &X86::VR256RegClass;
19904  else if (X86::VR512RegClass.hasType(VT))
19905  Res.second = &X86::VR512RegClass;
19906  }
19907 
19908  return Res;
19909 }
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:276
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low)
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI)
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
bool isVINSERT128Index(SDNode *N)
static bool isValueValidForType(Type *Ty, uint64_t V)
Determine if the value is in range for the given type.
Definition: Constants.cpp:1159
void setFrameAddressIsTaken(bool T)
bool isImplicit() const
unsigned getStackAlignment() const
unsigned GetCondBranchFromCond(CondCode CC)
static bool isPSHUFDMask(ArrayRef< int > Mask, MVT VT)
static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp)
const Value * getCalledValue() const
static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, bool GuaranteedTailCallOpt)
static MVT getIntegerVT(unsigned BitWidth)
Definition: ValueTypes.h:481
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
PSIGN - Copy integer sign.
virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const
unsigned Log2_32_Ceil(uint32_t Value)
Definition: MathExtras.h:456
const MachineFunction * getParent() const
The memory access reads data.
void setVarArgsFPOffset(unsigned Offset)
SDValue getConstant(uint64_t Val, EVT VT, bool isTarget=false)
static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl)
SDValue getValue(unsigned R) const
The C convention as specified in the x86-64 supplement to the System V ABI, used on most non-Windows ...
Definition: CallingConv.h:126
bool hasSSE3() const
Definition: X86Subtarget.h:261
static APInt getSignBit(unsigned BitWidth)
Get the SignBit for a specific bit width.
Definition: APInt.h:443
static SDValue LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG)
The memory access writes data.
MVT getValVT() const
PSHUFB - Shuffle 16 8-bit values within a vector.
static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo)
static SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2)
const GlobalValue * getGlobal() const
static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget *Subtarget)
static MachineBasicBlock * EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII)
static APInt getAllOnesValue(unsigned numBits)
Get the all-ones value.
Definition: APInt.h:450
LLVMContext * getContext() const
Definition: SelectionDAG.h:285
virtual void resetOperationActions()
Reset the operation actions based on target options.
SDValue getCopyToReg(SDValue Chain, SDLoc dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:487
static bool isPSHUFHWMask(ArrayRef< int > Mask, MVT VT, bool HasInt256)
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, SDLoc DL)
Definition: SelectionDAG.h:572
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget)
virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const
bool isKnownNeverNaN(SDValue Op) const
isKnownNeverNan - Test whether the given SDValue is known to never be NaN.
Reloc::Model getRelocationModel() const
static void CommuteVectorShuffleMask(SmallVectorImpl< int > &Mask, unsigned NumElems)
LocInfo getLocInfo() const
static MVT getVectorVT(MVT VT, unsigned NumElements)
Definition: ValueTypes.h:500
bool hasOneUse() const
static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG)
static const fltSemantics IEEEdouble
Definition: APFloat.h:133
static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1)
Turns an ISD::CondCode into a value suitable for SSE floating point mask CMPs.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
static SDValue LowerADD(SDValue Op, SelectionDAG &DAG)
bool isVEXTRACT128Index(SDNode *N)
Force argument to be passed in register.
Definition: Attributes.h:76
bool hasOneUse() const
bool isTargetCygMing() const
Definition: X86Subtarget.h:329
const TargetMachine & getTargetMachine() const
SDVTList getVTList() const
Y = RRC X, rotate right via carry.
static bool isMOVHLPSMask(ArrayRef< int > Mask, MVT VT)
unsigned createVirtualRegister(const TargetRegisterClass *RegClass)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl)
static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
Returns a vector of 0s if the node in input is a vector logical shift by a constant amount which is k...
unsigned getPointerSize(unsigned AS=0) const
Definition: DataLayout.h:261
static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
enable_if_c<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:266
CondCode getCondFromCMovOpc(unsigned Opc)
getCondFromCmovOpc - return condition code of a CMov opcode.
static MachineBasicBlock * EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:281
virtual const uint32_t * getCallPreservedMask(CallingConv::ID) const
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f, uint64_t s, unsigned base_alignment, const MDNode *TBAAInfo=0, const MDNode *Ranges=0)
static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth)
unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC)
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:123
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const
virtual ConstraintType getConstraintType(const std::string &Constraint) const
Given a constraint, return the type of constraint it is for this target.
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl)
iterator end() const
Definition: ArrayRef.h:98
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:327
virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op, unsigned Depth) const
void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static bool isVirtualRegister(unsigned Reg)
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2)
bool is512BitVector() const
is512BitVector - Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:229
unsigned InferPtrAlignment(SDValue Ptr) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:528
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:292
const GlobalValue * getGlobal() const
bool hasFMA4() const
Definition: X86Subtarget.h:278
unsigned char ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
BLENDV - Blend where the selector is a register.
const std::string & getAsmString() const
Definition: InlineAsm.h:86
static bool isUndefOrEqual(int Val, int CmpVal)
bool hasBMI() const
Definition: X86Subtarget.h:286
static MVT getFloatingPointVT(unsigned BitWidth)
Definition: ValueTypes.h:464
static bool isMOVDDUPYMask(ArrayRef< int > Mask, MVT VT, bool HasFp256)
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(MVT VT) const
unsigned MaxOffset
unsigned getOpcode() const
virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const
void addLiveIn(unsigned Reg)
StringRef substr(size_t Start, size_t N=npos) const
Definition: StringRef.h:392
static void ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned NewOp)
Type * getTypeForEVT(LLVMContext &Context) const
Definition: ValueTypes.cpp:180
unsigned getSizeInBits() const
Definition: ValueTypes.h:359
unsigned getByValSize() const
static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII, const X86Subtarget *Subtarget)
void DecodeUNPCKLMask(MVT VT, SmallVectorImpl< int > &ShuffleMask)
static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG)
static bool isZero(SDValue V)
static bool isAllOnes(SDValue V)
bool hasSSE41() const
Definition: X86Subtarget.h:263
unsigned getNumOperands() const
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG)
Type * getReturnType() const
Definition: Function.cpp:179
void setBooleanVectorContents(BooleanContent Ty)
unsigned getInsertVINSERT128Immediate(SDNode *N)
static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
const std::string & getConstraintString() const
Definition: InlineAsm.h:87
STATISTIC(NumTailCalls,"Number of tail calls")
unsigned getNumOperands() const
arg_iterator arg_end()
Definition: Function.h:418
unsigned getValueSizeInBits() const
virtual const MCExpr * getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const
FHSUB - Floating point horizontal sub.
const GlobalValue * GV
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB)
const X86Subtarget * getSubtarget() const
const SDValue & getOperand(unsigned Num) const
F(f)
const Function * getFunction() const
static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG)
PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes.
static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
static TargetLoweringObjectFile * createTLOF(X86TargetMachine &TM)
static bool ShouldXformToMOVHLPS(ArrayRef< int > Mask, MVT VT)
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Definition: APFloat.h:212
static MachinePointerInfo getConstantPool()
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:454
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:61
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned char TargetFlags=0)
void ComputeMaskedBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, unsigned Depth=0) const
Same for subtraction.
Definition: ISDOpcodes.h:216
static bool isSplatVector(SDNode *N)
static MachineBasicBlock * EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, const TargetInstrInfo *TII)
unsigned getFrameRegister(const MachineFunction &MF) const
bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const
Return true if the attribute exists at the given index.
Definition: Attributes.cpp:818
unsigned getValNo() const
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
const SDValue & getBasePtr() const
static bool isMOVHLPS_v_undef_Mask(ArrayRef< int > Mask, MVT VT)
SDValue getConstantPool(const Constant *C, EVT VT, unsigned Align=0, int Offs=0, bool isT=false, unsigned char TargetFlags=0)
static bool MayFoldVectorLoad(SDValue V)
virtual MVT getScalarShiftAmountTy(EVT LHSTy) const
HSUB - Integer horizontal sub.
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI)
bool hasCmpxchg16b() const
Definition: X86Subtarget.h:297
static bool MayFoldLoad(SDValue Op)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG)
PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
bool bitwiseIsEqual(const APFloat &) const
Bitwise comparison for equality (QNaNs compare equal, 0!=-0).
Definition: APFloat.cpp:755
unsigned getResNo() const
get the index which selects a specific result in the SDNode
bool bitsLT(EVT VT) const
bitsLT - Return true if this has less bits than VT.
Definition: ValueTypes.h:735
virtual bool isZExtFree(Type *Ty1, Type *Ty2) const
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, APInt &KnownZero, APInt &KnownOne, TargetLoweringOpt &TLO, unsigned Depth=0) const
static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
bool isRegLoc() const
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const char *reason, bool gen_crash_diag=true)
bool isAllOnesValue() const
SDValue getExternalSymbol(const char *Sym, EVT VT)
static SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG)
CallingConv::ID getCallingConv() const
Definition: Function.h:161
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
EVT getValueType(Type *Ty, bool AllowUnknown=false) const
static MachinePointerInfo getFixedStack(int FI, int64_t offset=0)
virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const
X86 bit-test instructions.
static Constant * getNullValue(Type *Ty)
Definition: Constants.cpp:111
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG)
static bool isUndefOrInRange(int Val, int Low, int Hi)
static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N)
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(MVT VT) const
bool isKnownNeverZero(SDValue Op) const
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit)
SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, bool isInvariant, unsigned Alignment, const MDNode *TBAAInfo=0, const MDNode *Ranges=0)
bool isVector() const
isVector - Return true if this is a vector value type.
Definition: ValueTypes.h:661
bool isMacOSX() const
Definition: Triple.h:303
virtual const char * LowerXConstraint(EVT ConstraintVT) const
EVT getShiftAmountTy(EVT LHSTy) const
lazy value info
static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SelectionDAG &DAG)
static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG)
static const fltSemantics x87DoubleExtended
Definition: APFloat.h:136
static StructReturnType argsAreStructReturn(const SmallVectorImpl< ISD::InputArg > &Ins)
static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl)
static bool isX86LogicalCmp(SDValue Op)
SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, bool isVolatile, bool isNonTemporal, unsigned Alignment, const MDNode *TBAAInfo=0)
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
static bool isPermImmMask(ArrayRef< int > Mask, MVT VT, unsigned &Imm8)
static bool MayFoldIntoStore(SDValue Op)
const HexagonInstrInfo * TII
bool hasDefaultVisibility() const
Definition: GlobalValue.h:88
static bool isVPERMILPMask(ArrayRef< int > Mask, MVT VT)
bool isTargetDarwin() const
Definition: X86Subtarget.h:311
bool isNormalStore(const SDNode *N)
static bool isUNPCKL_v_undef_Mask(ArrayRef< int > Mask, MVT VT, bool HasInt256)
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, SDLoc dl)
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
#define llvm_unreachable(msg)
bool isBuildVectorAllZeros(const SDNode *N)
EVT getValueType(unsigned ResNo) const
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG)
bool isVINSERT256Index(SDNode *N)
param_iterator param_end() const
Definition: DerivedTypes.h:125
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:280
bool isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N)
virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
static bool isGlobalStubReference(unsigned char TargetFlag)
Definition: X86InstrInfo.h:75
bool isInt< 8 >(int64_t x)
Definition: MathExtras.h:268
virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const
SDValue getTargetGlobalAddress(const GlobalValue *GV, SDLoc DL, EVT VT, int64_t offset=0, unsigned char TargetFlags=0)
Definition: SelectionDAG.h:434
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:923
const TargetRegisterClass * getRegClass(unsigned Reg) const
std::vector< MachineBasicBlock * >::iterator succ_iterator
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG)
void setCondCodeAction(ISD::CondCode CC, MVT VT, LegalizeAction Action)
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
MachinePointerInfo getWithOffset(int64_t O) const
SimpleValueType SimpleTy
Definition: ValueTypes.h:161
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
MVT getScalarType() const
Definition: ValueTypes.h:259
static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:309
EVT getScalarType() const
Definition: ValueTypes.h:756
Abstract Stack Frame Information.
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
SynchronizationScope
Definition: Instructions.h:47
bool isFixedObjectIndex(int ObjectIdx) const
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
int getMaskElt(unsigned Idx) const
virtual bool isShuffleMaskLegal(const SmallVectorImpl< int > &Mask, EVT VT) const
unsigned getExtractVEXTRACT128Immediate(SDNode *N)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
SDVTList getVTList(EVT VT)
unsigned getStoreSize() const
Definition: ValueTypes.h:433
bool needsStackRealignment(const MachineFunction &MF) const
virtual MVT getPointerTy(uint32_t=0) const
static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG)
ID
LLVM Calling Convention Representation.
Definition: CallingConv.h:26
static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm)
HADD - Integer horizontal add.
static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG)
const MachineInstrBuilder & addImm(int64_t Val) const
#define G(x, y, z)
Definition: MD5.cpp:52
bool isInteger() const
isInteger - Return true if this is an integer, or a vector integer type.
Definition: ValueTypes.h:656
X86TargetLowering(X86TargetMachine &TM)
unsigned getNumOperands() const
Definition: MachineInstr.h:265
static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG)
SDValue getConstantFP(double Val, EVT VT, bool isTarget=false)
SmallVector< ISD::InputArg, 32 > Ins
AtomicOrdering
Definition: Instructions.h:36
X86 compare and logical compare instructions.
EVT getVectorElementType() const
Definition: ValueTypes.h:762
bool hasMMX() const
Definition: X86Subtarget.h:258
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo, unsigned Alignment, AtomicOrdering Ordering, SynchronizationScope SynchScope)
virtual bool isVectorClearMaskLegal(const SmallVectorImpl< int > &Mask, EVT VT) const
virtual bool isSafeMemOpType(MVT VT) const
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
Definition: X86InstrInfo.h:92
SDValue getCALLSEQ_START(SDValue Chain, SDValue Op, SDLoc DL)
Definition: SelectionDAG.h:563
ConstraintType getConstraintType(const std::string &Constraint) const
unsigned isMacOSXVersionLT(unsigned Major, unsigned Minor=0, unsigned Micro=0) const
Definition: Triple.h:288
bool hasLZCNT() const
Definition: X86Subtarget.h:285
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
PerformShuffleCombine - Performs several different shuffle combines.
unsigned getLocReg() const
static bool isVINSERTIndex(SDNode *N, unsigned vecWidth)
static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, bool ZerosFromLeft, SelectionDAG &DAG, unsigned PreferredNum=-1U)
static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG)
bool definesRegister(unsigned Reg, const TargetRegisterInfo *TRI=NULL) const
Definition: MachineInstr.h:753
virtual const MCExpr * LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid, MCContext &Ctx) const
static unsigned getLoadOpcode(EVT VT)
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition: STLExtras.h:250
bool readsRegister(unsigned Reg, const TargetRegisterInfo *TRI=NULL) const
Definition: MachineInstr.h:724
unsigned getNumValues() const
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:56
SDValue getRegisterMask(const uint32_t *RegMask)
FMAXC, FMINC - Commutative FMIN and FMAX.
bool hasStructRetAttr() const
Determine if the function returns a structure through first pointer argument.
Definition: Function.h:299
SMAX, SMIN - Signed integer max and min.
int getOpcode() const
Definition: MachineInstr.h:261
#define T
enable_if_c< std::numeric_limits< T >::is_integer &&!std::numeric_limits< T >::is_signed, std::size_t >::type countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:49
union llvm::X86AddressMode::@223 Base
bool hasTBM() const
Definition: X86Subtarget.h:280
SDValue CombineTo(SDNode *N, const std::vector< SDValue > &To, bool AddTo=true)
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:240
This contains information for each constraint that we are lowering.
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:176
SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2, const int *MaskElts)
bool isMask_64(uint64_t Value)
Definition: MathExtras.h:335
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:348
static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG)
SmallVector< ISD::OutputArg, 32 > Outs
static void NormalizeMask(SmallVectorImpl< int > &Mask, unsigned NumElems)
static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth)
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG)
virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const
bool isIntegerTypeFTOL(EVT VT) const
bool isFloatingPointTy() const
Definition: Type.h:162
const SDValue & getBasePtr() const
virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const
getSetCCResultType - Return the value type to use for ISD::SETCC.
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
bool isPICStyleStubAny() const
Definition: X86Subtarget.h:357
SDValue getUNDEF(EVT VT)
getUNDEF - Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition: SelectionDAG.h:585
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain)
static bool isMOVLMask(ArrayRef< int > Mask, EVT VT)
bool insert(const T &V)
Definition: SmallSet.h:59
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
const APInt & getAPIntValue() const
APInt LLVM_ATTRIBUTE_UNUSED_RESULT shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:856
bool hasSSSE3() const
Definition: X86Subtarget.h:262
EVT getMemoryVT() const
getMemoryVT - Return the type of the in-memory value.
#define EQ(a, b)
Definition: regexec.c:112
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N)
CodeGenOpt::Level getOptLevel() const
int64_t getImm() const
bool isOperationLegalOrPromote(unsigned Op, EVT VT) const
static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG)
static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2)
getUnpackh - Returns a vector_shuffle node for an unpackh operation.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:109
const BasicBlock * getBasicBlock() const
static bool IsTailCallConvention(CallingConv::ID CC)
UNDEF - An undefined node.
Definition: ISDOpcodes.h:154
void setVarArgsGPOffset(unsigned Offset)
bool hasSSE2() const
Definition: X86Subtarget.h:260
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo)
getLegalSplat - Generate a legal splat with supported x86 shuffles
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG)
static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl)
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, bool High)
static bool isSHUFPMask(ArrayRef< int > Mask, MVT VT, bool Commuted=false)
static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth)
static const MCSymbolRefExpr * Create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:270
static bool isWeakForLinker(LinkageTypes Linkage)
Definition: GlobalValue.h:183
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false)
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:510
static bool isTargetShuffle(unsigned Opcode)
mmo_iterator memoperands_end() const
Definition: MachineInstr.h:292
virtual bool ExpandInlineAsm(CallInst *CI) const
unsigned char ClassifyBlockAddressReference() const
static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG)
SDNode * getNode() const
get the SDNode which holds the desired result
CondCode GetOppositeBranchCondition(X86::CondCode CC)
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
bundle_iterator< MachineInstr, instr_iterator > iterator
void setBytesToPopOnReturn(unsigned bytes)
A self-contained host- and target-independent arbitrary-precision floating-point software implementat...
Definition: APFloat.h:122
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
bool isX86_MMXTy() const
isX86_MMXTy - Return true if this is X86 MMX.
Definition: Type.h:182
#define P(N)
bool isTypeLegal(EVT VT) const
static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
unsigned getTargetFlags() const
bool isNormalLoad(const SDNode *N)
virtual unsigned getJumpTableEncoding() const
void array_pod_sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:289
const MachineInstrBuilder & setMemRefs(MachineInstr::mmo_iterator b, MachineInstr::mmo_iterator e) const
static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI)
bool isInteger() const
isInteger - Return true if this is an integer, or a vector integer type.
Definition: ValueTypes.h:182
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:475
bool is256BitVector() const
is256BitVector - Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:686
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=0)
* if(!EatIfPresent(lltok::kw_thread_local)) return false
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:362
virtual bool isLegalICmpImmediate(int64_t Imm) const
bool hasBMI2() const
Definition: X86Subtarget.h:287
unsigned getVectorNumElements() const
Definition: ValueTypes.h:311
APInt LLVM_ATTRIBUTE_UNUSED_RESULT trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:919
static SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2)
CodeModel::Model getCodeModel() const
LLVM Basic Block Representation.
Definition: BasicBlock.h:72
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
const SDValue & getOperand(unsigned i) const
static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
bool isTargetCOFF() const
Definition: X86Subtarget.h:330
static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Simple binary floating point operators.
Definition: ISDOpcodes.h:222
void setTargetDAGCombine(ISD::NodeType NT)
bool isExtInLoc() const
bool hasSinCos() const
bool isNonTemporal() const
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:350
static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
PerformShiftCombine - Combine shifts.
bool isOperationLegalOrCustom(unsigned Op, EVT VT) const
MVT getLocVT() const
unsigned getOperandNo() const
LLVM Constant Representation.
Definition: Constant.h:41
static bool isZeroShuffle(ShuffleVectorSDNode *N)
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG)
PromoteSplat - Splat is promoted to target supported vector shuffles.
bool isVector() const
isVector - Return true if this is a vector value type.
Definition: ValueTypes.h:190
REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl)
virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const
bool hasHiddenVisibility() const
Definition: GlobalValue.h:89
const Constant * getConstVal() const
bool isFloatingPoint() const
isFloatingPoint - Return true if this is a FP, or a vector FP type.
Definition: ValueTypes.h:174
bool hasAVX2() const
Definition: X86Subtarget.h:266
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:267
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG)
param_iterator param_begin() const
Definition: DerivedTypes.h:124
void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1845
virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void setBooleanContents(BooleanContent Ty)
static unsigned getCmpXChgOpcode(EVT VT)
static unsigned getNonAtomicOpcode(unsigned Opc)
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
ItTy next(ItTy it, Dist n)
Definition: STLExtras.h:154
SDValue getCopyFromReg(SDValue Chain, SDLoc dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:510
static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2)
getUnpackl - Returns a vector_shuffle node for an unpackl operation.
const DataLayout * getDataLayout() const
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1252
iterator begin() const
Definition: ArrayRef.h:97
opStatus convert(const fltSemantics &, roundingMode, bool *)
Definition: APFloat.cpp:1938
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1116
bool hasOneMemOperand() const
Definition: MachineInstr.h:297
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI)
unsigned getOpcode() const
bool hasFp256() const
Definition: X86Subtarget.h:268
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:586
REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
unsigned GuaranteedTailCallOpt
static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
bool isPICStyleStubNoDynamic() const
Definition: X86Subtarget.h:354
void setPrefFunctionAlignment(unsigned Align)
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:589
static bool isMOVSLDUPMask(ArrayRef< int > Mask, MVT VT, const X86Subtarget *Subtarget)
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl< SDValue > &Elts, SDLoc &DL, SelectionDAG &DAG)
arg_iterator arg_begin()
Definition: Function.h:410
static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
void changeSign()
Definition: APFloat.cpp:1589
static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
Integer representation type.
Definition: DerivedTypes.h:37
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
CondCode getSetCCSwappedOperands(CondCode Operation)
AddrNumOperands - Total number of operands in a memory reference.
Definition: X86BaseInfo.h:42
static SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG)
BLENDI - Blend where the selector is an immediate.
static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
use_iterator use_begin() const
bool isVolatile() const
const SDValue & getValue() const
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const
virtual const char * getTargetNodeName(unsigned Opcode) const
unsigned MaxStoresPerMemmove
Specify maximum bytes of store instructions per memmove call.
bool isOSWindows() const
Definition: X86Subtarget.h:336
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:312
static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG)
MachineInstrBuilder BuildMI(MachineFunction &MF, DebugLoc DL, const MCInstrDesc &MCID)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:411
virtual bool isGAPlusOffset(SDNode *N, const GlobalValue *&GA, int64_t &Offset) const
bool isPICStyleGOT() const
Definition: X86Subtarget.h:347
unsigned CountPopulation_64(uint64_t Value)
Definition: MathExtras.h:429
virtual const TargetFrameLowering * getFrameLowering() const
static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
std::vector< ArgListEntry > ArgListTy
size_t find_first_not_of(char C, size_t From=0) const
Definition: StringRef.cpp:213
bool isEqualTo(SDValue A, SDValue B) const
MCSymbol * getSymbol() const
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:284
virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements)
Definition: ValueTypes.h:616
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget *Subtarget)
static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
uint64_t getConstantOperandVal(unsigned Num) const
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:390
void setUseUnderscoreLongJmp(bool Val)
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
const MCInstrDesc & get(unsigned Opcode) const
Definition: MCInstrInfo.h:48
SDValue CreateStackTemporary(EVT VT, unsigned minAlign=1)
static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG)
unsigned getStackRegister() const
const MachinePointerInfo & getPointerInfo() const
void setIsKill(bool Val=true)
int64_t getObjectOffset(int ObjectIdx) const
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
SDValue getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo)
bool isTargetWindows() const
Definition: X86Subtarget.h:326
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const LLVM_OVERRIDE
Returns true if a cast between SrcAS and DestAS is a noop.
static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG)
bool isVEXTRACT256Index(SDNode *N)
unsigned getByValAlign() const
virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const
void SplitString(StringRef Source, SmallVectorImpl< StringRef > &OutFragments, StringRef Delimiters=" \t\n\v\f\r")
bool bitsGT(EVT VT) const
bitsGT - Return true if this has more bits than VT.
Definition: ValueTypes.h:723
static StructReturnType callIsStructReturn(const SmallVectorImpl< ISD::OutputArg > &Outs)
void setLoadExtAction(unsigned ExtType, MVT VT, LegalizeAction Action)
static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG)
ArrayRef< int > getMask() const
virtual bool isLegalAddImmediate(int64_t Imm) const
unsigned countTrailingZeros() const
Count the number of trailing zero bits.
Definition: APInt.cpp:736
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:153
SDValue getTargetConstantPool(const Constant *C, EVT VT, unsigned Align=0, int Offset=0, unsigned char TargetFlags=0)
Definition: SelectionDAG.h:451
void initActions()
Initialize all of the actions to default values.
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags=0)
Definition: SelectionDAG.h:445
virtual const TargetInstrInfo * getInstrInfo() const
unsigned getABITypeAlignment(Type *Ty) const
Definition: DataLayout.cpp:582
bool isBuildVectorAllOnes(const SDNode *N)
Node predicates.
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Definition: Constants.cpp:1021
SDValue getTargetConstantFP(double Val, EVT VT)
Definition: SelectionDAG.h:422
static bool WillBeConstantPoolLoad(SDNode *N)
virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const
static bool isXor1OfSetCC(SDValue Op)
static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
performVZEXTCombine - Performs build vector combines
SDValue getNOT(SDLoc DL, SDValue Val, EVT VT)
getNOT - Create a bitwise NOT operation as (XOR Val, -1).
static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:208
virtual unsigned getJumpTableEncoding() const
static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG)
Class for constant integers.
Definition: Constants.h:51
MCSymbol * getPICBaseSymbol() const
const STC & getSubtarget() const
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
bool isBaseWithConstantOffset(SDValue Op) const
static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, unsigned &ExtraOpc)
void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
uint64_t getTypeAllocSize(Type *Ty) const
Definition: DataLayout.h:326
void setExceptionPointerRegister(unsigned R)
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
Definition: Type.cpp:405
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
bool isInvariant() const
static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
bool isTarget64BitILP32() const
Is this x86_64 with the ILP32 programming model (x32 ABI)?
Definition: X86Subtarget.h:245
bool isTargetELF() const
Definition: X86Subtarget.h:318
bool isTargetMingw() const
Definition: X86Subtarget.h:327
SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, bool isVolatile, bool isNonTemporal, unsigned Alignment, const MDNode *TBAAInfo=0)
Type * getType() const
Definition: Value.h:111
StructReturnType
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, ArrayRef< int > Mask, MVT VT)
CCValAssign - Represent assignment of one arg/retval to a location.
static bool CanFoldXORWithAllOnes(const SDNode *N)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG)
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasSlowDivide() const
Definition: X86Subtarget.h:299
static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
PerformADDCombine - Do target-specific dag combines on integer adds.
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt)
static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII)
SDValue getIntPtrConstant(uint64_t Val, bool isTarget=false)
const SDValue & getChain() const
bool isTargetLinux() const
Definition: X86Subtarget.h:322
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:309
static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG)
unsigned getGlobalBaseReg(MachineFunction *MF) const
bool killsRegister(unsigned Reg, const TargetRegisterInfo *TRI=NULL) const
Definition: MachineInstr.h:745
virtual const MCExpr * getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const
unsigned getExtractVEXTRACT256Immediate(SDNode *N)
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
Definition: Constants.cpp:492
virtual bool isGAPlusOffset(SDNode *N, const GlobalValue *&GA, int64_t &Offset) const
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative)
MachineFrameInfo * getFrameInfo()
bool use_empty() const
CondCode getSetCCInverse(CondCode Operation, bool isInteger)
static Constant * get(Type *Ty, double V)
Definition: Constants.cpp:557
static MachinePointerInfo getStack(int64_t Offset)
getStack - stack pointer relative access.
static bool isUNPCKLMask(ArrayRef< int > Mask, MVT VT, bool HasInt256, bool V2IsSplat=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
double BitsToDouble(uint64_t Bits)
Definition: MathExtras.h:479
unsigned Log2_64_Ceil(uint64_t Value)
Definition: MathExtras.h:462
bool isAllOnesValue() const
Definition: Constants.cpp:88
static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG)
unsigned Log2_32(uint32_t Value)
Definition: MathExtras.h:443
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:591
AttributeSet getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:170
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
bool is128BitVector() const
is128BitVector - Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:215
ISD::LoadExtType getExtensionType() const
static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp)
Class for arbitrary precision integers.
Definition: APInt.h:75
SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl, const EVT *VTs, unsigned NumVTs, const SDValue *Ops, unsigned NumOps, EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align=0, bool Vol=false, bool ReadMem=true, bool WriteMem=true)
bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:302
void setExceptionSelectorRegister(unsigned R)
static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth)
static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
int64_t getSExtValue() const
op_iterator op_begin() const
bool isIntegerTy() const
Definition: Type.h:196
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp)
virtual const TargetRegisterClass * getRegClassFor(MVT VT) const
bool hasInt256() const
Definition: X86Subtarget.h:269
static use_iterator use_end()
const uint32_t * getNoPreservedMask() const
void setPrefLoopAlignment(unsigned Align)
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:357
bool isPowerOf2_64(uint64_t Value)
Definition: MathExtras.h:360
bool isMemLoc() const
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
static bool isUNPCKH_v_undef_Mask(ArrayRef< int > Mask, MVT VT, bool HasInt256)
SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT)
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:360
bool isUnalignedMemAccessFast() const
Definition: X86Subtarget.h:295
static cl::opt< AlignMode > Align(cl::desc("Load/store alignment support"), cl::Hidden, cl::init(DefaultAlign), cl::values(clEnumValN(DefaultAlign,"arm-default-align","Generate unaligned accesses only on hardware/OS ""combinations that are known to support them"), clEnumValN(StrictAlign,"arm-strict-align","Disallow all unaligned memory accesses"), clEnumValN(NoStrictAlign,"arm-no-strict-align","Allow unaligned memory accesses"), clEnumValEnd))
void DecodeUNPCKHMask(MVT VT, SmallVectorImpl< int > &ShuffleMask)
bool hasCMov() const
Definition: X86Subtarget.h:257
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
PerformFADDCombine - Do target-specific dag combines on floating point adds.
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1840
bool hasPOPCNT() const
Definition: X86Subtarget.h:273
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:495
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned char TargetFlags=0) const
bool hasSSE1() const
Definition: X86Subtarget.h:259
UMAX, UMIN - Unsigned integer max and min.
static std::pair< unsigned, bool > matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const X86Subtarget *Subtarget)
Matches a VSELECT onto min/max or return 0 if the node doesn't match.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static bool isVPERM2X128Mask(ArrayRef< int > Mask, MVT VT, bool HasFp256)
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG)
static MachinePointerInfo getGOT()
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, unsigned &HiOpc, unsigned &ExtraOpc)
void setArgumentStackSize(unsigned size)
static bool isMOVDDUPMask(ArrayRef< int > Mask, MVT VT)
uint64_t MinAlign(uint64_t A, uint64_t B)
Definition: MathExtras.h:535
static bool isMOVLHPSMask(ArrayRef< int > Mask, MVT VT)
bool isAtom() const
Definition: X86Subtarget.h:307
unsigned countLeadingOnes() const
Count the number of leading one bits.
Definition: APInt.cpp:709
int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS, bool MayNeedSP=false, const AllocaInst *Alloca=0)
SDValue getStackArgumentTokenFactor(SDValue Chain)
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned char TargetFlags=0)
Definition: SelectionDAG.h:481
uint64_t getConstantOperandVal(unsigned i) const
SmallVector< SDValue, 32 > OutVals
static const fltSemantics IEEEsingle
Definition: APFloat.h:132
static bool LowerToByteSwap(CallInst *CI)
bool isX86_FP80Ty() const
isX86_FP80Ty - Return true if this is x86 long double.
Definition: Type.h:152
static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD=NULL)
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:295
bool hasAnyUseOfValue(unsigned Value) const
pointer data()
data - Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:135
static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG)
bool is256BitVector() const
is256BitVector - Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:222
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
bool isLiveIn(unsigned Reg) const
MachineRegisterInfo & getRegInfo()
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
unsigned size() const
Definition: SmallSet.h:43
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
virtual void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth=0) const
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
bool isDeclaration() const
Definition: Globals.cpp:66
bool equals_lower(StringRef RHS) const
equals_lower - Check for string equality, ignoring case.
Definition: StringRef.h:135
size_t size_type
Definition: StringRef.h:46
bool hasSSE42() const
Definition: X86Subtarget.h:264
virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt)
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
PerformFSUBCombine - Do target-specific dag combines on floating point subs.
void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
IMPLICIT_DEF - This is the MachineInstr-level equivalent of undef.
Definition: TargetOpcodes.h:52
static SDValue LowerShift(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl)
unsigned getSizeInBits() const
getSizeInBits - Return the size of the specified value type in bits.
Definition: ValueTypes.h:779
void ReplaceAllUsesWith(SDValue From, SDValue Op)
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp)
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
ANDNP - Bitwise Logical AND NOT of Packed FP values.
FunctionType * getFunctionType() const
Definition: Function.cpp:171
static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain)
void setStackPointerRegisterToSaveRestore(unsigned R)
bool isTailCall() const
static bool IsCCallConvention(CallingConv::ID CC)
Return true if the calling convention is a C calling convention.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
bool hasProtectedVisibility() const
Definition: GlobalValue.h:90
bool isSignBit() const
Check if the APInt's value is returned by getSignBit.
Definition: APInt.h:399
op_iterator op_end() const
bool hasAVX512() const
Definition: X86Subtarget.h:267
uint8_t getFlags() const
getFlags - Return the MI flags bitvector.
Definition: MachineInstr.h:149
const TargetMachine & getTarget() const
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
unsigned getSlotSize() const
MachineSDNode * getMachineNode(unsigned Opcode, SDLoc dl, EVT VT)
Same for multiplication.
Definition: ISDOpcodes.h:219
virtual const TargetRegisterInfo * getRegisterInfo() const
bool isNON_EXTLoad(const SDNode *N)
unsigned getPrimitiveSizeInBits() const
Definition: Type.cpp:117
enum llvm::X86AddressMode::@222 BaseType
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:454
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
bool hasType(EVT vt) const
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG)
unsigned MaxStoresPerMemcpy
Specify maximum bytes of store instructions per memcpy call.
static SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG)
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Definition: ValueTypes.cpp:275
virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const
unsigned getInsertVINSERT256Immediate(SDNode *N)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
EVT getValueType() const
MachineInstr * getVRegDef(unsigned Reg) const
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget)
bool hasLocalLinkage() const
Definition: GlobalValue.h:211
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG)
static bool isMOVLPMask(ArrayRef< int > Mask, MVT VT)
bool is128BitVector() const
is128BitVector - Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:681
bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt)
TLSModel::Model getTLSModel(const GlobalValue *GV) const
unsigned EnableSegmentedStacks
unsigned getReg() const
getReg - Returns the register number.
bool isFloatingPoint() const
isFloatingPoint - Return true if this is a FP, or a vector FP type.
Definition: ValueTypes.h:651
The C convention as implemented on Windows/x86-64. This convention differs from the more common X86_6...
Definition: CallingConv.h:132
static bool isUNPCKHMask(ArrayRef< int > Mask, MVT VT, bool HasInt256, bool V2IsSplat=false)
static SDValue PerformBTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc)
virtual FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const
void insert(iterator MBBI, MachineBasicBlock *MBB)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
void setReturnAddressIsTaken(bool s)
bool isTargetWin64() const
Definition: X86Subtarget.h:338
bool isSimple() const
Definition: ValueTypes.h:640
unsigned getAlignment() const
virtual const TargetRegisterClass * getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget)
PerformSTORECombine - Do target-specific dag combines on STORE nodes.
SDValue getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT)
uint16_t getEncodingValue(unsigned RegNo) const
Returns the encoding for RegNo.
static bool hasFPCMov(unsigned X86CC)
LLVM Value Representation.
Definition: Value.h:66
virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRegister(unsigned Reg, EVT VT)
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:444
void setUseUnderscoreSetJmp(bool Val)
FHADD - Floating point horizontal add.
static SDValue InsertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth)
static unsigned getPseudoCMOVOpc(EVT VT)
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned char TargetFlags=0) const
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static VectorType * get(Type *ElementType, unsigned NumElements)
Definition: Type.cpp:706
bool isTruncatingStore() const
void push_back(MachineBasicBlock *MBB)
static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG)
SDValue getValueType(EVT)
Disable implicit floating point insts.
Definition: Attributes.h:84
bool hasDLLImportLinkage() const
Definition: GlobalValue.h:212
const MachineInstrBuilder & addOperand(const MachineOperand &MO) const
uint64_t getTypeSizeInBits(Type *Ty) const
Definition: DataLayout.h:459
BasicBlockListType::iterator iterator
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:282
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
static bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, unsigned MaskI, unsigned MaskE, unsigned OpIdx, unsigned NumElems, unsigned &OpNum)
static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl< int > &Mask, bool &IsUnary)
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign)
bool isPowerOf2_32(uint32_t Value)
Definition: MathExtras.h:354
bool isZeroNode(SDValue Elt)
vt_iterator vt_begin() const
APInt LLVM_ATTRIBUTE_UNUSED_RESULT zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:983
SDValue getMergeValues(const SDValue *Ops, unsigned NumOps, SDLoc dl)
getMergeValues - Create a MERGE_VALUES node from the given operands.
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
const MCRegisterInfo & MRI
static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml","ocaml 3.10-compatible collector")
SDValue getTargetConstant(uint64_t Val, EVT VT)
Definition: SelectionDAG.h:408
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement=true)
SDValue getSetCC(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond)
Definition: SelectionDAG.h:653
unsigned getLocMemOffset() const
MVT getVectorElementType() const
Definition: ValueTypes.h:263
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG)
static bool isPALIGNRMask(ArrayRef< int > Mask, MVT VT, const X86Subtarget *Subtarget)
static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG)
SDValue getEntryNode() const
Definition: SelectionDAG.h:332
bool hasFMA() const
Definition: X86Subtarget.h:276
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable)
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
Definition: Constants.cpp:2374
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:363
#define Assert(C, M)
Definition: Lint.cpp:162
unsigned getAlignment() const
virtual SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
static bool isPSHUFLWMask(ArrayRef< int > Mask, MVT VT, bool HasInt256)
MVT getSimpleValueType(unsigned ResNo) const
bool is512BitVector() const
is512BitVector - Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:691
void setObjectAlignment(int ObjectIdx, unsigned Align)
setObjectAlignment - Change the alignment of the specified stack object.
virtual unsigned getByValTypeAlignment(Type *Ty) const
bool isVarArg() const
Definition: Function.cpp:175
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:118
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget)
bool isScalarFPTypeInSSEReg(EVT VT) const
static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG)
unsigned AllocateStack(unsigned Size, unsigned Align)
void addSuccessor(MachineBasicBlock *succ, uint32_t weight=0)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
bool isEmpty() const
Return true if there are no attributes.
Definition: Attributes.h:345
void setMemRefs(mmo_iterator NewMemRefs, mmo_iterator NewMemRefsEnd)
static RegisterPass< NVPTXAllocaHoisting > X("alloca-hoisting","Hoisting alloca instructions in non-entry ""blocks to the entry block")
unsigned Log2_64(uint64_t Value)
Definition: MathExtras.h:449
int64_t getObjectSize(int ObjectIdx) const
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
static bool isSplatMask(const int *Mask, EVT VT)
bool isOnlyUserOf(SDNode *N) const
EVT changeVectorElementTypeToInteger() const
Definition: ValueTypes.h:626
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static bool isCommutedMOVLMask(ArrayRef< int > Mask, MVT VT, bool V2IsSplat=false, bool V2IsUndef=false)
bool hasAVX() const
Definition: X86Subtarget.h:265
bool isTargetEnvMacho() const
Definition: X86Subtarget.h:334
#define T1
MVT getSimpleVT() const
Definition: ValueTypes.h:749
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
DebugLoc getDebugLoc() const
Definition: MachineInstr.h:244
static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl)
static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG)
static ConstraintInfoVector ParseConstraints(StringRef ConstraintString)
Definition: InlineAsm.cpp:213
static void ReplaceATOMIC_LOAD(SDNode *Node, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
static bool isMOVSHDUPMask(ArrayRef< int > Mask, MVT VT, const X86Subtarget *Subtarget)
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget)
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt)
bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:110
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
uint64_t getZExtValue() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:291
unsigned getVectorNumElements() const
Definition: ValueTypes.h:771