LLVM API Documentation

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
YAMLParser.cpp
Go to the documentation of this file.
1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a YAML parser.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/ADT/ilist.h"
19 #include "llvm/ADT/ilist_node.h"
22 #include "llvm/Support/SourceMgr.h"
24 
25 using namespace llvm;
26 using namespace yaml;
27 
29  UEF_UTF32_LE, ///< UTF-32 Little Endian
30  UEF_UTF32_BE, ///< UTF-32 Big Endian
31  UEF_UTF16_LE, ///< UTF-16 Little Endian
32  UEF_UTF16_BE, ///< UTF-16 Big Endian
33  UEF_UTF8, ///< UTF-8 or ascii.
34  UEF_Unknown ///< Not a valid Unicode encoding.
35 };
36 
37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
38 /// it exists. Length is in {0, 2, 3, 4}.
39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
40 
41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
42 /// encoding form of \a Input.
43 ///
44 /// @param Input A string of length 0 or more.
45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
46 /// and how long the byte order mark is if one exists.
48  if (Input.size() == 0)
49  return std::make_pair(UEF_Unknown, 0);
50 
51  switch (uint8_t(Input[0])) {
52  case 0x00:
53  if (Input.size() >= 4) {
54  if ( Input[1] == 0
55  && uint8_t(Input[2]) == 0xFE
56  && uint8_t(Input[3]) == 0xFF)
57  return std::make_pair(UEF_UTF32_BE, 4);
58  if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
59  return std::make_pair(UEF_UTF32_BE, 0);
60  }
61 
62  if (Input.size() >= 2 && Input[1] != 0)
63  return std::make_pair(UEF_UTF16_BE, 0);
64  return std::make_pair(UEF_Unknown, 0);
65  case 0xFF:
66  if ( Input.size() >= 4
67  && uint8_t(Input[1]) == 0xFE
68  && Input[2] == 0
69  && Input[3] == 0)
70  return std::make_pair(UEF_UTF32_LE, 4);
71 
72  if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
73  return std::make_pair(UEF_UTF16_LE, 2);
74  return std::make_pair(UEF_Unknown, 0);
75  case 0xFE:
76  if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
77  return std::make_pair(UEF_UTF16_BE, 2);
78  return std::make_pair(UEF_Unknown, 0);
79  case 0xEF:
80  if ( Input.size() >= 3
81  && uint8_t(Input[1]) == 0xBB
82  && uint8_t(Input[2]) == 0xBF)
83  return std::make_pair(UEF_UTF8, 3);
84  return std::make_pair(UEF_Unknown, 0);
85  }
86 
87  // It could still be utf-32 or utf-16.
88  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
89  return std::make_pair(UEF_UTF32_LE, 0);
90 
91  if (Input.size() >= 2 && Input[1] == 0)
92  return std::make_pair(UEF_UTF16_LE, 0);
93 
94  return std::make_pair(UEF_UTF8, 0);
95 }
96 
97 namespace llvm {
98 namespace yaml {
99 /// Pin the vtables to this file.
100 void Node::anchor() {}
101 void NullNode::anchor() {}
102 void ScalarNode::anchor() {}
103 void KeyValueNode::anchor() {}
104 void MappingNode::anchor() {}
105 void SequenceNode::anchor() {}
106 void AliasNode::anchor() {}
107 
108 /// Token - A single YAML token.
109 struct Token : ilist_node<Token> {
110  enum TokenKind {
111  TK_Error, // Uninitialized token.
132  TK_Tag
133  } Kind;
134 
135  /// A string of length 0 or more whose begin() points to the logical location
136  /// of the token in the input.
138 
139  Token() : Kind(TK_Error) {}
140 };
141 }
142 }
143 
144 namespace llvm {
145 template<>
148  return &Sentinel;
149  }
150  static void destroySentinel(Token*) {}
151 
152  Token *provideInitialHead() const { return createSentinel(); }
153  Token *ensureHead(Token*) const { return createSentinel(); }
154  static void noteHead(Token*, Token*) {}
155 
156 private:
157  mutable Token Sentinel;
158 };
159 
160 template<>
162  Token *createNode(const Token &V) {
163  return new (Alloc.Allocate<Token>()) Token(V);
164  }
165  static void deleteNode(Token *V) {}
166 
167  void addNodeToList(Token *) {}
170  ilist_iterator<Token> /*first*/,
171  ilist_iterator<Token> /*last*/) {}
172 
174 };
175 }
176 
178 
179 namespace {
180 /// @brief This struct is used to track simple keys.
181 ///
182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
183 /// which could legally be the start of a simple key. When peekNext is called,
184 /// if the Token To be returned is referenced by a SimpleKey, we continue
185 /// tokenizing until that potential simple key has either been found to not be
186 /// a simple key (we moved on to the next line or went further than 1024 chars).
187 /// Or when we run into a Value, and then insert a Key token (and possibly
188 /// others) before the SimpleKey's Tok.
189 struct SimpleKey {
191  unsigned Column;
192  unsigned Line;
193  unsigned FlowLevel;
194  bool IsRequired;
195 
196  bool operator ==(const SimpleKey &Other) {
197  return Tok == Other.Tok;
198  }
199 };
200 }
201 
202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
203 /// subsequence and the subsequence's length in code units (uint8_t).
204 /// A length of 0 represents an error.
205 typedef std::pair<uint32_t, unsigned> UTF8Decoded;
206 
208  StringRef::iterator Position= Range.begin();
209  StringRef::iterator End = Range.end();
210  // 1 byte: [0x00, 0x7f]
211  // Bit pattern: 0xxxxxxx
212  if ((*Position & 0x80) == 0) {
213  return std::make_pair(*Position, 1);
214  }
215  // 2 bytes: [0x80, 0x7ff]
216  // Bit pattern: 110xxxxx 10xxxxxx
217  if (Position + 1 != End &&
218  ((*Position & 0xE0) == 0xC0) &&
219  ((*(Position + 1) & 0xC0) == 0x80)) {
220  uint32_t codepoint = ((*Position & 0x1F) << 6) |
221  (*(Position + 1) & 0x3F);
222  if (codepoint >= 0x80)
223  return std::make_pair(codepoint, 2);
224  }
225  // 3 bytes: [0x8000, 0xffff]
226  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
227  if (Position + 2 != End &&
228  ((*Position & 0xF0) == 0xE0) &&
229  ((*(Position + 1) & 0xC0) == 0x80) &&
230  ((*(Position + 2) & 0xC0) == 0x80)) {
231  uint32_t codepoint = ((*Position & 0x0F) << 12) |
232  ((*(Position + 1) & 0x3F) << 6) |
233  (*(Position + 2) & 0x3F);
234  // Codepoints between 0xD800 and 0xDFFF are invalid, as
235  // they are high / low surrogate halves used by UTF-16.
236  if (codepoint >= 0x800 &&
237  (codepoint < 0xD800 || codepoint > 0xDFFF))
238  return std::make_pair(codepoint, 3);
239  }
240  // 4 bytes: [0x10000, 0x10FFFF]
241  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
242  if (Position + 3 != End &&
243  ((*Position & 0xF8) == 0xF0) &&
244  ((*(Position + 1) & 0xC0) == 0x80) &&
245  ((*(Position + 2) & 0xC0) == 0x80) &&
246  ((*(Position + 3) & 0xC0) == 0x80)) {
247  uint32_t codepoint = ((*Position & 0x07) << 18) |
248  ((*(Position + 1) & 0x3F) << 12) |
249  ((*(Position + 2) & 0x3F) << 6) |
250  (*(Position + 3) & 0x3F);
251  if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
252  return std::make_pair(codepoint, 4);
253  }
254  return std::make_pair(0, 0);
255 }
256 
257 namespace llvm {
258 namespace yaml {
259 /// @brief Scans YAML tokens from a MemoryBuffer.
260 class Scanner {
261 public:
262  Scanner(const StringRef Input, SourceMgr &SM);
263  Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
264 
265  /// @brief Parse the next token and return it without popping it.
266  Token &peekNext();
267 
268  /// @brief Parse the next token and pop it from the queue.
269  Token getNext();
270 
271  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
272  ArrayRef<SMRange> Ranges = None) {
273  SM.PrintMessage(Loc, Kind, Message, Ranges);
274  }
275 
276  void setError(const Twine &Message, StringRef::iterator Position) {
277  if (Current >= End)
278  Current = End - 1;
279 
280  // Don't print out more errors after the first one we encounter. The rest
281  // are just the result of the first, and have no meaning.
282  if (!Failed)
283  printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
284  Failed = true;
285  }
286 
287  void setError(const Twine &Message) {
288  setError(Message, Current);
289  }
290 
291  /// @brief Returns true if an error occurred while parsing.
292  bool failed() {
293  return Failed;
294  }
295 
296 private:
297  StringRef currentInput() {
298  return StringRef(Current, End - Current);
299  }
300 
301  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
302  /// at \a Position.
303  ///
304  /// If the UTF-8 code units starting at Position do not form a well-formed
305  /// code unit subsequence, then the Unicode scalar value is 0, and the length
306  /// is 0.
308  return ::decodeUTF8(StringRef(Position, End - Position));
309  }
310 
311  // The following functions are based on the gramar rules in the YAML spec. The
312  // style of the function names it meant to closely match how they are written
313  // in the spec. The number within the [] is the number of the grammar rule in
314  // the spec.
315  //
316  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
317  //
318  // c-
319  // A production starting and ending with a special character.
320  // b-
321  // A production matching a single line break.
322  // nb-
323  // A production starting and ending with a non-break character.
324  // s-
325  // A production starting and ending with a white space character.
326  // ns-
327  // A production starting and ending with a non-space character.
328  // l-
329  // A production matching complete line(s).
330 
331  /// @brief Skip a single nb-char[27] starting at Position.
332  ///
333  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
334  /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
335  ///
336  /// @returns The code unit after the nb-char, or Position if it's not an
337  /// nb-char.
338  StringRef::iterator skip_nb_char(StringRef::iterator Position);
339 
340  /// @brief Skip a single b-break[28] starting at Position.
341  ///
342  /// A b-break is 0xD 0xA | 0xD | 0xA
343  ///
344  /// @returns The code unit after the b-break, or Position if it's not a
345  /// b-break.
346  StringRef::iterator skip_b_break(StringRef::iterator Position);
347 
348  /// @brief Skip a single s-white[33] starting at Position.
349  ///
350  /// A s-white is 0x20 | 0x9
351  ///
352  /// @returns The code unit after the s-white, or Position if it's not a
353  /// s-white.
354  StringRef::iterator skip_s_white(StringRef::iterator Position);
355 
356  /// @brief Skip a single ns-char[34] starting at Position.
357  ///
358  /// A ns-char is nb-char - s-white
359  ///
360  /// @returns The code unit after the ns-char, or Position if it's not a
361  /// ns-char.
362  StringRef::iterator skip_ns_char(StringRef::iterator Position);
363 
364  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
365  /// @brief Skip minimal well-formed code unit subsequences until Func
366  /// returns its input.
367  ///
368  /// @returns The code unit after the last minimal well-formed code unit
369  /// subsequence that Func accepted.
370  StringRef::iterator skip_while( SkipWhileFunc Func
371  , StringRef::iterator Position);
372 
373  /// @brief Scan ns-uri-char[39]s starting at Cur.
374  ///
375  /// This updates Cur and Column while scanning.
376  ///
377  /// @returns A StringRef starting at Cur which covers the longest contiguous
378  /// sequence of ns-uri-char.
379  StringRef scan_ns_uri_char();
380 
381  /// @brief Scan ns-plain-one-line[133] starting at \a Cur.
382  StringRef scan_ns_plain_one_line();
383 
384  /// @brief Consume a minimal well-formed code unit subsequence starting at
385  /// \a Cur. Return false if it is not the same Unicode scalar value as
386  /// \a Expected. This updates \a Column.
387  bool consume(uint32_t Expected);
388 
389  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
390  void skip(uint32_t Distance);
391 
392  /// @brief Return true if the minimal well-formed code unit subsequence at
393  /// Pos is whitespace or a new line
394  bool isBlankOrBreak(StringRef::iterator Position);
395 
396  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
397  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
398  , unsigned AtColumn
399  , bool IsRequired);
400 
401  /// @brief Remove simple keys that can no longer be valid simple keys.
402  ///
403  /// Invalid simple keys are not on the current line or are further than 1024
404  /// columns back.
405  void removeStaleSimpleKeyCandidates();
406 
407  /// @brief Remove all simple keys on FlowLevel \a Level.
408  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
409 
410  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
411  /// tokens if needed.
412  bool unrollIndent(int ToColumn);
413 
414  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
415  /// if needed.
416  bool rollIndent( int ToColumn
418  , TokenQueueT::iterator InsertPoint);
419 
420  /// @brief Skip whitespace and comments until the start of the next token.
421  void scanToNextToken();
422 
423  /// @brief Must be the first token generated.
424  bool scanStreamStart();
425 
426  /// @brief Generate tokens needed to close out the stream.
427  bool scanStreamEnd();
428 
429  /// @brief Scan a %BLAH directive.
430  bool scanDirective();
431 
432  /// @brief Scan a ... or ---.
433  bool scanDocumentIndicator(bool IsStart);
434 
435  /// @brief Scan a [ or { and generate the proper flow collection start token.
436  bool scanFlowCollectionStart(bool IsSequence);
437 
438  /// @brief Scan a ] or } and generate the proper flow collection end token.
439  bool scanFlowCollectionEnd(bool IsSequence);
440 
441  /// @brief Scan the , that separates entries in a flow collection.
442  bool scanFlowEntry();
443 
444  /// @brief Scan the - that starts block sequence entries.
445  bool scanBlockEntry();
446 
447  /// @brief Scan an explicit ? indicating a key.
448  bool scanKey();
449 
450  /// @brief Scan an explicit : indicating a value.
451  bool scanValue();
452 
453  /// @brief Scan a quoted scalar.
454  bool scanFlowScalar(bool IsDoubleQuoted);
455 
456  /// @brief Scan an unquoted scalar.
457  bool scanPlainScalar();
458 
459  /// @brief Scan an Alias or Anchor starting with * or &.
460  bool scanAliasOrAnchor(bool IsAlias);
461 
462  /// @brief Scan a block scalar starting with | or >.
463  bool scanBlockScalar(bool IsLiteral);
464 
465  /// @brief Scan a tag of the form !stuff.
466  bool scanTag();
467 
468  /// @brief Dispatch to the next scanning function based on \a *Cur.
469  bool fetchMoreTokens();
470 
471  /// @brief The SourceMgr used for diagnostics and buffer management.
472  SourceMgr &SM;
473 
474  /// @brief The original input.
475  MemoryBuffer *InputBuffer;
476 
477  /// @brief The current position of the scanner.
478  StringRef::iterator Current;
479 
480  /// @brief The end of the input (one past the last character).
482 
483  /// @brief Current YAML indentation level in spaces.
484  int Indent;
485 
486  /// @brief Current column number in Unicode code points.
487  unsigned Column;
488 
489  /// @brief Current line number.
490  unsigned Line;
491 
492  /// @brief How deep we are in flow style containers. 0 Means at block level.
493  unsigned FlowLevel;
494 
495  /// @brief Are we at the start of the stream?
496  bool IsStartOfStream;
497 
498  /// @brief Can the next token be the start of a simple key?
499  bool IsSimpleKeyAllowed;
500 
501  /// @brief True if an error has occurred.
502  bool Failed;
503 
504  /// @brief Queue of tokens. This is required to queue up tokens while looking
505  /// for the end of a simple key. And for cases where a single character
506  /// can produce multiple tokens (e.g. BlockEnd).
507  TokenQueueT TokenQueue;
508 
509  /// @brief Indentation levels.
510  SmallVector<int, 4> Indents;
511 
512  /// @brief Potential simple keys.
513  SmallVector<SimpleKey, 4> SimpleKeys;
514 };
515 
516 } // end namespace yaml
517 } // end namespace llvm
518 
519 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
520 static void encodeUTF8( uint32_t UnicodeScalarValue
521  , SmallVectorImpl<char> &Result) {
522  if (UnicodeScalarValue <= 0x7F) {
523  Result.push_back(UnicodeScalarValue & 0x7F);
524  } else if (UnicodeScalarValue <= 0x7FF) {
525  uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
526  uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
527  Result.push_back(FirstByte);
528  Result.push_back(SecondByte);
529  } else if (UnicodeScalarValue <= 0xFFFF) {
530  uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
531  uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
532  uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
533  Result.push_back(FirstByte);
534  Result.push_back(SecondByte);
535  Result.push_back(ThirdByte);
536  } else if (UnicodeScalarValue <= 0x10FFFF) {
537  uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
538  uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
539  uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
540  uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
541  Result.push_back(FirstByte);
542  Result.push_back(SecondByte);
543  Result.push_back(ThirdByte);
544  Result.push_back(FourthByte);
545  }
546 }
547 
549  SourceMgr SM;
550  Scanner scanner(Input, SM);
551  while (true) {
552  Token T = scanner.getNext();
553  switch (T.Kind) {
555  OS << "Stream-Start: ";
556  break;
557  case Token::TK_StreamEnd:
558  OS << "Stream-End: ";
559  break;
561  OS << "Version-Directive: ";
562  break;
564  OS << "Tag-Directive: ";
565  break;
567  OS << "Document-Start: ";
568  break;
570  OS << "Document-End: ";
571  break;
573  OS << "Block-Entry: ";
574  break;
575  case Token::TK_BlockEnd:
576  OS << "Block-End: ";
577  break;
579  OS << "Block-Sequence-Start: ";
580  break;
582  OS << "Block-Mapping-Start: ";
583  break;
584  case Token::TK_FlowEntry:
585  OS << "Flow-Entry: ";
586  break;
588  OS << "Flow-Sequence-Start: ";
589  break;
591  OS << "Flow-Sequence-End: ";
592  break;
594  OS << "Flow-Mapping-Start: ";
595  break;
597  OS << "Flow-Mapping-End: ";
598  break;
599  case Token::TK_Key:
600  OS << "Key: ";
601  break;
602  case Token::TK_Value:
603  OS << "Value: ";
604  break;
605  case Token::TK_Scalar:
606  OS << "Scalar: ";
607  break;
608  case Token::TK_Alias:
609  OS << "Alias: ";
610  break;
611  case Token::TK_Anchor:
612  OS << "Anchor: ";
613  break;
614  case Token::TK_Tag:
615  OS << "Tag: ";
616  break;
617  case Token::TK_Error:
618  break;
619  }
620  OS << T.Range << "\n";
621  if (T.Kind == Token::TK_StreamEnd)
622  break;
623  else if (T.Kind == Token::TK_Error)
624  return false;
625  }
626  return true;
627 }
628 
630  llvm::SourceMgr SM;
631  llvm::yaml::Scanner scanner(Input, SM);
632  for (;;) {
633  llvm::yaml::Token T = scanner.getNext();
634  if (T.Kind == Token::TK_StreamEnd)
635  break;
636  else if (T.Kind == Token::TK_Error)
637  return false;
638  }
639  return true;
640 }
641 
643  std::string EscapedInput;
644  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
645  if (*i == '\\')
646  EscapedInput += "\\\\";
647  else if (*i == '"')
648  EscapedInput += "\\\"";
649  else if (*i == 0)
650  EscapedInput += "\\0";
651  else if (*i == 0x07)
652  EscapedInput += "\\a";
653  else if (*i == 0x08)
654  EscapedInput += "\\b";
655  else if (*i == 0x09)
656  EscapedInput += "\\t";
657  else if (*i == 0x0A)
658  EscapedInput += "\\n";
659  else if (*i == 0x0B)
660  EscapedInput += "\\v";
661  else if (*i == 0x0C)
662  EscapedInput += "\\f";
663  else if (*i == 0x0D)
664  EscapedInput += "\\r";
665  else if (*i == 0x1B)
666  EscapedInput += "\\e";
667  else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
668  std::string HexStr = utohexstr(*i);
669  EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
670  } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
671  UTF8Decoded UnicodeScalarValue
672  = decodeUTF8(StringRef(i, Input.end() - i));
673  if (UnicodeScalarValue.second == 0) {
674  // Found invalid char.
675  SmallString<4> Val;
676  encodeUTF8(0xFFFD, Val);
677  EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
678  // FIXME: Error reporting.
679  return EscapedInput;
680  }
681  if (UnicodeScalarValue.first == 0x85)
682  EscapedInput += "\\N";
683  else if (UnicodeScalarValue.first == 0xA0)
684  EscapedInput += "\\_";
685  else if (UnicodeScalarValue.first == 0x2028)
686  EscapedInput += "\\L";
687  else if (UnicodeScalarValue.first == 0x2029)
688  EscapedInput += "\\P";
689  else {
690  std::string HexStr = utohexstr(UnicodeScalarValue.first);
691  if (HexStr.size() <= 2)
692  EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
693  else if (HexStr.size() <= 4)
694  EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
695  else if (HexStr.size() <= 8)
696  EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
697  }
698  i += UnicodeScalarValue.second - 1;
699  } else
700  EscapedInput.push_back(*i);
701  }
702  return EscapedInput;
703 }
704 
706  : SM(sm)
707  , Indent(-1)
708  , Column(0)
709  , Line(0)
710  , FlowLevel(0)
711  , IsStartOfStream(true)
712  , IsSimpleKeyAllowed(true)
713  , Failed(false) {
714  InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
715  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
716  Current = InputBuffer->getBufferStart();
717  End = InputBuffer->getBufferEnd();
718 }
719 
721  : SM(SM_)
722  , InputBuffer(Buffer)
723  , Current(InputBuffer->getBufferStart())
724  , End(InputBuffer->getBufferEnd())
725  , Indent(-1)
726  , Column(0)
727  , Line(0)
728  , FlowLevel(0)
729  , IsStartOfStream(true)
730  , IsSimpleKeyAllowed(true)
731  , Failed(false) {
732  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
733 }
734 
736  // If the current token is a possible simple key, keep parsing until we
737  // can confirm.
738  bool NeedMore = false;
739  while (true) {
740  if (TokenQueue.empty() || NeedMore) {
741  if (!fetchMoreTokens()) {
742  TokenQueue.clear();
743  TokenQueue.push_back(Token());
744  return TokenQueue.front();
745  }
746  }
747  assert(!TokenQueue.empty() &&
748  "fetchMoreTokens lied about getting tokens!");
749 
750  removeStaleSimpleKeyCandidates();
751  SimpleKey SK;
752  SK.Tok = TokenQueue.front();
753  if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
754  == SimpleKeys.end())
755  break;
756  else
757  NeedMore = true;
758  }
759  return TokenQueue.front();
760 }
761 
763  Token Ret = peekNext();
764  // TokenQueue can be empty if there was an error getting the next token.
765  if (!TokenQueue.empty())
766  TokenQueue.pop_front();
767 
768  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
769  // quick deallocation of them all.
770  if (TokenQueue.empty()) {
771  TokenQueue.Alloc.Reset();
772  }
773 
774  return Ret;
775 }
776 
777 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
778  if (Position == End)
779  return Position;
780  // Check 7 bit c-printable - b-char.
781  if ( *Position == 0x09
782  || (*Position >= 0x20 && *Position <= 0x7E))
783  return Position + 1;
784 
785  // Check for valid UTF-8.
786  if (uint8_t(*Position) & 0x80) {
787  UTF8Decoded u8d = decodeUTF8(Position);
788  if ( u8d.second != 0
789  && u8d.first != 0xFEFF
790  && ( u8d.first == 0x85
791  || ( u8d.first >= 0xA0
792  && u8d.first <= 0xD7FF)
793  || ( u8d.first >= 0xE000
794  && u8d.first <= 0xFFFD)
795  || ( u8d.first >= 0x10000
796  && u8d.first <= 0x10FFFF)))
797  return Position + u8d.second;
798  }
799  return Position;
800 }
801 
802 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
803  if (Position == End)
804  return Position;
805  if (*Position == 0x0D) {
806  if (Position + 1 != End && *(Position + 1) == 0x0A)
807  return Position + 2;
808  return Position + 1;
809  }
810 
811  if (*Position == 0x0A)
812  return Position + 1;
813  return Position;
814 }
815 
816 
817 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
818  if (Position == End)
819  return Position;
820  if (*Position == ' ' || *Position == '\t')
821  return Position + 1;
822  return Position;
823 }
824 
825 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
826  if (Position == End)
827  return Position;
828  if (*Position == ' ' || *Position == '\t')
829  return Position;
830  return skip_nb_char(Position);
831 }
832 
833 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
834  , StringRef::iterator Position) {
835  while (true) {
836  StringRef::iterator i = (this->*Func)(Position);
837  if (i == Position)
838  break;
839  Position = i;
840  }
841  return Position;
842 }
843 
844 static bool is_ns_hex_digit(const char C) {
845  return (C >= '0' && C <= '9')
846  || (C >= 'a' && C <= 'z')
847  || (C >= 'A' && C <= 'Z');
848 }
849 
850 static bool is_ns_word_char(const char C) {
851  return C == '-'
852  || (C >= 'a' && C <= 'z')
853  || (C >= 'A' && C <= 'Z');
854 }
855 
856 StringRef Scanner::scan_ns_uri_char() {
857  StringRef::iterator Start = Current;
858  while (true) {
859  if (Current == End)
860  break;
861  if (( *Current == '%'
862  && Current + 2 < End
863  && is_ns_hex_digit(*(Current + 1))
864  && is_ns_hex_digit(*(Current + 2)))
865  || is_ns_word_char(*Current)
866  || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
867  != StringRef::npos) {
868  ++Current;
869  ++Column;
870  } else
871  break;
872  }
873  return StringRef(Start, Current - Start);
874 }
875 
876 StringRef Scanner::scan_ns_plain_one_line() {
877  StringRef::iterator start = Current;
878  // The first character must already be verified.
879  ++Current;
880  while (true) {
881  if (Current == End) {
882  break;
883  } else if (*Current == ':') {
884  // Check if the next character is a ns-char.
885  if (Current + 1 == End)
886  break;
887  StringRef::iterator i = skip_ns_char(Current + 1);
888  if (Current + 1 != i) {
889  Current = i;
890  Column += 2; // Consume both the ':' and ns-char.
891  } else
892  break;
893  } else if (*Current == '#') {
894  // Check if the previous character was a ns-char.
895  // The & 0x80 check is to check for the trailing byte of a utf-8
896  if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) {
897  ++Current;
898  ++Column;
899  } else
900  break;
901  } else {
902  StringRef::iterator i = skip_nb_char(Current);
903  if (i == Current)
904  break;
905  Current = i;
906  ++Column;
907  }
908  }
909  return StringRef(start, Current - start);
910 }
911 
912 bool Scanner::consume(uint32_t Expected) {
913  if (Expected >= 0x80)
914  report_fatal_error("Not dealing with this yet");
915  if (Current == End)
916  return false;
917  if (uint8_t(*Current) >= 0x80)
918  report_fatal_error("Not dealing with this yet");
919  if (uint8_t(*Current) == Expected) {
920  ++Current;
921  ++Column;
922  return true;
923  }
924  return false;
925 }
926 
927 void Scanner::skip(uint32_t Distance) {
928  Current += Distance;
929  Column += Distance;
930  assert(Current <= End && "Skipped past the end");
931 }
932 
933 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
934  if (Position == End)
935  return false;
936  if ( *Position == ' ' || *Position == '\t'
937  || *Position == '\r' || *Position == '\n')
938  return true;
939  return false;
940 }
941 
942 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
943  , unsigned AtColumn
944  , bool IsRequired) {
945  if (IsSimpleKeyAllowed) {
946  SimpleKey SK;
947  SK.Tok = Tok;
948  SK.Line = Line;
949  SK.Column = AtColumn;
950  SK.IsRequired = IsRequired;
951  SK.FlowLevel = FlowLevel;
952  SimpleKeys.push_back(SK);
953  }
954 }
955 
956 void Scanner::removeStaleSimpleKeyCandidates() {
957  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
958  i != SimpleKeys.end();) {
959  if (i->Line != Line || i->Column + 1024 < Column) {
960  if (i->IsRequired)
961  setError( "Could not find expected : for simple key"
962  , i->Tok->Range.begin());
963  i = SimpleKeys.erase(i);
964  } else
965  ++i;
966  }
967 }
968 
969 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
970  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
971  SimpleKeys.pop_back();
972 }
973 
974 bool Scanner::unrollIndent(int ToColumn) {
975  Token T;
976  // Indentation is ignored in flow.
977  if (FlowLevel != 0)
978  return true;
979 
980  while (Indent > ToColumn) {
982  T.Range = StringRef(Current, 1);
983  TokenQueue.push_back(T);
984  Indent = Indents.pop_back_val();
985  }
986 
987  return true;
988 }
989 
990 bool Scanner::rollIndent( int ToColumn
992  , TokenQueueT::iterator InsertPoint) {
993  if (FlowLevel)
994  return true;
995  if (Indent < ToColumn) {
996  Indents.push_back(Indent);
997  Indent = ToColumn;
998 
999  Token T;
1000  T.Kind = Kind;
1001  T.Range = StringRef(Current, 0);
1002  TokenQueue.insert(InsertPoint, T);
1003  }
1004  return true;
1005 }
1006 
1007 void Scanner::scanToNextToken() {
1008  while (true) {
1009  while (*Current == ' ' || *Current == '\t') {
1010  skip(1);
1011  }
1012 
1013  // Skip comment.
1014  if (*Current == '#') {
1015  while (true) {
1016  // This may skip more than one byte, thus Column is only incremented
1017  // for code points.
1018  StringRef::iterator i = skip_nb_char(Current);
1019  if (i == Current)
1020  break;
1021  Current = i;
1022  ++Column;
1023  }
1024  }
1025 
1026  // Skip EOL.
1027  StringRef::iterator i = skip_b_break(Current);
1028  if (i == Current)
1029  break;
1030  Current = i;
1031  ++Line;
1032  Column = 0;
1033  // New lines may start a simple key.
1034  if (!FlowLevel)
1035  IsSimpleKeyAllowed = true;
1036  }
1037 }
1038 
1039 bool Scanner::scanStreamStart() {
1040  IsStartOfStream = false;
1041 
1042  EncodingInfo EI = getUnicodeEncoding(currentInput());
1043 
1044  Token T;
1046  T.Range = StringRef(Current, EI.second);
1047  TokenQueue.push_back(T);
1048  Current += EI.second;
1049  return true;
1050 }
1051 
1052 bool Scanner::scanStreamEnd() {
1053  // Force an ending new line if one isn't present.
1054  if (Column != 0) {
1055  Column = 0;
1056  ++Line;
1057  }
1058 
1059  unrollIndent(-1);
1060  SimpleKeys.clear();
1061  IsSimpleKeyAllowed = false;
1062 
1063  Token T;
1065  T.Range = StringRef(Current, 0);
1066  TokenQueue.push_back(T);
1067  return true;
1068 }
1069 
1070 bool Scanner::scanDirective() {
1071  // Reset the indentation level.
1072  unrollIndent(-1);
1073  SimpleKeys.clear();
1074  IsSimpleKeyAllowed = false;
1075 
1076  StringRef::iterator Start = Current;
1077  consume('%');
1078  StringRef::iterator NameStart = Current;
1079  Current = skip_while(&Scanner::skip_ns_char, Current);
1080  StringRef Name(NameStart, Current - NameStart);
1081  Current = skip_while(&Scanner::skip_s_white, Current);
1082 
1083  Token T;
1084  if (Name == "YAML") {
1085  Current = skip_while(&Scanner::skip_ns_char, Current);
1087  T.Range = StringRef(Start, Current - Start);
1088  TokenQueue.push_back(T);
1089  return true;
1090  } else if(Name == "TAG") {
1091  Current = skip_while(&Scanner::skip_ns_char, Current);
1092  Current = skip_while(&Scanner::skip_s_white, Current);
1093  Current = skip_while(&Scanner::skip_ns_char, Current);
1095  T.Range = StringRef(Start, Current - Start);
1096  TokenQueue.push_back(T);
1097  return true;
1098  }
1099  return false;
1100 }
1101 
1102 bool Scanner::scanDocumentIndicator(bool IsStart) {
1103  unrollIndent(-1);
1104  SimpleKeys.clear();
1105  IsSimpleKeyAllowed = false;
1106 
1107  Token T;
1109  T.Range = StringRef(Current, 3);
1110  skip(3);
1111  TokenQueue.push_back(T);
1112  return true;
1113 }
1114 
1115 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1116  Token T;
1117  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1119  T.Range = StringRef(Current, 1);
1120  skip(1);
1121  TokenQueue.push_back(T);
1122 
1123  // [ and { may begin a simple key.
1124  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
1125 
1126  // And may also be followed by a simple key.
1127  IsSimpleKeyAllowed = true;
1128  ++FlowLevel;
1129  return true;
1130 }
1131 
1132 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1133  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1134  IsSimpleKeyAllowed = false;
1135  Token T;
1136  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1138  T.Range = StringRef(Current, 1);
1139  skip(1);
1140  TokenQueue.push_back(T);
1141  if (FlowLevel)
1142  --FlowLevel;
1143  return true;
1144 }
1145 
1146 bool Scanner::scanFlowEntry() {
1147  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1148  IsSimpleKeyAllowed = true;
1149  Token T;
1151  T.Range = StringRef(Current, 1);
1152  skip(1);
1153  TokenQueue.push_back(T);
1154  return true;
1155 }
1156 
1157 bool Scanner::scanBlockEntry() {
1158  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1159  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1160  IsSimpleKeyAllowed = true;
1161  Token T;
1163  T.Range = StringRef(Current, 1);
1164  skip(1);
1165  TokenQueue.push_back(T);
1166  return true;
1167 }
1168 
1169 bool Scanner::scanKey() {
1170  if (!FlowLevel)
1171  rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1172 
1173  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1174  IsSimpleKeyAllowed = !FlowLevel;
1175 
1176  Token T;
1177  T.Kind = Token::TK_Key;
1178  T.Range = StringRef(Current, 1);
1179  skip(1);
1180  TokenQueue.push_back(T);
1181  return true;
1182 }
1183 
1184 bool Scanner::scanValue() {
1185  // If the previous token could have been a simple key, insert the key token
1186  // into the token queue.
1187  if (!SimpleKeys.empty()) {
1188  SimpleKey SK = SimpleKeys.pop_back_val();
1189  Token T;
1190  T.Kind = Token::TK_Key;
1191  T.Range = SK.Tok->Range;
1192  TokenQueueT::iterator i, e;
1193  for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1194  if (i == SK.Tok)
1195  break;
1196  }
1197  assert(i != e && "SimpleKey not in token queue!");
1198  i = TokenQueue.insert(i, T);
1199 
1200  // We may also need to add a Block-Mapping-Start token.
1201  rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1202 
1203  IsSimpleKeyAllowed = false;
1204  } else {
1205  if (!FlowLevel)
1206  rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1207  IsSimpleKeyAllowed = !FlowLevel;
1208  }
1209 
1210  Token T;
1211  T.Kind = Token::TK_Value;
1212  T.Range = StringRef(Current, 1);
1213  skip(1);
1214  TokenQueue.push_back(T);
1215  return true;
1216 }
1217 
1218 // Forbidding inlining improves performance by roughly 20%.
1219 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1220 LLVM_ATTRIBUTE_NOINLINE static bool
1222 
1223 // Returns whether a character at 'Position' was escaped with a leading '\'.
1224 // 'First' specifies the position of the first character in the string.
1226  StringRef::iterator Position) {
1227  assert(Position - 1 >= First);
1228  StringRef::iterator I = Position - 1;
1229  // We calculate the number of consecutive '\'s before the current position
1230  // by iterating backwards through our string.
1231  while (I >= First && *I == '\\') --I;
1232  // (Position - 1 - I) now contains the number of '\'s before the current
1233  // position. If it is odd, the character at 'Position' was escaped.
1234  return (Position - 1 - I) % 2 == 1;
1235 }
1236 
1237 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1238  StringRef::iterator Start = Current;
1239  unsigned ColStart = Column;
1240  if (IsDoubleQuoted) {
1241  do {
1242  ++Current;
1243  while (Current != End && *Current != '"')
1244  ++Current;
1245  // Repeat until the previous character was not a '\' or was an escaped
1246  // backslash.
1247  } while ( Current != End
1248  && *(Current - 1) == '\\'
1249  && wasEscaped(Start + 1, Current));
1250  } else {
1251  skip(1);
1252  while (true) {
1253  // Skip a ' followed by another '.
1254  if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1255  skip(2);
1256  continue;
1257  } else if (*Current == '\'')
1258  break;
1259  StringRef::iterator i = skip_nb_char(Current);
1260  if (i == Current) {
1261  i = skip_b_break(Current);
1262  if (i == Current)
1263  break;
1264  Current = i;
1265  Column = 0;
1266  ++Line;
1267  } else {
1268  if (i == End)
1269  break;
1270  Current = i;
1271  ++Column;
1272  }
1273  }
1274  }
1275 
1276  if (Current == End) {
1277  setError("Expected quote at end of scalar", Current);
1278  return false;
1279  }
1280 
1281  skip(1); // Skip ending quote.
1282  Token T;
1283  T.Kind = Token::TK_Scalar;
1284  T.Range = StringRef(Start, Current - Start);
1285  TokenQueue.push_back(T);
1286 
1287  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1288 
1289  IsSimpleKeyAllowed = false;
1290 
1291  return true;
1292 }
1293 
1294 bool Scanner::scanPlainScalar() {
1295  StringRef::iterator Start = Current;
1296  unsigned ColStart = Column;
1297  unsigned LeadingBlanks = 0;
1298  assert(Indent >= -1 && "Indent must be >= -1 !");
1299  unsigned indent = static_cast<unsigned>(Indent + 1);
1300  while (true) {
1301  if (*Current == '#')
1302  break;
1303 
1304  while (!isBlankOrBreak(Current)) {
1305  if ( FlowLevel && *Current == ':'
1306  && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
1307  setError("Found unexpected ':' while scanning a plain scalar", Current);
1308  return false;
1309  }
1310 
1311  // Check for the end of the plain scalar.
1312  if ( (*Current == ':' && isBlankOrBreak(Current + 1))
1313  || ( FlowLevel
1314  && (StringRef(Current, 1).find_first_of(",:?[]{}")
1315  != StringRef::npos)))
1316  break;
1317 
1318  StringRef::iterator i = skip_nb_char(Current);
1319  if (i == Current)
1320  break;
1321  Current = i;
1322  ++Column;
1323  }
1324 
1325  // Are we at the end?
1326  if (!isBlankOrBreak(Current))
1327  break;
1328 
1329  // Eat blanks.
1330  StringRef::iterator Tmp = Current;
1331  while (isBlankOrBreak(Tmp)) {
1332  StringRef::iterator i = skip_s_white(Tmp);
1333  if (i != Tmp) {
1334  if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1335  setError("Found invalid tab character in indentation", Tmp);
1336  return false;
1337  }
1338  Tmp = i;
1339  ++Column;
1340  } else {
1341  i = skip_b_break(Tmp);
1342  if (!LeadingBlanks)
1343  LeadingBlanks = 1;
1344  Tmp = i;
1345  Column = 0;
1346  ++Line;
1347  }
1348  }
1349 
1350  if (!FlowLevel && Column < indent)
1351  break;
1352 
1353  Current = Tmp;
1354  }
1355  if (Start == Current) {
1356  setError("Got empty plain scalar", Start);
1357  return false;
1358  }
1359  Token T;
1360  T.Kind = Token::TK_Scalar;
1361  T.Range = StringRef(Start, Current - Start);
1362  TokenQueue.push_back(T);
1363 
1364  // Plain scalars can be simple keys.
1365  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1366 
1367  IsSimpleKeyAllowed = false;
1368 
1369  return true;
1370 }
1371 
1372 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1373  StringRef::iterator Start = Current;
1374  unsigned ColStart = Column;
1375  skip(1);
1376  while(true) {
1377  if ( *Current == '[' || *Current == ']'
1378  || *Current == '{' || *Current == '}'
1379  || *Current == ','
1380  || *Current == ':')
1381  break;
1382  StringRef::iterator i = skip_ns_char(Current);
1383  if (i == Current)
1384  break;
1385  Current = i;
1386  ++Column;
1387  }
1388 
1389  if (Start == Current) {
1390  setError("Got empty alias or anchor", Start);
1391  return false;
1392  }
1393 
1394  Token T;
1395  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1396  T.Range = StringRef(Start, Current - Start);
1397  TokenQueue.push_back(T);
1398 
1399  // Alias and anchors can be simple keys.
1400  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1401 
1402  IsSimpleKeyAllowed = false;
1403 
1404  return true;
1405 }
1406 
1407 bool Scanner::scanBlockScalar(bool IsLiteral) {
1408  StringRef::iterator Start = Current;
1409  skip(1); // Eat | or >
1410  while(true) {
1411  StringRef::iterator i = skip_nb_char(Current);
1412  if (i == Current) {
1413  if (Column == 0)
1414  break;
1415  i = skip_b_break(Current);
1416  if (i != Current) {
1417  // We got a line break.
1418  Column = 0;
1419  ++Line;
1420  Current = i;
1421  continue;
1422  } else {
1423  // There was an error, which should already have been printed out.
1424  return false;
1425  }
1426  }
1427  Current = i;
1428  ++Column;
1429  }
1430 
1431  if (Start == Current) {
1432  setError("Got empty block scalar", Start);
1433  return false;
1434  }
1435 
1436  Token T;
1437  T.Kind = Token::TK_Scalar;
1438  T.Range = StringRef(Start, Current - Start);
1439  TokenQueue.push_back(T);
1440  return true;
1441 }
1442 
1443 bool Scanner::scanTag() {
1444  StringRef::iterator Start = Current;
1445  unsigned ColStart = Column;
1446  skip(1); // Eat !.
1447  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1448  else if (*Current == '<') {
1449  skip(1);
1450  scan_ns_uri_char();
1451  if (!consume('>'))
1452  return false;
1453  } else {
1454  // FIXME: Actually parse the c-ns-shorthand-tag rule.
1455  Current = skip_while(&Scanner::skip_ns_char, Current);
1456  }
1457 
1458  Token T;
1459  T.Kind = Token::TK_Tag;
1460  T.Range = StringRef(Start, Current - Start);
1461  TokenQueue.push_back(T);
1462 
1463  // Tags can be simple keys.
1464  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1465 
1466  IsSimpleKeyAllowed = false;
1467 
1468  return true;
1469 }
1470 
1471 bool Scanner::fetchMoreTokens() {
1472  if (IsStartOfStream)
1473  return scanStreamStart();
1474 
1475  scanToNextToken();
1476 
1477  if (Current == End)
1478  return scanStreamEnd();
1479 
1480  removeStaleSimpleKeyCandidates();
1481 
1482  unrollIndent(Column);
1483 
1484  if (Column == 0 && *Current == '%')
1485  return scanDirective();
1486 
1487  if (Column == 0 && Current + 4 <= End
1488  && *Current == '-'
1489  && *(Current + 1) == '-'
1490  && *(Current + 2) == '-'
1491  && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1492  return scanDocumentIndicator(true);
1493 
1494  if (Column == 0 && Current + 4 <= End
1495  && *Current == '.'
1496  && *(Current + 1) == '.'
1497  && *(Current + 2) == '.'
1498  && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1499  return scanDocumentIndicator(false);
1500 
1501  if (*Current == '[')
1502  return scanFlowCollectionStart(true);
1503 
1504  if (*Current == '{')
1505  return scanFlowCollectionStart(false);
1506 
1507  if (*Current == ']')
1508  return scanFlowCollectionEnd(true);
1509 
1510  if (*Current == '}')
1511  return scanFlowCollectionEnd(false);
1512 
1513  if (*Current == ',')
1514  return scanFlowEntry();
1515 
1516  if (*Current == '-' && isBlankOrBreak(Current + 1))
1517  return scanBlockEntry();
1518 
1519  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1520  return scanKey();
1521 
1522  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1523  return scanValue();
1524 
1525  if (*Current == '*')
1526  return scanAliasOrAnchor(true);
1527 
1528  if (*Current == '&')
1529  return scanAliasOrAnchor(false);
1530 
1531  if (*Current == '!')
1532  return scanTag();
1533 
1534  if (*Current == '|' && !FlowLevel)
1535  return scanBlockScalar(true);
1536 
1537  if (*Current == '>' && !FlowLevel)
1538  return scanBlockScalar(false);
1539 
1540  if (*Current == '\'')
1541  return scanFlowScalar(false);
1542 
1543  if (*Current == '"')
1544  return scanFlowScalar(true);
1545 
1546  // Get a plain scalar.
1547  StringRef FirstChar(Current, 1);
1548  if (!(isBlankOrBreak(Current)
1549  || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1550  || (*Current == '-' && !isBlankOrBreak(Current + 1))
1551  || (!FlowLevel && (*Current == '?' || *Current == ':')
1552  && isBlankOrBreak(Current + 1))
1553  || (!FlowLevel && *Current == ':'
1554  && Current + 2 < End
1555  && *(Current + 1) == ':'
1556  && !isBlankOrBreak(Current + 2)))
1557  return scanPlainScalar();
1558 
1559  setError("Unrecognized character while tokenizing.");
1560  return false;
1561 }
1562 
1564  : scanner(new Scanner(Input, SM))
1565  , CurrentDoc(0) {}
1566 
1568  : scanner(new Scanner(InputBuffer, SM))
1569  , CurrentDoc(0) {}
1570 
1572 
1573 bool Stream::failed() { return scanner->failed(); }
1574 
1575 void Stream::printError(Node *N, const Twine &Msg) {
1576  SmallVector<SMRange, 1> Ranges;
1577  Ranges.push_back(N->getSourceRange());
1578  scanner->printError( N->getSourceRange().Start
1580  , Msg
1581  , Ranges);
1582 }
1583 
1585  if (CurrentDoc)
1586  report_fatal_error("Can only iterate over the stream once");
1587 
1588  // Skip Stream-Start.
1589  scanner->getNext();
1590 
1591  CurrentDoc.reset(new Document(*this));
1592  return document_iterator(CurrentDoc);
1593 }
1594 
1596  return document_iterator();
1597 }
1598 
1600  for (document_iterator i = begin(), e = end(); i != e; ++i)
1601  i->skip();
1602 }
1603 
1605  : Doc(D)
1606  , TypeID(Type)
1607  , Anchor(A)
1608  , Tag(T) {
1609  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1610  SourceRange = SMRange(Start, Start);
1611 }
1612 
1613 std::string Node::getVerbatimTag() const {
1614  StringRef Raw = getRawTag();
1615  if (!Raw.empty() && Raw != "!") {
1616  std::string Ret;
1617  if (Raw.find_last_of('!') == 0) {
1618  Ret = Doc->getTagMap().find("!")->second;
1619  Ret += Raw.substr(1);
1620  return llvm_move(Ret);
1621  } else if (Raw.startswith("!!")) {
1622  Ret = Doc->getTagMap().find("!!")->second;
1623  Ret += Raw.substr(2);
1624  return llvm_move(Ret);
1625  } else {
1626  StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1627  std::map<StringRef, StringRef>::const_iterator It =
1628  Doc->getTagMap().find(TagHandle);
1629  if (It != Doc->getTagMap().end())
1630  Ret = It->second;
1631  else {
1632  Token T;
1633  T.Kind = Token::TK_Tag;
1634  T.Range = TagHandle;
1635  setError(Twine("Unknown tag handle ") + TagHandle, T);
1636  }
1637  Ret += Raw.substr(Raw.find_last_of('!') + 1);
1638  return llvm_move(Ret);
1639  }
1640  }
1641 
1642  switch (getType()) {
1643  case NK_Null:
1644  return "tag:yaml.org,2002:null";
1645  case NK_Scalar:
1646  // TODO: Tag resolution.
1647  return "tag:yaml.org,2002:str";
1648  case NK_Mapping:
1649  return "tag:yaml.org,2002:map";
1650  case NK_Sequence:
1651  return "tag:yaml.org,2002:seq";
1652  }
1653 
1654  return "";
1655 }
1656 
1657 Token &Node::peekNext() {
1658  return Doc->peekNext();
1659 }
1660 
1661 Token Node::getNext() {
1662  return Doc->getNext();
1663 }
1664 
1665 Node *Node::parseBlockNode() {
1666  return Doc->parseBlockNode();
1667 }
1668 
1669 BumpPtrAllocator &Node::getAllocator() {
1670  return Doc->NodeAllocator;
1671 }
1672 
1673 void Node::setError(const Twine &Msg, Token &Tok) const {
1674  Doc->setError(Msg, Tok);
1675 }
1676 
1677 bool Node::failed() const {
1678  return Doc->failed();
1679 }
1680 
1681 
1682 
1683 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1684  // TODO: Handle newlines properly. We need to remove leading whitespace.
1685  if (Value[0] == '"') { // Double quoted.
1686  // Pull off the leading and trailing "s.
1687  StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1688  // Search for characters that would require unescaping the value.
1689  StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1690  if (i != StringRef::npos)
1691  return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1692  return UnquotedValue;
1693  } else if (Value[0] == '\'') { // Single quoted.
1694  // Pull off the leading and trailing 's.
1695  StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1696  StringRef::size_type i = UnquotedValue.find('\'');
1697  if (i != StringRef::npos) {
1698  // We're going to need Storage.
1699  Storage.clear();
1700  Storage.reserve(UnquotedValue.size());
1701  for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1702  StringRef Valid(UnquotedValue.begin(), i);
1703  Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1704  Storage.push_back('\'');
1705  UnquotedValue = UnquotedValue.substr(i + 2);
1706  }
1707  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1708  return StringRef(Storage.begin(), Storage.size());
1709  }
1710  return UnquotedValue;
1711  }
1712  // Plain or block.
1713  return Value.rtrim(" ");
1714 }
1715 
1716 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1718  , SmallVectorImpl<char> &Storage)
1719  const {
1720  // Use Storage to build proper value.
1721  Storage.clear();
1722  Storage.reserve(UnquotedValue.size());
1723  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1724  // Insert all previous chars into Storage.
1725  StringRef Valid(UnquotedValue.begin(), i);
1726  Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1727  // Chop off inserted chars.
1728  UnquotedValue = UnquotedValue.substr(i);
1729 
1730  assert(!UnquotedValue.empty() && "Can't be empty!");
1731 
1732  // Parse escape or line break.
1733  switch (UnquotedValue[0]) {
1734  case '\r':
1735  case '\n':
1736  Storage.push_back('\n');
1737  if ( UnquotedValue.size() > 1
1738  && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1739  UnquotedValue = UnquotedValue.substr(1);
1740  UnquotedValue = UnquotedValue.substr(1);
1741  break;
1742  default:
1743  if (UnquotedValue.size() == 1)
1744  // TODO: Report error.
1745  break;
1746  UnquotedValue = UnquotedValue.substr(1);
1747  switch (UnquotedValue[0]) {
1748  default: {
1749  Token T;
1750  T.Range = StringRef(UnquotedValue.begin(), 1);
1751  setError("Unrecognized escape code!", T);
1752  return "";
1753  }
1754  case '\r':
1755  case '\n':
1756  // Remove the new line.
1757  if ( UnquotedValue.size() > 1
1758  && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1759  UnquotedValue = UnquotedValue.substr(1);
1760  // If this was just a single byte newline, it will get skipped
1761  // below.
1762  break;
1763  case '0':
1764  Storage.push_back(0x00);
1765  break;
1766  case 'a':
1767  Storage.push_back(0x07);
1768  break;
1769  case 'b':
1770  Storage.push_back(0x08);
1771  break;
1772  case 't':
1773  case 0x09:
1774  Storage.push_back(0x09);
1775  break;
1776  case 'n':
1777  Storage.push_back(0x0A);
1778  break;
1779  case 'v':
1780  Storage.push_back(0x0B);
1781  break;
1782  case 'f':
1783  Storage.push_back(0x0C);
1784  break;
1785  case 'r':
1786  Storage.push_back(0x0D);
1787  break;
1788  case 'e':
1789  Storage.push_back(0x1B);
1790  break;
1791  case ' ':
1792  Storage.push_back(0x20);
1793  break;
1794  case '"':
1795  Storage.push_back(0x22);
1796  break;
1797  case '/':
1798  Storage.push_back(0x2F);
1799  break;
1800  case '\\':
1801  Storage.push_back(0x5C);
1802  break;
1803  case 'N':
1804  encodeUTF8(0x85, Storage);
1805  break;
1806  case '_':
1807  encodeUTF8(0xA0, Storage);
1808  break;
1809  case 'L':
1810  encodeUTF8(0x2028, Storage);
1811  break;
1812  case 'P':
1813  encodeUTF8(0x2029, Storage);
1814  break;
1815  case 'x': {
1816  if (UnquotedValue.size() < 3)
1817  // TODO: Report error.
1818  break;
1819  unsigned int UnicodeScalarValue;
1820  if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
1821  // TODO: Report error.
1822  UnicodeScalarValue = 0xFFFD;
1823  encodeUTF8(UnicodeScalarValue, Storage);
1824  UnquotedValue = UnquotedValue.substr(2);
1825  break;
1826  }
1827  case 'u': {
1828  if (UnquotedValue.size() < 5)
1829  // TODO: Report error.
1830  break;
1831  unsigned int UnicodeScalarValue;
1832  if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
1833  // TODO: Report error.
1834  UnicodeScalarValue = 0xFFFD;
1835  encodeUTF8(UnicodeScalarValue, Storage);
1836  UnquotedValue = UnquotedValue.substr(4);
1837  break;
1838  }
1839  case 'U': {
1840  if (UnquotedValue.size() < 9)
1841  // TODO: Report error.
1842  break;
1843  unsigned int UnicodeScalarValue;
1844  if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
1845  // TODO: Report error.
1846  UnicodeScalarValue = 0xFFFD;
1847  encodeUTF8(UnicodeScalarValue, Storage);
1848  UnquotedValue = UnquotedValue.substr(8);
1849  break;
1850  }
1851  }
1852  UnquotedValue = UnquotedValue.substr(1);
1853  }
1854  }
1855  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1856  return StringRef(Storage.begin(), Storage.size());
1857 }
1858 
1859 Node *KeyValueNode::getKey() {
1860  if (Key)
1861  return Key;
1862  // Handle implicit null keys.
1863  {
1864  Token &t = peekNext();
1865  if ( t.Kind == Token::TK_BlockEnd
1866  || t.Kind == Token::TK_Value
1867  || t.Kind == Token::TK_Error) {
1868  return Key = new (getAllocator()) NullNode(Doc);
1869  }
1870  if (t.Kind == Token::TK_Key)
1871  getNext(); // skip TK_Key.
1872  }
1873 
1874  // Handle explicit null keys.
1875  Token &t = peekNext();
1876  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
1877  return Key = new (getAllocator()) NullNode(Doc);
1878  }
1879 
1880  // We've got a normal key.
1881  return Key = parseBlockNode();
1882 }
1883 
1884 Node *KeyValueNode::getValue() {
1885  if (Value)
1886  return Value;
1887  getKey()->skip();
1888  if (failed())
1889  return Value = new (getAllocator()) NullNode(Doc);
1890 
1891  // Handle implicit null values.
1892  {
1893  Token &t = peekNext();
1894  if ( t.Kind == Token::TK_BlockEnd
1895  || t.Kind == Token::TK_FlowMappingEnd
1896  || t.Kind == Token::TK_Key
1897  || t.Kind == Token::TK_FlowEntry
1898  || t.Kind == Token::TK_Error) {
1899  return Value = new (getAllocator()) NullNode(Doc);
1900  }
1901 
1902  if (t.Kind != Token::TK_Value) {
1903  setError("Unexpected token in Key Value.", t);
1904  return Value = new (getAllocator()) NullNode(Doc);
1905  }
1906  getNext(); // skip TK_Value.
1907  }
1908 
1909  // Handle explicit null values.
1910  Token &t = peekNext();
1911  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
1912  return Value = new (getAllocator()) NullNode(Doc);
1913  }
1914 
1915  // We got a normal value.
1916  return Value = parseBlockNode();
1917 }
1918 
1919 void MappingNode::increment() {
1920  if (failed()) {
1921  IsAtEnd = true;
1922  CurrentEntry = 0;
1923  return;
1924  }
1925  if (CurrentEntry) {
1926  CurrentEntry->skip();
1927  if (Type == MT_Inline) {
1928  IsAtEnd = true;
1929  CurrentEntry = 0;
1930  return;
1931  }
1932  }
1933  Token T = peekNext();
1934  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
1935  // KeyValueNode eats the TK_Key. That way it can detect null keys.
1936  CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
1937  } else if (Type == MT_Block) {
1938  switch (T.Kind) {
1939  case Token::TK_BlockEnd:
1940  getNext();
1941  IsAtEnd = true;
1942  CurrentEntry = 0;
1943  break;
1944  default:
1945  setError("Unexpected token. Expected Key or Block End", T);
1946  case Token::TK_Error:
1947  IsAtEnd = true;
1948  CurrentEntry = 0;
1949  }
1950  } else {
1951  switch (T.Kind) {
1952  case Token::TK_FlowEntry:
1953  // Eat the flow entry and recurse.
1954  getNext();
1955  return increment();
1956  case Token::TK_FlowMappingEnd:
1957  getNext();
1958  case Token::TK_Error:
1959  // Set this to end iterator.
1960  IsAtEnd = true;
1961  CurrentEntry = 0;
1962  break;
1963  default:
1964  setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
1965  "Mapping End."
1966  , T);
1967  IsAtEnd = true;
1968  CurrentEntry = 0;
1969  }
1970  }
1971 }
1972 
1973 void SequenceNode::increment() {
1974  if (failed()) {
1975  IsAtEnd = true;
1976  CurrentEntry = 0;
1977  return;
1978  }
1979  if (CurrentEntry)
1980  CurrentEntry->skip();
1981  Token T = peekNext();
1982  if (SeqType == ST_Block) {
1983  switch (T.Kind) {
1984  case Token::TK_BlockEntry:
1985  getNext();
1986  CurrentEntry = parseBlockNode();
1987  if (CurrentEntry == 0) { // An error occurred.
1988  IsAtEnd = true;
1989  CurrentEntry = 0;
1990  }
1991  break;
1992  case Token::TK_BlockEnd:
1993  getNext();
1994  IsAtEnd = true;
1995  CurrentEntry = 0;
1996  break;
1997  default:
1998  setError( "Unexpected token. Expected Block Entry or Block End."
1999  , T);
2000  case Token::TK_Error:
2001  IsAtEnd = true;
2002  CurrentEntry = 0;
2003  }
2004  } else if (SeqType == ST_Indentless) {
2005  switch (T.Kind) {
2006  case Token::TK_BlockEntry:
2007  getNext();
2008  CurrentEntry = parseBlockNode();
2009  if (CurrentEntry == 0) { // An error occurred.
2010  IsAtEnd = true;
2011  CurrentEntry = 0;
2012  }
2013  break;
2014  default:
2015  case Token::TK_Error:
2016  IsAtEnd = true;
2017  CurrentEntry = 0;
2018  }
2019  } else if (SeqType == ST_Flow) {
2020  switch (T.Kind) {
2021  case Token::TK_FlowEntry:
2022  // Eat the flow entry and recurse.
2023  getNext();
2024  WasPreviousTokenFlowEntry = true;
2025  return increment();
2026  case Token::TK_FlowSequenceEnd:
2027  getNext();
2028  case Token::TK_Error:
2029  // Set this to end iterator.
2030  IsAtEnd = true;
2031  CurrentEntry = 0;
2032  break;
2033  case Token::TK_StreamEnd:
2034  case Token::TK_DocumentEnd:
2035  case Token::TK_DocumentStart:
2036  setError("Could not find closing ]!", T);
2037  // Set this to end iterator.
2038  IsAtEnd = true;
2039  CurrentEntry = 0;
2040  break;
2041  default:
2042  if (!WasPreviousTokenFlowEntry) {
2043  setError("Expected , between entries!", T);
2044  IsAtEnd = true;
2045  CurrentEntry = 0;
2046  break;
2047  }
2048  // Otherwise it must be a flow entry.
2049  CurrentEntry = parseBlockNode();
2050  if (!CurrentEntry) {
2051  IsAtEnd = true;
2052  }
2053  WasPreviousTokenFlowEntry = false;
2054  break;
2055  }
2056  }
2057 }
2058 
2059 Document::Document(Stream &S) : stream(S), Root(0) {
2060  // Tag maps starts with two default mappings.
2061  TagMap["!"] = "!";
2062  TagMap["!!"] = "tag:yaml.org,2002:";
2063 
2064  if (parseDirectives())
2065  expectToken(Token::TK_DocumentStart);
2066  Token &T = peekNext();
2067  if (T.Kind == Token::TK_DocumentStart)
2068  getNext();
2069 }
2070 
2072  if (stream.scanner->failed())
2073  return false;
2074  if (!Root)
2075  getRoot();
2076  Root->skip();
2077  Token &T = peekNext();
2078  if (T.Kind == Token::TK_StreamEnd)
2079  return false;
2080  if (T.Kind == Token::TK_DocumentEnd) {
2081  getNext();
2082  return skip();
2083  }
2084  return true;
2085 }
2086 
2087 Token &Document::peekNext() {
2088  return stream.scanner->peekNext();
2089 }
2090 
2091 Token Document::getNext() {
2092  return stream.scanner->getNext();
2093 }
2094 
2095 void Document::setError(const Twine &Message, Token &Location) const {
2096  stream.scanner->setError(Message, Location.Range.begin());
2097 }
2098 
2099 bool Document::failed() const {
2100  return stream.scanner->failed();
2101 }
2102 
2104  Token T = peekNext();
2105  // Handle properties.
2106  Token AnchorInfo;
2107  Token TagInfo;
2108 parse_property:
2109  switch (T.Kind) {
2110  case Token::TK_Alias:
2111  getNext();
2112  return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2113  case Token::TK_Anchor:
2114  if (AnchorInfo.Kind == Token::TK_Anchor) {
2115  setError("Already encountered an anchor for this node!", T);
2116  return 0;
2117  }
2118  AnchorInfo = getNext(); // Consume TK_Anchor.
2119  T = peekNext();
2120  goto parse_property;
2121  case Token::TK_Tag:
2122  if (TagInfo.Kind == Token::TK_Tag) {
2123  setError("Already encountered a tag for this node!", T);
2124  return 0;
2125  }
2126  TagInfo = getNext(); // Consume TK_Tag.
2127  T = peekNext();
2128  goto parse_property;
2129  default:
2130  break;
2131  }
2132 
2133  switch (T.Kind) {
2134  case Token::TK_BlockEntry:
2135  // We got an unindented BlockEntry sequence. This is not terminated with
2136  // a BlockEnd.
2137  // Don't eat the TK_BlockEntry, SequenceNode needs it.
2138  return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2139  , AnchorInfo.Range.substr(1)
2140  , TagInfo.Range
2143  getNext();
2144  return new (NodeAllocator)
2145  SequenceNode( stream.CurrentDoc
2146  , AnchorInfo.Range.substr(1)
2147  , TagInfo.Range
2150  getNext();
2151  return new (NodeAllocator)
2152  MappingNode( stream.CurrentDoc
2153  , AnchorInfo.Range.substr(1)
2154  , TagInfo.Range
2157  getNext();
2158  return new (NodeAllocator)
2159  SequenceNode( stream.CurrentDoc
2160  , AnchorInfo.Range.substr(1)
2161  , TagInfo.Range
2164  getNext();
2165  return new (NodeAllocator)
2166  MappingNode( stream.CurrentDoc
2167  , AnchorInfo.Range.substr(1)
2168  , TagInfo.Range
2170  case Token::TK_Scalar:
2171  getNext();
2172  return new (NodeAllocator)
2173  ScalarNode( stream.CurrentDoc
2174  , AnchorInfo.Range.substr(1)
2175  , TagInfo.Range
2176  , T.Range);
2177  case Token::TK_Key:
2178  // Don't eat the TK_Key, KeyValueNode expects it.
2179  return new (NodeAllocator)
2180  MappingNode( stream.CurrentDoc
2181  , AnchorInfo.Range.substr(1)
2182  , TagInfo.Range
2185  case Token::TK_DocumentEnd:
2186  case Token::TK_StreamEnd:
2187  default:
2188  // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2189  // !!null null.
2190  return new (NodeAllocator) NullNode(stream.CurrentDoc);
2191  case Token::TK_Error:
2192  return 0;
2193  }
2194  llvm_unreachable("Control flow shouldn't reach here.");
2195  return 0;
2196 }
2197 
2198 bool Document::parseDirectives() {
2199  bool isDirective = false;
2200  while (true) {
2201  Token T = peekNext();
2202  if (T.Kind == Token::TK_TagDirective) {
2203  parseTAGDirective();
2204  isDirective = true;
2205  } else if (T.Kind == Token::TK_VersionDirective) {
2206  parseYAMLDirective();
2207  isDirective = true;
2208  } else
2209  break;
2210  }
2211  return isDirective;
2212 }
2213 
2214 void Document::parseYAMLDirective() {
2215  getNext(); // Eat %YAML <version>
2216 }
2217 
2218 void Document::parseTAGDirective() {
2219  Token Tag = getNext(); // %TAG <handle> <prefix>
2220  StringRef T = Tag.Range;
2221  // Strip %TAG
2222  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2223  std::size_t HandleEnd = T.find_first_of(" \t");
2224  StringRef TagHandle = T.substr(0, HandleEnd);
2225  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2226  TagMap[TagHandle] = TagPrefix;
2227 }
2228 
2229 bool Document::expectToken(int TK) {
2230  Token T = getNext();
2231  if (T.Kind != TK) {
2232  setError("Unexpected token", T);
2233  return false;
2234  }
2235  return true;
2236 }
static MemoryBuffer * getMemBuffer(StringRef InputData, StringRef BufferName="", bool RequiresNullTerminator=true)
static void destroySentinel(Token *)
Definition: YAMLParser.cpp:150
Node(unsigned int Type, OwningPtr< Document > &, StringRef Anchor, StringRef Tag)
void push_back(const T &Elt)
Definition: SmallVector.h:236
OwningPtr< Document > & Doc
Definition: YAMLParser.h:160
void reserve(unsigned N)
Definition: SmallVector.h:425
std::string getVerbatimTag() const
Get the verbatium tag for a given Node. This performs tag resoluton and substitution.
iplist< Token >::iterator iterator
Definition: ilist.h:642
size_t size() const
size - Get the string size.
Definition: StringRef.h:113
const char * getBufferStart() const
Definition: MemoryBuffer.h:51
Not a valid Unicode encoding.
Definition: YAMLParser.cpp:34
bool scanTokens(StringRef Input)
Scans all tokens in input without outputting anything. This is used for benchmarking the tokenizer...
Definition: YAMLParser.cpp:629
size_t find(char C, size_t From=0) const
Definition: StringRef.h:233
StringRef getRawTag() const
Get the tag as it was written in the document. This does not perform tag resolution.
Definition: YAMLParser.h:128
Represents a YAML sequence created from either a block sequence for a flow sequence.
Definition: YAMLParser.h:398
static LLVM_ATTRIBUTE_NOINLINE bool wasEscaped(StringRef::iterator First, StringRef::iterator Position)
StringRef substr(size_t Start, size_t N=npos) const
Definition: StringRef.h:392
iterator insert(iterator I, const T &Elt)
Definition: SmallVector.h:537
UTF-8 or ascii.
Definition: YAMLParser.cpp:33
Node * parseBlockNode()
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Definition: Function.cpp:657
std::pair< uint32_t, unsigned > UTF8Decoded
The Unicode scalar value of a UTF-8 minimal well-formed code unit subsequence and the subsequence's l...
Definition: YAMLParser.cpp:205
iterator begin()
Definition: ilist.h:359
document_iterator begin()
Represents an alias to a Node with an anchor.
Definition: YAMLParser.h:454
void skip(CollectionType &C)
Definition: YAMLParser.h:332
UTF-32 Little Endian.
Definition: YAMLParser.cpp:29
document_iterator end()
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const char *reason, bool gen_crash_diag=true)
void setError(const Twine &Message, Token &Location) const
SMLoc Start
Definition: SMLoc.h:49
static std::string utohexstr(uint64_t X)
Definition: StringExtras.h:67
#define llvm_move(value)
Definition: Compiler.h:108
static bool is_ns_hex_digit(const char C)
Definition: YAMLParser.cpp:844
T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val()
Definition: SmallVector.h:430
bool failed()
Returns true if an error occurred while parsing.
Definition: YAMLParser.cpp:292
#define llvm_unreachable(msg)
TypeID
Definition: Type.h:53
Token * ensureHead(Token *) const
Definition: YAMLParser.cpp:153
static EncodingInfo getUnicodeEncoding(StringRef Input)
Definition: YAMLParser.cpp:47
#define false
Definition: ConvertUTF.c:64
void clear()
Definition: ilist.h:550
static void noteHead(Token *, Token *)
Definition: YAMLParser.cpp:154
virtual void skip()
Definition: YAMLParser.h:145
A key and value pair. While not technically a Node under the YAML representation graph, it is easier to treat them this way.
Definition: YAMLParser.h:236
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:56
Node * getRoot()
Parse and return the root level node.
Definition: YAMLParser.h:485
#define T
iterator begin() const
Definition: StringRef.h:97
bool dumpTokens(StringRef Input, raw_ostream &)
Dump all the tokens in this stream to OS.
Definition: YAMLParser.cpp:548
#define true
Definition: ConvertUTF.c:65
void printError(Node *N, const Twine &Msg)
Stream(StringRef Input, SourceMgr &)
This keeps a reference to the string referenced by Input.
iterator insert(iterator where, const NodeTy &val)
Definition: ilist.h:664
Token * createNode(const Token &V)
Definition: YAMLParser.cpp:162
std::string escape(StringRef Input)
Escape Input for a double quoted scalar.
Definition: YAMLParser.cpp:642
void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, ArrayRef< SMRange > Ranges=None)
Definition: YAMLParser.cpp:271
Token getNext()
Parse the next token and pop it from the queue.
Definition: YAMLParser.cpp:762
enable_if_c< std::numeric_limits< T >::is_signed, bool >::type getAsInteger(unsigned Radix, T &Result) const
Definition: StringRef.h:337
iterator erase(iterator I)
Definition: SmallVector.h:478
Scanner(const StringRef Input, SourceMgr &SM)
Definition: YAMLParser.cpp:705
A scalar node is an opaque datum that can be presented as a series of zero or more Unicode scalar val...
Definition: YAMLParser.h:194
UTF-16 Little Endian.
Definition: YAMLParser.cpp:31
A null value.
Definition: YAMLParser.h:178
void setError(const Twine &Message, StringRef::iterator Position)
Definition: YAMLParser.cpp:276
const char * iterator
Definition: StringRef.h:43
size_t find_last_of(char C, size_t From=npos) const
Definition: StringRef.h:291
void transferNodesFromList(ilist_node_traits &, ilist_iterator< Token >, ilist_iterator< Token >)
Definition: YAMLParser.cpp:169
bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:208
bool skip()
Finish parsing the current document and return true if there are more. Return false otherwise...
enum llvm::yaml::Token::TokenKind Kind
Token & peekNext()
Parse the next token and return it without popping it.
Definition: YAMLParser.cpp:735
Token & peekNext()
This class represents a YAML stream potentially containing multiple documents.
Definition: YAMLParser.h:79
#define LLVM_ATTRIBUTE_NOINLINE
Definition: Compiler.h:254
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: ilist.h:385
ilist< Token > TokenQueueT
Definition: YAMLParser.cpp:177
static void encodeUTF8(uint32_t UnicodeScalarValue, SmallVectorImpl< char > &Result)
encodeUTF8 - Encode UnicodeScalarValue in UTF-8 and append it to result.
Definition: YAMLParser.cpp:520
void setError(const Twine &Message)
Definition: YAMLParser.cpp:287
static UTF8Decoded decodeUTF8(StringRef Range)
Definition: YAMLParser.cpp:207
reference front()
Definition: ilist.h:390
static SMLoc getFromPointer(const char *Ptr)
Definition: SMLoc.h:35
SMRange getSourceRange() const
Definition: YAMLParser.h:134
static const size_t npos
Definition: StringRef.h:45
size_t size_type
Definition: StringRef.h:46
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
Token - A single YAML token.
Definition: YAMLParser.cpp:109
Represents a YAML map created from either a block map for a flow map.
Definition: YAMLParser.h:348
static void deleteNode(Token *V)
Definition: YAMLParser.cpp:165
size_t find_first_of(char C, size_t From=0) const
Definition: StringRef.h:269
reference back()
Definition: ilist.h:398
const char * getBufferEnd() const
Definition: MemoryBuffer.h:52
UTF-16 Big Endian.
Definition: YAMLParser.cpp:32
Scans YAML tokens from a MemoryBuffer.
Definition: YAMLParser.cpp:260
std::pair< UnicodeEncodingForm, unsigned > EncodingInfo
Definition: YAMLParser.cpp:39
Iterator abstraction for Documents over a Stream.
Definition: YAMLParser.h:532
iterator end()
Definition: ilist.h:367
SMRange SourceRange
Definition: YAMLParser.h:161
LLVM Value Representation.
Definition: Value.h:66
void pop_front()
Definition: ilist.h:555
friend class Document
Definition: YAMLParser.h:103
iterator end() const
Definition: StringRef.h:99
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1684
void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind, const Twine &Msg, ArrayRef< SMRange > Ranges=None, ArrayRef< SMFixIt > FixIts=None, bool ShowColors=true) const
Definition: SourceMgr.cpp:214
Represents a location in source code.
Definition: SMLoc.h:23
size_t AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc)
Definition: SourceMgr.h:112
An inline mapping node is used for "[key: value]".
Definition: YAMLParser.h:354
UTF-32 Big Endian.
Definition: YAMLParser.cpp:30
Node * parseBlockNode()
Root for parsing a node. Returns a single node.
StringRef ltrim(StringRef Chars=" \t\n\v\f\r") const
Definition: StringRef.h:498
void push_back(const NodeTy &val)
Definition: ilist.h:671
UnicodeEncodingForm
Definition: YAMLParser.cpp:28
bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:110
Abstract base class for all Nodes.
Definition: YAMLParser.h:107
static bool is_ns_word_char(const char C)
Definition: YAMLParser.cpp:850