LLVM API Documentation

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
AsmLexer.cpp
Go to the documentation of this file.
1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This class implements the lexer for assembly files.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/MC/MCAsmInfo.h"
17 #include "llvm/Support/SMLoc.h"
18 #include <cctype>
19 #include <cerrno>
20 #include <cstdio>
21 #include <cstdlib>
22 using namespace llvm;
23 
24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) {
25  CurBuf = NULL;
26  CurPtr = NULL;
27  isAtStartOfLine = true;
28 }
29 
31 }
32 
33 void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
34  CurBuf = buf;
35 
36  if (ptr)
37  CurPtr = ptr;
38  else
39  CurPtr = CurBuf->getBufferStart();
40 
41  TokStart = 0;
42 }
43 
44 /// ReturnError - Set the error to the specified string at the specified
45 /// location. This is defined to always return AsmToken::Error.
46 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
48 
49  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
50 }
51 
52 int AsmLexer::getNextChar() {
53  char CurChar = *CurPtr++;
54  switch (CurChar) {
55  default:
56  return (unsigned char)CurChar;
57  case 0:
58  // A nul character in the stream is either the end of the current buffer or
59  // a random nul in the file. Disambiguate that here.
60  if (CurPtr-1 != CurBuf->getBufferEnd())
61  return 0; // Just whitespace.
62 
63  // Otherwise, return end of file.
64  --CurPtr; // Another call to lex will return EOF again.
65  return EOF;
66  }
67 }
68 
69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
70 ///
71 /// The leading integral digit sequence and dot should have already been
72 /// consumed, some or all of the fractional digit sequence *can* have been
73 /// consumed.
74 AsmToken AsmLexer::LexFloatLiteral() {
75  // Skip the fractional digit sequence.
76  while (isdigit(*CurPtr))
77  ++CurPtr;
78 
79  // Check for exponent; we intentionally accept a slighlty wider set of
80  // literals here and rely on the upstream client to reject invalid ones (e.g.,
81  // "1e+").
82  if (*CurPtr == 'e' || *CurPtr == 'E') {
83  ++CurPtr;
84  if (*CurPtr == '-' || *CurPtr == '+')
85  ++CurPtr;
86  while (isdigit(*CurPtr))
87  ++CurPtr;
88  }
89 
90  return AsmToken(AsmToken::Real,
91  StringRef(TokStart, CurPtr - TokStart));
92 }
93 
94 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95 /// while making sure there are enough actual digits around for the constant to
96 /// be valid.
97 ///
98 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99 /// before we get here.
100 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
101  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
102  "unexpected parse state in floating hex");
103  bool NoFracDigits = true;
104 
105  // Skip the fractional part if there is one
106  if (*CurPtr == '.') {
107  ++CurPtr;
108 
109  const char *FracStart = CurPtr;
110  while (isxdigit(*CurPtr))
111  ++CurPtr;
112 
113  NoFracDigits = CurPtr == FracStart;
114  }
115 
116  if (NoIntDigits && NoFracDigits)
117  return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
118  "expected at least one significand digit");
119 
120  // Make sure we do have some kind of proper exponent part
121  if (*CurPtr != 'p' && *CurPtr != 'P')
122  return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
123  "expected exponent part 'p'");
124  ++CurPtr;
125 
126  if (*CurPtr == '+' || *CurPtr == '-')
127  ++CurPtr;
128 
129  // N.b. exponent digits are *not* hex
130  const char *ExpStart = CurPtr;
131  while (isdigit(*CurPtr))
132  ++CurPtr;
133 
134  if (CurPtr == ExpStart)
135  return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
136  "expected at least one exponent digit");
137 
138  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
139 }
140 
141 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
142 static bool IsIdentifierChar(char c) {
143  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?';
144 }
145 AsmToken AsmLexer::LexIdentifier() {
146  // Check for floating point literals.
147  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
148  // Disambiguate a .1243foo identifier from a floating literal.
149  while (isdigit(*CurPtr))
150  ++CurPtr;
151  if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr))
152  return LexFloatLiteral();
153  }
154 
155  while (IsIdentifierChar(*CurPtr))
156  ++CurPtr;
157 
158  // Handle . as a special case.
159  if (CurPtr == TokStart+1 && TokStart[0] == '.')
161 
163 }
164 
165 /// LexSlash: Slash: /
166 /// C-Style Comment: /* ... */
167 AsmToken AsmLexer::LexSlash() {
168  switch (*CurPtr) {
169  case '*': break; // C style comment.
170  case '/': return ++CurPtr, LexLineComment();
171  default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
172  }
173 
174  // C Style comment.
175  ++CurPtr; // skip the star.
176  while (1) {
177  int CurChar = getNextChar();
178  switch (CurChar) {
179  case EOF:
180  return ReturnError(TokStart, "unterminated comment");
181  case '*':
182  // End of the comment?
183  if (CurPtr[0] != '/') break;
184 
185  ++CurPtr; // End the */.
186  return LexToken();
187  }
188  }
189 }
190 
191 /// LexLineComment: Comment: #[^\n]*
192 /// : //[^\n]*
193 AsmToken AsmLexer::LexLineComment() {
194  // FIXME: This is broken if we happen to a comment at the end of a file, which
195  // was .included, and which doesn't end with a newline.
196  int CurChar = getNextChar();
197  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
198  CurChar = getNextChar();
199 
200  if (CurChar == EOF)
201  return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
202  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
203 }
204 
205 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
206  // Skip ULL, UL, U, L and LL suffices.
207  if (CurPtr[0] == 'U')
208  ++CurPtr;
209  if (CurPtr[0] == 'L')
210  ++CurPtr;
211  if (CurPtr[0] == 'L')
212  ++CurPtr;
213 }
214 
215 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
216 // integer as a hexadecimal, possibly with leading zeroes.
217 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
218  const char *FirstHex = 0;
219  const char *LookAhead = CurPtr;
220  while (1) {
221  if (isdigit(*LookAhead)) {
222  ++LookAhead;
223  } else if (isxdigit(*LookAhead)) {
224  if (!FirstHex)
225  FirstHex = LookAhead;
226  ++LookAhead;
227  } else {
228  break;
229  }
230  }
231  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
232  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
233  if (isHex)
234  return 16;
235  return DefaultRadix;
236 }
237 
238 /// LexDigit: First character is [0-9].
239 /// Local Label: [0-9][:]
240 /// Forward/Backward Label: [0-9][fb]
241 /// Binary integer: 0b[01]+
242 /// Octal integer: 0[0-7]+
243 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
244 /// Decimal integer: [1-9][0-9]*
245 AsmToken AsmLexer::LexDigit() {
246  // Decimal integer: [1-9][0-9]*
247  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
248  unsigned Radix = doLookAhead(CurPtr, 10);
249  bool isHex = Radix == 16;
250  // Check for floating point literals.
251  if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
252  ++CurPtr;
253  return LexFloatLiteral();
254  }
255 
256  StringRef Result(TokStart, CurPtr - TokStart);
257 
258  long long Value;
259  if (Result.getAsInteger(Radix, Value)) {
260  // Allow positive values that are too large to fit into a signed 64-bit
261  // integer, but that do fit in an unsigned one, we just convert them over.
262  unsigned long long UValue;
263  if (Result.getAsInteger(Radix, UValue))
264  return ReturnError(TokStart, !isHex ? "invalid decimal number" :
265  "invalid hexdecimal number");
266  Value = (long long)UValue;
267  }
268 
269  // Consume the [bB][hH].
270  if (Radix == 2 || Radix == 16)
271  ++CurPtr;
272 
273  // The darwin/x86 (and x86-64) assembler accepts and ignores type
274  // suffices on integer literals.
275  SkipIgnoredIntegerSuffix(CurPtr);
276 
277  return AsmToken(AsmToken::Integer, Result, Value);
278  }
279 
280  if (*CurPtr == 'b') {
281  ++CurPtr;
282  // See if we actually have "0b" as part of something like "jmp 0b\n"
283  if (!isdigit(CurPtr[0])) {
284  --CurPtr;
285  StringRef Result(TokStart, CurPtr - TokStart);
286  return AsmToken(AsmToken::Integer, Result, 0);
287  }
288  const char *NumStart = CurPtr;
289  while (CurPtr[0] == '0' || CurPtr[0] == '1')
290  ++CurPtr;
291 
292  // Requires at least one binary digit.
293  if (CurPtr == NumStart)
294  return ReturnError(TokStart, "invalid binary number");
295 
296  StringRef Result(TokStart, CurPtr - TokStart);
297 
298  long long Value;
299  if (Result.substr(2).getAsInteger(2, Value))
300  return ReturnError(TokStart, "invalid binary number");
301 
302  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
303  // suffixes on integer literals.
304  SkipIgnoredIntegerSuffix(CurPtr);
305 
306  return AsmToken(AsmToken::Integer, Result, Value);
307  }
308 
309  if (*CurPtr == 'x') {
310  ++CurPtr;
311  const char *NumStart = CurPtr;
312  while (isxdigit(CurPtr[0]))
313  ++CurPtr;
314 
315  // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
316  // diagnosed by LexHexFloatLiteral).
317  if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
318  return LexHexFloatLiteral(NumStart == CurPtr);
319 
320  // Otherwise requires at least one hex digit.
321  if (CurPtr == NumStart)
322  return ReturnError(CurPtr-2, "invalid hexadecimal number");
323 
324  unsigned long long Result;
325  if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
326  return ReturnError(TokStart, "invalid hexadecimal number");
327 
328  // Consume the optional [hH].
329  if (*CurPtr == 'h' || *CurPtr == 'H')
330  ++CurPtr;
331 
332  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
333  // suffixes on integer literals.
334  SkipIgnoredIntegerSuffix(CurPtr);
335 
337  (int64_t)Result);
338  }
339 
340  // Either octal or hexadecimal.
341  long long Value;
342  unsigned Radix = doLookAhead(CurPtr, 8);
343  bool isHex = Radix == 16;
344  StringRef Result(TokStart, CurPtr - TokStart);
345  if (Result.getAsInteger(Radix, Value))
346  return ReturnError(TokStart, !isHex ? "invalid octal number" :
347  "invalid hexdecimal number");
348 
349  // Consume the [hH].
350  if (Radix == 16)
351  ++CurPtr;
352 
353  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
354  // suffixes on integer literals.
355  SkipIgnoredIntegerSuffix(CurPtr);
356 
357  return AsmToken(AsmToken::Integer, Result, Value);
358 }
359 
360 /// LexSingleQuote: Integer: 'b'
361 AsmToken AsmLexer::LexSingleQuote() {
362  int CurChar = getNextChar();
363 
364  if (CurChar == '\\')
365  CurChar = getNextChar();
366 
367  if (CurChar == EOF)
368  return ReturnError(TokStart, "unterminated single quote");
369 
370  CurChar = getNextChar();
371 
372  if (CurChar != '\'')
373  return ReturnError(TokStart, "single quote way too long");
374 
375  // The idea here being that 'c' is basically just an integral
376  // constant.
377  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
378  long long Value;
379 
380  if (Res.startswith("\'\\")) {
381  char theChar = Res[2];
382  switch (theChar) {
383  default: Value = theChar; break;
384  case '\'': Value = '\''; break;
385  case 't': Value = '\t'; break;
386  case 'n': Value = '\n'; break;
387  case 'b': Value = '\b'; break;
388  }
389  } else
390  Value = TokStart[1];
391 
392  return AsmToken(AsmToken::Integer, Res, Value);
393 }
394 
395 
396 /// LexQuote: String: "..."
397 AsmToken AsmLexer::LexQuote() {
398  int CurChar = getNextChar();
399  // TODO: does gas allow multiline string constants?
400  while (CurChar != '"') {
401  if (CurChar == '\\') {
402  // Allow \", etc.
403  CurChar = getNextChar();
404  }
405 
406  if (CurChar == EOF)
407  return ReturnError(TokStart, "unterminated string constant");
408 
409  CurChar = getNextChar();
410  }
411 
413 }
414 
416  TokStart = CurPtr;
417 
418  while (!isAtStartOfComment(*CurPtr) && // Start of line comment.
419  !isAtStatementSeparator(CurPtr) && // End of statement marker.
420  *CurPtr != '\n' &&
421  *CurPtr != '\r' &&
422  (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
423  ++CurPtr;
424  }
425  return StringRef(TokStart, CurPtr-TokStart);
426 }
427 
429  TokStart = CurPtr;
430 
431  while (*CurPtr != '\n' &&
432  *CurPtr != '\r' &&
433  (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
434  ++CurPtr;
435  }
436  return StringRef(TokStart, CurPtr-TokStart);
437 }
438 
440  // FIXME: This won't work for multi-character comment indicators like "//".
441  return Char == *MAI.getCommentString();
442 }
443 
444 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
445  return strncmp(Ptr, MAI.getSeparatorString(),
446  strlen(MAI.getSeparatorString())) == 0;
447 }
448 
450  TokStart = CurPtr;
451  // This always consumes at least one character.
452  int CurChar = getNextChar();
453 
454  if (isAtStartOfComment(CurChar)) {
455  // If this comment starts with a '#', then return the Hash token and let
456  // the assembler parser see if it can be parsed as a cpp line filename
457  // comment. We do this only if we are at the start of a line.
458  if (CurChar == '#' && isAtStartOfLine)
460  isAtStartOfLine = true;
461  return LexLineComment();
462  }
464  CurPtr += strlen(MAI.getSeparatorString()) - 1;
467  }
468 
469  // If we're missing a newline at EOF, make sure we still get an
470  // EndOfStatement token before the Eof token.
471  if (CurChar == EOF && !isAtStartOfLine) {
472  isAtStartOfLine = true;
474  }
475 
476  isAtStartOfLine = false;
477  switch (CurChar) {
478  default:
479  // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
480  if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
481  return LexIdentifier();
482 
483  // Unknown character, emit an error.
484  return ReturnError(TokStart, "invalid character in input");
485  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
486  case 0:
487  case ' ':
488  case '\t':
489  if (SkipSpace) {
490  // Ignore whitespace.
491  return LexToken();
492  } else {
493  int len = 1;
494  while (*CurPtr==' ' || *CurPtr=='\t') {
495  CurPtr++;
496  len++;
497  }
499  }
500  case '\n': // FALL THROUGH.
501  case '\r':
502  isAtStartOfLine = true;
504  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
505  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
506  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
507  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
508  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
509  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
510  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
511  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
512  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
513  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
514  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
515  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
516  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
517  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
518  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
519  case '=':
520  if (*CurPtr == '=')
521  return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
522  return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
523  case '|':
524  if (*CurPtr == '|')
525  return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
526  return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
527  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
528  case '&':
529  if (*CurPtr == '&')
530  return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
531  return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
532  case '!':
533  if (*CurPtr == '=')
534  return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
535  return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
536  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
537  case '/': return LexSlash();
538  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
539  case '\'': return LexSingleQuote();
540  case '"': return LexQuote();
541  case '0': case '1': case '2': case '3': case '4':
542  case '5': case '6': case '7': case '8': case '9':
543  return LexDigit();
544  case '<':
545  switch (*CurPtr) {
546  case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
547  StringRef(TokStart, 2));
548  case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
549  StringRef(TokStart, 2));
550  case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
551  StringRef(TokStart, 2));
552  default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
553  }
554  case '>':
555  switch (*CurPtr) {
556  case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
557  StringRef(TokStart, 2));
558  case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
559  StringRef(TokStart, 2));
560  default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
561  }
562 
563  // TODO: Quoted identifiers (objc methods etc)
564  // local labels: [0-9][:]
565  // Forward/backward labels: [0-9][fb]
566  // Integers, fp constants, character constants.
567  }
568 }
static bool IsIdentifierChar(char c)
LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*.
Definition: AsmLexer.cpp:142
const char * getBufferStart() const
Definition: MemoryBuffer.h:51
int isdigit(int c);
virtual AsmToken LexToken()
LexToken - Read the next token and return its code.
Definition: AsmLexer.cpp:449
virtual StringRef LexUntilEndOfStatement()
Definition: AsmLexer.cpp:415
AsmToken - Target independent representation for an assembler token.
Definition: MCAsmLexer.h:21
StringRef LexUntilEndOfLine()
Definition: AsmLexer.cpp:428
const char * TokStart
Definition: MCAsmLexer.h:119
bool isAtStatementSeparator(const char *Ptr)
Definition: AsmLexer.cpp:444
bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:208
size_t strlen(const char *s);
static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix)
Definition: AsmLexer.cpp:217
static SMLoc getFromPointer(const char *Ptr)
Definition: SMLoc.h:35
void setBuffer(const MemoryBuffer *buf, const char *ptr=NULL)
Definition: AsmLexer.cpp:33
const char * getSeparatorString() const
Definition: MCAsmInfo.h:414
void SetError(const SMLoc &errLoc, const std::string &err)
Definition: MCAsmLexer.h:126
const char * getBufferEnd() const
Definition: MemoryBuffer.h:52
LLVM Value Representation.
Definition: Value.h:66
static void SkipIgnoredIntegerSuffix(const char *&CurPtr)
Definition: AsmLexer.cpp:205
int strncmp(const char *s1, const char *s2, size_t n);
bool isAtStartOfComment(char Char)
Definition: AsmLexer.cpp:439
const char * getCommentString() const
Definition: MCAsmInfo.h:420