LLVM API Documentation

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
regcomp.c
Go to the documentation of this file.
1 /*-
2  * This code is derived from OpenBSD's libc/regex, original license follows:
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  * The Regents of the University of California. All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  * may be used to endorse or promote products derived from this software
21  * without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
36  */
37 
38 #include <sys/types.h>
39 #include <stdio.h>
40 #include <string.h>
41 #include <ctype.h>
42 #include <limits.h>
43 #include <stdlib.h>
44 #include "regex_impl.h"
45 
46 #include "regutils.h"
47 #include "regex2.h"
48 
49 #include "regcclass.h"
50 #include "regcname.h"
51 
52 /*
53  * parse structure, passed up and down to avoid global variables and
54  * other clumsinesses
55  */
56 struct parse {
57  char *next; /* next character in RE */
58  char *end; /* end of string (-> NUL normally) */
59  int error; /* has an error been seen? */
60  sop *strip; /* malloced strip */
61  sopno ssize; /* malloced strip size (allocated) */
62  sopno slen; /* malloced strip length (used) */
63  int ncsalloc; /* number of csets allocated */
64  struct re_guts *g;
65 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
66  sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
67  sopno pend[NPAREN]; /* -> ) ([0] unused) */
68 };
69 
70 static void p_ere(struct parse *, int);
71 static void p_ere_exp(struct parse *);
72 static void p_str(struct parse *);
73 static void p_bre(struct parse *, int, int);
74 static int p_simp_re(struct parse *, int);
75 static int p_count(struct parse *);
76 static void p_bracket(struct parse *);
77 static void p_b_term(struct parse *, cset *);
78 static void p_b_cclass(struct parse *, cset *);
79 static void p_b_eclass(struct parse *, cset *);
80 static char p_b_symbol(struct parse *);
81 static char p_b_coll_elem(struct parse *, int);
82 static char othercase(int);
83 static void bothcases(struct parse *, int);
84 static void ordinary(struct parse *, int);
85 static void nonnewline(struct parse *);
86 static void repeat(struct parse *, sopno, int, int);
87 static int seterr(struct parse *, int);
88 static cset *allocset(struct parse *);
89 static void freeset(struct parse *, cset *);
90 static int freezeset(struct parse *, cset *);
91 static int firstch(struct parse *, cset *);
92 static int nch(struct parse *, cset *);
93 static void mcadd(struct parse *, cset *, const char *);
94 static void mcinvert(struct parse *, cset *);
95 static void mccase(struct parse *, cset *);
96 static int isinsets(struct re_guts *, int);
97 static int samesets(struct re_guts *, int, int);
98 static void categorize(struct parse *, struct re_guts *);
99 static sopno dupl(struct parse *, sopno, sopno);
100 static void doemit(struct parse *, sop, size_t);
101 static void doinsert(struct parse *, sop, size_t, sopno);
102 static void dofwd(struct parse *, sopno, sop);
103 static void enlarge(struct parse *, sopno);
104 static void stripsnug(struct parse *, struct re_guts *);
105 static void findmust(struct parse *, struct re_guts *);
106 static sopno pluscount(struct parse *, struct re_guts *);
107 
108 static char nuls[10]; /* place to point scanner in event of error */
109 
110 /*
111  * macros for use with parse structure
112  * BEWARE: these know that the parse structure is named `p' !!!
113  */
114 #define PEEK() (*p->next)
115 #define PEEK2() (*(p->next+1))
116 #define MORE() (p->next < p->end)
117 #define MORE2() (p->next+1 < p->end)
118 #define SEE(c) (MORE() && PEEK() == (c))
119 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
120 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
121 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
122 #define NEXT() (p->next++)
123 #define NEXT2() (p->next += 2)
124 #define NEXTn(n) (p->next += (n))
125 #define GETNEXT() (*p->next++)
126 #define SETERROR(e) seterr(p, (e))
127 #define REQUIRE(co, e) (void)((co) || SETERROR(e))
128 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
129 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
130 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
131 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
132 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
133 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
134 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
135 #define HERE() (p->slen)
136 #define THERE() (p->slen - 1)
137 #define THERETHERE() (p->slen - 2)
138 #define DROP(n) (p->slen -= (n))
139 
140 #ifdef _POSIX2_RE_DUP_MAX
141 #define DUPMAX _POSIX2_RE_DUP_MAX
142 #else
143 #define DUPMAX 255
144 #endif
145 #define INFINITY (DUPMAX + 1)
146 
147 #ifndef NDEBUG
148 static int never = 0; /* for use in asserts; shuts lint up */
149 #else
150 #define never 0 /* some <assert.h>s have bugs too */
151 #endif
152 
153 /*
154  - llvm_regcomp - interface for parser and compilation
155  */
156 int /* 0 success, otherwise REG_something */
157 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
158 {
159  struct parse pa;
160  struct re_guts *g;
161  struct parse *p = &pa;
162  int i;
163  size_t len;
164 #ifdef REDEBUG
165 # define GOODFLAGS(f) (f)
166 #else
167 # define GOODFLAGS(f) ((f)&~REG_DUMP)
168 #endif
169 
170  cflags = GOODFLAGS(cflags);
171  if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
172  return(REG_INVARG);
173 
174  if (cflags&REG_PEND) {
175  if (preg->re_endp < pattern)
176  return(REG_INVARG);
177  len = preg->re_endp - pattern;
178  } else
179  len = strlen((const char *)pattern);
180 
181  /* do the mallocs early so failure handling is easy */
182  g = (struct re_guts *)malloc(sizeof(struct re_guts) +
183  (NC-1)*sizeof(cat_t));
184  if (g == NULL)
185  return(REG_ESPACE);
186  p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
187  p->strip = (sop *)calloc(p->ssize, sizeof(sop));
188  p->slen = 0;
189  if (p->strip == NULL) {
190  free((char *)g);
191  return(REG_ESPACE);
192  }
193 
194  /* set things up */
195  p->g = g;
196  p->next = (char *)pattern; /* convenience; we do not modify it */
197  p->end = p->next + len;
198  p->error = 0;
199  p->ncsalloc = 0;
200  for (i = 0; i < NPAREN; i++) {
201  p->pbegin[i] = 0;
202  p->pend[i] = 0;
203  }
204  g->csetsize = NC;
205  g->sets = NULL;
206  g->setbits = NULL;
207  g->ncsets = 0;
208  g->cflags = cflags;
209  g->iflags = 0;
210  g->nbol = 0;
211  g->neol = 0;
212  g->must = NULL;
213  g->mlen = 0;
214  g->nsub = 0;
215  g->ncategories = 1; /* category 0 is "everything else" */
216  g->categories = &g->catspace[-(CHAR_MIN)];
217  (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
218  g->backrefs = 0;
219 
220  /* do it */
221  EMIT(OEND, 0);
222  g->firststate = THERE();
223  if (cflags&REG_EXTENDED)
224  p_ere(p, OUT);
225  else if (cflags&REG_NOSPEC)
226  p_str(p);
227  else
228  p_bre(p, OUT, OUT);
229  EMIT(OEND, 0);
230  g->laststate = THERE();
231 
232  /* tidy up loose ends and fill things in */
233  categorize(p, g);
234  stripsnug(p, g);
235  findmust(p, g);
236  g->nplus = pluscount(p, g);
237  g->magic = MAGIC2;
238  preg->re_nsub = g->nsub;
239  preg->re_g = g;
240  preg->re_magic = MAGIC1;
241 #ifndef REDEBUG
242  /* not debugging, so can't rely on the assert() in llvm_regexec() */
243  if (g->iflags&REGEX_BAD)
245 #endif
246 
247  /* win or lose, we're done */
248  if (p->error != 0) /* lose */
249  llvm_regfree(preg);
250  return(p->error);
251 }
252 
253 /*
254  - p_ere - ERE parser top level, concatenation and alternation
255  */
256 static void
257 p_ere(struct parse *p, int stop) /* character this ERE should end at */
258 {
259  char c;
260  sopno prevback = 0;
261  sopno prevfwd = 0;
262  sopno conc;
263  int first = 1; /* is this the first alternative? */
264 
265  for (;;) {
266  /* do a bunch of concatenated expressions */
267  conc = HERE();
268  while (MORE() && (c = PEEK()) != '|' && c != stop)
269  p_ere_exp(p);
270  REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
271 
272  if (!EAT('|'))
273  break; /* NOTE BREAK OUT */
274 
275  if (first) {
276  INSERT(OCH_, conc); /* offset is wrong */
277  prevfwd = conc;
278  prevback = conc;
279  first = 0;
280  }
281  ASTERN(OOR1, prevback);
282  prevback = THERE();
283  AHEAD(prevfwd); /* fix previous offset */
284  prevfwd = HERE();
285  EMIT(OOR2, 0); /* offset is very wrong */
286  }
287 
288  if (!first) { /* tail-end fixups */
289  AHEAD(prevfwd);
290  ASTERN(O_CH, prevback);
291  }
292 
293  assert(!MORE() || SEE(stop));
294 }
295 
296 /*
297  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
298  */
299 static void
300 p_ere_exp(struct parse *p)
301 {
302  char c;
303  sopno pos;
304  int count;
305  int count2;
306  int backrefnum;
307  sopno subno;
308  int wascaret = 0;
309 
310  assert(MORE()); /* caller should have ensured this */
311  c = GETNEXT();
312 
313  pos = HERE();
314  switch (c) {
315  case '(':
316  REQUIRE(MORE(), REG_EPAREN);
317  p->g->nsub++;
318  subno = p->g->nsub;
319  if (subno < NPAREN)
320  p->pbegin[subno] = HERE();
321  EMIT(OLPAREN, subno);
322  if (!SEE(')'))
323  p_ere(p, ')');
324  if (subno < NPAREN) {
325  p->pend[subno] = HERE();
326  assert(p->pend[subno] != 0);
327  }
328  EMIT(ORPAREN, subno);
329  MUSTEAT(')', REG_EPAREN);
330  break;
331 #ifndef POSIX_MISTAKE
332  case ')': /* happens only if no current unmatched ( */
333  /*
334  * You may ask, why the ifndef? Because I didn't notice
335  * this until slightly too late for 1003.2, and none of the
336  * other 1003.2 regular-expression reviewers noticed it at
337  * all. So an unmatched ) is legal POSIX, at least until
338  * we can get it fixed.
339  */
341  break;
342 #endif
343  case '^':
344  EMIT(OBOL, 0);
345  p->g->iflags |= USEBOL;
346  p->g->nbol++;
347  wascaret = 1;
348  break;
349  case '$':
350  EMIT(OEOL, 0);
351  p->g->iflags |= USEEOL;
352  p->g->neol++;
353  break;
354  case '|':
356  break;
357  case '*':
358  case '+':
359  case '?':
361  break;
362  case '.':
363  if (p->g->cflags&REG_NEWLINE)
364  nonnewline(p);
365  else
366  EMIT(OANY, 0);
367  break;
368  case '[':
369  p_bracket(p);
370  break;
371  case '\\':
373  c = GETNEXT();
374  if (c >= '1' && c <= '9') {
375  /* \[0-9] is taken to be a back-reference to a previously specified
376  * matching group. backrefnum will hold the number. The matching
377  * group must exist (i.e. if \4 is found there must have been at
378  * least 4 matching groups specified in the pattern previously).
379  */
380  backrefnum = c - '0';
381  if (p->pend[backrefnum] == 0) {
383  break;
384  }
385 
386  /* Make sure everything checks out and emit the sequence
387  * that marks a back-reference to the parse structure.
388  */
389  assert(backrefnum <= p->g->nsub);
390  EMIT(OBACK_, backrefnum);
391  assert(p->pbegin[backrefnum] != 0);
392  assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
393  assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
394  (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
395  EMIT(O_BACK, backrefnum);
396  p->g->backrefs = 1;
397  } else {
398  /* Other chars are simply themselves when escaped with a backslash.
399  */
400  ordinary(p, c);
401  }
402  break;
403  case '{': /* okay as ordinary except if digit follows */
404  REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
405  /* FALLTHROUGH */
406  default:
407  ordinary(p, c);
408  break;
409  }
410 
411  if (!MORE())
412  return;
413  c = PEEK();
414  /* we call { a repetition if followed by a digit */
415  if (!( c == '*' || c == '+' || c == '?' ||
416  (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
417  return; /* no repetition, we're done */
418  NEXT();
419 
420  REQUIRE(!wascaret, REG_BADRPT);
421  switch (c) {
422  case '*': /* implemented as +? */
423  /* this case does not require the (y|) trick, noKLUDGE */
424  INSERT(OPLUS_, pos);
425  ASTERN(O_PLUS, pos);
426  INSERT(OQUEST_, pos);
427  ASTERN(O_QUEST, pos);
428  break;
429  case '+':
430  INSERT(OPLUS_, pos);
431  ASTERN(O_PLUS, pos);
432  break;
433  case '?':
434  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
435  INSERT(OCH_, pos); /* offset slightly wrong */
436  ASTERN(OOR1, pos); /* this one's right */
437  AHEAD(pos); /* fix the OCH_ */
438  EMIT(OOR2, 0); /* offset very wrong... */
439  AHEAD(THERE()); /* ...so fix it */
440  ASTERN(O_CH, THERETHERE());
441  break;
442  case '{':
443  count = p_count(p);
444  if (EAT(',')) {
445  if (isdigit((uch)PEEK())) {
446  count2 = p_count(p);
447  REQUIRE(count <= count2, REG_BADBR);
448  } else /* single number with comma */
449  count2 = INFINITY;
450  } else /* just a single number */
451  count2 = count;
452  repeat(p, pos, count, count2);
453  if (!EAT('}')) { /* error heuristics */
454  while (MORE() && PEEK() != '}')
455  NEXT();
456  REQUIRE(MORE(), REG_EBRACE);
458  }
459  break;
460  }
461 
462  if (!MORE())
463  return;
464  c = PEEK();
465  if (!( c == '*' || c == '+' || c == '?' ||
466  (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
467  return;
469 }
470 
471 /*
472  - p_str - string (no metacharacters) "parser"
473  */
474 static void
475 p_str(struct parse *p)
476 {
477  REQUIRE(MORE(), REG_EMPTY);
478  while (MORE())
479  ordinary(p, GETNEXT());
480 }
481 
482 /*
483  - p_bre - BRE parser top level, anchoring and concatenation
484  * Giving end1 as OUT essentially eliminates the end1/end2 check.
485  *
486  * This implementation is a bit of a kludge, in that a trailing $ is first
487  * taken as an ordinary character and then revised to be an anchor. The
488  * only undesirable side effect is that '$' gets included as a character
489  * category in such cases. This is fairly harmless; not worth fixing.
490  * The amount of lookahead needed to avoid this kludge is excessive.
491  */
492 static void
493 p_bre(struct parse *p,
494  int end1, /* first terminating character */
495  int end2) /* second terminating character */
496 {
497  sopno start = HERE();
498  int first = 1; /* first subexpression? */
499  int wasdollar = 0;
500 
501  if (EAT('^')) {
502  EMIT(OBOL, 0);
503  p->g->iflags |= USEBOL;
504  p->g->nbol++;
505  }
506  while (MORE() && !SEETWO(end1, end2)) {
507  wasdollar = p_simp_re(p, first);
508  first = 0;
509  }
510  if (wasdollar) { /* oops, that was a trailing anchor */
511  DROP(1);
512  EMIT(OEOL, 0);
513  p->g->iflags |= USEEOL;
514  p->g->neol++;
515  }
516 
517  REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
518 }
519 
520 /*
521  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
522  */
523 static int /* was the simple RE an unbackslashed $? */
524 p_simp_re(struct parse *p,
525  int starordinary) /* is a leading * an ordinary character? */
526 {
527  int c;
528  int count;
529  int count2;
530  sopno pos;
531  int i;
532  sopno subno;
533 # define BACKSL (1<<CHAR_BIT)
534 
535  pos = HERE(); /* repetion op, if any, covers from here */
536 
537  assert(MORE()); /* caller should have ensured this */
538  c = GETNEXT();
539  if (c == '\\') {
541  c = BACKSL | GETNEXT();
542  }
543  switch (c) {
544  case '.':
545  if (p->g->cflags&REG_NEWLINE)
546  nonnewline(p);
547  else
548  EMIT(OANY, 0);
549  break;
550  case '[':
551  p_bracket(p);
552  break;
553  case BACKSL|'{':
555  break;
556  case BACKSL|'(':
557  p->g->nsub++;
558  subno = p->g->nsub;
559  if (subno < NPAREN)
560  p->pbegin[subno] = HERE();
561  EMIT(OLPAREN, subno);
562  /* the MORE here is an error heuristic */
563  if (MORE() && !SEETWO('\\', ')'))
564  p_bre(p, '\\', ')');
565  if (subno < NPAREN) {
566  p->pend[subno] = HERE();
567  assert(p->pend[subno] != 0);
568  }
569  EMIT(ORPAREN, subno);
570  REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
571  break;
572  case BACKSL|')': /* should not get here -- must be user */
573  case BACKSL|'}':
575  break;
576  case BACKSL|'1':
577  case BACKSL|'2':
578  case BACKSL|'3':
579  case BACKSL|'4':
580  case BACKSL|'5':
581  case BACKSL|'6':
582  case BACKSL|'7':
583  case BACKSL|'8':
584  case BACKSL|'9':
585  i = (c&~BACKSL) - '0';
586  assert(i < NPAREN);
587  if (p->pend[i] != 0) {
588  assert(i <= p->g->nsub);
589  EMIT(OBACK_, i);
590  assert(p->pbegin[i] != 0);
591  assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
592  assert(OP(p->strip[p->pend[i]]) == ORPAREN);
593  (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
594  EMIT(O_BACK, i);
595  } else
597  p->g->backrefs = 1;
598  break;
599  case '*':
600  REQUIRE(starordinary, REG_BADRPT);
601  /* FALLTHROUGH */
602  default:
603  ordinary(p, (char)c);
604  break;
605  }
606 
607  if (EAT('*')) { /* implemented as +? */
608  /* this case does not require the (y|) trick, noKLUDGE */
609  INSERT(OPLUS_, pos);
610  ASTERN(O_PLUS, pos);
611  INSERT(OQUEST_, pos);
612  ASTERN(O_QUEST, pos);
613  } else if (EATTWO('\\', '{')) {
614  count = p_count(p);
615  if (EAT(',')) {
616  if (MORE() && isdigit((uch)PEEK())) {
617  count2 = p_count(p);
618  REQUIRE(count <= count2, REG_BADBR);
619  } else /* single number with comma */
620  count2 = INFINITY;
621  } else /* just a single number */
622  count2 = count;
623  repeat(p, pos, count, count2);
624  if (!EATTWO('\\', '}')) { /* error heuristics */
625  while (MORE() && !SEETWO('\\', '}'))
626  NEXT();
627  REQUIRE(MORE(), REG_EBRACE);
629  }
630  } else if (c == '$') /* $ (but not \$) ends it */
631  return(1);
632 
633  return(0);
634 }
635 
636 /*
637  - p_count - parse a repetition count
638  */
639 static int /* the value */
640 p_count(struct parse *p)
641 {
642  int count = 0;
643  int ndigits = 0;
644 
645  while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
646  count = count*10 + (GETNEXT() - '0');
647  ndigits++;
648  }
649 
650  REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
651  return(count);
652 }
653 
654 /*
655  - p_bracket - parse a bracketed character list
656  *
657  * Note a significant property of this code: if the allocset() did SETERROR,
658  * no set operations are done.
659  */
660 static void
661 p_bracket(struct parse *p)
662 {
663  cset *cs;
664  int invert = 0;
665 
666  /* Dept of Truly Sickening Special-Case Kludges */
667  if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
668  EMIT(OBOW, 0);
669  NEXTn(6);
670  return;
671  }
672  if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
673  EMIT(OEOW, 0);
674  NEXTn(6);
675  return;
676  }
677 
678  if ((cs = allocset(p)) == NULL) {
679  /* allocset did set error status in p */
680  return;
681  }
682 
683  if (EAT('^'))
684  invert++; /* make note to invert set at end */
685  if (EAT(']'))
686  CHadd(cs, ']');
687  else if (EAT('-'))
688  CHadd(cs, '-');
689  while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
690  p_b_term(p, cs);
691  if (EAT('-'))
692  CHadd(cs, '-');
693  MUSTEAT(']', REG_EBRACK);
694 
695  if (p->error != 0) { /* don't mess things up further */
696  freeset(p, cs);
697  return;
698  }
699 
700  if (p->g->cflags&REG_ICASE) {
701  int i;
702  int ci;
703 
704  for (i = p->g->csetsize - 1; i >= 0; i--)
705  if (CHIN(cs, i) && isalpha(i)) {
706  ci = othercase(i);
707  if (ci != i)
708  CHadd(cs, ci);
709  }
710  if (cs->multis != NULL)
711  mccase(p, cs);
712  }
713  if (invert) {
714  int i;
715 
716  for (i = p->g->csetsize - 1; i >= 0; i--)
717  if (CHIN(cs, i))
718  CHsub(cs, i);
719  else
720  CHadd(cs, i);
721  if (p->g->cflags&REG_NEWLINE)
722  CHsub(cs, '\n');
723  if (cs->multis != NULL)
724  mcinvert(p, cs);
725  }
726 
727  assert(cs->multis == NULL); /* xxx */
728 
729  if (nch(p, cs) == 1) { /* optimize singleton sets */
730  ordinary(p, firstch(p, cs));
731  freeset(p, cs);
732  } else
733  EMIT(OANYOF, freezeset(p, cs));
734 }
735 
736 /*
737  - p_b_term - parse one term of a bracketed character list
738  */
739 static void
740 p_b_term(struct parse *p, cset *cs)
741 {
742  char c;
743  char start, finish;
744  int i;
745 
746  /* classify what we've got */
747  switch ((MORE()) ? PEEK() : '\0') {
748  case '[':
749  c = (MORE2()) ? PEEK2() : '\0';
750  break;
751  case '-':
753  return; /* NOTE RETURN */
754  break;
755  default:
756  c = '\0';
757  break;
758  }
759 
760  switch (c) {
761  case ':': /* character class */
762  NEXT2();
763  REQUIRE(MORE(), REG_EBRACK);
764  c = PEEK();
765  REQUIRE(c != '-' && c != ']', REG_ECTYPE);
766  p_b_cclass(p, cs);
767  REQUIRE(MORE(), REG_EBRACK);
768  REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
769  break;
770  case '=': /* equivalence class */
771  NEXT2();
772  REQUIRE(MORE(), REG_EBRACK);
773  c = PEEK();
774  REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
775  p_b_eclass(p, cs);
776  REQUIRE(MORE(), REG_EBRACK);
777  REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
778  break;
779  default: /* symbol, ordinary character, or range */
780 /* xxx revision needed for multichar stuff */
781  start = p_b_symbol(p);
782  if (SEE('-') && MORE2() && PEEK2() != ']') {
783  /* range */
784  NEXT();
785  if (EAT('-'))
786  finish = '-';
787  else
788  finish = p_b_symbol(p);
789  } else
790  finish = start;
791 /* xxx what about signed chars here... */
792  REQUIRE(start <= finish, REG_ERANGE);
793  for (i = start; i <= finish; i++)
794  CHadd(cs, i);
795  break;
796  }
797 }
798 
799 /*
800  - p_b_cclass - parse a character-class name and deal with it
801  */
802 static void
803 p_b_cclass(struct parse *p, cset *cs)
804 {
805  char *sp = p->next;
806  struct cclass *cp;
807  size_t len;
808  const char *u;
809  char c;
810 
811  while (MORE() && isalpha((uch)PEEK()))
812  NEXT();
813  len = p->next - sp;
814  for (cp = cclasses; cp->name != NULL; cp++)
815  if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
816  break;
817  if (cp->name == NULL) {
818  /* oops, didn't find it */
820  return;
821  }
822 
823  u = cp->chars;
824  while ((c = *u++) != '\0')
825  CHadd(cs, c);
826  for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
827  MCadd(p, cs, u);
828 }
829 
830 /*
831  - p_b_eclass - parse an equivalence-class name and deal with it
832  *
833  * This implementation is incomplete. xxx
834  */
835 static void
836 p_b_eclass(struct parse *p, cset *cs)
837 {
838  char c;
839 
840  c = p_b_coll_elem(p, '=');
841  CHadd(cs, c);
842 }
843 
844 /*
845  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
846  */
847 static char /* value of symbol */
848 p_b_symbol(struct parse *p)
849 {
850  char value;
851 
852  REQUIRE(MORE(), REG_EBRACK);
853  if (!EATTWO('[', '.'))
854  return(GETNEXT());
855 
856  /* collating symbol */
857  value = p_b_coll_elem(p, '.');
858  REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
859  return(value);
860 }
861 
862 /*
863  - p_b_coll_elem - parse a collating-element name and look it up
864  */
865 static char /* value of collating element */
867  int endc) /* name ended by endc,']' */
868 {
869  char *sp = p->next;
870  struct cname *cp;
871  int len;
872 
873  while (MORE() && !SEETWO(endc, ']'))
874  NEXT();
875  if (!MORE()) {
877  return(0);
878  }
879  len = p->next - sp;
880  for (cp = cnames; cp->name != NULL; cp++)
881  if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
882  return(cp->code); /* known name */
883  if (len == 1)
884  return(*sp); /* single character */
885  SETERROR(REG_ECOLLATE); /* neither */
886  return(0);
887 }
888 
889 /*
890  - othercase - return the case counterpart of an alphabetic
891  */
892 static char /* if no counterpart, return ch */
893 othercase(int ch)
894 {
895  ch = (uch)ch;
896  assert(isalpha(ch));
897  if (isupper(ch))
898  return ((uch)tolower(ch));
899  else if (islower(ch))
900  return ((uch)toupper(ch));
901  else /* peculiar, but could happen */
902  return(ch);
903 }
904 
905 /*
906  - bothcases - emit a dualcase version of a two-case character
907  *
908  * Boy, is this implementation ever a kludge...
909  */
910 static void
911 bothcases(struct parse *p, int ch)
912 {
913  char *oldnext = p->next;
914  char *oldend = p->end;
915  char bracket[3];
916 
917  ch = (uch)ch;
918  assert(othercase(ch) != ch); /* p_bracket() would recurse */
919  p->next = bracket;
920  p->end = bracket+2;
921  bracket[0] = ch;
922  bracket[1] = ']';
923  bracket[2] = '\0';
924  p_bracket(p);
925  assert(p->next == bracket+2);
926  p->next = oldnext;
927  p->end = oldend;
928 }
929 
930 /*
931  - ordinary - emit an ordinary character
932  */
933 static void
934 ordinary(struct parse *p, int ch)
935 {
936  cat_t *cap = p->g->categories;
937 
938  if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
939  bothcases(p, ch);
940  else {
941  EMIT(OCHAR, (uch)ch);
942  if (cap[ch] == 0)
943  cap[ch] = p->g->ncategories++;
944  }
945 }
946 
947 /*
948  - nonnewline - emit REG_NEWLINE version of OANY
949  *
950  * Boy, is this implementation ever a kludge...
951  */
952 static void
953 nonnewline(struct parse *p)
954 {
955  char *oldnext = p->next;
956  char *oldend = p->end;
957  char bracket[4];
958 
959  p->next = bracket;
960  p->end = bracket+3;
961  bracket[0] = '^';
962  bracket[1] = '\n';
963  bracket[2] = ']';
964  bracket[3] = '\0';
965  p_bracket(p);
966  assert(p->next == bracket+3);
967  p->next = oldnext;
968  p->end = oldend;
969 }
970 
971 /*
972  - repeat - generate code for a bounded repetition, recursively if needed
973  */
974 static void
975 repeat(struct parse *p,
976  sopno start, /* operand from here to end of strip */
977  int from, /* repeated from this number */
978  int to) /* to this number of times (maybe INFINITY) */
979 {
980  sopno finish = HERE();
981 # define N 2
982 # define INF 3
983 # define REP(f, t) ((f)*8 + (t))
984 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
985  sopno copy;
986 
987  if (p->error != 0) /* head off possible runaway recursion */
988  return;
989 
990  assert(from <= to);
991 
992  switch (REP(MAP(from), MAP(to))) {
993  case REP(0, 0): /* must be user doing this */
994  DROP(finish-start); /* drop the operand */
995  break;
996  case REP(0, 1): /* as x{1,1}? */
997  case REP(0, N): /* as x{1,n}? */
998  case REP(0, INF): /* as x{1,}? */
999  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1000  INSERT(OCH_, start); /* offset is wrong... */
1001  repeat(p, start+1, 1, to);
1002  ASTERN(OOR1, start);
1003  AHEAD(start); /* ... fix it */
1004  EMIT(OOR2, 0);
1005  AHEAD(THERE());
1006  ASTERN(O_CH, THERETHERE());
1007  break;
1008  case REP(1, 1): /* trivial case */
1009  /* done */
1010  break;
1011  case REP(1, N): /* as x?x{1,n-1} */
1012  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1013  INSERT(OCH_, start);
1014  ASTERN(OOR1, start);
1015  AHEAD(start);
1016  EMIT(OOR2, 0); /* offset very wrong... */
1017  AHEAD(THERE()); /* ...so fix it */
1018  ASTERN(O_CH, THERETHERE());
1019  copy = dupl(p, start+1, finish+1);
1020  assert(copy == finish+4);
1021  repeat(p, copy, 1, to-1);
1022  break;
1023  case REP(1, INF): /* as x+ */
1024  INSERT(OPLUS_, start);
1025  ASTERN(O_PLUS, start);
1026  break;
1027  case REP(N, N): /* as xx{m-1,n-1} */
1028  copy = dupl(p, start, finish);
1029  repeat(p, copy, from-1, to-1);
1030  break;
1031  case REP(N, INF): /* as xx{n-1,INF} */
1032  copy = dupl(p, start, finish);
1033  repeat(p, copy, from-1, to);
1034  break;
1035  default: /* "can't happen" */
1036  SETERROR(REG_ASSERT); /* just in case */
1037  break;
1038  }
1039 }
1040 
1041 /*
1042  - seterr - set an error condition
1043  */
1044 static int /* useless but makes type checking happy */
1045 seterr(struct parse *p, int e)
1046 {
1047  if (p->error == 0) /* keep earliest error condition */
1048  p->error = e;
1049  p->next = nuls; /* try to bring things to a halt */
1050  p->end = nuls;
1051  return(0); /* make the return value well-defined */
1052 }
1053 
1054 /*
1055  - allocset - allocate a set of characters for []
1056  */
1057 static cset *
1058 allocset(struct parse *p)
1059 {
1060  int no = p->g->ncsets++;
1061  size_t nc;
1062  size_t nbytes;
1063  cset *cs;
1064  size_t css = (size_t)p->g->csetsize;
1065  int i;
1066 
1067  if (no >= p->ncsalloc) { /* need another column of space */
1068  void *ptr;
1069 
1070  p->ncsalloc += CHAR_BIT;
1071  nc = p->ncsalloc;
1072  assert(nc % CHAR_BIT == 0);
1073  nbytes = nc / CHAR_BIT * css;
1074 
1075  ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1076  if (ptr == NULL)
1077  goto nomem;
1078  p->g->sets = ptr;
1079 
1080  ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1081  if (ptr == NULL)
1082  goto nomem;
1083  p->g->setbits = ptr;
1084 
1085  for (i = 0; i < no; i++)
1086  p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1087 
1088  (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1089  }
1090  /* XXX should not happen */
1091  if (p->g->sets == NULL || p->g->setbits == NULL)
1092  goto nomem;
1093 
1094  cs = &p->g->sets[no];
1095  cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1096  cs->mask = 1 << ((no) % CHAR_BIT);
1097  cs->hash = 0;
1098  cs->smultis = 0;
1099  cs->multis = NULL;
1100 
1101  return(cs);
1102 nomem:
1103  free(p->g->sets);
1104  p->g->sets = NULL;
1105  free(p->g->setbits);
1106  p->g->setbits = NULL;
1107 
1109  /* caller's responsibility not to do set ops */
1110  return(NULL);
1111 }
1112 
1113 /*
1114  - freeset - free a now-unused set
1115  */
1116 static void
1117 freeset(struct parse *p, cset *cs)
1118 {
1119  size_t i;
1120  cset *top = &p->g->sets[p->g->ncsets];
1121  size_t css = (size_t)p->g->csetsize;
1122 
1123  for (i = 0; i < css; i++)
1124  CHsub(cs, i);
1125  if (cs == top-1) /* recover only the easy case */
1126  p->g->ncsets--;
1127 }
1128 
1129 /*
1130  - freezeset - final processing on a set of characters
1131  *
1132  * The main task here is merging identical sets. This is usually a waste
1133  * of time (although the hash code minimizes the overhead), but can win
1134  * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1135  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1136  * the same value!
1137  */
1138 static int /* set number */
1139 freezeset(struct parse *p, cset *cs)
1140 {
1141  uch h = cs->hash;
1142  size_t i;
1143  cset *top = &p->g->sets[p->g->ncsets];
1144  cset *cs2;
1145  size_t css = (size_t)p->g->csetsize;
1146 
1147  /* look for an earlier one which is the same */
1148  for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1149  if (cs2->hash == h && cs2 != cs) {
1150  /* maybe */
1151  for (i = 0; i < css; i++)
1152  if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1153  break; /* no */
1154  if (i == css)
1155  break; /* yes */
1156  }
1157 
1158  if (cs2 < top) { /* found one */
1159  freeset(p, cs);
1160  cs = cs2;
1161  }
1162 
1163  return((int)(cs - p->g->sets));
1164 }
1165 
1166 /*
1167  - firstch - return first character in a set (which must have at least one)
1168  */
1169 static int /* character; there is no "none" value */
1170 firstch(struct parse *p, cset *cs)
1171 {
1172  size_t i;
1173  size_t css = (size_t)p->g->csetsize;
1174 
1175  for (i = 0; i < css; i++)
1176  if (CHIN(cs, i))
1177  return((char)i);
1178  assert(never);
1179  return(0); /* arbitrary */
1180 }
1181 
1182 /*
1183  - nch - number of characters in a set
1184  */
1185 static int
1186 nch(struct parse *p, cset *cs)
1187 {
1188  size_t i;
1189  size_t css = (size_t)p->g->csetsize;
1190  int n = 0;
1191 
1192  for (i = 0; i < css; i++)
1193  if (CHIN(cs, i))
1194  n++;
1195  return(n);
1196 }
1197 
1198 /*
1199  - mcadd - add a collating element to a cset
1200  */
1201 static void
1202 mcadd( struct parse *p, cset *cs, const char *cp)
1203 {
1204  size_t oldend = cs->smultis;
1205  void *np;
1206 
1207  cs->smultis += strlen(cp) + 1;
1208  np = realloc(cs->multis, cs->smultis);
1209  if (np == NULL) {
1210  if (cs->multis)
1211  free(cs->multis);
1212  cs->multis = NULL;
1214  return;
1215  }
1216  cs->multis = np;
1217 
1218  llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1219 }
1220 
1221 /*
1222  - mcinvert - invert the list of collating elements in a cset
1223  *
1224  * This would have to know the set of possibilities. Implementation
1225  * is deferred.
1226  */
1227 /* ARGSUSED */
1228 static void
1229 mcinvert(struct parse *p, cset *cs)
1230 {
1231  assert(cs->multis == NULL); /* xxx */
1232 }
1233 
1234 /*
1235  - mccase - add case counterparts of the list of collating elements in a cset
1236  *
1237  * This would have to know the set of possibilities. Implementation
1238  * is deferred.
1239  */
1240 /* ARGSUSED */
1241 static void
1242 mccase(struct parse *p, cset *cs)
1243 {
1244  assert(cs->multis == NULL); /* xxx */
1245 }
1246 
1247 /*
1248  - isinsets - is this character in any sets?
1249  */
1250 static int /* predicate */
1251 isinsets(struct re_guts *g, int c)
1252 {
1253  uch *col;
1254  int i;
1255  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1256  unsigned uc = (uch)c;
1257 
1258  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1259  if (col[uc] != 0)
1260  return(1);
1261  return(0);
1262 }
1263 
1264 /*
1265  - samesets - are these two characters in exactly the same sets?
1266  */
1267 static int /* predicate */
1268 samesets(struct re_guts *g, int c1, int c2)
1269 {
1270  uch *col;
1271  int i;
1272  int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1273  unsigned uc1 = (uch)c1;
1274  unsigned uc2 = (uch)c2;
1275 
1276  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1277  if (col[uc1] != col[uc2])
1278  return(0);
1279  return(1);
1280 }
1281 
1282 /*
1283  - categorize - sort out character categories
1284  */
1285 static void
1286 categorize(struct parse *p, struct re_guts *g)
1287 {
1288  cat_t *cats = g->categories;
1289  int c;
1290  int c2;
1291  cat_t cat;
1292 
1293  /* avoid making error situations worse */
1294  if (p->error != 0)
1295  return;
1296 
1297  for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1298  if (cats[c] == 0 && isinsets(g, c)) {
1299  cat = g->ncategories++;
1300  cats[c] = cat;
1301  for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1302  if (cats[c2] == 0 && samesets(g, c, c2))
1303  cats[c2] = cat;
1304  }
1305 }
1306 
1307 /*
1308  - dupl - emit a duplicate of a bunch of sops
1309  */
1310 static sopno /* start of duplicate */
1311 dupl(struct parse *p,
1312  sopno start, /* from here */
1313  sopno finish) /* to this less one */
1314 {
1315  sopno ret = HERE();
1316  sopno len = finish - start;
1317 
1318  assert(finish >= start);
1319  if (len == 0)
1320  return(ret);
1321  enlarge(p, p->ssize + len); /* this many unexpected additions */
1322  assert(p->ssize >= p->slen + len);
1323  (void) memmove((char *)(p->strip + p->slen),
1324  (char *)(p->strip + start), (size_t)len*sizeof(sop));
1325  p->slen += len;
1326  return(ret);
1327 }
1328 
1329 /*
1330  - doemit - emit a strip operator
1331  *
1332  * It might seem better to implement this as a macro with a function as
1333  * hard-case backup, but it's just too big and messy unless there are
1334  * some changes to the data structures. Maybe later.
1335  */
1336 static void
1337 doemit(struct parse *p, sop op, size_t opnd)
1338 {
1339  /* avoid making error situations worse */
1340  if (p->error != 0)
1341  return;
1342 
1343  /* deal with oversize operands ("can't happen", more or less) */
1344  assert(opnd < 1<<OPSHIFT);
1345 
1346  /* deal with undersized strip */
1347  if (p->slen >= p->ssize)
1348  enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1349  assert(p->slen < p->ssize);
1350 
1351  /* finally, it's all reduced to the easy case */
1352  p->strip[p->slen++] = SOP(op, opnd);
1353 }
1354 
1355 /*
1356  - doinsert - insert a sop into the strip
1357  */
1358 static void
1359 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1360 {
1361  sopno sn;
1362  sop s;
1363  int i;
1364 
1365  /* avoid making error situations worse */
1366  if (p->error != 0)
1367  return;
1368 
1369  sn = HERE();
1370  EMIT(op, opnd); /* do checks, ensure space */
1371  assert(HERE() == sn+1);
1372  s = p->strip[sn];
1373 
1374  /* adjust paren pointers */
1375  assert(pos > 0);
1376  for (i = 1; i < NPAREN; i++) {
1377  if (p->pbegin[i] >= pos) {
1378  p->pbegin[i]++;
1379  }
1380  if (p->pend[i] >= pos) {
1381  p->pend[i]++;
1382  }
1383  }
1384 
1385  memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1386  (HERE()-pos-1)*sizeof(sop));
1387  p->strip[pos] = s;
1388 }
1389 
1390 /*
1391  - dofwd - complete a forward reference
1392  */
1393 static void
1394 dofwd(struct parse *p, sopno pos, sop value)
1395 {
1396  /* avoid making error situations worse */
1397  if (p->error != 0)
1398  return;
1399 
1400  assert(value < 1<<OPSHIFT);
1401  p->strip[pos] = OP(p->strip[pos]) | value;
1402 }
1403 
1404 /*
1405  - enlarge - enlarge the strip
1406  */
1407 static void
1408 enlarge(struct parse *p, sopno size)
1409 {
1410  sop *sp;
1411 
1412  if (p->ssize >= size)
1413  return;
1414 
1415  sp = (sop *)realloc(p->strip, size*sizeof(sop));
1416  if (sp == NULL) {
1418  return;
1419  }
1420  p->strip = sp;
1421  p->ssize = size;
1422 }
1423 
1424 /*
1425  - stripsnug - compact the strip
1426  */
1427 static void
1428 stripsnug(struct parse *p, struct re_guts *g)
1429 {
1430  g->nstates = p->slen;
1431  g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1432  if (g->strip == NULL) {
1434  g->strip = p->strip;
1435  }
1436 }
1437 
1438 /*
1439  - findmust - fill in must and mlen with longest mandatory literal string
1440  *
1441  * This algorithm could do fancy things like analyzing the operands of |
1442  * for common subsequences. Someday. This code is simple and finds most
1443  * of the interesting cases.
1444  *
1445  * Note that must and mlen got initialized during setup.
1446  */
1447 static void
1448 findmust(struct parse *p, struct re_guts *g)
1449 {
1450  sop *scan;
1451  sop *start = 0; /* start initialized in the default case, after that */
1452  sop *newstart = 0; /* newstart was initialized in the OCHAR case */
1453  sopno newlen;
1454  sop s;
1455  char *cp;
1456  sopno i;
1457 
1458  /* avoid making error situations worse */
1459  if (p->error != 0)
1460  return;
1461 
1462  /* find the longest OCHAR sequence in strip */
1463  newlen = 0;
1464  scan = g->strip + 1;
1465  do {
1466  s = *scan++;
1467  switch (OP(s)) {
1468  case OCHAR: /* sequence member */
1469  if (newlen == 0) /* new sequence */
1470  newstart = scan - 1;
1471  newlen++;
1472  break;
1473  case OPLUS_: /* things that don't break one */
1474  case OLPAREN:
1475  case ORPAREN:
1476  break;
1477  case OQUEST_: /* things that must be skipped */
1478  case OCH_:
1479  scan--;
1480  do {
1481  scan += OPND(s);
1482  s = *scan;
1483  /* assert() interferes w debug printouts */
1484  if (OP(s) != O_QUEST && OP(s) != O_CH &&
1485  OP(s) != OOR2) {
1486  g->iflags |= REGEX_BAD;
1487  return;
1488  }
1489  } while (OP(s) != O_QUEST && OP(s) != O_CH);
1490  /* fallthrough */
1491  default: /* things that break a sequence */
1492  if (newlen > g->mlen) { /* ends one */
1493  start = newstart;
1494  g->mlen = newlen;
1495  }
1496  newlen = 0;
1497  break;
1498  }
1499  } while (OP(s) != OEND);
1500 
1501  if (g->mlen == 0) /* there isn't one */
1502  return;
1503 
1504  /* turn it into a character string */
1505  g->must = malloc((size_t)g->mlen + 1);
1506  if (g->must == NULL) { /* argh; just forget it */
1507  g->mlen = 0;
1508  return;
1509  }
1510  cp = g->must;
1511  scan = start;
1512  for (i = g->mlen; i > 0; i--) {
1513  while (OP(s = *scan++) != OCHAR)
1514  continue;
1515  assert(cp < g->must + g->mlen);
1516  *cp++ = (char)OPND(s);
1517  }
1518  assert(cp == g->must + g->mlen);
1519  *cp++ = '\0'; /* just on general principles */
1520 }
1521 
1522 /*
1523  - pluscount - count + nesting
1524  */
1525 static sopno /* nesting depth */
1526 pluscount(struct parse *p, struct re_guts *g)
1527 {
1528  sop *scan;
1529  sop s;
1530  sopno plusnest = 0;
1531  sopno maxnest = 0;
1532 
1533  if (p->error != 0)
1534  return(0); /* there may not be an OEND */
1535 
1536  scan = g->strip + 1;
1537  do {
1538  s = *scan++;
1539  switch (OP(s)) {
1540  case OPLUS_:
1541  plusnest++;
1542  break;
1543  case O_PLUS:
1544  if (plusnest > maxnest)
1545  maxnest = plusnest;
1546  plusnest--;
1547  break;
1548  }
1549  } while (OP(s) != OEND);
1550  if (plusnest != 0)
1551  g->iflags |= REGEX_BAD;
1552  return(maxnest);
1553 }
static void nonnewline(struct parse *)
Definition: regcomp.c:953
static void p_ere_exp(struct parse *)
Definition: regcomp.c:300
size_t re_nsub
Definition: regex_impl.h:50
#define OUT
Definition: regex2.h:156
#define REG_NOSPEC
Definition: regex_impl.h:61
#define REG_INVARG
Definition: regex_impl.h:81
#define OBOL
Definition: regex2.h:74
sopno nplus
Definition: regex2.h:150
static int p_simp_re(struct parse *, int)
Definition: regcomp.c:524
#define EAT(c)
Definition: regcomp.c:120
sopno slen
Definition: regcomp.c:62
static int firstch(struct parse *, cset *)
Definition: regcomp.c:1170
const char * name
Definition: regcclass.h:42
static struct cname cnames[]
#define BACKSL
#define OEOL
Definition: regex2.h:75
static int samesets(struct re_guts *, int, int)
Definition: regcomp.c:1268
static int freezeset(struct parse *, cset *)
Definition: regcomp.c:1139
long sopno
Definition: regex2.h:63
char * multis
Definition: regex2.h:110
#define ASTERN(sop, pos)
Definition: regcomp.c:134
#define REP(f, t)
int ncategories
Definition: regex2.h:144
int backrefs
Definition: regex2.h:149
#define THERE()
Definition: regcomp.c:136
#define REG_NEWLINE
Definition: regex_impl.h:60
static void mccase(struct parse *, cset *)
Definition: regcomp.c:1242
#define REG_BADRPT
Definition: regex_impl.h:78
#define MAGIC2
Definition: regex2.h:128
#define OOR2
Definition: regex2.h:88
#define OPLUS_
Definition: regex2.h:80
int mlen
Definition: regex2.h:147
static void p_b_term(struct parse *, cset *)
Definition: regcomp.c:740
#define REG_EESCAPE
Definition: regex_impl.h:70
static void categorize(struct parse *, struct re_guts *)
Definition: regcomp.c:1286
#define EMIT(op, sopnd)
Definition: regcomp.c:131
sopno ssize
Definition: regcomp.c:61
#define GETNEXT()
Definition: regcomp.c:125
static void findmust(struct parse *, struct re_guts *)
Definition: regcomp.c:1448
int llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
Definition: regcomp.c:157
int csetsize
Definition: regex2.h:130
int isdigit(int c);
#define DUPMAX
Definition: regcomp.c:143
static int seterr(struct parse *, int)
Definition: regcomp.c:1045
#define REG_EPAREN
Definition: regex_impl.h:73
#define NEXT()
Definition: regcomp.c:122
#define OPSHIFT
Definition: regex2.h:66
void llvm_regfree(llvm_regex_t *)
Definition: regfree.c:50
sop * strip
Definition: regex2.h:129
static cset * allocset(struct parse *)
Definition: regcomp.c:1058
#define O_CH
Definition: regex2.h:89
#define REG_EMPTY
Definition: regex_impl.h:79
int re_magic
Definition: regex_impl.h:49
const char * name
Definition: regcname.h:40
int cflags
Definition: regex2.h:134
static void stripsnug(struct parse *, struct re_guts *)
Definition: regcomp.c:1428
static void dofwd(struct parse *, sopno, sop)
Definition: regcomp.c:1394
Definition: regcname.h:39
#define REG_EXTENDED
Definition: regex_impl.h:57
sopno pend[NPAREN]
Definition: regcomp.c:67
static void doemit(struct parse *, sop, size_t)
Definition: regcomp.c:1337
#define CHIN(cs, c)
Definition: regex2.h:115
static void bothcases(struct parse *, int)
Definition: regcomp.c:911
#define REG_EBRACE
Definition: regex_impl.h:74
int ncsets
Definition: regex2.h:131
#define MCadd(p, cs, cp)
Definition: regex2.h:116
char * must
Definition: regex2.h:146
#define CHadd(cs, c)
Definition: regex2.h:113
#define NEXTn(n)
Definition: regcomp.c:124
static int nch(struct parse *, cset *)
Definition: regcomp.c:1186
Definition: regcomp.c:56
sopno firststate
Definition: regex2.h:136
char * next
Definition: regcomp.c:57
#define OEND
Definition: regex2.h:72
sopno pbegin[NPAREN]
Definition: regcomp.c:66
const char * re_endp
Definition: regex_impl.h:51
#define OQUEST_
Definition: regex2.h:82
#define SOP(op, opnd)
Definition: regex2.h:69
int error
Definition: regcomp.c:59
struct re_guts * g
Definition: regcomp.c:64
size_t llvm_strlcpy(char *dst, const char *src, size_t siz)
Definition: regstrlcpy.c:29
#define NPAREN
Definition: regcomp.c:65
#define O_BACK
Definition: regex2.h:79
#define SETERROR(e)
Definition: regcomp.c:126
static void enlarge(struct parse *, sopno)
Definition: regcomp.c:1408
#define AHEAD(pos)
Definition: regcomp.c:133
uch hash
Definition: regex2.h:108
static void p_ere(struct parse *, int)
Definition: regcomp.c:257
sopno laststate
Definition: regex2.h:137
sopno nstates
Definition: regex2.h:135
#define CHsub(cs, c)
Definition: regex2.h:114
#define REG_ICASE
Definition: regex_impl.h:58
uch * ptr
Definition: regex2.h:106
Definition: regex2.h:105
size_t smultis
Definition: regex2.h:109
#define REG_ECOLLATE
Definition: regex_impl.h:68
#define HERE()
Definition: regcomp.c:135
* if(!EatIfPresent(lltok::kw_thread_local)) return false
#define ORPAREN
Definition: regex2.h:85
#define REG_BADBR
Definition: regex_impl.h:75
char code
Definition: regcname.h:41
static void mcadd(struct parse *, cset *, const char *)
Definition: regcomp.c:1202
#define MAGIC1
Definition: regex2.h:41
uch * setbits
Definition: regex2.h:133
void free(void *ptr);
void *realloc(void *ptr, size_t size);
static void doinsert(struct parse *, sop, size_t, sopno)
Definition: regcomp.c:1359
#define OBOW
Definition: regex2.h:90
int neol
Definition: regex2.h:143
int iflags
Definition: regex2.h:138
for(unsigned i=0, e=MI->getNumOperands();i!=e;++i)
char * end
Definition: regcomp.c:58
#define GOODFLAGS(f)
#define INFINITY
Definition: regcomp.c:145
#define DROP(n)
Definition: regcomp.c:138
#define OANY
Definition: regex2.h:76
sop * strip
Definition: regcomp.c:60
#define REG_EBRACK
Definition: regex_impl.h:72
struct re_guts * re_g
Definition: regex_impl.h:52
uch mask
Definition: regex2.h:107
static char nuls[10]
Definition: regcomp.c:108
unsigned char uch
Definition: regutils.h:40
#define REGEX_BAD
Definition: regex2.h:141
#define OLPAREN
Definition: regex2.h:84
int ncsalloc
Definition: regcomp.c:63
static void p_b_cclass(struct parse *, cset *)
Definition: regcomp.c:803
unsigned char cat_t
Definition: regex2.h:121
#define SEETWO(a, b)
Definition: regcomp.c:119
const char * multis
Definition: regcclass.h:44
#define EATTWO(a, b)
Definition: regcomp.c:121
static void p_bracket(struct parse *)
Definition: regcomp.c:661
static void freeset(struct parse *, cset *)
Definition: regcomp.c:1117
cset * sets
Definition: regex2.h:132
#define MORE()
Definition: regcomp.c:116
static char p_b_symbol(struct parse *)
Definition: regcomp.c:848
static void ordinary(struct parse *, int)
Definition: regcomp.c:934
#define NC
Definition: regutils.h:39
size_t strlen(const char *s);
#define MAP(n)
#define OANYOF
Definition: regex2.h:77
static void p_bre(struct parse *, int, int)
Definition: regcomp.c:493
#define MORE2()
Definition: regcomp.c:117
#define REG_ERANGE
Definition: regex_impl.h:76
#define SEE(c)
Definition: regcomp.c:118
static int p_count(struct parse *)
Definition: regcomp.c:640
static struct cclass cclasses[]
#define OEOW
Definition: regex2.h:91
#define USEEOL
Definition: regex2.h:140
#define REG_PEND
Definition: regex_impl.h:62
#define OBACK_
Definition: regex2.h:78
void *malloc(size_t size);
#define USEBOL
Definition: regex2.h:139
#define THERETHERE()
Definition: regcomp.c:137
static char p_b_coll_elem(struct parse *, int)
Definition: regcomp.c:866
static void p_b_eclass(struct parse *, cset *)
Definition: regcomp.c:836
static char othercase(int)
Definition: regcomp.c:893
#define PEEK2()
Definition: regcomp.c:115
#define REG_ECTYPE
Definition: regex_impl.h:69
#define O_QUEST
Definition: regex2.h:83
#define N
static sopno dupl(struct parse *, sopno, sopno)
Definition: regcomp.c:1311
cat_t * categories
Definition: regex2.h:145
static void repeat(struct parse *, sopno, int, int)
Definition: regcomp.c:975
static sopno pluscount(struct parse *, struct re_guts *)
Definition: regcomp.c:1526
size_t nsub
Definition: regex2.h:148
#define OCH_
Definition: regex2.h:86
int magic
Definition: regex2.h:127
static void p_str(struct parse *)
Definition: regcomp.c:475
#define MUSTEAT(c, e)
Definition: regcomp.c:129
static int isinsets(struct re_guts *, int)
Definition: regcomp.c:1251
#define never
Definition: regcomp.c:150
#define OPND(n)
Definition: regex2.h:68
#define INF
const char * chars
Definition: regcclass.h:43
#define REG_ASSERT
Definition: regex_impl.h:80
#define OCHAR
Definition: regex2.h:73
int strncmp(const char *s1, const char *s2, size_t n);
#define PEEK()
Definition: regcomp.c:114
#define O_PLUS
Definition: regex2.h:81
#define REQUIRE(co, e)
Definition: regcomp.c:127
#define NEXT2()
Definition: regcomp.c:123
void *calloc(size_t count, size_t size);
unsigned long sop
Definition: regex2.h:62
#define INSERT(op, pos)
Definition: regcomp.c:132
#define REG_ESPACE
Definition: regex_impl.h:77
#define OP(n)
Definition: regex2.h:67
#define OOR1
Definition: regex2.h:87
cat_t catspace[1]
Definition: regex2.h:152
static void mcinvert(struct parse *, cset *)
Definition: regcomp.c:1229
int nbol
Definition: regex2.h:142
#define REG_ESUBREG
Definition: regex_impl.h:71