1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#define DEBUG
28
29#include <ctype.h>
30#include <limits.h>
31#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include "awk.h"
35#include "awkgram.tab.h"
36
37#define MAXLIN 22
38
39#define type(v) (v)->nobj /* badly overloaded here */
40#define info(v) (v)->ntype /* badly overloaded here */
41#define left(v) (v)->narg[0]
42#define right(v) (v)->narg[1]
43#define parent(v) (v)->nnext
44
45#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46#define ELEAF case EMPTYRE: /* empty string in regexp */
47#define UNARY case STAR: case PLUS: case QUEST:
48
49/* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55*/
56
57
58int *setvec;
59int *tmpset;
60int maxsetvec = 0;
61
62int rtok; /* next token in current re */
63int rlxval;
64static const uschar *rlxstr;
65static const uschar *prestr; /* current position in current re */
66static const uschar *lastre; /* origin of last re */
67static const uschar *lastatom; /* origin of last Atom */
68static const uschar *starttok;
69static const uschar *basestr; /* starts with original, replaced during
70 repetition processing */
71static const uschar *firstbasestr;
72
73static int setcnt;
74static int poscnt;
75
76const char *patbeg;
77int patlen;
78
79#define NFA 128 /* cache this many dynamic fa's */
80fa *fatab[NFA];
81int nfatab = 0; /* entries in fatab */
82
83extern int u8_nextlen(const char *s);
84
85
86/* utf-8 mechanism:
87
88 For most of Awk, utf-8 strings just "work", since they look like
89 null-terminated sequences of 8-bit bytes.
90
91 Functions like length(), index(), and substr() have to operate
92 in units of utf-8 characters. The u8_* functions in run.c
93 handle this.
94
95 Regular expressions are more complicated, since the basic
96 mechanism of the goto table used 8-bit byte indices into the
97 gototab entries to compute the next state. Unicode is a lot
98 bigger, so the gototab entries are now structs with a character
99 and a next state. These are sorted by code point and binary
100 searched.
101
102 Throughout the RE mechanism in b.c, utf-8 characters are
103 converted to their utf-32 value. This mostly shows up in
104 cclenter, which expands character class ranges like a-z and now
105 alpha-omega. The size of a gototab array is still about 256.
106 This should be dynamic, but for now things work ok for a single
107 code page of Unicode, which is the most likely case.
108
109 The code changes are localized in run.c and b.c. I have added a
110 handful of functions to somewhat better hide the implementation,
111 but a lot more could be done.
112
113 */
114
115static int entry_cmp(const void *l, const void *r);
116static int get_gototab(fa*, int, int);
117static int set_gototab(fa*, int, int, int);
118static void clear_gototab(fa*, int);
119extern int u8_rune(int *, const char *);
120
121static int *
122intalloc(size_t n, const char *f)
123{
124 int *p = (int *) calloc(n, sizeof(int));
125 if (p == NULL)
126 overflo(f);
127 return p;
128}
129
130static void
131resizesetvec(const char *f)
132{
133 if (maxsetvec == 0)
134 maxsetvec = MAXLIN;
135 else
136 maxsetvec *= 4;
137 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139 if (setvec == NULL || tmpset == NULL)
140 overflo(f);
141}
142
143static void
144resize_state(fa *f, int state)
145{
146 gtt *p;
147 uschar *p2;
148 int **p3;
149 int i, new_count;
150
151 if (++state < f->state_count)
152 return;
153
154 new_count = state + 10; /* needs to be tuned */
155
156 p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157 if (p == NULL)
158 goto out;
159 f->gototab = p;
160
161 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162 if (p2 == NULL)
163 goto out;
164 f->out = p2;
165
166 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167 if (p3 == NULL)
168 goto out;
169 f->posns = p3;
170
171 for (i = f->state_count; i < new_count; ++i) {
172 memset(f->gototab[i].direct, 0, sizeof(f->gototab[i].direct));
173 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
174 if (f->gototab[i].entries == NULL)
175 goto out;
176 f->gototab[i].allocated = NCHARS;
177 f->gototab[i].inuse = 0;
178 f->out[i] = 0;
179 f->posns[i] = NULL;
180 }
181 f->state_count = new_count;
182 return;
183out:
184 overflo(__func__);
185}
186
187fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
188{
189 int i, use, nuse;
190 fa *pfa;
191 static int now = 1;
192
193 if (setvec == NULL) { /* first time through any RE */
194 resizesetvec(__func__);
195 }
196
197 if (compile_time != RUNNING) /* a constant for sure */
198 return mkdfa(s, anchor);
199 for (i = 0; i < nfatab; i++) /* is it there already? */
200 if (fatab[i]->anchor == anchor
201 && strcmp((const char *) fatab[i]->restr, s) == 0) {
202 fatab[i]->use = now++;
203 return fatab[i];
204 }
205 pfa = mkdfa(s, anchor);
206 if (nfatab < NFA) { /* room for another */
207 fatab[nfatab] = pfa;
208 fatab[nfatab]->use = now++;
209 nfatab++;
210 return pfa;
211 }
212 use = fatab[0]->use; /* replace least-recently used */
213 nuse = 0;
214 for (i = 1; i < nfatab; i++)
215 if (fatab[i]->use < use) {
216 use = fatab[i]->use;
217 nuse = i;
218 }
219 freefa(fatab[nuse]);
220 fatab[nuse] = pfa;
221 pfa->use = now++;
222 return pfa;
223}
224
225fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
226 /* anchor = true for anchored matches, else false */
227{
228 Node *p, *p1;
229 fa *f;
230
231 firstbasestr = (const uschar *) s;
232 basestr = firstbasestr;
233 p = reparse(s);
234 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
235 /* put ALL STAR in front of reg. exp. */
236 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
237 /* put FINAL after reg. exp. */
238
239 poscnt = 0;
240 penter(p1); /* enter parent pointers and leaf indices */
241 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
242 overflo(__func__);
243 f->accept = poscnt-1; /* penter has computed number of positions in re */
244 cfoll(f, p1); /* set up follow sets */
245 freetr(p1);
246 resize_state(f, 1);
247 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
248 f->posns[1] = intalloc(1, __func__);
249 *f->posns[1] = 0;
250 f->initstat = makeinit(f, anchor);
251 f->anchor = anchor;
252 f->restr = (uschar *) tostring(s);
253 if (firstbasestr != basestr) {
254 if (basestr)
255 xfree(basestr);
256 }
257 return f;
258}
259
260int makeinit(fa *f, bool anchor)
261{
262 int i, k;
263
264 f->curstat = 2;
265 f->out[2] = 0;
266 k = *(f->re[0].lfollow);
267 xfree(f->posns[2]);
268 f->posns[2] = intalloc(k + 1, __func__);
269 for (i = 0; i <= k; i++) {
270 (f->posns[2])[i] = (f->re[0].lfollow)[i];
271 }
272 if ((f->posns[2])[1] == f->accept)
273 f->out[2] = 1;
274 clear_gototab(f, 2);
275 f->curstat = cgoto(f, 2, HAT);
276 if (anchor) {
277 *f->posns[2] = k-1; /* leave out position 0 */
278 for (i = 0; i < k; i++) {
279 (f->posns[0])[i] = (f->posns[2])[i];
280 }
281
282 f->out[0] = f->out[2];
283 if (f->curstat != 2)
284 --(*f->posns[f->curstat]);
285 }
286 return f->curstat;
287}
288
289void penter(Node *p) /* set up parent pointers and leaf indices */
290{
291 switch (type(p)) {
292 ELEAF
293 LEAF
294 info(p) = poscnt;
295 poscnt++;
296 break;
297 UNARY
298 penter(left(p));
299 parent(left(p)) = p;
300 break;
301 case CAT:
302 case OR:
303 penter(left(p));
304 penter(right(p));
305 parent(left(p)) = p;
306 parent(right(p)) = p;
307 break;
308 case ZERO:
309 break;
310 default: /* can't happen */
311 FATAL("can't happen: unknown type %d in penter", type(p));
312 break;
313 }
314}
315
316void freetr(Node *p) /* free parse tree */
317{
318 switch (type(p)) {
319 ELEAF
320 LEAF
321 xfree(p);
322 break;
323 UNARY
324 case ZERO:
325 freetr(left(p));
326 xfree(p);
327 break;
328 case CAT:
329 case OR:
330 freetr(left(p));
331 freetr(right(p));
332 xfree(p);
333 break;
334 default: /* can't happen */
335 FATAL("can't happen: unknown type %d in freetr", type(p));
336 break;
337 }
338}
339
340/* in the parsing of regular expressions, metacharacters like . have */
341/* to be seen literally; \056 is not a metacharacter. */
342
343int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
344{ /* only pick up one 8-bit byte (2 chars) */
345 const uschar *p;
346 int n = 0;
347 int i;
348
349 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
350 if (isdigit((int) *p))
351 n = 16 * n + *p - '0';
352 else if (*p >= 'a' && *p <= 'f')
353 n = 16 * n + *p - 'a' + 10;
354 else if (*p >= 'A' && *p <= 'F')
355 n = 16 * n + *p - 'A' + 10;
356 }
357 *pp = p;
358 return n;
359}
360
361
362
363#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
364
365int quoted(const uschar **pp) /* pick up next thing after a \\ */
366 /* and increment *pp */
367{
368 const uschar *p = *pp;
369 int c;
370
371/* BUG: should advance by utf-8 char even if makes no sense */
372
373 switch ((c = *p++)) {
374 case 't':
375 c = '\t';
376 break;
377 case 'n':
378 c = '\n';
379 break;
380 case 'f':
381 c = '\f';
382 break;
383 case 'r':
384 c = '\r';
385 break;
386 case 'b':
387 c = '\b';
388 break;
389 case 'v':
390 c = '\v';
391 break;
392 case 'a':
393 c = '\a';
394 break;
395 case '\\':
396 c = '\\';
397 break;
398 case 'x': /* 2 hex digits follow */
399 c = hexstr(&p, 2); /* this adds a null if number is invalid */
400 break;
401 case 'u': /* unicode char number up to 8 hex digits */
402 c = hexstr(&p, 8);
403 break;
404 default:
405 if (isoctdigit(c)) { /* \d \dd \ddd */
406 int n = c - '0';
407 if (isoctdigit(*p)) {
408 n = 8 * n + *p++ - '0';
409 if (isoctdigit(*p))
410 n = 8 * n + *p++ - '0';
411 }
412 c = n;
413 }
414 }
415
416 *pp = p;
417 return c;
418}
419
420int *cclenter(const char *argp) /* add a character class */
421{
422 int i, c, c2;
423 int n;
424 const uschar *p = (const uschar *) argp;
425 int *bp, *retp;
426 static int *buf = NULL;
427 static int bufsz = 100;
428
429 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
430 FATAL("out of space for character class [%.10s...] 1", p);
431 bp = buf;
432 for (i = 0; *p != 0; ) {
433 n = u8_rune(&c, (const char *) p);
434 p += n;
435 if (c == '\\') {
436 c = quoted(&p);
437 } else if (c == '-' && i > 0 && bp[-1] != 0) {
438 if (*p != 0) {
439 c = bp[-1];
440 /* c2 = *p++; */
441 n = u8_rune(&c2, (const char *) p);
442 p += n;
443 if (c2 == '\\')
444 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
445 if (c > c2) { /* empty; ignore */
446 bp--;
447 i--;
448 continue;
449 }
450 while (c < c2) {
451 if (i >= bufsz) {
452 bufsz *= 2;
453 buf = (int *) realloc(buf, bufsz * sizeof(int));
454 if (buf == NULL)
455 FATAL("out of space for character class [%.10s...] 2", p);
456 bp = buf + i;
457 }
458 *bp++ = ++c;
459 i++;
460 }
461 continue;
462 }
463 }
464 if (i >= bufsz) {
465 bufsz *= 2;
466 buf = (int *) realloc(buf, bufsz * sizeof(int));
467 if (buf == NULL)
468 FATAL("out of space for character class [%.10s...] 2", p);
469 bp = buf + i;
470 }
471 *bp++ = c;
472 i++;
473 }
474 *bp = 0;
475 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
476 /* xfree(op); BUG: what are we freeing here? */
477 retp = (int *) calloc(bp-buf+1, sizeof(int));
478 for (i = 0; i < bp-buf+1; i++)
479 retp[i] = buf[i];
480 return retp;
481}
482
483void overflo(const char *s)
484{
485 FATAL("regular expression too big: out of space in %.30s...", s);
486}
487
488void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
489{
490 int i;
491 int *p;
492
493 switch (type(v)) {
494 ELEAF
495 LEAF
496 f->re[info(v)].ltype = type(v);
497 f->re[info(v)].lval.np = right(v);
498 while (f->accept >= maxsetvec) { /* guessing here! */
499 resizesetvec(__func__);
500 }
501 for (i = 0; i <= f->accept; i++)
502 setvec[i] = 0;
503 setcnt = 0;
504 follow(v); /* computes setvec and setcnt */
505 p = intalloc(setcnt + 1, __func__);
506 f->re[info(v)].lfollow = p;
507 *p = setcnt;
508 for (i = f->accept; i >= 0; i--)
509 if (setvec[i] == 1)
510 *++p = i;
511 break;
512 UNARY
513 cfoll(f,left(v));
514 break;
515 case CAT:
516 case OR:
517 cfoll(f,left(v));
518 cfoll(f,right(v));
519 break;
520 case ZERO:
521 break;
522 default: /* can't happen */
523 FATAL("can't happen: unknown type %d in cfoll", type(v));
524 }
525}
526
527int first(Node *p) /* collects initially active leaves of p into setvec */
528 /* returns 0 if p matches empty string */
529{
530 int b, lp;
531
532 switch (type(p)) {
533 ELEAF
534 LEAF
535 lp = info(p); /* look for high-water mark of subscripts */
536 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
537 resizesetvec(__func__);
538 }
539 if (type(p) == EMPTYRE) {
540 setvec[lp] = 0;
541 return(0);
542 }
543 if (setvec[lp] != 1) {
544 setvec[lp] = 1;
545 setcnt++;
546 }
547 if (type(p) == CCL && (*(int *) right(p)) == 0)
548 return(0); /* empty CCL */
549 return(1);
550 case PLUS:
551 if (first(left(p)) == 0)
552 return(0);
553 return(1);
554 case STAR:
555 case QUEST:
556 first(left(p));
557 return(0);
558 case CAT:
559 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
560 return(1);
561 case OR:
562 b = first(right(p));
563 if (first(left(p)) == 0 || b == 0) return(0);
564 return(1);
565 case ZERO:
566 return 0;
567 }
568 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
569 return(-1);
570}
571
572void follow(Node *v) /* collects leaves that can follow v into setvec */
573{
574 Node *p;
575
576 if (type(v) == FINAL)
577 return;
578 p = parent(v);
579 switch (type(p)) {
580 case STAR:
581 case PLUS:
582 first(v);
583 follow(p);
584 return;
585
586 case OR:
587 case QUEST:
588 follow(p);
589 return;
590
591 case CAT:
592 if (v == left(p)) { /* v is left child of p */
593 if (first(right(p)) == 0) {
594 follow(p);
595 return;
596 }
597 } else /* v is right child */
598 follow(p);
599 return;
600 }
601}
602
603int member(int c, int *sarg) /* is c in s? */
604{
605 int *s = (int *) sarg;
606
607 while (*s)
608 if (c == *s++)
609 return(1);
610 return(0);
611}
612
613static void resize_gototab(fa *f, int state)
614{
615 size_t new_size = f->gototab[state].allocated * 2;
616 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
617 if (p == NULL)
618 overflo(__func__);
619
620 // need to initialize the new memory to zero
621 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
622 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
623
624 f->gototab[state].allocated = new_size; // update gototab info
625 f->gototab[state].entries = p;
626}
627
628static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
629{
630 gtte key;
631 gtte *item;
632
633 if ((unsigned)ch < GOTO_DIRECT)
634 return f->gototab[state].direct[ch];
635
636 key.ch = ch;
637 key.state = 0; /* irrelevant */
638 item = (gtte *) bsearch(& key, f->gototab[state].entries,
639 f->gototab[state].inuse, sizeof(gtte),
640 entry_cmp);
641
642 if (item == NULL)
643 return 0;
644 else
645 return item->state;
646}
647
648static int entry_cmp(const void *l, const void *r)
649{
650 const gtte *left, *right;
651
652 left = (const gtte *) l;
653 right = (const gtte *) r;
654
655 return left->ch - right->ch;
656}
657
658static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
659{
660 if ((unsigned)ch < GOTO_DIRECT) {
661 f->gototab[state].direct[ch] = val;
662 return val;
663 }
664
665 if (f->gototab[state].inuse == 0) {
666 f->gototab[state].entries[0].ch = ch;
667 f->gototab[state].entries[0].state = val;
668 f->gototab[state].inuse++;
669 return val;
670 } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
671 // not seen yet, insert and return
672 gtt *tab = & f->gototab[state];
673 if (tab->inuse + 1 >= tab->allocated)
674 resize_gototab(f, state);
675
676 f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
677 f->gototab[state].entries[f->gototab[state].inuse].state = val;
678 f->gototab[state].inuse++;
679 return val;
680 } else {
681 // maybe we have it, maybe we don't
682 gtte key;
683 gtte *item;
684
685 key.ch = ch;
686 key.state = 0; /* irrelevant */
687 item = (gtte *) bsearch(& key, f->gototab[state].entries,
688 f->gototab[state].inuse, sizeof(gtte),
689 entry_cmp);
690
691 if (item != NULL) {
692 // we have it, update state and return
693 item->state = val;
694 return item->state;
695 }
696 // otherwise, fall through to insert and reallocate.
697 }
698
699 gtt *tab = & f->gototab[state];
700 if (tab->inuse + 1 >= tab->allocated)
701 resize_gototab(f, state);
702 f->gototab[state].entries[tab->inuse].ch = ch;
703 f->gototab[state].entries[tab->inuse].state = val;
704 ++tab->inuse;
705
706 qsort(f->gototab[state].entries,
707 f->gototab[state].inuse, sizeof(gtte), entry_cmp);
708
709 return val; /* not used anywhere at the moment */
710}
711
712static void clear_gototab(fa *f, int state)
713{
714 memset(f->gototab[state].direct, 0, sizeof(f->gototab[state].direct));
715 memset(f->gototab[state].entries, 0,
716 f->gototab[state].allocated * sizeof(gtte));
717 f->gototab[state].inuse = 0;
718}
719
720int match(fa *f, const char *p0) /* shortest match ? */
721{
722 int s, ns;
723 int n;
724 int rune;
725 const uschar *p = (const uschar *) p0;
726
727 /* return pmatch(f, p0); does it matter whether longest or shortest? */
728
729 s = f->initstat;
730 assert (s < f->state_count);
731
732 if (f->out[s])
733 return(1);
734 do {
735 /* assert(*p < NCHARS); */
736 n = u8_rune(&rune, (const char *) p);
737 if ((ns = get_gototab(f, s, rune)) != 0)
738 s = ns;
739 else
740 s = cgoto(f, s, rune);
741 if (f->out[s])
742 return(1);
743 if (*p == 0)
744 break;
745 p += n;
746 } while (1); /* was *p++ != 0 */
747 return(0);
748}
749
750int pmatch(fa *f, const char *p0) /* longest match, for sub */
751{
752 int s, ns;
753 int n;
754 int rune;
755 const uschar *p = (const uschar *) p0;
756 const uschar *q;
757
758 s = f->initstat;
759 assert(s < f->state_count);
760
761 patbeg = (const char *)p;
762 patlen = -1;
763 do {
764 q = p;
765 do {
766 if (f->out[s]) /* final state */
767 patlen = q-p;
768 /* assert(*q < NCHARS); */
769 n = u8_rune(&rune, (const char *) q);
770 if ((ns = get_gototab(f, s, rune)) != 0)
771 s = ns;
772 else
773 s = cgoto(f, s, rune);
774
775 assert(s < f->state_count);
776
777 if (s == 1) { /* no transition */
778 if (patlen >= 0) {
779 patbeg = (const char *) p;
780 return(1);
781 }
782 else
783 goto nextin; /* no match */
784 }
785 if (*q == 0)
786 break;
787 q += n;
788 } while (1);
789 q++; /* was *q++ */
790 if (f->out[s])
791 patlen = q-p-1; /* don't count $ */
792 if (patlen >= 0) {
793 patbeg = (const char *) p;
794 return(1);
795 }
796 nextin:
797 s = 2;
798 if (*p == 0)
799 break;
800 n = u8_rune(&rune, (const char *) p);
801 p += n;
802 } while (1); /* was *p++ */
803 return (0);
804}
805
806int nematch(fa *f, const char *p0) /* non-empty match, for sub */
807{
808 int s, ns;
809 int n;
810 int rune;
811 const uschar *p = (const uschar *) p0;
812 const uschar *q;
813
814 s = f->initstat;
815 assert(s < f->state_count);
816
817 patbeg = (const char *)p;
818 patlen = -1;
819 while (*p) {
820 q = p;
821 do {
822 if (f->out[s]) /* final state */
823 patlen = q-p;
824 /* assert(*q < NCHARS); */
825 n = u8_rune(&rune, (const char *) q);
826 if ((ns = get_gototab(f, s, rune)) != 0)
827 s = ns;
828 else
829 s = cgoto(f, s, rune);
830 if (s == 1) { /* no transition */
831 if (patlen > 0) {
832 patbeg = (const char *) p;
833 return(1);
834 } else
835 goto nnextin; /* no nonempty match */
836 }
837 if (*q == 0)
838 break;
839 q += n;
840 } while (1);
841 q++;
842 if (f->out[s])
843 patlen = q-p-1; /* don't count $ */
844 if (patlen > 0 ) {
845 patbeg = (const char *) p;
846 return(1);
847 }
848 nnextin:
849 s = 2;
850 p++;
851 }
852 return (0);
853}
854
855
856/*
857 * NAME
858 * fnematch
859 *
860 * DESCRIPTION
861 * A stream-fed version of nematch which transfers characters to a
862 * null-terminated buffer. All characters up to and including the last
863 * character of the matching text or EOF are placed in the buffer. If
864 * a match is found, patbeg and patlen are set appropriately.
865 *
866 * RETURN VALUES
867 * false No match found.
868 * true Match found.
869 */
870
871bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
872{
873 char *i, *j, *k, *buf = *pbuf;
874 int bufsize = *pbufsize;
875 int c, n, ns, s;
876
877 s = pfa->initstat;
878 patlen = 0;
879
880 /*
881 * buf <= i <= j <= k <= buf+bufsize
882 *
883 * i: origin of active substring
884 * j: current character
885 * k: destination of the next getc
886 */
887
888 i = j = k = buf;
889
890 do {
891 /*
892 * Call u8_rune with at least awk_mb_cur_max ahead in
893 * the buffer until EOF interferes.
894 */
895 if (k - j < (int)awk_mb_cur_max) {
896 if (k + awk_mb_cur_max > buf + bufsize) {
897 char *obuf = buf;
898 adjbuf((char **) &buf, &bufsize,
899 bufsize + awk_mb_cur_max,
900 quantum, 0, "fnematch");
901
902 /* buf resized, maybe moved. update pointers */
903 *pbufsize = bufsize;
904 if (obuf != buf) {
905 i = buf + (i - obuf);
906 j = buf + (j - obuf);
907 k = buf + (k - obuf);
908 *pbuf = buf;
909 if (patlen)
910 patbeg = buf + (patbeg - obuf);
911 }
912 }
913 for (n = awk_mb_cur_max ; n > 0; n--) {
914 *k++ = (c = getc(f)) != EOF ? c : 0;
915 if (c == EOF) {
916 if (ferror(f))
917 FATAL("fnematch: getc error");
918 break;
919 }
920 }
921 }
922
923 j += u8_rune(&c, j);
924
925 if ((ns = get_gototab(pfa, s, c)) != 0)
926 s = ns;
927 else
928 s = cgoto(pfa, s, c);
929
930 if (pfa->out[s]) { /* final state */
931 patbeg = i;
932 patlen = j - i;
933 if (c == 0) /* don't count $ */
934 patlen--;
935 }
936
937 if (c && s != 1)
938 continue; /* origin i still viable, next j */
939 if (patlen)
940 break; /* best match found */
941
942 /* no match at origin i, next i and start over */
943 i += u8_rune(&c, i);
944 if (c == 0)
945 break; /* no match */
946 j = i;
947 s = 2;
948 } while (1);
949
950 if (patlen) {
951 /*
952 * Under no circumstances is the last character fed to
953 * the automaton part of the match. It is EOF's nullbyte,
954 * or it sent the automaton into a state with no further
955 * transitions available (s==1), or both. Room for a
956 * terminating nullbyte is guaranteed.
957 *
958 * ungetc any chars after the end of matching text
959 * (except for EOF's nullbyte, if present) and null
960 * terminate the buffer.
961 */
962 do
963 if (*--k && ungetc(*k, f) == EOF)
964 FATAL("unable to ungetc '%c'", *k);
965 while (k > patbeg + patlen);
966 *k = '\0';
967 return true;
968 }
969 else
970 return false;
971}
972
973Node *reparse(const char *p) /* parses regular expression pointed to by p */
974{ /* uses relex() to scan regular expression */
975 Node *np;
976
977 DPRINTF("reparse <%s>\n", p);
978 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
979 rtok = relex();
980 /* GNU compatibility: an empty regexp matches anything */
981 if (rtok == '\0') {
982 /* FATAL("empty regular expression"); previous */
983 return(op2(EMPTYRE, NIL, NIL));
984 }
985 np = regexp();
986 if (rtok != '\0')
987 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
988 return(np);
989}
990
991Node *regexp(void) /* top-level parse of reg expr */
992{
993 return (alt(concat(primary())));
994}
995
996Node *primary(void)
997{
998 Node *np;
999 int savelastatom;
1000
1001 switch (rtok) {
1002 case CHAR:
1003 lastatom = starttok;
1004 np = op2(CHAR, NIL, itonp(rlxval));
1005 rtok = relex();
1006 return (unary(np));
1007 case ALL:
1008 rtok = relex();
1009 return (unary(op2(ALL, NIL, NIL)));
1010 case EMPTYRE:
1011 rtok = relex();
1012 return (unary(op2(EMPTYRE, NIL, NIL)));
1013 case DOT:
1014 lastatom = starttok;
1015 rtok = relex();
1016 return (unary(op2(DOT, NIL, NIL)));
1017 case CCL:
1018 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1019 lastatom = starttok;
1020 rtok = relex();
1021 return (unary(np));
1022 case NCCL:
1023 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1024 lastatom = starttok;
1025 rtok = relex();
1026 return (unary(np));
1027 case '^':
1028 rtok = relex();
1029 return (unary(op2(CHAR, NIL, itonp(HAT))));
1030 case '$':
1031 rtok = relex();
1032 return (unary(op2(CHAR, NIL, NIL)));
1033 case '(':
1034 lastatom = starttok;
1035 savelastatom = starttok - basestr; /* Retain over recursion */
1036 rtok = relex();
1037 if (rtok == ')') { /* special pleading for () */
1038 rtok = relex();
1039 return unary(op2(CCL, NIL, (Node *) cclenter("")));
1040 }
1041 np = regexp();
1042 if (rtok == ')') {
1043 lastatom = basestr + savelastatom; /* Restore */
1044 rtok = relex();
1045 return (unary(np));
1046 }
1047 else {
1048 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1049 break;
1050 }
1051 default:
1052 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1053 }
1054 return 0; /*NOTREACHED*/
1055}
1056
1057Node *concat(Node *np)
1058{
1059 switch (rtok) {
1060 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1061 return (concat(op2(CAT, np, primary())));
1062 case EMPTYRE:
1063 rtok = relex();
1064 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1065 primary())));
1066 }
1067 return (np);
1068}
1069
1070Node *alt(Node *np)
1071{
1072 if (rtok == OR) {
1073 rtok = relex();
1074 return (alt(op2(OR, np, concat(primary()))));
1075 }
1076 return (np);
1077}
1078
1079Node *unary(Node *np)
1080{
1081 switch (rtok) {
1082 case STAR:
1083 rtok = relex();
1084 return (unary(op2(STAR, np, NIL)));
1085 case PLUS:
1086 rtok = relex();
1087 return (unary(op2(PLUS, np, NIL)));
1088 case QUEST:
1089 rtok = relex();
1090 return (unary(op2(QUEST, np, NIL)));
1091 case ZERO:
1092 rtok = relex();
1093 return (unary(op2(ZERO, np, NIL)));
1094 default:
1095 return (np);
1096 }
1097}
1098
1099/*
1100 * Character class definitions conformant to the POSIX locale as
1101 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1102 * and operating character sets are both ASCII (ISO646) or supersets
1103 * thereof.
1104 *
1105 * Note that to avoid overflowing the temporary buffer used in
1106 * relex(), the expanded character class (prior to range expansion)
1107 * must be less than twice the size of their full name.
1108 */
1109
1110/* Because isblank doesn't show up in any of the header files on any
1111 * system i use, it's defined here. if some other locale has a richer
1112 * definition of "blank", define HAS_ISBLANK and provide your own
1113 * version.
1114 * the parentheses here are an attempt to find a path through the maze
1115 * of macro definition and/or function and/or version provided. thanks
1116 * to nelson beebe for the suggestion; let's see if it works everywhere.
1117 */
1118
1119/* #define HAS_ISBLANK */
1120#ifndef HAS_ISBLANK
1121
1122int (xisblank)(int c)
1123{
1124 return c==' ' || c=='\t';
1125}
1126
1127#endif
1128
1129static const struct charclass {
1130 const char *cc_name;
1131 int cc_namelen;
1132 int (*cc_func)(int);
1133} charclasses[] = {
1134 { "alnum", 5, isalnum },
1135 { "alpha", 5, isalpha },
1136#ifndef HAS_ISBLANK
1137 { "blank", 5, xisblank },
1138#else
1139 { "blank", 5, isblank },
1140#endif
1141 { "cntrl", 5, iscntrl },
1142 { "digit", 5, isdigit },
1143 { "graph", 5, isgraph },
1144 { "lower", 5, islower },
1145 { "print", 5, isprint },
1146 { "punct", 5, ispunct },
1147 { "space", 5, isspace },
1148 { "upper", 5, isupper },
1149 { "xdigit", 6, isxdigit },
1150 { NULL, 0, NULL },
1151};
1152
1153#define REPEAT_SIMPLE 0
1154#define REPEAT_PLUS_APPENDED 1
1155#define REPEAT_WITH_Q 2
1156#define REPEAT_ZERO 3
1157
1158static int
1159replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1160 int atomlen, int firstnum, int secondnum, int special_case)
1161{
1162 int i, j;
1163 uschar *buf = 0;
1164 int ret = 1;
1165 int init_q = (firstnum == 0); /* first added char will be ? */
1166 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1167 int prefix_length = reptok - basestr; /* prefix includes first rep */
1168 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1169 int size = prefix_length + suffix_length;
1170
1171 if (firstnum > 1) { /* add room for reps 2 through firstnum */
1172 size += atomlen*(firstnum-1);
1173 }
1174
1175 /* Adjust size of buffer for special cases */
1176 if (special_case == REPEAT_PLUS_APPENDED) {
1177 size++; /* for the final + */
1178 } else if (special_case == REPEAT_WITH_Q) {
1179 size += init_q + (atomlen+1)* (n_q_reps-init_q);
1180 } else if (special_case == REPEAT_ZERO) {
1181 size += 2; /* just a null ERE: () */
1182 }
1183 if ((buf = (uschar *) malloc(size + 1)) == NULL)
1184 FATAL("out of space in reg expr %.10s..", lastre);
1185 memcpy(buf, basestr, prefix_length); /* copy prefix */
1186 j = prefix_length;
1187 if (special_case == REPEAT_ZERO) {
1188 j -= atomlen;
1189 buf[j++] = '(';
1190 buf[j++] = ')';
1191 }
1192 for (i = 1; i < firstnum; i++) { /* copy x reps */
1193 memcpy(&buf[j], atom, atomlen);
1194 j += atomlen;
1195 }
1196 if (special_case == REPEAT_PLUS_APPENDED) {
1197 buf[j++] = '+';
1198 } else if (special_case == REPEAT_WITH_Q) {
1199 if (init_q)
1200 buf[j++] = '?';
1201 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1202 memcpy(&buf[j], atom, atomlen);
1203 j += atomlen;
1204 buf[j++] = '?';
1205 }
1206 }
1207 memcpy(&buf[j], reptok+reptoklen, suffix_length);
1208 j += suffix_length;
1209 buf[j] = '\0';
1210 /* free old basestr */
1211 if (firstbasestr != basestr) {
1212 if (basestr)
1213 xfree(basestr);
1214 }
1215 basestr = buf;
1216 prestr = buf + prefix_length;
1217 if (special_case == REPEAT_ZERO) {
1218 prestr -= atomlen;
1219 ret++;
1220 }
1221 return ret;
1222}
1223
1224static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1225 int atomlen, int firstnum, int secondnum)
1226{
1227 if (atom == NULL)
1228 return 0;
1229
1230 /*
1231 In general, the repetition specifier or "bound" is replaced here
1232 by an equivalent ERE string, repeating the immediately previous atom
1233 and appending ? and + as needed. Note that the first copy of the
1234 atom is left in place, except in the special_case of a zero-repeat
1235 (i.e., {0}).
1236 */
1237 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1238 if (firstnum < 2) {
1239 /* 0 or 1: should be handled before you get here */
1240 FATAL("internal error");
1241 } else {
1242 return replace_repeat(reptok, reptoklen, atom, atomlen,
1243 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1244 }
1245 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1246 if (firstnum == 0) { /* {0} or {0,0} */
1247 /* This case is unusual because the resulting
1248 replacement string might actually be SMALLER than
1249 the original ERE */
1250 return replace_repeat(reptok, reptoklen, atom, atomlen,
1251 firstnum, secondnum, REPEAT_ZERO);
1252 } else { /* (firstnum >= 1) */
1253 return replace_repeat(reptok, reptoklen, atom, atomlen,
1254 firstnum, secondnum, REPEAT_SIMPLE);
1255 }
1256 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1257 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1258 return replace_repeat(reptok, reptoklen, atom, atomlen,
1259 firstnum, secondnum, REPEAT_WITH_Q);
1260 } else { /* Error - shouldn't be here (n>m) */
1261 FATAL("internal error");
1262 }
1263 return 0;
1264}
1265
1266int relex(void) /* lexical analyzer for reparse */
1267{
1268 int c, n;
1269 int cflag;
1270 static uschar *buf = NULL;
1271 static int bufsz = 100;
1272 uschar *bp;
1273 const struct charclass *cc;
1274 int i;
1275 int num, m;
1276 bool commafound, digitfound;
1277 const uschar *startreptok;
1278 static int parens = 0;
1279
1280rescan:
1281 starttok = prestr;
1282
1283 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1284 prestr += n;
1285 starttok = prestr;
1286 return CHAR;
1287 }
1288
1289 switch (c = *prestr++) {
1290 case '|': return OR;
1291 case '*': return STAR;
1292 case '+': return PLUS;
1293 case '?': return QUEST;
1294 case '.': return DOT;
1295 case '\0': prestr--; return '\0';
1296 case '^':
1297 case '$':
1298 return c;
1299 case '(':
1300 parens++;
1301 return c;
1302 case ')':
1303 if (parens) {
1304 parens--;
1305 return c;
1306 }
1307 /* unmatched close parenthesis; per POSIX, treat as literal */
1308 rlxval = c;
1309 return CHAR;
1310 case '\\':
1311 rlxval = quoted(&prestr);
1312 return CHAR;
1313 default:
1314 rlxval = c;
1315 return CHAR;
1316 case '[':
1317 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1318 FATAL("out of space in reg expr %.10s..", lastre);
1319 bp = buf;
1320 if (*prestr == '^') {
1321 cflag = 1;
1322 prestr++;
1323 }
1324 else
1325 cflag = 0;
1326 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1327 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1328 FATAL("out of space for reg expr %.10s...", lastre);
1329 for (; ; ) {
1330 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1331 for (i = 0; i < n; i++)
1332 *bp++ = *prestr++;
1333 continue;
1334 }
1335 if ((c = *prestr++) == '\\') {
1336 *bp++ = '\\';
1337 if ((c = *prestr++) == '\0')
1338 FATAL("nonterminated character class %.20s...", lastre);
1339 *bp++ = c;
1340 /* } else if (c == '\n') { */
1341 /* FATAL("newline in character class %.20s...", lastre); */
1342 } else if (c == '[' && *prestr == ':') {
1343 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1344 for (cc = charclasses; cc->cc_name; cc++)
1345 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1346 break;
1347 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1348 prestr[2 + cc->cc_namelen] == ']') {
1349 prestr += cc->cc_namelen + 3;
1350 /*
1351 * BUG: We begin at 1, instead of 0, since we
1352 * would otherwise prematurely terminate the
1353 * string for classes like [[:cntrl:]]. This
1354 * means that we can't match the NUL character,
1355 * not without first adapting the entire
1356 * program to track each string's length.
1357 */
1358 for (i = 1; i <= UCHAR_MAX; i++) {
1359 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1360 FATAL("out of space for reg expr %.10s...", lastre);
1361 if (cc->cc_func(i)) {
1362 /* escape backslash */
1363 if (i == '\\') {
1364 *bp++ = '\\';
1365 n++;
1366 }
1367
1368 *bp++ = i;
1369 n++;
1370 }
1371 }
1372 } else
1373 *bp++ = c;
1374 } else if (c == '[' && *prestr == '.') {
1375 char collate_char;
1376 prestr++;
1377 collate_char = *prestr++;
1378 if (*prestr == '.' && prestr[1] == ']') {
1379 prestr += 2;
1380 /* Found it: map via locale TBD: for
1381 now, simply return this char. This
1382 is sufficient to pass conformance
1383 test awk.ex 156
1384 */
1385 if (*prestr == ']') {
1386 prestr++;
1387 rlxval = collate_char;
1388 return CHAR;
1389 }
1390 }
1391 } else if (c == '[' && *prestr == '=') {
1392 char equiv_char;
1393 prestr++;
1394 equiv_char = *prestr++;
1395 if (*prestr == '=' && prestr[1] == ']') {
1396 prestr += 2;
1397 /* Found it: map via locale TBD: for now
1398 simply return this char. This is
1399 sufficient to pass conformance test
1400 awk.ex 156
1401 */
1402 if (*prestr == ']') {
1403 prestr++;
1404 rlxval = equiv_char;
1405 return CHAR;
1406 }
1407 }
1408 } else if (c == '\0') {
1409 FATAL("nonterminated character class %.20s", lastre);
1410 } else if (bp == buf) { /* 1st char is special */
1411 *bp++ = c;
1412 } else if (c == ']') {
1413 *bp++ = 0;
1414 rlxstr = (uschar *) tostring((char *) buf);
1415 if (cflag == 0)
1416 return CCL;
1417 else
1418 return NCCL;
1419 } else
1420 *bp++ = c;
1421 }
1422 break;
1423 case '{':
1424 if (isdigit((int) *(prestr))) {
1425 num = 0; /* Process as a repetition */
1426 n = -1; m = -1;
1427 commafound = false;
1428 digitfound = false;
1429 startreptok = prestr-1;
1430 /* Remember start of previous atom here ? */
1431 } else { /* just a { char, not a repetition */
1432 rlxval = c;
1433 return CHAR;
1434 }
1435 for (; ; ) {
1436 if ((c = *prestr++) == '}') {
1437 if (commafound) {
1438 if (digitfound) { /* {n,m} */
1439 m = num;
1440 if (m < n)
1441 FATAL("illegal repetition expression: class %.20s",
1442 lastre);
1443 if (n == 0 && m == 1) {
1444 return QUEST;
1445 }
1446 } else { /* {n,} */
1447 if (n == 0)
1448 return STAR;
1449 else if (n == 1)
1450 return PLUS;
1451 }
1452 } else {
1453 if (digitfound) { /* {n} same as {n,n} */
1454 n = num;
1455 m = num;
1456 } else { /* {} */
1457 FATAL("illegal repetition expression: class %.20s",
1458 lastre);
1459 }
1460 }
1461 if (repeat(starttok, prestr-starttok, lastatom,
1462 startreptok - lastatom, n, m) > 0) {
1463 if (n == 0 && m == 0) {
1464 return ZERO;
1465 }
1466 /* must rescan input for next token */
1467 goto rescan;
1468 }
1469 /* Failed to replace: eat up {...} characters
1470 and treat like just PLUS */
1471 return PLUS;
1472 } else if (c == '\0') {
1473 FATAL("nonterminated character class %.20s",
1474 lastre);
1475 } else if (isdigit(c)) {
1476 num = 10 * num + c - '0';
1477 if (num > 255)
1478 FATAL("repetition count %.20s too large",
1479 lastre);
1480 digitfound = true;
1481 } else if (c == ',') {
1482 if (commafound)
1483 FATAL("illegal repetition expression: class %.20s",
1484 lastre);
1485 /* looking for {n,} or {n,m} */
1486 commafound = true;
1487 n = num;
1488 digitfound = false; /* reset */
1489 num = 0;
1490 } else {
1491 FATAL("illegal repetition expression: class %.20s",
1492 lastre);
1493 }
1494 }
1495 break;
1496 }
1497}
1498
1499int cgoto(fa *f, int s, int c)
1500{
1501 int *p, *q;
1502 int i, j, k;
1503
1504 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1505 while (f->accept >= maxsetvec) { /* guessing here! */
1506 resizesetvec(__func__);
1507 }
1508 for (i = 0; i <= f->accept; i++)
1509 setvec[i] = 0;
1510 setcnt = 0;
1511 resize_state(f, s);
1512 /* compute positions of gototab[s,c] into setvec */
1513 p = f->posns[s];
1514 for (i = 1; i <= *p; i++) {
1515 if ((k = f->re[p[i]].ltype) != FINAL) {
1516 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1517 || (k == DOT && c != 0 && c != HAT)
1518 || (k == ALL && c != 0)
1519 || (k == EMPTYRE && c != 0)
1520 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1521 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1522 q = f->re[p[i]].lfollow;
1523 for (j = 1; j <= *q; j++) {
1524 if (q[j] >= maxsetvec) {
1525 resizesetvec(__func__);
1526 }
1527 if (setvec[q[j]] == 0) {
1528 setcnt++;
1529 setvec[q[j]] = 1;
1530 }
1531 }
1532 }
1533 }
1534 }
1535 /* determine if setvec is a previous state */
1536 tmpset[0] = setcnt;
1537 j = 1;
1538 for (i = f->accept; i >= 0; i--)
1539 if (setvec[i]) {
1540 tmpset[j++] = i;
1541 }
1542 resize_state(f, f->curstat > s ? f->curstat : s);
1543 /* tmpset == previous state? */
1544 for (i = 1; i <= f->curstat; i++) {
1545 p = f->posns[i];
1546 if ((k = tmpset[0]) != p[0])
1547 goto different;
1548 for (j = 1; j <= k; j++)
1549 if (tmpset[j] != p[j])
1550 goto different;
1551 /* setvec is state i */
1552 if (c != HAT)
1553 set_gototab(f, s, c, i);
1554 return i;
1555 different:;
1556 }
1557
1558 /* add tmpset to current set of states */
1559 ++(f->curstat);
1560 resize_state(f, f->curstat);
1561 clear_gototab(f, f->curstat);
1562 xfree(f->posns[f->curstat]);
1563 p = intalloc(setcnt + 1, __func__);
1564
1565 f->posns[f->curstat] = p;
1566 if (c != HAT)
1567 set_gototab(f, s, c, f->curstat);
1568 for (i = 0; i <= setcnt; i++)
1569 p[i] = tmpset[i];
1570 if (setvec[f->accept])
1571 f->out[f->curstat] = 1;
1572 else
1573 f->out[f->curstat] = 0;
1574 return f->curstat;
1575}
1576
1577
1578void freefa(fa *f) /* free a finite automaton */
1579{
1580 int i;
1581
1582 if (f == NULL)
1583 return;
1584 for (i = 0; i < f->state_count; i++)
1585 xfree(f->gototab[i].entries);
1586 xfree(f->gototab);
1587 for (i = 0; i <= f->curstat; i++)
1588 xfree(f->posns[i]);
1589 for (i = 0; i <= f->accept; i++) {
1590 xfree(f->re[i].lfollow);
1591 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1592 xfree(f->re[i].lval.np);
1593 }
1594 xfree(f->restr);
1595 xfree(f->out);
1596 xfree(f->posns);
1597 xfree(f->gototab);
1598 xfree(f);
1599}