master xplshn/aruu / cmd / posix / awk / b.c
   1/****************************************************************
   2Copyright (C) Lucent Technologies 1997
   3All Rights Reserved
   4
   5Permission to use, copy, modify, and distribute this software and
   6its documentation for any purpose and without fee is hereby
   7granted, provided that the above copyright notice appear in all
   8copies and that both that the copyright notice and this
   9permission notice and warranty disclaimer appear in supporting
  10documentation, and that the name Lucent Technologies or any of
  11its entities not be used in advertising or publicity pertaining
  12to distribution of the software without specific, written prior
  13permission.
  14
  15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  22THIS SOFTWARE.
  23****************************************************************/
  24
  25/* lasciate ogne speranza, voi ch'intrate. */
  26
  27#define	DEBUG
  28
  29#include <ctype.h>
  30#include <limits.h>
  31#include <stdio.h>
  32#include <string.h>
  33#include <stdlib.h>
  34#include "awk.h"
  35#include "awkgram.tab.h"
  36
  37#define MAXLIN 22
  38
  39#define type(v)		(v)->nobj	/* badly overloaded here */
  40#define info(v)		(v)->ntype	/* badly overloaded here */
  41#define left(v)		(v)->narg[0]
  42#define right(v)	(v)->narg[1]
  43#define parent(v)	(v)->nnext
  44
  45#define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
  46#define ELEAF	case EMPTYRE:		/* empty string in regexp */
  47#define UNARY	case STAR: case PLUS: case QUEST:
  48
  49/* encoding in tree Nodes:
  50	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
  51		left is index, right contains value or pointer to value
  52	unary (STAR, PLUS, QUEST): left is child, right is null
  53	binary (CAT, OR): left and right are children
  54	parent contains pointer to parent
  55*/
  56
  57
  58int	*setvec;
  59int	*tmpset;
  60int	maxsetvec = 0;
  61
  62int	rtok;		/* next token in current re */
  63int	rlxval;
  64static const uschar	*rlxstr;
  65static const uschar	*prestr;	/* current position in current re */
  66static const uschar	*lastre;	/* origin of last re */
  67static const uschar	*lastatom;	/* origin of last Atom */
  68static const uschar	*starttok;
  69static const uschar 	*basestr;	/* starts with original, replaced during
  70				   repetition processing */
  71static const uschar 	*firstbasestr;
  72
  73static	int setcnt;
  74static	int poscnt;
  75
  76const char	*patbeg;
  77int	patlen;
  78
  79#define	NFA	128	/* cache this many dynamic fa's */
  80fa	*fatab[NFA];
  81int	nfatab	= 0;	/* entries in fatab */
  82
  83extern int u8_nextlen(const char *s);
  84
  85
  86/* utf-8 mechanism:
  87
  88   For most of Awk, utf-8 strings just "work", since they look like
  89   null-terminated sequences of 8-bit bytes.
  90
  91   Functions like length(), index(), and substr() have to operate
  92   in units of utf-8 characters.  The u8_* functions in run.c
  93   handle this.
  94
  95   Regular expressions are more complicated, since the basic
  96   mechanism of the goto table used 8-bit byte indices into the
  97   gototab entries to compute the next state.  Unicode is a lot
  98   bigger, so the gototab entries are now structs with a character
  99   and a next state. These are sorted by code point and binary
 100   searched.
 101
 102   Throughout the RE mechanism in b.c, utf-8 characters are
 103   converted to their utf-32 value.  This mostly shows up in
 104   cclenter, which expands character class ranges like a-z and now
 105   alpha-omega.  The size of a gototab array is still about 256.
 106   This should be dynamic, but for now things work ok for a single
 107   code page of Unicode, which is the most likely case.
 108
 109   The code changes are localized in run.c and b.c.  I have added a
 110   handful of functions to somewhat better hide the implementation,
 111   but a lot more could be done.
 112
 113 */
 114
 115static int entry_cmp(const void *l, const void *r);
 116static int get_gototab(fa*, int, int);
 117static int set_gototab(fa*, int, int, int);
 118static void clear_gototab(fa*, int);
 119extern int u8_rune(int *, const char *);
 120
 121static int *
 122intalloc(size_t n, const char *f)
 123{
 124	int *p = (int *) calloc(n, sizeof(int));
 125	if (p == NULL)
 126		overflo(f);
 127	return p;
 128}
 129
 130static void
 131resizesetvec(const char *f)
 132{
 133	if (maxsetvec == 0)
 134		maxsetvec = MAXLIN;
 135	else
 136		maxsetvec *= 4;
 137	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
 138	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
 139	if (setvec == NULL || tmpset == NULL)
 140		overflo(f);
 141}
 142
 143static void
 144resize_state(fa *f, int state)
 145{
 146	gtt *p;
 147	uschar *p2;
 148	int **p3;
 149	int i, new_count;
 150
 151	if (++state < f->state_count)
 152		return;
 153
 154	new_count = state + 10; /* needs to be tuned */
 155
 156	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
 157	if (p == NULL)
 158		goto out;
 159	f->gototab = p;
 160
 161	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
 162	if (p2 == NULL)
 163		goto out;
 164	f->out = p2;
 165
 166	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
 167	if (p3 == NULL)
 168		goto out;
 169	f->posns = p3;
 170
 171	for (i = f->state_count; i < new_count; ++i) {
 172		memset(f->gototab[i].direct, 0, sizeof(f->gototab[i].direct));
 173		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
 174		if (f->gototab[i].entries == NULL)
 175			goto out;
 176		f->gototab[i].allocated = NCHARS;
 177		f->gototab[i].inuse = 0;
 178		f->out[i] = 0;
 179		f->posns[i] = NULL;
 180	}
 181	f->state_count = new_count;
 182	return;
 183out:
 184	overflo(__func__);
 185}
 186
 187fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
 188{
 189	int i, use, nuse;
 190	fa *pfa;
 191	static int now = 1;
 192
 193	if (setvec == NULL) {	/* first time through any RE */
 194		resizesetvec(__func__);
 195	}
 196
 197	if (compile_time != RUNNING)	/* a constant for sure */
 198		return mkdfa(s, anchor);
 199	for (i = 0; i < nfatab; i++)	/* is it there already? */
 200		if (fatab[i]->anchor == anchor
 201		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
 202			fatab[i]->use = now++;
 203			return fatab[i];
 204		}
 205	pfa = mkdfa(s, anchor);
 206	if (nfatab < NFA) {	/* room for another */
 207		fatab[nfatab] = pfa;
 208		fatab[nfatab]->use = now++;
 209		nfatab++;
 210		return pfa;
 211	}
 212	use = fatab[0]->use;	/* replace least-recently used */
 213	nuse = 0;
 214	for (i = 1; i < nfatab; i++)
 215		if (fatab[i]->use < use) {
 216			use = fatab[i]->use;
 217			nuse = i;
 218		}
 219	freefa(fatab[nuse]);
 220	fatab[nuse] = pfa;
 221	pfa->use = now++;
 222	return pfa;
 223}
 224
 225fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
 226				/* anchor = true for anchored matches, else false */
 227{
 228	Node *p, *p1;
 229	fa *f;
 230
 231	firstbasestr = (const uschar *) s;
 232	basestr = firstbasestr;
 233	p = reparse(s);
 234	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 235		/* put ALL STAR in front of reg.  exp. */
 236	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
 237		/* put FINAL after reg.  exp. */
 238
 239	poscnt = 0;
 240	penter(p1);	/* enter parent pointers and leaf indices */
 241	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
 242		overflo(__func__);
 243	f->accept = poscnt-1;	/* penter has computed number of positions in re */
 244	cfoll(f, p1);	/* set up follow sets */
 245	freetr(p1);
 246	resize_state(f, 1);
 247	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
 248	f->posns[1] = intalloc(1, __func__);
 249	*f->posns[1] = 0;
 250	f->initstat = makeinit(f, anchor);
 251	f->anchor = anchor;
 252	f->restr = (uschar *) tostring(s);
 253	if (firstbasestr != basestr) {
 254		if (basestr)
 255			xfree(basestr);
 256	}
 257	return f;
 258}
 259
 260int makeinit(fa *f, bool anchor)
 261{
 262	int i, k;
 263
 264	f->curstat = 2;
 265	f->out[2] = 0;
 266	k = *(f->re[0].lfollow);
 267	xfree(f->posns[2]);
 268	f->posns[2] = intalloc(k + 1,  __func__);
 269	for (i = 0; i <= k; i++) {
 270		(f->posns[2])[i] = (f->re[0].lfollow)[i];
 271	}
 272	if ((f->posns[2])[1] == f->accept)
 273		f->out[2] = 1;
 274	clear_gototab(f, 2);
 275	f->curstat = cgoto(f, 2, HAT);
 276	if (anchor) {
 277		*f->posns[2] = k-1;	/* leave out position 0 */
 278		for (i = 0; i < k; i++) {
 279			(f->posns[0])[i] = (f->posns[2])[i];
 280		}
 281
 282		f->out[0] = f->out[2];
 283		if (f->curstat != 2)
 284			--(*f->posns[f->curstat]);
 285	}
 286	return f->curstat;
 287}
 288
 289void penter(Node *p)	/* set up parent pointers and leaf indices */
 290{
 291	switch (type(p)) {
 292	ELEAF
 293	LEAF
 294		info(p) = poscnt;
 295		poscnt++;
 296		break;
 297	UNARY
 298		penter(left(p));
 299		parent(left(p)) = p;
 300		break;
 301	case CAT:
 302	case OR:
 303		penter(left(p));
 304		penter(right(p));
 305		parent(left(p)) = p;
 306		parent(right(p)) = p;
 307		break;
 308	case ZERO:
 309		break;
 310	default:	/* can't happen */
 311		FATAL("can't happen: unknown type %d in penter", type(p));
 312		break;
 313	}
 314}
 315
 316void freetr(Node *p)	/* free parse tree */
 317{
 318	switch (type(p)) {
 319	ELEAF
 320	LEAF
 321		xfree(p);
 322		break;
 323	UNARY
 324	case ZERO:
 325		freetr(left(p));
 326		xfree(p);
 327		break;
 328	case CAT:
 329	case OR:
 330		freetr(left(p));
 331		freetr(right(p));
 332		xfree(p);
 333		break;
 334	default:	/* can't happen */
 335		FATAL("can't happen: unknown type %d in freetr", type(p));
 336		break;
 337	}
 338}
 339
 340/* in the parsing of regular expressions, metacharacters like . have */
 341/* to be seen literally;  \056 is not a metacharacter. */
 342
 343int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
 344{			/* only pick up one 8-bit byte (2 chars) */
 345	const uschar *p;
 346	int n = 0;
 347	int i;
 348
 349	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
 350		if (isdigit((int) *p))
 351			n = 16 * n + *p - '0';
 352		else if (*p >= 'a' && *p <= 'f')
 353			n = 16 * n + *p - 'a' + 10;
 354		else if (*p >= 'A' && *p <= 'F')
 355			n = 16 * n + *p - 'A' + 10;
 356	}
 357	*pp = p;
 358	return n;
 359}
 360
 361
 362
 363#define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
 364
 365int quoted(const uschar **pp)	/* pick up next thing after a \\ */
 366			/* and increment *pp */
 367{
 368	const uschar *p = *pp;
 369	int c;
 370
 371/* BUG: should advance by utf-8 char even if makes no sense */
 372
 373	switch ((c = *p++)) {
 374	case 't':
 375		c = '\t';
 376		break;
 377	case 'n':
 378		c = '\n';
 379		break;
 380	case 'f':
 381		c = '\f';
 382		break;
 383	case 'r':
 384		c = '\r';
 385		break;
 386	case 'b':
 387		c = '\b';
 388		break;
 389	case 'v':
 390		c = '\v';
 391		break;
 392	case 'a':
 393		c = '\a';
 394		break;
 395	case '\\':
 396		c = '\\';
 397		break;
 398	case 'x': /* 2 hex digits follow */
 399		c = hexstr(&p, 2); /* this adds a null if number is invalid */
 400		break;
 401	case 'u': /* unicode char number up to 8 hex digits */
 402		c = hexstr(&p, 8);
 403		break;
 404	default:
 405		if (isoctdigit(c)) { /* \d \dd \ddd */
 406			int n = c - '0';
 407			if (isoctdigit(*p)) {
 408				n = 8 * n + *p++ - '0';
 409				if (isoctdigit(*p))
 410					n = 8 * n + *p++ - '0';
 411			}
 412			c = n;
 413		}
 414	}
 415
 416	*pp = p;
 417	return c;
 418}
 419
 420int *cclenter(const char *argp)	/* add a character class */
 421{
 422	int i, c, c2;
 423	int n;
 424	const uschar *p = (const uschar *) argp;
 425	int *bp, *retp;
 426	static int *buf = NULL;
 427	static int bufsz = 100;
 428
 429	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
 430		FATAL("out of space for character class [%.10s...] 1", p);
 431	bp = buf;
 432	for (i = 0; *p != 0; ) {
 433		n = u8_rune(&c, (const char *) p);
 434		p += n;
 435		if (c == '\\') {
 436			c = quoted(&p);
 437		} else if (c == '-' && i > 0 && bp[-1] != 0) {
 438			if (*p != 0) {
 439				c = bp[-1];
 440				/* c2 = *p++; */
 441				n = u8_rune(&c2, (const char *) p);
 442				p += n;
 443				if (c2 == '\\')
 444					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
 445				if (c > c2) {	/* empty; ignore */
 446					bp--;
 447					i--;
 448					continue;
 449				}
 450				while (c < c2) {
 451					if (i >= bufsz) {
 452						bufsz *= 2;
 453						buf = (int *) realloc(buf, bufsz * sizeof(int));
 454						if (buf == NULL)
 455							FATAL("out of space for character class [%.10s...] 2", p);
 456						bp = buf + i;
 457					}
 458					*bp++ = ++c;
 459					i++;
 460				}
 461				continue;
 462			}
 463		}
 464		if (i >= bufsz) {
 465			bufsz *= 2;
 466			buf = (int *) realloc(buf, bufsz * sizeof(int));
 467			if (buf == NULL)
 468				FATAL("out of space for character class [%.10s...] 2", p);
 469			bp = buf + i;
 470		}
 471		*bp++ = c;
 472		i++;
 473	}
 474	*bp = 0;
 475	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
 476	/* xfree(op);  BUG: what are we freeing here? */
 477	retp = (int *) calloc(bp-buf+1, sizeof(int));
 478	for (i = 0; i < bp-buf+1; i++)
 479		retp[i] = buf[i];
 480	return retp;
 481}
 482
 483void overflo(const char *s)
 484{
 485	FATAL("regular expression too big: out of space in %.30s...", s);
 486}
 487
 488void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
 489{
 490	int i;
 491	int *p;
 492
 493	switch (type(v)) {
 494	ELEAF
 495	LEAF
 496		f->re[info(v)].ltype = type(v);
 497		f->re[info(v)].lval.np = right(v);
 498		while (f->accept >= maxsetvec) {	/* guessing here! */
 499			resizesetvec(__func__);
 500		}
 501		for (i = 0; i <= f->accept; i++)
 502			setvec[i] = 0;
 503		setcnt = 0;
 504		follow(v);	/* computes setvec and setcnt */
 505		p = intalloc(setcnt + 1, __func__);
 506		f->re[info(v)].lfollow = p;
 507		*p = setcnt;
 508		for (i = f->accept; i >= 0; i--)
 509			if (setvec[i] == 1)
 510				*++p = i;
 511		break;
 512	UNARY
 513		cfoll(f,left(v));
 514		break;
 515	case CAT:
 516	case OR:
 517		cfoll(f,left(v));
 518		cfoll(f,right(v));
 519		break;
 520	case ZERO:
 521		break;
 522	default:	/* can't happen */
 523		FATAL("can't happen: unknown type %d in cfoll", type(v));
 524	}
 525}
 526
 527int first(Node *p)	/* collects initially active leaves of p into setvec */
 528			/* returns 0 if p matches empty string */
 529{
 530	int b, lp;
 531
 532	switch (type(p)) {
 533	ELEAF
 534	LEAF
 535		lp = info(p);	/* look for high-water mark of subscripts */
 536		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
 537			resizesetvec(__func__);
 538		}
 539		if (type(p) == EMPTYRE) {
 540			setvec[lp] = 0;
 541			return(0);
 542		}
 543		if (setvec[lp] != 1) {
 544			setvec[lp] = 1;
 545			setcnt++;
 546		}
 547		if (type(p) == CCL && (*(int *) right(p)) == 0)
 548			return(0);		/* empty CCL */
 549		return(1);
 550	case PLUS:
 551		if (first(left(p)) == 0)
 552			return(0);
 553		return(1);
 554	case STAR:
 555	case QUEST:
 556		first(left(p));
 557		return(0);
 558	case CAT:
 559		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
 560		return(1);
 561	case OR:
 562		b = first(right(p));
 563		if (first(left(p)) == 0 || b == 0) return(0);
 564		return(1);
 565	case ZERO:
 566		return 0;
 567	}
 568	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
 569	return(-1);
 570}
 571
 572void follow(Node *v)	/* collects leaves that can follow v into setvec */
 573{
 574	Node *p;
 575
 576	if (type(v) == FINAL)
 577		return;
 578	p = parent(v);
 579	switch (type(p)) {
 580	case STAR:
 581	case PLUS:
 582		first(v);
 583		follow(p);
 584		return;
 585
 586	case OR:
 587	case QUEST:
 588		follow(p);
 589		return;
 590
 591	case CAT:
 592		if (v == left(p)) {	/* v is left child of p */
 593			if (first(right(p)) == 0) {
 594				follow(p);
 595				return;
 596			}
 597		} else		/* v is right child */
 598			follow(p);
 599		return;
 600	}
 601}
 602
 603int member(int c, int *sarg)	/* is c in s? */
 604{
 605	int *s = (int *) sarg;
 606
 607	while (*s)
 608		if (c == *s++)
 609			return(1);
 610	return(0);
 611}
 612
 613static void resize_gototab(fa *f, int state)
 614{
 615	size_t new_size = f->gototab[state].allocated * 2;
 616	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
 617	if (p == NULL)
 618		overflo(__func__);
 619
 620	// need to initialize the new memory to zero
 621	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
 622	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
 623
 624	f->gototab[state].allocated = new_size;			// update gototab info
 625	f->gototab[state].entries = p;
 626}
 627
 628static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
 629{
 630	gtte key;
 631	gtte *item;
 632
 633	if ((unsigned)ch < GOTO_DIRECT)
 634		return f->gototab[state].direct[ch];
 635
 636	key.ch = ch;
 637	key.state = 0;	/* irrelevant */
 638	item = (gtte *) bsearch(& key, f->gototab[state].entries,
 639			f->gototab[state].inuse, sizeof(gtte),
 640			entry_cmp);
 641
 642	if (item == NULL)
 643		return 0;
 644	else
 645		return item->state;
 646}
 647
 648static int entry_cmp(const void *l, const void *r)
 649{
 650	const gtte *left, *right;
 651
 652	left = (const gtte *) l;
 653	right = (const gtte *) r;
 654
 655	return left->ch - right->ch;
 656}
 657
 658static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
 659{
 660	if ((unsigned)ch < GOTO_DIRECT) {
 661		f->gototab[state].direct[ch] = val;
 662		return val;
 663	}
 664
 665	if (f->gototab[state].inuse == 0) {
 666		f->gototab[state].entries[0].ch = ch;
 667		f->gototab[state].entries[0].state = val;
 668		f->gototab[state].inuse++;
 669		return val;
 670	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
 671		// not seen yet, insert and return
 672		gtt *tab = & f->gototab[state];
 673		if (tab->inuse + 1 >= tab->allocated)
 674			resize_gototab(f, state);
 675
 676		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
 677		f->gototab[state].entries[f->gototab[state].inuse].state = val;
 678		f->gototab[state].inuse++;
 679		return val;
 680	} else {
 681		// maybe we have it, maybe we don't
 682		gtte key;
 683		gtte *item;
 684
 685		key.ch = ch;
 686		key.state = 0;	/* irrelevant */
 687		item = (gtte *) bsearch(& key, f->gototab[state].entries,
 688				f->gototab[state].inuse, sizeof(gtte),
 689				entry_cmp);
 690
 691		if (item != NULL) {
 692			// we have it, update state and return
 693			item->state = val;
 694			return item->state;
 695		}
 696		// otherwise, fall through to insert and reallocate.
 697	}
 698
 699	gtt *tab = & f->gototab[state];
 700	if (tab->inuse + 1 >= tab->allocated)
 701		resize_gototab(f, state);
 702	f->gototab[state].entries[tab->inuse].ch = ch;
 703	f->gototab[state].entries[tab->inuse].state = val;
 704	++tab->inuse;
 705
 706	qsort(f->gototab[state].entries,
 707		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
 708
 709	return val; /* not used anywhere at the moment */
 710}
 711
 712static void clear_gototab(fa *f, int state)
 713{
 714	memset(f->gototab[state].direct, 0, sizeof(f->gototab[state].direct));
 715	memset(f->gototab[state].entries, 0,
 716		f->gototab[state].allocated * sizeof(gtte));
 717	f->gototab[state].inuse = 0;
 718}
 719
 720int match(fa *f, const char *p0)	/* shortest match ? */
 721{
 722	int s, ns;
 723	int n;
 724	int rune;
 725	const uschar *p = (const uschar *) p0;
 726
 727	/* return pmatch(f, p0); does it matter whether longest or shortest? */
 728
 729	s = f->initstat;
 730	assert (s < f->state_count);
 731
 732	if (f->out[s])
 733		return(1);
 734	do {
 735		/* assert(*p < NCHARS); */
 736		n = u8_rune(&rune, (const char *) p);
 737		if ((ns = get_gototab(f, s, rune)) != 0)
 738			s = ns;
 739		else
 740			s = cgoto(f, s, rune);
 741		if (f->out[s])
 742			return(1);
 743		if (*p == 0)
 744			break;
 745		p += n;
 746	} while (1);  /* was *p++ != 0 */
 747	return(0);
 748}
 749
 750int pmatch(fa *f, const char *p0)	/* longest match, for sub */
 751{
 752	int s, ns;
 753	int n;
 754	int rune;
 755	const uschar *p = (const uschar *) p0;
 756	const uschar *q;
 757
 758	s = f->initstat;
 759	assert(s < f->state_count);
 760
 761	patbeg = (const char *)p;
 762	patlen = -1;
 763	do {
 764		q = p;
 765		do {
 766			if (f->out[s])		/* final state */
 767				patlen = q-p;
 768			/* assert(*q < NCHARS); */
 769			n = u8_rune(&rune, (const char *) q);
 770			if ((ns = get_gototab(f, s, rune)) != 0)
 771				s = ns;
 772			else
 773				s = cgoto(f, s, rune);
 774
 775			assert(s < f->state_count);
 776
 777			if (s == 1) {	/* no transition */
 778				if (patlen >= 0) {
 779					patbeg = (const char *) p;
 780					return(1);
 781				}
 782				else
 783					goto nextin;	/* no match */
 784			}
 785			if (*q == 0)
 786				break;
 787			q += n;
 788		} while (1);
 789		q++;  /* was *q++ */
 790		if (f->out[s])
 791			patlen = q-p-1;	/* don't count $ */
 792		if (patlen >= 0) {
 793			patbeg = (const char *) p;
 794			return(1);
 795		}
 796	nextin:
 797		s = 2;
 798		if (*p == 0)
 799			break;
 800		n = u8_rune(&rune, (const char *) p);
 801		p += n;
 802	} while (1); /* was *p++ */
 803	return (0);
 804}
 805
 806int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
 807{
 808	int s, ns;
 809	int n;
 810	int rune;
 811	const uschar *p = (const uschar *) p0;
 812	const uschar *q;
 813
 814	s = f->initstat;
 815	assert(s < f->state_count);
 816
 817	patbeg = (const char *)p;
 818	patlen = -1;
 819	while (*p) {
 820		q = p;
 821		do {
 822			if (f->out[s])		/* final state */
 823				patlen = q-p;
 824			/* assert(*q < NCHARS); */
 825			n = u8_rune(&rune, (const char *) q);
 826			if ((ns = get_gototab(f, s, rune)) != 0)
 827				s = ns;
 828			else
 829				s = cgoto(f, s, rune);
 830			if (s == 1) {	/* no transition */
 831				if (patlen > 0) {
 832					patbeg = (const char *) p;
 833					return(1);
 834				} else
 835					goto nnextin;	/* no nonempty match */
 836			}
 837			if (*q == 0)
 838				break;
 839			q += n;
 840		} while (1);
 841		q++;
 842		if (f->out[s])
 843			patlen = q-p-1;	/* don't count $ */
 844		if (patlen > 0 ) {
 845			patbeg = (const char *) p;
 846			return(1);
 847		}
 848	nnextin:
 849		s = 2;
 850		p++;
 851	}
 852	return (0);
 853}
 854
 855
 856/*
 857 * NAME
 858 *     fnematch
 859 *
 860 * DESCRIPTION
 861 *     A stream-fed version of nematch which transfers characters to a
 862 *     null-terminated buffer. All characters up to and including the last
 863 *     character of the matching text or EOF are placed in the buffer. If
 864 *     a match is found, patbeg and patlen are set appropriately.
 865 *
 866 * RETURN VALUES
 867 *     false    No match found.
 868 *     true     Match found.
 869 */
 870
 871bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 872{
 873	char *i, *j, *k, *buf = *pbuf;
 874	int bufsize = *pbufsize;
 875	int c, n, ns, s;
 876
 877	s = pfa->initstat;
 878	patlen = 0;
 879
 880	/*
 881	 * buf <= i <= j <= k <= buf+bufsize
 882	 *
 883	 * i: origin of active substring
 884	 * j: current character
 885	 * k: destination of the next getc
 886	 */
 887
 888	i = j = k = buf;
 889
 890	do {
 891		/*
 892		 * Call u8_rune with at least awk_mb_cur_max ahead in
 893		 * the buffer until EOF interferes.
 894		 */
 895		if (k - j < (int)awk_mb_cur_max) {
 896			if (k + awk_mb_cur_max > buf + bufsize) {
 897				char *obuf = buf;
 898				adjbuf((char **) &buf, &bufsize,
 899				    bufsize + awk_mb_cur_max,
 900				    quantum, 0, "fnematch");
 901
 902				/* buf resized, maybe moved. update pointers */
 903				*pbufsize = bufsize;
 904				if (obuf != buf) {
 905					i = buf + (i - obuf);
 906					j = buf + (j - obuf);
 907					k = buf + (k - obuf);
 908					*pbuf = buf;
 909					if (patlen)
 910						patbeg = buf + (patbeg - obuf);
 911				}
 912			}
 913			for (n = awk_mb_cur_max ; n > 0; n--) {
 914				*k++ = (c = getc(f)) != EOF ? c : 0;
 915				if (c == EOF) {
 916					if (ferror(f))
 917						FATAL("fnematch: getc error");
 918					break;
 919				}
 920			}
 921		}
 922
 923		j += u8_rune(&c, j);
 924
 925		if ((ns = get_gototab(pfa, s, c)) != 0)
 926			s = ns;
 927		else
 928			s = cgoto(pfa, s, c);
 929
 930		if (pfa->out[s]) {	/* final state */
 931			patbeg = i;
 932			patlen = j - i;
 933			if (c == 0)	/* don't count $ */
 934				patlen--;
 935		}
 936
 937		if (c && s != 1)
 938			continue;  /* origin i still viable, next j */
 939		if (patlen)
 940			break;     /* best match found */
 941
 942		/* no match at origin i, next i and start over */
 943		i += u8_rune(&c, i);
 944		if (c == 0)
 945			break;    /* no match */
 946		j = i;
 947		s = 2;
 948	} while (1);
 949
 950	if (patlen) {
 951		/*
 952		 * Under no circumstances is the last character fed to
 953		 * the automaton part of the match. It is EOF's nullbyte,
 954		 * or it sent the automaton into a state with no further
 955		 * transitions available (s==1), or both. Room for a
 956		 * terminating nullbyte is guaranteed.
 957		 *
 958		 * ungetc any chars after the end of matching text
 959		 * (except for EOF's nullbyte, if present) and null
 960		 * terminate the buffer.
 961		 */
 962		do
 963			if (*--k && ungetc(*k, f) == EOF)
 964				FATAL("unable to ungetc '%c'", *k);
 965		while (k > patbeg + patlen);
 966		*k = '\0';
 967		return true;
 968	}
 969	else
 970		return false;
 971}
 972
 973Node *reparse(const char *p)	/* parses regular expression pointed to by p */
 974{			/* uses relex() to scan regular expression */
 975	Node *np;
 976
 977	DPRINTF("reparse <%s>\n", p);
 978	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
 979	rtok = relex();
 980	/* GNU compatibility: an empty regexp matches anything */
 981	if (rtok == '\0') {
 982		/* FATAL("empty regular expression"); previous */
 983		return(op2(EMPTYRE, NIL, NIL));
 984	}
 985	np = regexp();
 986	if (rtok != '\0')
 987		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
 988	return(np);
 989}
 990
 991Node *regexp(void)	/* top-level parse of reg expr */
 992{
 993	return (alt(concat(primary())));
 994}
 995
 996Node *primary(void)
 997{
 998	Node *np;
 999	int savelastatom;
1000
1001	switch (rtok) {
1002	case CHAR:
1003		lastatom = starttok;
1004		np = op2(CHAR, NIL, itonp(rlxval));
1005		rtok = relex();
1006		return (unary(np));
1007	case ALL:
1008		rtok = relex();
1009		return (unary(op2(ALL, NIL, NIL)));
1010	case EMPTYRE:
1011		rtok = relex();
1012		return (unary(op2(EMPTYRE, NIL, NIL)));
1013	case DOT:
1014		lastatom = starttok;
1015		rtok = relex();
1016		return (unary(op2(DOT, NIL, NIL)));
1017	case CCL:
1018		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1019		lastatom = starttok;
1020		rtok = relex();
1021		return (unary(np));
1022	case NCCL:
1023		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1024		lastatom = starttok;
1025		rtok = relex();
1026		return (unary(np));
1027	case '^':
1028		rtok = relex();
1029		return (unary(op2(CHAR, NIL, itonp(HAT))));
1030	case '$':
1031		rtok = relex();
1032		return (unary(op2(CHAR, NIL, NIL)));
1033	case '(':
1034		lastatom = starttok;
1035		savelastatom = starttok - basestr; /* Retain over recursion */
1036		rtok = relex();
1037		if (rtok == ')') {	/* special pleading for () */
1038			rtok = relex();
1039			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1040		}
1041		np = regexp();
1042		if (rtok == ')') {
1043			lastatom = basestr + savelastatom; /* Restore */
1044			rtok = relex();
1045			return (unary(np));
1046		}
1047		else {
1048			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1049			break;
1050		}
1051	default:
1052		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1053	}
1054	return 0;	/*NOTREACHED*/
1055}
1056
1057Node *concat(Node *np)
1058{
1059	switch (rtok) {
1060	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1061		return (concat(op2(CAT, np, primary())));
1062	case EMPTYRE:
1063		rtok = relex();
1064		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1065				primary())));
1066	}
1067	return (np);
1068}
1069
1070Node *alt(Node *np)
1071{
1072	if (rtok == OR) {
1073		rtok = relex();
1074		return (alt(op2(OR, np, concat(primary()))));
1075	}
1076	return (np);
1077}
1078
1079Node *unary(Node *np)
1080{
1081	switch (rtok) {
1082	case STAR:
1083		rtok = relex();
1084		return (unary(op2(STAR, np, NIL)));
1085	case PLUS:
1086		rtok = relex();
1087		return (unary(op2(PLUS, np, NIL)));
1088	case QUEST:
1089		rtok = relex();
1090		return (unary(op2(QUEST, np, NIL)));
1091	case ZERO:
1092		rtok = relex();
1093		return (unary(op2(ZERO, np, NIL)));
1094	default:
1095		return (np);
1096	}
1097}
1098
1099/*
1100 * Character class definitions conformant to the POSIX locale as
1101 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1102 * and operating character sets are both ASCII (ISO646) or supersets
1103 * thereof.
1104 *
1105 * Note that to avoid overflowing the temporary buffer used in
1106 * relex(), the expanded character class (prior to range expansion)
1107 * must be less than twice the size of their full name.
1108 */
1109
1110/* Because isblank doesn't show up in any of the header files on any
1111 * system i use, it's defined here.  if some other locale has a richer
1112 * definition of "blank", define HAS_ISBLANK and provide your own
1113 * version.
1114 * the parentheses here are an attempt to find a path through the maze
1115 * of macro definition and/or function and/or version provided.  thanks
1116 * to nelson beebe for the suggestion; let's see if it works everywhere.
1117 */
1118
1119/* #define HAS_ISBLANK */
1120#ifndef HAS_ISBLANK
1121
1122int (xisblank)(int c)
1123{
1124	return c==' ' || c=='\t';
1125}
1126
1127#endif
1128
1129static const struct charclass {
1130	const char *cc_name;
1131	int cc_namelen;
1132	int (*cc_func)(int);
1133} charclasses[] = {
1134	{ "alnum",	5,	isalnum },
1135	{ "alpha",	5,	isalpha },
1136#ifndef HAS_ISBLANK
1137	{ "blank",	5,	xisblank },
1138#else
1139	{ "blank",	5,	isblank },
1140#endif
1141	{ "cntrl",	5,	iscntrl },
1142	{ "digit",	5,	isdigit },
1143	{ "graph",	5,	isgraph },
1144	{ "lower",	5,	islower },
1145	{ "print",	5,	isprint },
1146	{ "punct",	5,	ispunct },
1147	{ "space",	5,	isspace },
1148	{ "upper",	5,	isupper },
1149	{ "xdigit",	6,	isxdigit },
1150	{ NULL,		0,	NULL },
1151};
1152
1153#define REPEAT_SIMPLE		0
1154#define REPEAT_PLUS_APPENDED	1
1155#define REPEAT_WITH_Q		2
1156#define REPEAT_ZERO		3
1157
1158static int
1159replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1160	       int atomlen, int firstnum, int secondnum, int special_case)
1161{
1162	int i, j;
1163	uschar *buf = 0;
1164	int ret = 1;
1165	int init_q = (firstnum == 0);		/* first added char will be ? */
1166	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1167	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1168	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1169	int size = prefix_length +  suffix_length;
1170
1171	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1172		size += atomlen*(firstnum-1);
1173	}
1174
1175	/* Adjust size of buffer for special cases */
1176	if (special_case == REPEAT_PLUS_APPENDED) {
1177		size++;		/* for the final + */
1178	} else if (special_case == REPEAT_WITH_Q) {
1179		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1180	} else if (special_case == REPEAT_ZERO) {
1181		size += 2;	/* just a null ERE: () */
1182	}
1183	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1184		FATAL("out of space in reg expr %.10s..", lastre);
1185	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1186	j = prefix_length;
1187	if (special_case == REPEAT_ZERO) {
1188		j -= atomlen;
1189		buf[j++] = '(';
1190		buf[j++] = ')';
1191	}
1192	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1193		memcpy(&buf[j], atom, atomlen);
1194		j += atomlen;
1195	}
1196	if (special_case == REPEAT_PLUS_APPENDED) {
1197		buf[j++] = '+';
1198	} else if (special_case == REPEAT_WITH_Q) {
1199		if (init_q)
1200			buf[j++] = '?';
1201		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1202			memcpy(&buf[j], atom, atomlen);
1203			j += atomlen;
1204			buf[j++] = '?';
1205		}
1206	}
1207	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1208	j += suffix_length;
1209	buf[j] = '\0';
1210	/* free old basestr */
1211	if (firstbasestr != basestr) {
1212		if (basestr)
1213			xfree(basestr);
1214	}
1215	basestr = buf;
1216	prestr  = buf + prefix_length;
1217	if (special_case == REPEAT_ZERO) {
1218		prestr  -= atomlen;
1219		ret++;
1220	}
1221	return ret;
1222}
1223
1224static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1225		  int atomlen, int firstnum, int secondnum)
1226{
1227	if (atom == NULL)
1228		return 0;
1229
1230	/*
1231	   In general, the repetition specifier or "bound" is replaced here
1232	   by an equivalent ERE string, repeating the immediately previous atom
1233	   and appending ? and + as needed. Note that the first copy of the
1234	   atom is left in place, except in the special_case of a zero-repeat
1235	   (i.e., {0}).
1236	 */
1237	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1238		if (firstnum < 2) {
1239			/* 0 or 1: should be handled before you get here */
1240			FATAL("internal error");
1241		} else {
1242			return replace_repeat(reptok, reptoklen, atom, atomlen,
1243				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1244		}
1245	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1246		if (firstnum == 0) {	/* {0} or {0,0} */
1247			/* This case is unusual because the resulting
1248			   replacement string might actually be SMALLER than
1249			   the original ERE */
1250			return replace_repeat(reptok, reptoklen, atom, atomlen,
1251					firstnum, secondnum, REPEAT_ZERO);
1252		} else {		/* (firstnum >= 1) */
1253			return replace_repeat(reptok, reptoklen, atom, atomlen,
1254					firstnum, secondnum, REPEAT_SIMPLE);
1255		}
1256	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1257		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1258		return replace_repeat(reptok, reptoklen, atom, atomlen,
1259					firstnum, secondnum, REPEAT_WITH_Q);
1260	} else {	/* Error - shouldn't be here (n>m) */
1261		FATAL("internal error");
1262	}
1263	return 0;
1264}
1265
1266int relex(void)		/* lexical analyzer for reparse */
1267{
1268	int c, n;
1269	int cflag;
1270	static uschar *buf = NULL;
1271	static int bufsz = 100;
1272	uschar *bp;
1273	const struct charclass *cc;
1274	int i;
1275	int num, m;
1276	bool commafound, digitfound;
1277	const uschar *startreptok;
1278	static int parens = 0;
1279
1280rescan:
1281	starttok = prestr;
1282
1283	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1284		prestr += n;
1285		starttok = prestr;
1286		return CHAR;
1287	}
1288
1289	switch (c = *prestr++) {
1290	case '|': return OR;
1291	case '*': return STAR;
1292	case '+': return PLUS;
1293	case '?': return QUEST;
1294	case '.': return DOT;
1295	case '\0': prestr--; return '\0';
1296	case '^':
1297	case '$':
1298		return c;
1299	case '(':
1300		parens++;
1301		return c;
1302	case ')':
1303		if (parens) {
1304			parens--;
1305			return c;
1306		}
1307		/* unmatched close parenthesis; per POSIX, treat as literal */
1308		rlxval = c;
1309		return CHAR;
1310	case '\\':
1311		rlxval = quoted(&prestr);
1312		return CHAR;
1313	default:
1314		rlxval = c;
1315		return CHAR;
1316	case '[':
1317		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1318			FATAL("out of space in reg expr %.10s..", lastre);
1319		bp = buf;
1320		if (*prestr == '^') {
1321			cflag = 1;
1322			prestr++;
1323		}
1324		else
1325			cflag = 0;
1326		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1327		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1328			FATAL("out of space for reg expr %.10s...", lastre);
1329		for (; ; ) {
1330			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1331				for (i = 0; i < n; i++)
1332					*bp++ = *prestr++;
1333				continue;
1334			}
1335			if ((c = *prestr++) == '\\') {
1336				*bp++ = '\\';
1337				if ((c = *prestr++) == '\0')
1338					FATAL("nonterminated character class %.20s...", lastre);
1339				*bp++ = c;
1340			/* } else if (c == '\n') { */
1341			/* 	FATAL("newline in character class %.20s...", lastre); */
1342			} else if (c == '[' && *prestr == ':') {
1343				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1344				for (cc = charclasses; cc->cc_name; cc++)
1345					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1346						break;
1347				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1348				    prestr[2 + cc->cc_namelen] == ']') {
1349					prestr += cc->cc_namelen + 3;
1350					/*
1351					 * BUG: We begin at 1, instead of 0, since we
1352					 * would otherwise prematurely terminate the
1353					 * string for classes like [[:cntrl:]]. This
1354					 * means that we can't match the NUL character,
1355					 * not without first adapting the entire
1356					 * program to track each string's length.
1357					 */
1358					for (i = 1; i <= UCHAR_MAX; i++) {
1359						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1360						    FATAL("out of space for reg expr %.10s...", lastre);
1361						if (cc->cc_func(i)) {
1362							/* escape backslash */
1363							if (i == '\\') {
1364								*bp++ = '\\';
1365								n++;
1366							}
1367
1368							*bp++ = i;
1369							n++;
1370						}
1371					}
1372				} else
1373					*bp++ = c;
1374			} else if (c == '[' && *prestr == '.') {
1375				char collate_char;
1376				prestr++;
1377				collate_char = *prestr++;
1378				if (*prestr == '.' && prestr[1] == ']') {
1379					prestr += 2;
1380					/* Found it: map via locale TBD: for
1381					   now, simply return this char.  This
1382					   is sufficient to pass conformance
1383					   test awk.ex 156
1384					 */
1385					if (*prestr == ']') {
1386						prestr++;
1387						rlxval = collate_char;
1388						return CHAR;
1389					}
1390				}
1391			} else if (c == '[' && *prestr == '=') {
1392				char equiv_char;
1393				prestr++;
1394				equiv_char = *prestr++;
1395				if (*prestr == '=' && prestr[1] == ']') {
1396					prestr += 2;
1397					/* Found it: map via locale TBD: for now
1398					   simply return this char. This is
1399					   sufficient to pass conformance test
1400					   awk.ex 156
1401					 */
1402					if (*prestr == ']') {
1403						prestr++;
1404						rlxval = equiv_char;
1405						return CHAR;
1406					}
1407				}
1408			} else if (c == '\0') {
1409				FATAL("nonterminated character class %.20s", lastre);
1410			} else if (bp == buf) {	/* 1st char is special */
1411				*bp++ = c;
1412			} else if (c == ']') {
1413				*bp++ = 0;
1414				rlxstr = (uschar *) tostring((char *) buf);
1415				if (cflag == 0)
1416					return CCL;
1417				else
1418					return NCCL;
1419			} else
1420				*bp++ = c;
1421		}
1422		break;
1423	case '{':
1424		if (isdigit((int) *(prestr))) {
1425			num = 0;	/* Process as a repetition */
1426			n = -1; m = -1;
1427			commafound = false;
1428			digitfound = false;
1429			startreptok = prestr-1;
1430			/* Remember start of previous atom here ? */
1431		} else {        	/* just a { char, not a repetition */
1432			rlxval = c;
1433			return CHAR;
1434                }
1435		for (; ; ) {
1436			if ((c = *prestr++) == '}') {
1437				if (commafound) {
1438					if (digitfound) { /* {n,m} */
1439						m = num;
1440						if (m < n)
1441							FATAL("illegal repetition expression: class %.20s",
1442								lastre);
1443						if (n == 0 && m == 1) {
1444							return QUEST;
1445						}
1446					} else {	/* {n,} */
1447						if (n == 0)
1448							return STAR;
1449						else if (n == 1)
1450							return PLUS;
1451					}
1452				} else {
1453					if (digitfound) { /* {n} same as {n,n} */
1454						n = num;
1455						m = num;
1456					} else {	/* {} */
1457						FATAL("illegal repetition expression: class %.20s",
1458							lastre);
1459					}
1460				}
1461				if (repeat(starttok, prestr-starttok, lastatom,
1462					   startreptok - lastatom, n, m) > 0) {
1463					if (n == 0 && m == 0) {
1464						return ZERO;
1465					}
1466					/* must rescan input for next token */
1467					goto rescan;
1468				}
1469				/* Failed to replace: eat up {...} characters
1470				   and treat like just PLUS */
1471				return PLUS;
1472			} else if (c == '\0') {
1473				FATAL("nonterminated character class %.20s",
1474					lastre);
1475			} else if (isdigit(c)) {
1476				num = 10 * num + c - '0';
1477				if (num > 255)
1478					FATAL("repetition count %.20s too large",
1479						lastre);
1480				digitfound = true;
1481			} else if (c == ',') {
1482				if (commafound)
1483					FATAL("illegal repetition expression: class %.20s",
1484						lastre);
1485				/* looking for {n,} or {n,m} */
1486				commafound = true;
1487				n = num;
1488				digitfound = false; /* reset */
1489				num = 0;
1490			} else {
1491				FATAL("illegal repetition expression: class %.20s",
1492					lastre);
1493			}
1494		}
1495		break;
1496	}
1497}
1498
1499int cgoto(fa *f, int s, int c)
1500{
1501	int *p, *q;
1502	int i, j, k;
1503
1504	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1505	while (f->accept >= maxsetvec) {	/* guessing here! */
1506		resizesetvec(__func__);
1507	}
1508	for (i = 0; i <= f->accept; i++)
1509		setvec[i] = 0;
1510	setcnt = 0;
1511	resize_state(f, s);
1512	/* compute positions of gototab[s,c] into setvec */
1513	p = f->posns[s];
1514	for (i = 1; i <= *p; i++) {
1515		if ((k = f->re[p[i]].ltype) != FINAL) {
1516			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1517			 || (k == DOT && c != 0 && c != HAT)
1518			 || (k == ALL && c != 0)
1519			 || (k == EMPTYRE && c != 0)
1520			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1521			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1522				q = f->re[p[i]].lfollow;
1523				for (j = 1; j <= *q; j++) {
1524					if (q[j] >= maxsetvec) {
1525						resizesetvec(__func__);
1526					}
1527					if (setvec[q[j]] == 0) {
1528						setcnt++;
1529						setvec[q[j]] = 1;
1530					}
1531				}
1532			}
1533		}
1534	}
1535	/* determine if setvec is a previous state */
1536	tmpset[0] = setcnt;
1537	j = 1;
1538	for (i = f->accept; i >= 0; i--)
1539		if (setvec[i]) {
1540			tmpset[j++] = i;
1541		}
1542	resize_state(f, f->curstat > s ? f->curstat : s);
1543	/* tmpset == previous state? */
1544	for (i = 1; i <= f->curstat; i++) {
1545		p = f->posns[i];
1546		if ((k = tmpset[0]) != p[0])
1547			goto different;
1548		for (j = 1; j <= k; j++)
1549			if (tmpset[j] != p[j])
1550				goto different;
1551		/* setvec is state i */
1552		if (c != HAT)
1553			set_gototab(f, s, c, i);
1554		return i;
1555	  different:;
1556	}
1557
1558	/* add tmpset to current set of states */
1559	++(f->curstat);
1560	resize_state(f, f->curstat);
1561	clear_gototab(f, f->curstat);
1562	xfree(f->posns[f->curstat]);
1563	p = intalloc(setcnt + 1, __func__);
1564
1565	f->posns[f->curstat] = p;
1566	if (c != HAT)
1567		set_gototab(f, s, c, f->curstat);
1568	for (i = 0; i <= setcnt; i++)
1569		p[i] = tmpset[i];
1570	if (setvec[f->accept])
1571		f->out[f->curstat] = 1;
1572	else
1573		f->out[f->curstat] = 0;
1574	return f->curstat;
1575}
1576
1577
1578void freefa(fa *f)	/* free a finite automaton */
1579{
1580	int i;
1581
1582	if (f == NULL)
1583		return;
1584	for (i = 0; i < f->state_count; i++)
1585		xfree(f->gototab[i].entries);
1586	xfree(f->gototab);
1587	for (i = 0; i <= f->curstat; i++)
1588		xfree(f->posns[i]);
1589	for (i = 0; i <= f->accept; i++) {
1590		xfree(f->re[i].lfollow);
1591		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1592			xfree(f->re[i].lval.np);
1593	}
1594	xfree(f->restr);
1595	xfree(f->out);
1596	xfree(f->posns);
1597	xfree(f->gototab);
1598	xfree(f);
1599}