master xplshn/aruu / cmd / posix / awk / lex.c
  1/****************************************************************
  2Copyright (C) Lucent Technologies 1997
  3All Rights Reserved
  4
  5Permission to use, copy, modify, and distribute this software and
  6its documentation for any purpose and without fee is hereby
  7granted, provided that the above copyright notice appear in all
  8copies and that both that the copyright notice and this
  9permission notice and warranty disclaimer appear in supporting
 10documentation, and that the name Lucent Technologies or any of
 11its entities not be used in advertising or publicity pertaining
 12to distribution of the software without specific, written prior
 13permission.
 14
 15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
 17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
 18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
 20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 22THIS SOFTWARE.
 23****************************************************************/
 24
 25#include <stdio.h>
 26#include <stdlib.h>
 27#include <string.h>
 28#include <ctype.h>
 29#include "awk.h"
 30#include "awkgram.tab.h"
 31
 32extern YYSTYPE	yylval;
 33extern bool	infunc;
 34
 35int	lineno	= 1;
 36int	bracecnt = 0;
 37int	brackcnt  = 0;
 38int	parencnt = 0;
 39
 40typedef struct Keyword {
 41	const char *word;
 42	int	sub;
 43	int	type;
 44} Keyword;
 45
 46const Keyword keywords[] = {	/* keep sorted: binary searched */
 47	{ "BEGIN",	XBEGIN,		XBEGIN },
 48	{ "END",	XEND,		XEND },
 49	{ "NF",		VARNF,		VARNF },
 50	{ "atan2",	FATAN,		BLTIN },
 51	{ "break",	BREAK,		BREAK },
 52	{ "close",	CLOSE,		CLOSE },
 53	{ "continue",	CONTINUE,	CONTINUE },
 54	{ "cos",	FCOS,		BLTIN },
 55	{ "delete",	DELETE,		DELETE },
 56	{ "do",		DO,		DO },
 57	{ "else",	ELSE,		ELSE },
 58	{ "exit",	EXIT,		EXIT },
 59	{ "exp",	FEXP,		BLTIN },
 60	{ "fflush",	FFLUSH,		BLTIN },
 61	{ "for",	FOR,		FOR },
 62	{ "func",	FUNC,		FUNC },
 63	{ "function",	FUNC,		FUNC },
 64	{ "getline",	GETLINE,	GETLINE },
 65	{ "gsub",	GSUB,		GSUB },
 66	{ "if",		IF,		IF },
 67	{ "in",		IN,		IN },
 68	{ "index",	INDEX,		INDEX },
 69	{ "int",	FINT,		BLTIN },
 70	{ "length",	FLENGTH,	BLTIN },
 71	{ "log",	FLOG,		BLTIN },
 72	{ "match",	MATCHFCN,	MATCHFCN },
 73	{ "next",	NEXT,		NEXT },
 74	{ "nextfile",	NEXTFILE,	NEXTFILE },
 75	{ "print",	PRINT,		PRINT },
 76	{ "printf",	PRINTF,		PRINTF },
 77	{ "rand",	FRAND,		BLTIN },
 78	{ "return",	RETURN,		RETURN },
 79	{ "sin",	FSIN,		BLTIN },
 80	{ "split",	SPLIT,		SPLIT },
 81	{ "sprintf",	SPRINTF,	SPRINTF },
 82	{ "sqrt",	FSQRT,		BLTIN },
 83	{ "srand",	FSRAND,		BLTIN },
 84	{ "sub",	SUB,		SUB },
 85	{ "substr",	SUBSTR,		SUBSTR },
 86	{ "system",	FSYSTEM,	BLTIN },
 87	{ "tolower",	FTOLOWER,	BLTIN },
 88	{ "toupper",	FTOUPPER,	BLTIN },
 89	{ "while",	WHILE,		WHILE },
 90};
 91
 92#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
 93
 94static int peek(void)
 95{
 96	int c = input();
 97	unput(c);
 98	return c;
 99}
100
101static int gettok(char **pbuf, int *psz)	/* get next input token */
102{
103	int c, retc;
104	char *buf = *pbuf;
105	int sz = *psz;
106	char *bp = buf;
107
108	c = input();
109	if (c == 0)
110		return 0;
111	buf[0] = c;
112	buf[1] = 0;
113	if (!isalnum(c) && c != '.' && c != '_')
114		return c;
115
116	*bp++ = c;
117	if (isalpha(c) || c == '_') {	/* it's a varname */
118		for ( ; (c = input()) != 0; ) {
119			if (bp-buf >= sz)
120				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121					FATAL( "out of space for name %.10s...", buf );
122			if (isalnum(c) || c == '_')
123				*bp++ = c;
124			else {
125				*bp = 0;
126				unput(c);
127				break;
128			}
129		}
130		*bp = 0;
131		retc = 'a';	/* alphanumeric */
132	} else {	/* maybe it's a number, but could be . */
133		char *rem;
134		/* read input until can't be a number */
135		for ( ; (c = input()) != 0; ) {
136			if (bp-buf >= sz)
137				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138					FATAL( "out of space for number %.10s...", buf );
139			if (isdigit(c) || c == 'e' || c == 'E'
140			  || c == '.' || c == '+' || c == '-')
141				*bp++ = c;
142			else {
143				unput(c);
144				break;
145			}
146		}
147		*bp = 0;
148		strtod(buf, &rem);	/* parse the number */
149		if (rem == buf) {	/* it wasn't a valid number at all */
150			buf[1] = 0;	/* return one character as token */
151			retc = (uschar)buf[0];	/* character is its own type */
152			unputstr(rem+1); /* put rest back for later */
153		} else {	/* some prefix was a number */
154			unputstr(rem);	/* put rest back for later */
155			rem[0] = 0;	/* truncate buf after number part */
156			retc = '0';	/* type is number */
157		}
158	}
159	*pbuf = buf;
160	*psz = sz;
161	return retc;
162}
163
164int	word(char *);
165int	string(void);
166int	regexpr(void);
167bool	sc	= false;	/* true => return a } right now */
168bool	reg	= false;	/* true => return a REGEXPR now */
169
170int yylex(void)
171{
172	int c;
173	static char *buf = NULL;
174	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175
176	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
177		FATAL( "out of space in yylex" );
178	if (sc) {
179		sc = false;
180		RET('}');
181	}
182	if (reg) {
183		reg = false;
184		return regexpr();
185	}
186	for (;;) {
187		c = gettok(&buf, &bufsize);
188		if (c == 0)
189			return 0;
190		if (isalpha(c) || c == '_')
191			return word(buf);
192		if (isdigit(c)) {
193			char *cp = tostring(buf);
194			double result;
195
196			if (is_number(cp, & result))
197				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
198			else
199				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
200			free(cp);
201			/* should this also have STR set? */
202			RET(NUMBER);
203		}
204
205		yylval.i = c;
206		switch (c) {
207		case '\n':	/* {EOL} */
208			lineno++;
209			RET(NL);
210		case '\r':	/* assume \n is coming */
211		case ' ':	/* {WS}+ */
212		case '\t':
213			break;
214		case '#':	/* #.* strip comments */
215			while ((c = input()) != '\n' && c != 0)
216				;
217			unput(c);
218			break;
219		case ';':
220			RET(';');
221		case '\\':
222			if (peek() == '\n') {
223				input();
224				lineno++;
225			} else if (peek() == '\r') {
226				input(); input();	/* \n */
227				lineno++;
228			} else {
229				RET(c);
230			}
231			break;
232		case '&':
233			if (peek() == '&') {
234				input(); RET(AND);
235			} else
236				RET('&');
237		case '|':
238			if (peek() == '|') {
239				input(); RET(BOR);
240			} else
241				RET('|');
242		case '!':
243			if (peek() == '=') {
244				input(); yylval.i = NE; RET(NE);
245			} else if (peek() == '~') {
246				input(); yylval.i = NOTMATCH; RET(MATCHOP);
247			} else
248				RET(NOT);
249		case '~':
250			yylval.i = MATCH;
251			RET(MATCHOP);
252		case '<':
253			if (peek() == '=') {
254				input(); yylval.i = LE; RET(LE);
255			} else {
256				yylval.i = LT; RET(LT);
257			}
258		case '=':
259			if (peek() == '=') {
260				input(); yylval.i = EQ; RET(EQ);
261			} else {
262				yylval.i = ASSIGN; RET(ASGNOP);
263			}
264		case '>':
265			if (peek() == '=') {
266				input(); yylval.i = GE; RET(GE);
267			} else if (peek() == '>') {
268				input(); yylval.i = APPEND; RET(APPEND);
269			} else {
270				yylval.i = GT; RET(GT);
271			}
272		case '+':
273			if (peek() == '+') {
274				input(); yylval.i = INCR; RET(INCR);
275			} else if (peek() == '=') {
276				input(); yylval.i = ADDEQ; RET(ASGNOP);
277			} else
278				RET('+');
279		case '-':
280			if (peek() == '-') {
281				input(); yylval.i = DECR; RET(DECR);
282			} else if (peek() == '=') {
283				input(); yylval.i = SUBEQ; RET(ASGNOP);
284			} else
285				RET('-');
286		case '*':
287			if (peek() == '=') {	/* *= */
288				input(); yylval.i = MULTEQ; RET(ASGNOP);
289			} else if (peek() == '*') {	/* ** or **= */
290				input();	/* eat 2nd * */
291				if (peek() == '=') {
292					input(); yylval.i = POWEQ; RET(ASGNOP);
293				} else {
294					RET(POWER);
295				}
296			} else
297				RET('*');
298		case '/':
299			RET('/');
300		case '%':
301			if (peek() == '=') {
302				input(); yylval.i = MODEQ; RET(ASGNOP);
303			} else
304				RET('%');
305		case '^':
306			if (peek() == '=') {
307				input(); yylval.i = POWEQ; RET(ASGNOP);
308			} else
309				RET(POWER);
310
311		case '$':
312			/* BUG: awkward, if not wrong */
313			c = gettok(&buf, &bufsize);
314			if (isalpha(c)) {
315				if (strcmp(buf, "NF") == 0) {	/* very special */
316					unputstr("(NF)");
317					RET(INDIRECT);
318				}
319				c = peek();
320				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321					unputstr(buf);
322					RET(INDIRECT);
323				}
324				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325				RET(IVAR);
326			} else if (c == 0) {	/*  */
327				SYNTAX( "unexpected end of input after $" );
328				RET(';');
329			} else {
330				unputstr(buf);
331				RET(INDIRECT);
332			}
333
334		case '}':
335			if (--bracecnt < 0)
336				SYNTAX( "extra }" );
337			sc = true;
338			RET(';');
339		case ']':
340			if (--brackcnt < 0)
341				SYNTAX( "extra ]" );
342			RET(']');
343		case ')':
344			if (--parencnt < 0)
345				SYNTAX( "extra )" );
346			RET(')');
347		case '{':
348			bracecnt++;
349			RET('{');
350		case '[':
351			brackcnt++;
352			RET('[');
353		case '(':
354			parencnt++;
355			RET('(');
356
357		case '"':
358			return string();	/* BUG: should be like tran.c ? */
359
360		default:
361			RET(c);
362		}
363	}
364}
365
366extern int runetochar(char *str, int c);
367
368int string(void)
369{
370	int c, n;
371	char *s, *bp;
372	static char *buf = NULL;
373	static int bufsz = 500;
374
375	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
376		FATAL("out of space for strings");
377	for (bp = buf; (c = input()) != '"'; ) {
378		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
379			FATAL("out of space for string %.10s...", buf);
380		switch (c) {
381		case '\n':
382		case '\r':
383		case 0:
384			*bp = '\0';
385			SYNTAX( "non-terminated string %.10s...", buf );
386			if (c == 0)	/* hopeless */
387				FATAL( "giving up" );
388			lineno++;
389			break;
390		case '\\':
391			c = input();
392			switch (c) {
393			case '\n': break;
394			case '"': *bp++ = '"'; break;
395			case 'n': *bp++ = '\n'; break;
396			case 't': *bp++ = '\t'; break;
397			case 'f': *bp++ = '\f'; break;
398			case 'r': *bp++ = '\r'; break;
399			case 'b': *bp++ = '\b'; break;
400			case 'v': *bp++ = '\v'; break;
401			case 'a': *bp++ = '\a'; break;
402			case '\\': *bp++ = '\\'; break;
403
404			case '0': case '1': case '2': /* octal: \d \dd \ddd */
405			case '3': case '4': case '5': case '6': case '7':
406				n = c - '0';
407				if ((c = peek()) >= '0' && c < '8') {
408					n = 8 * n + input() - '0';
409					if ((c = peek()) >= '0' && c < '8')
410						n = 8 * n + input() - '0';
411				}
412				*bp++ = n;
413				break;
414
415			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
416			    {
417				int i;
418
419				if (!isxdigit(peek())) {
420					unput(c);
421					break;
422				}
423				n = 0;
424				for (i = 0; i < 2; i++) {
425					c = input();
426					if (c == 0)
427						break;
428					if (isxdigit(c)) {
429						c = tolower(c);
430						n *= 16;
431						if (isdigit(c))
432							n += (c - '0');
433						else
434							n += 10 + (c - 'a');
435					} else {
436						unput(c);
437						break;
438					}
439				}
440				if (i)
441					*bp++ = n;
442				break;
443			    }
444
445			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
446			    {
447				int i;
448
449				n = 0;
450				for (i = 0; i < 8; i++) {
451					c = input();
452					if (!isxdigit(c) || c == 0)
453						break;
454					c = tolower(c);
455					n *= 16;
456					if (isdigit(c))
457						n += (c - '0');
458					else
459						n += 10 + (c - 'a');
460				}
461				unput(c);
462				bp += runetochar(bp, n);
463				break;
464			    }
465
466			default:
467				*bp++ = c;
468				break;
469			}
470			break;
471		default:
472			*bp++ = c;
473			break;
474		}
475	}
476	*bp = 0;
477	s = tostring(buf);
478	*bp++ = ' '; *bp++ = '\0';
479	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
480	free(s);
481	RET(STRING);
482}
483
484
485static int binsearch(char *w, const Keyword *kp, int n)
486{
487	int cond, low, mid, high;
488
489	low = 0;
490	high = n - 1;
491	while (low <= high) {
492		mid = (low + high) / 2;
493		if ((cond = strcmp(w, kp[mid].word)) < 0)
494			high = mid - 1;
495		else if (cond > 0)
496			low = mid + 1;
497		else
498			return mid;
499	}
500	return -1;
501}
502
503int word(char *w)
504{
505	const Keyword *kp;
506	int c, n;
507
508	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
509	if (n != -1) {	/* found in table */
510		kp = keywords + n;
511		yylval.i = kp->sub;
512		switch (kp->type) {	/* special handling */
513		case BLTIN:
514			if (kp->sub == FSYSTEM && safe)
515				SYNTAX( "system is unsafe" );
516			RET(kp->type);
517		case FUNC:
518			if (infunc)
519				SYNTAX( "illegal nested function" );
520			RET(kp->type);
521		case RETURN:
522			if (!infunc)
523				SYNTAX( "return not in function" );
524			RET(kp->type);
525		case VARNF:
526			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
527			RET(VARNF);
528		default:
529			RET(kp->type);
530		}
531	}
532	c = peek();	/* look for '(' */
533	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
534		yylval.i = n;
535		RET(ARG);
536	} else {
537		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
538		if (c == '(') {
539			RET(CALL);
540		} else {
541			RET(VAR);
542		}
543	}
544}
545
546void startreg(void)	/* next call to yylex will return a regular expression */
547{
548	reg = true;
549}
550
551int regexpr(void)
552{
553	int c;
554	static char *buf = NULL;
555	static int bufsz = 500;
556	char *bp;
557
558	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
559		FATAL("out of space for reg expr");
560	bp = buf;
561	for ( ; (c = input()) != '/' && c != 0; ) {
562		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
563			FATAL("out of space for reg expr %.10s...", buf);
564		if (c == '\n') {
565			*bp = '\0';
566			SYNTAX( "newline in regular expression %.10s...", buf );
567			unput('\n');
568			break;
569		} else if (c == '\\') {
570			*bp++ = '\\';
571			*bp++ = input();
572		} else {
573			*bp++ = c;
574		}
575	}
576	*bp = 0;
577	if (c == 0)
578		SYNTAX("non-terminated regular expression %.10s...", buf);
579	yylval.s = tostring(buf);
580	unput('/');
581	RET(REGEXPR);
582}
583
584/* low-level lexical stuff, sort of inherited from lex */
585
586char	ebuf[300];
587char	*ep = ebuf;
588char	yysbuf[100];	/* pushback buffer */
589char	*yysptr = yysbuf;
590FILE	*yyin = NULL;
591
592int input(void)	/* get next lexical input character */
593{
594	int c;
595	extern char *lexprog;
596
597	if (yysptr > yysbuf)
598		c = (uschar)*--yysptr;
599	else if (lexprog != NULL) {	/* awk '...' */
600		if ((c = (uschar)*lexprog) != 0)
601			lexprog++;
602	} else				/* awk -f ... */
603		c = pgetc();
604	if (c == EOF)
605		c = 0;
606	if (ep >= ebuf + sizeof ebuf)
607		ep = ebuf;
608	*ep = c;
609	if (c != 0) {
610		ep++;
611	}
612	return (c);
613}
614
615void unput(int c)	/* put lexical character back on input */
616{
617	if (yysptr >= yysbuf + sizeof(yysbuf))
618		FATAL("pushed back too much: %.20s...", yysbuf);
619	*yysptr++ = c;
620	if (--ep < ebuf)
621		ep = ebuf + sizeof(ebuf) - 1;
622}
623
624void unputstr(const char *s)	/* put a string back on input */
625{
626	int i;
627
628	for (i = strlen(s)-1; i >= 0; i--)
629		unput(s[i]);
630}