1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "awkgram.tab.h"
31
32extern YYSTYPE yylval;
33extern bool infunc;
34
35int lineno = 1;
36int bracecnt = 0;
37int brackcnt = 0;
38int parencnt = 0;
39
40typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44} Keyword;
45
46const Keyword keywords[] = { /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90};
91
92#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93
94static int peek(void)
95{
96 int c = input();
97 unput(c);
98 return c;
99}
100
101static int gettok(char **pbuf, int *psz) /* get next input token */
102{
103 int c, retc;
104 char *buf = *pbuf;
105 int sz = *psz;
106 char *bp = buf;
107
108 c = input();
109 if (c == 0)
110 return 0;
111 buf[0] = c;
112 buf[1] = 0;
113 if (!isalnum(c) && c != '.' && c != '_')
114 return c;
115
116 *bp++ = c;
117 if (isalpha(c) || c == '_') { /* it's a varname */
118 for ( ; (c = input()) != 0; ) {
119 if (bp-buf >= sz)
120 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 FATAL( "out of space for name %.10s...", buf );
122 if (isalnum(c) || c == '_')
123 *bp++ = c;
124 else {
125 *bp = 0;
126 unput(c);
127 break;
128 }
129 }
130 *bp = 0;
131 retc = 'a'; /* alphanumeric */
132 } else { /* maybe it's a number, but could be . */
133 char *rem;
134 /* read input until can't be a number */
135 for ( ; (c = input()) != 0; ) {
136 if (bp-buf >= sz)
137 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 FATAL( "out of space for number %.10s...", buf );
139 if (isdigit(c) || c == 'e' || c == 'E'
140 || c == '.' || c == '+' || c == '-')
141 *bp++ = c;
142 else {
143 unput(c);
144 break;
145 }
146 }
147 *bp = 0;
148 strtod(buf, &rem); /* parse the number */
149 if (rem == buf) { /* it wasn't a valid number at all */
150 buf[1] = 0; /* return one character as token */
151 retc = (uschar)buf[0]; /* character is its own type */
152 unputstr(rem+1); /* put rest back for later */
153 } else { /* some prefix was a number */
154 unputstr(rem); /* put rest back for later */
155 rem[0] = 0; /* truncate buf after number part */
156 retc = '0'; /* type is number */
157 }
158 }
159 *pbuf = buf;
160 *psz = sz;
161 return retc;
162}
163
164int word(char *);
165int string(void);
166int regexpr(void);
167bool sc = false; /* true => return a } right now */
168bool reg = false; /* true => return a REGEXPR now */
169
170int yylex(void)
171{
172 int c;
173 static char *buf = NULL;
174 static int bufsize = 5; /* BUG: setting this small causes core dump! */
175
176 if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
177 FATAL( "out of space in yylex" );
178 if (sc) {
179 sc = false;
180 RET('}');
181 }
182 if (reg) {
183 reg = false;
184 return regexpr();
185 }
186 for (;;) {
187 c = gettok(&buf, &bufsize);
188 if (c == 0)
189 return 0;
190 if (isalpha(c) || c == '_')
191 return word(buf);
192 if (isdigit(c)) {
193 char *cp = tostring(buf);
194 double result;
195
196 if (is_number(cp, & result))
197 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
198 else
199 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
200 free(cp);
201 /* should this also have STR set? */
202 RET(NUMBER);
203 }
204
205 yylval.i = c;
206 switch (c) {
207 case '\n': /* {EOL} */
208 lineno++;
209 RET(NL);
210 case '\r': /* assume \n is coming */
211 case ' ': /* {WS}+ */
212 case '\t':
213 break;
214 case '#': /* #.* strip comments */
215 while ((c = input()) != '\n' && c != 0)
216 ;
217 unput(c);
218 break;
219 case ';':
220 RET(';');
221 case '\\':
222 if (peek() == '\n') {
223 input();
224 lineno++;
225 } else if (peek() == '\r') {
226 input(); input(); /* \n */
227 lineno++;
228 } else {
229 RET(c);
230 }
231 break;
232 case '&':
233 if (peek() == '&') {
234 input(); RET(AND);
235 } else
236 RET('&');
237 case '|':
238 if (peek() == '|') {
239 input(); RET(BOR);
240 } else
241 RET('|');
242 case '!':
243 if (peek() == '=') {
244 input(); yylval.i = NE; RET(NE);
245 } else if (peek() == '~') {
246 input(); yylval.i = NOTMATCH; RET(MATCHOP);
247 } else
248 RET(NOT);
249 case '~':
250 yylval.i = MATCH;
251 RET(MATCHOP);
252 case '<':
253 if (peek() == '=') {
254 input(); yylval.i = LE; RET(LE);
255 } else {
256 yylval.i = LT; RET(LT);
257 }
258 case '=':
259 if (peek() == '=') {
260 input(); yylval.i = EQ; RET(EQ);
261 } else {
262 yylval.i = ASSIGN; RET(ASGNOP);
263 }
264 case '>':
265 if (peek() == '=') {
266 input(); yylval.i = GE; RET(GE);
267 } else if (peek() == '>') {
268 input(); yylval.i = APPEND; RET(APPEND);
269 } else {
270 yylval.i = GT; RET(GT);
271 }
272 case '+':
273 if (peek() == '+') {
274 input(); yylval.i = INCR; RET(INCR);
275 } else if (peek() == '=') {
276 input(); yylval.i = ADDEQ; RET(ASGNOP);
277 } else
278 RET('+');
279 case '-':
280 if (peek() == '-') {
281 input(); yylval.i = DECR; RET(DECR);
282 } else if (peek() == '=') {
283 input(); yylval.i = SUBEQ; RET(ASGNOP);
284 } else
285 RET('-');
286 case '*':
287 if (peek() == '=') { /* *= */
288 input(); yylval.i = MULTEQ; RET(ASGNOP);
289 } else if (peek() == '*') { /* ** or **= */
290 input(); /* eat 2nd * */
291 if (peek() == '=') {
292 input(); yylval.i = POWEQ; RET(ASGNOP);
293 } else {
294 RET(POWER);
295 }
296 } else
297 RET('*');
298 case '/':
299 RET('/');
300 case '%':
301 if (peek() == '=') {
302 input(); yylval.i = MODEQ; RET(ASGNOP);
303 } else
304 RET('%');
305 case '^':
306 if (peek() == '=') {
307 input(); yylval.i = POWEQ; RET(ASGNOP);
308 } else
309 RET(POWER);
310
311 case '$':
312 /* BUG: awkward, if not wrong */
313 c = gettok(&buf, &bufsize);
314 if (isalpha(c)) {
315 if (strcmp(buf, "NF") == 0) { /* very special */
316 unputstr("(NF)");
317 RET(INDIRECT);
318 }
319 c = peek();
320 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321 unputstr(buf);
322 RET(INDIRECT);
323 }
324 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325 RET(IVAR);
326 } else if (c == 0) { /* */
327 SYNTAX( "unexpected end of input after $" );
328 RET(';');
329 } else {
330 unputstr(buf);
331 RET(INDIRECT);
332 }
333
334 case '}':
335 if (--bracecnt < 0)
336 SYNTAX( "extra }" );
337 sc = true;
338 RET(';');
339 case ']':
340 if (--brackcnt < 0)
341 SYNTAX( "extra ]" );
342 RET(']');
343 case ')':
344 if (--parencnt < 0)
345 SYNTAX( "extra )" );
346 RET(')');
347 case '{':
348 bracecnt++;
349 RET('{');
350 case '[':
351 brackcnt++;
352 RET('[');
353 case '(':
354 parencnt++;
355 RET('(');
356
357 case '"':
358 return string(); /* BUG: should be like tran.c ? */
359
360 default:
361 RET(c);
362 }
363 }
364}
365
366extern int runetochar(char *str, int c);
367
368int string(void)
369{
370 int c, n;
371 char *s, *bp;
372 static char *buf = NULL;
373 static int bufsz = 500;
374
375 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
376 FATAL("out of space for strings");
377 for (bp = buf; (c = input()) != '"'; ) {
378 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
379 FATAL("out of space for string %.10s...", buf);
380 switch (c) {
381 case '\n':
382 case '\r':
383 case 0:
384 *bp = '\0';
385 SYNTAX( "non-terminated string %.10s...", buf );
386 if (c == 0) /* hopeless */
387 FATAL( "giving up" );
388 lineno++;
389 break;
390 case '\\':
391 c = input();
392 switch (c) {
393 case '\n': break;
394 case '"': *bp++ = '"'; break;
395 case 'n': *bp++ = '\n'; break;
396 case 't': *bp++ = '\t'; break;
397 case 'f': *bp++ = '\f'; break;
398 case 'r': *bp++ = '\r'; break;
399 case 'b': *bp++ = '\b'; break;
400 case 'v': *bp++ = '\v'; break;
401 case 'a': *bp++ = '\a'; break;
402 case '\\': *bp++ = '\\'; break;
403
404 case '0': case '1': case '2': /* octal: \d \dd \ddd */
405 case '3': case '4': case '5': case '6': case '7':
406 n = c - '0';
407 if ((c = peek()) >= '0' && c < '8') {
408 n = 8 * n + input() - '0';
409 if ((c = peek()) >= '0' && c < '8')
410 n = 8 * n + input() - '0';
411 }
412 *bp++ = n;
413 break;
414
415 case 'x': /* hex \x0-9a-fA-F (exactly two) */
416 {
417 int i;
418
419 if (!isxdigit(peek())) {
420 unput(c);
421 break;
422 }
423 n = 0;
424 for (i = 0; i < 2; i++) {
425 c = input();
426 if (c == 0)
427 break;
428 if (isxdigit(c)) {
429 c = tolower(c);
430 n *= 16;
431 if (isdigit(c))
432 n += (c - '0');
433 else
434 n += 10 + (c - 'a');
435 } else {
436 unput(c);
437 break;
438 }
439 }
440 if (i)
441 *bp++ = n;
442 break;
443 }
444
445 case 'u': /* utf \u0-9a-fA-F (1..8) */
446 {
447 int i;
448
449 n = 0;
450 for (i = 0; i < 8; i++) {
451 c = input();
452 if (!isxdigit(c) || c == 0)
453 break;
454 c = tolower(c);
455 n *= 16;
456 if (isdigit(c))
457 n += (c - '0');
458 else
459 n += 10 + (c - 'a');
460 }
461 unput(c);
462 bp += runetochar(bp, n);
463 break;
464 }
465
466 default:
467 *bp++ = c;
468 break;
469 }
470 break;
471 default:
472 *bp++ = c;
473 break;
474 }
475 }
476 *bp = 0;
477 s = tostring(buf);
478 *bp++ = ' '; *bp++ = '\0';
479 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
480 free(s);
481 RET(STRING);
482}
483
484
485static int binsearch(char *w, const Keyword *kp, int n)
486{
487 int cond, low, mid, high;
488
489 low = 0;
490 high = n - 1;
491 while (low <= high) {
492 mid = (low + high) / 2;
493 if ((cond = strcmp(w, kp[mid].word)) < 0)
494 high = mid - 1;
495 else if (cond > 0)
496 low = mid + 1;
497 else
498 return mid;
499 }
500 return -1;
501}
502
503int word(char *w)
504{
505 const Keyword *kp;
506 int c, n;
507
508 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
509 if (n != -1) { /* found in table */
510 kp = keywords + n;
511 yylval.i = kp->sub;
512 switch (kp->type) { /* special handling */
513 case BLTIN:
514 if (kp->sub == FSYSTEM && safe)
515 SYNTAX( "system is unsafe" );
516 RET(kp->type);
517 case FUNC:
518 if (infunc)
519 SYNTAX( "illegal nested function" );
520 RET(kp->type);
521 case RETURN:
522 if (!infunc)
523 SYNTAX( "return not in function" );
524 RET(kp->type);
525 case VARNF:
526 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
527 RET(VARNF);
528 default:
529 RET(kp->type);
530 }
531 }
532 c = peek(); /* look for '(' */
533 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
534 yylval.i = n;
535 RET(ARG);
536 } else {
537 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
538 if (c == '(') {
539 RET(CALL);
540 } else {
541 RET(VAR);
542 }
543 }
544}
545
546void startreg(void) /* next call to yylex will return a regular expression */
547{
548 reg = true;
549}
550
551int regexpr(void)
552{
553 int c;
554 static char *buf = NULL;
555 static int bufsz = 500;
556 char *bp;
557
558 if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
559 FATAL("out of space for reg expr");
560 bp = buf;
561 for ( ; (c = input()) != '/' && c != 0; ) {
562 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
563 FATAL("out of space for reg expr %.10s...", buf);
564 if (c == '\n') {
565 *bp = '\0';
566 SYNTAX( "newline in regular expression %.10s...", buf );
567 unput('\n');
568 break;
569 } else if (c == '\\') {
570 *bp++ = '\\';
571 *bp++ = input();
572 } else {
573 *bp++ = c;
574 }
575 }
576 *bp = 0;
577 if (c == 0)
578 SYNTAX("non-terminated regular expression %.10s...", buf);
579 yylval.s = tostring(buf);
580 unput('/');
581 RET(REGEXPR);
582}
583
584/* low-level lexical stuff, sort of inherited from lex */
585
586char ebuf[300];
587char *ep = ebuf;
588char yysbuf[100]; /* pushback buffer */
589char *yysptr = yysbuf;
590FILE *yyin = NULL;
591
592int input(void) /* get next lexical input character */
593{
594 int c;
595 extern char *lexprog;
596
597 if (yysptr > yysbuf)
598 c = (uschar)*--yysptr;
599 else if (lexprog != NULL) { /* awk '...' */
600 if ((c = (uschar)*lexprog) != 0)
601 lexprog++;
602 } else /* awk -f ... */
603 c = pgetc();
604 if (c == EOF)
605 c = 0;
606 if (ep >= ebuf + sizeof ebuf)
607 ep = ebuf;
608 *ep = c;
609 if (c != 0) {
610 ep++;
611 }
612 return (c);
613}
614
615void unput(int c) /* put lexical character back on input */
616{
617 if (yysptr >= yysbuf + sizeof(yysbuf))
618 FATAL("pushed back too much: %.20s...", yysbuf);
619 *yysptr++ = c;
620 if (--ep < ebuf)
621 ep = ebuf + sizeof(ebuf) - 1;
622}
623
624void unputstr(const char *s) /* put a string back on input */
625{
626 int i;
627
628 for (i = strlen(s)-1; i >= 0; i--)
629 unput(s[i]);
630}