master xplshn/aruu / cmd / posix / sed.c
   1
   2
   3/* FIXME: summary
   4 * decide whether we enforce valid UTF-8, right now it's enforced in certain
   5 *     parts of the script, but not the input...
   6 * nul bytes cause explosions due to use of libc string functions. thoughts?
   7 * lack of newline at end of file, currently we add one. what should we do?
   8 * allow "\\t" for "\t" etc. in regex? in replacement text?
   9 * POSIX says don't flush on N when out of input, but GNU and busybox do.
  10 */
  11
  12#include "config.h"
  13#include "utf.h"
  14#include "util.h"
  15
  16#include <ctype.h>
  17#include <errno.h>
  18#include <libgen.h>
  19#include <regex.h>
  20#include <stdlib.h>
  21#include <string.h>
  22#include <sys/stat.h>
  23#include <unistd.h>
  24
  25/* Types */
  26
  27/* used as queue for writes and stack for {,:,b,t */
  28typedef struct {
  29	void **data;
  30	size_t size;
  31	size_t cap;
  32} Vec;
  33
  34/* used for arbitrary growth, str is a C string
  35 * FIXME: does it make sense to keep track of length? or just rely on libc
  36 *        string functions? If we want to support nul bytes everything changes
  37 */
  38typedef struct {
  39	char  *str;
  40	size_t cap;
  41} String;
  42
  43typedef struct Cmd Cmd;
  44typedef struct {
  45	void  (*fn)(Cmd *);
  46	char *(*getarg)(Cmd *, char *);
  47	void  (*freearg)(Cmd *);
  48	unsigned char naddr;
  49} Fninfo;
  50
  51typedef struct {
  52	union {
  53		size_t   lineno;
  54		regex_t *re;
  55	} u;
  56	enum {
  57		IGNORE, /* empty address, ignore        */
  58		EVERY , /* every line                   */
  59		LINE  , /* line number                  */
  60		LAST  , /* last line ($)                */
  61		REGEX , /* use included regex           */
  62		LASTRE, /* use most recently used regex */
  63	} type;
  64} Addr;
  65
  66/* DISCUSS: naddr is not strictly necessary, but very helpful
  67 * naddr == 0 iff beg.type == EVERY  && end.type == IGNORE
  68 * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE
  69 * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE
  70 */
  71typedef struct {
  72	Addr          beg;
  73	Addr          end;
  74	unsigned char naddr;
  75} Range;
  76
  77typedef struct {
  78	regex_t      *re; /* if NULL use last regex */
  79	String        repl;
  80	FILE         *file;
  81	size_t        occurrence; /* 0 for all (g flag) */
  82	Rune          delim;
  83	unsigned int  p:1;
  84} Sarg;
  85
  86typedef struct {
  87	Rune *set1;
  88	Rune *set2;
  89} Yarg;
  90
  91typedef struct {
  92	String str; /* a,c,i text. r file path */
  93	void  (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */
  94} ACIRarg;
  95
  96struct Cmd {
  97	Range   range;
  98	Fninfo *fninfo;
  99	union {
 100		Cmd      *jump;   /* used for   b,t when running  */
 101		char     *label;  /* used for :,b,t when building */
 102		ptrdiff_t offset; /* used for { (pointers break during realloc) */
 103		FILE     *file;   /* used for w */
 104
 105		/* FIXME: Should the following be in the union? or pointers and malloc? */
 106		Sarg      s;
 107		Yarg      y;
 108		ACIRarg   acir;
 109	} u; /* I find your lack of anonymous unions disturbing */
 110	unsigned int in_match:1;
 111	unsigned int negate  :1;
 112};
 113
 114/* Files for w command (and s' w flag) */
 115typedef struct {
 116	char *path;
 117	FILE *file;
 118} Wfile;
 119
 120/*
 121 * Function Declarations
 122 */
 123
 124/* Dynamically allocated arrays and strings */
 125static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next);
 126static void *pop(Vec *v);
 127static void push(Vec *v, void *p);
 128static void stracat(String *dst, char *src);
 129static void strnacat(String *dst, char *src, size_t n);
 130static void stracpy(String *dst, char *src);
 131
 132/* Cleanup and errors */
 133static void usage(void);
 134
 135/* Parsing functions and related utilities */
 136static void compile(char *s, int isfile);
 137static int read_line(FILE *f, String *s);
 138static char *make_range(Range *range, char *s);
 139static char *make_addr(Addr *addr, char *s);
 140static char *find_delim(char *s, Rune delim, int do_brackets);
 141static char *chompr(char *s, Rune rune);
 142static char *chomp(char *s);
 143static Rune *strtorunes(char *s, size_t nrunes);
 144static long stol(char *s, char **endp);
 145static size_t escapes(char *beg, char *end, Rune delim, int n_newline);
 146static size_t echarntorune(Rune *r, char *s, size_t n);
 147static void insert_labels(void);
 148
 149/* Get and Free arg and related utilities */
 150static char *get_aci_arg(Cmd *c, char *s);
 151static void aci_append(Cmd *c, char *s);
 152static void free_acir_arg(Cmd *c);
 153static char *get_bt_arg(Cmd *c, char *s);
 154static char *get_r_arg(Cmd *c, char *s);
 155static char *get_s_arg(Cmd *c, char *s);
 156static void free_s_arg(Cmd *c);
 157static char *get_w_arg(Cmd *c, char *s);
 158static char *get_y_arg(Cmd *c, char *s);
 159static void free_y_arg(Cmd *c);
 160static char *get_colon_arg(Cmd *c, char *s);
 161static char *get_lbrace_arg(Cmd *c, char *s);
 162static char *get_rbrace_arg(Cmd *c, char *s);
 163static char *semicolon_arg(char *s);
 164
 165/* Running */
 166static void run(void);
 167static int in_range(Cmd *c);
 168static int match_addr(Addr *a);
 169static int next_file(void);
 170static int is_eof(FILE *f);
 171static void do_writes(void);
 172static void write_file(char *path, FILE *out);
 173static void check_puts(char *s, FILE *f);
 174static void write_patt(char *s, FILE *f);
 175static void update_ranges(Cmd *beg, Cmd *end);
 176
 177/* Sed functions */
 178static void cmd_y(Cmd *c);
 179static void cmd_x(Cmd *c);
 180static void cmd_w(Cmd *c);
 181static void cmd_t(Cmd *c);
 182static void cmd_s(Cmd *c);
 183static void cmd_r(Cmd *c);
 184static void cmd_q(Cmd *c);
 185static void cmd_P(Cmd *c);
 186static void cmd_p(Cmd *c);
 187static void cmd_N(Cmd *c);
 188static void cmd_n(Cmd *c);
 189static void cmd_l(Cmd *c);
 190static void cmd_i(Cmd *c);
 191static void cmd_H(Cmd *c);
 192static void cmd_h(Cmd *c);
 193static void cmd_G(Cmd *c);
 194static void cmd_g(Cmd *c);
 195static void cmd_D(Cmd *c);
 196static void cmd_d(Cmd *c);
 197static void cmd_c(Cmd *c);
 198static void cmd_b(Cmd *c);
 199static void cmd_a(Cmd *c);
 200static void cmd_colon(Cmd *c);
 201static void cmd_equal(Cmd *c);
 202static void cmd_lbrace(Cmd *c);
 203static void cmd_rbrace(Cmd *c);
 204static void cmd_last(Cmd *c);
 205
 206/* Actions */
 207static void new_line(void);
 208static void app_line(void);
 209static void new_next(void);
 210static void old_next(void);
 211
 212/*
 213 * Globals
 214 */
 215static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */
 216static Vec writes; /* holds cmd*. writes scheduled by a and r commands */
 217static Vec wfiles; /* holds Wfile*. files for w and s///w commands */
 218
 219static Cmd   *prog, *pc; /* Program, program counter */
 220static size_t pcap;
 221static size_t lineno;
 222#if FEATURE_SED_PRESERVE_NEWLINE
 223static int hadnl = 1;
 224#endif
 225
 226static regex_t *lastre; /* last used regex for empty regex search */
 227static char   **files;  /* list of file names from argv */
 228static FILE    *file;   /* current file we are reading */
 229static int      ret;    /* exit status */
 230
 231static String patt, hold, genbuf;
 232
 233static struct {
 234	unsigned int n       :1; /* -n (no print) */
 235	unsigned int E       :1; /* -E (extended re) */
 236	unsigned int s       :1; /* s/// replacement happened */
 237	unsigned int aci_cont:1; /* a,c,i text continuation */
 238	unsigned int s_cont  :1; /* s/// replacement text continuation */
 239	unsigned int halt    :1; /* halt execution */
 240} gflags;
 241
 242/* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */
 243static Fninfo fns[] = {
 244	['a'] = { cmd_a     , get_aci_arg   , free_acir_arg , 1 }, /* schedule write of text for later                                                      */
 245	['b'] = { cmd_b     , get_bt_arg    , NULL          , 2 }, /* branch to label char *label when building, Cmd *jump when running                     */
 246	['c'] = { cmd_c     , get_aci_arg   , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text                     */
 247	['d'] = { cmd_d     , NULL          , NULL          , 2 }, /* delete pattern space                                                                  */
 248	['D'] = { cmd_D     , NULL          , NULL          , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d)        */
 249	['g'] = { cmd_g     , NULL          , NULL          , 2 }, /* replace pattern space with hold space                                                 */
 250	['G'] = { cmd_G     , NULL          , NULL          , 2 }, /* append newline and hold space to pattern space                                        */
 251	['h'] = { cmd_h     , NULL          , NULL          , 2 }, /* replace hold space with pattern space                                                 */
 252	['H'] = { cmd_H     , NULL          , NULL          , 2 }, /* append newline and pattern space to hold space                                        */
 253	['i'] = { cmd_i     , get_aci_arg   , free_acir_arg , 1 }, /* write text                                                                            */
 254	['l'] = { cmd_l     , NULL          , NULL          , 2 }, /* write pattern space in 'visually unambiguous form'                                    */
 255	['n'] = { cmd_n     , NULL          , NULL          , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit)     */
 256	['N'] = { cmd_N     , NULL          , NULL          , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */
 257	['p'] = { cmd_p     , NULL          , NULL          , 2 }, /* write pattern space                                                                   */
 258	['P'] = { cmd_P     , NULL          , NULL          , 2 }, /* write pattern space up to first newline                                               */
 259	['q'] = { cmd_q     , NULL          , NULL          , 1 }, /* quit                                                                                  */
 260	['r'] = { cmd_r     , get_r_arg     , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file)                    */
 261	['s'] = { cmd_s     , get_s_arg     , free_s_arg    , 2 }, /* find/replace/all that crazy s stuff                                                   */
 262	['t'] = { cmd_t     , get_bt_arg    , NULL          , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */
 263	['w'] = { cmd_w     , get_w_arg     , NULL          , 2 }, /* append pattern space to file                                                          */
 264	['x'] = { cmd_x     , NULL          , NULL          , 2 }, /* exchange pattern and hold spaces                                                      */
 265	['y'] = { cmd_y     , get_y_arg     , free_y_arg    , 2 }, /* replace runes in set1 with runes in set2                                              */
 266	[':'] = { cmd_colon , get_colon_arg , NULL          , 0 }, /* defines label for later b and t commands                                              */
 267	['='] = { cmd_equal , NULL          , NULL          , 1 }, /* printf("%d\n", line_number);                                                          */
 268	['{'] = { cmd_lbrace, get_lbrace_arg, NULL          , 2 }, /* if we match, run commands, otherwise jump to close                                    */
 269	['}'] = { cmd_rbrace, get_rbrace_arg, NULL          , 0 }, /* noop, hold onto open for ease of building scripts                                     */
 270
 271	[0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */
 272};
 273
 274/*
 275 * Function Definitions
 276 */
 277
 278/* given memory pointed to by *ptr that currently holds *nmemb members of size
 279 * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one
 280 * past old end in *next. if realloc fails...explode
 281 */
 282static void
 283resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next)
 284{
 285	void *n, *tmp;
 286
 287	if (new_nmemb) {
 288		tmp = ereallocarray(*ptr, new_nmemb, size);
 289	} else { /* turns out realloc(*ptr, 0) != free(*ptr) */
 290		free(*ptr);
 291		tmp = NULL;
 292	}
 293	n = (char *)tmp + *nmemb * size;
 294	*nmemb = new_nmemb;
 295	*ptr   = tmp;
 296	if (next)
 297		*next = n;
 298}
 299
 300static void *
 301pop(Vec *v)
 302{
 303	if (!v->size)
 304		return NULL;
 305	return v->data[--v->size];
 306}
 307
 308static void
 309push(Vec *v, void *p)
 310{
 311	if (v->size == v->cap)
 312		resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL);
 313	v->data[v->size++] = p;
 314}
 315
 316static void
 317stracat(String *dst, char *src)
 318{
 319	int new = !dst->cap;
 320	size_t len;
 321
 322	len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1;
 323	if (dst->cap < len)
 324		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
 325	if (new)
 326		*dst->str = '\0';
 327	strcat(dst->str, src);
 328}
 329
 330static void
 331strnacat(String *dst, char *src, size_t n)
 332{
 333	int new = !dst->cap;
 334	size_t len;
 335
 336	len = strlen(src);
 337	len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1;
 338	if (dst->cap < len)
 339		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
 340	if (new)
 341		*dst->str = '\0';
 342	strlcat(dst->str, src, len);
 343}
 344
 345static void
 346stracpy(String *dst, char *src)
 347{
 348	size_t len;
 349
 350	len = strlen(src) + 1;
 351	if (dst->cap < len)
 352		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
 353	strcpy(dst->str, src);
 354}
 355
 356static void
 357leprintf(char *s)
 358{
 359	if (errno)
 360		eprintf("%zu: %s: %s\n", lineno, s, strerror(errno));
 361	else
 362		eprintf("%zu: %s\n", lineno, s);
 363}
 364
 365/* FIXME: write usage message */
 366#if FEATURE_SED_INPLACE
 367static int iflag = 0;
 368static char *backup_suffix = NULL;
 369
 370static int
 371create_temp_file(const char *orig_path, char **temp_path)
 372{
 373	char *dir, *dircopy, *tmpl;
 374	int fd;
 375
 376	dircopy = estrdup(orig_path);
 377	dir = dirname(dircopy);
 378	tmpl = emalloc(strlen(dir) + 16);
 379	sprintf(tmpl, "%s/sedtmpXXXXXX", dir);
 380	free(dircopy);
 381
 382	fd = mkstemp(tmpl);
 383	if (fd < 0) {
 384		free(tmpl);
 385		return -1;
 386	}
 387	*temp_path = tmpl;
 388	return fd;
 389}
 390#endif
 391
 392static void
 393usage(void)
 394{
 395	eprintf("usage: sed [-nrE] script [file ...]\n"
 396	        "       sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n"
 397	        "       sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n");
 398}
 399
 400/* Differences from POSIX
 401 * we allows semicolons and trailing blanks inside {}
 402 * we allow spaces after ! (and in between !s)
 403 * we allow extended regular expressions (-E)
 404 */
 405static void
 406compile(char *s, int isfile)
 407{
 408	FILE *f;
 409
 410	if (isfile) {
 411		f = fopen(s, "r");
 412		if (!f)
 413			eprintf("fopen %s:", s);
 414	} else {
 415		if (!*s) /* empty string script */
 416			return;
 417		f = fmemopen(s, strlen(s), "r");
 418		if (!f)
 419			eprintf("fmemopen:");
 420	}
 421
 422	/* NOTE: get arg functions can't use genbuf */
 423	while (read_line(f, &genbuf) != EOF) {
 424		s = genbuf.str;
 425
 426		/* if the first two characters of the script are "#n" default output shall be suppressed */
 427		if (++lineno == 1 && *s == '#' && s[1] == 'n') {
 428			gflags.n = 1;
 429			continue;
 430		}
 431
 432		if (gflags.aci_cont) {
 433			aci_append(pc - 1, s);
 434			continue;
 435		}
 436		if (gflags.s_cont)
 437			s = (pc - 1)->fninfo->getarg(pc - 1, s);
 438
 439		while (*s) {
 440			s = chompr(s, ';');
 441			if (!*s || *s == '#')
 442				break;
 443
 444			if ((size_t)(pc - prog) == pcap)
 445				resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc);
 446
 447			pc->range.beg.type = pc->range.end.type = IGNORE;
 448			pc->fninfo = NULL;
 449			pc->in_match = 0;
 450
 451			s = make_range(&pc->range, s);
 452			s = chomp(s);
 453			pc->negate = *s == '!';
 454			s = chompr(s, '!');
 455
 456			if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn)
 457				leprintf("bad sed function");
 458			if (pc->range.naddr > pc->fninfo->naddr)
 459				leprintf("wrong number of addresses");
 460			s++;
 461
 462			if (pc->fninfo->getarg)
 463				s = pc->fninfo->getarg(pc, s);
 464
 465			pc++;
 466		}
 467	}
 468
 469	fshut(f, s);
 470}
 471
 472/* FIXME: if we decide to honor lack of trailing newline, set/clear a global
 473 * flag when reading a line
 474 */
 475static int
 476read_line(FILE *f, String *s)
 477{
 478	ssize_t len;
 479
 480	if (!f)
 481		return EOF;
 482
 483	if ((len = getline(&s->str, &s->cap, f)) < 0) {
 484		if (ferror(f))
 485			eprintf("getline:");
 486		return EOF;
 487	}
 488#if FEATURE_SED_PRESERVE_NEWLINE
 489	if (len > 0)
 490		hadnl = (s->str[len - 1] == '\n');
 491#endif
 492	if (s->str[--len] == '\n')
 493		s->str[len] = '\0';
 494	return 0;
 495}
 496
 497/* read first range from s, return pointer to one past end of range */
 498static char *
 499make_range(Range *range, char *s)
 500{
 501	s = make_addr(&range->beg, s);
 502
 503	if (*s == ',')
 504		s = make_addr(&range->end, s + 1);
 505	else
 506		range->end.type = IGNORE;
 507
 508	if      (range->beg.type == EVERY  && range->end.type == IGNORE) range->naddr = 0;
 509	else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1;
 510	else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2;
 511	else leprintf("this is impossible...");
 512
 513	return s;
 514}
 515
 516/* read first addr from s, return pointer to one past end of addr */
 517static char *
 518make_addr(Addr *addr, char *s)
 519{
 520	Rune r;
 521	char *p = s + strlen(s);
 522	size_t rlen = echarntorune(&r, s, p - s);
 523
 524	if (r == '$') {
 525		addr->type = LAST;
 526		s += rlen;
 527	} else if (isdigitrune(r)) {
 528		addr->type = LINE;
 529		addr->u.lineno = stol(s, &s);
 530	} else if (r == '/' || r == '\\') {
 531		Rune delim;
 532		if (r == '\\') {
 533			s += rlen;
 534			rlen = echarntorune(&r, s, p - s);
 535		}
 536		if (r == '\\')
 537			leprintf("bad delimiter '\\'");
 538		delim = r;
 539		s += rlen;
 540		rlen = echarntorune(&r, s, p - s);
 541		if (r == delim) {
 542			addr->type = LASTRE;
 543			s += rlen;
 544		} else {
 545			addr->type = REGEX;
 546			p = find_delim(s, delim, 1);
 547			if (!*p)
 548				leprintf("unclosed regex");
 549			p -= escapes(s, p, delim, 0);
 550			*p++ = '\0';
 551			addr->u.re = emalloc(sizeof(*addr->u.re));
 552			eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0);
 553			s = p;
 554		}
 555	} else {
 556		addr->type = EVERY;
 557	}
 558
 559	return s;
 560}
 561
 562/* return pointer to first delim in s that is not escaped
 563 * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside [])
 564 * return pointer to trailing nul byte if no delim found
 565 *
 566 * any escaped character that is not special is just itself (POSIX undefined)
 567 * FIXME: pull out into some util thing, will be useful for ed as well
 568 */
 569static char *
 570find_delim(char *s, Rune delim, int do_brackets)
 571{
 572	enum {
 573		OUTSIDE         , /* not in brackets */
 574		BRACKETS_OPENING, /* last char was first [ or last two were first [^ */
 575		BRACKETS_INSIDE , /* inside [] */
 576		INSIDE_OPENING  , /* inside [] and last char was [ */
 577		CLASS_INSIDE    , /* inside class [::], or colating element [..] or [==], inside [] */
 578		CLASS_CLOSING   , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */
 579	} state = OUTSIDE;
 580
 581	Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */
 582	size_t rlen;
 583	int escape = 0;
 584	char *end = s + strlen(s);
 585
 586	for (; *s; s += rlen) {
 587		rlen = echarntorune(&r, s, end - s);
 588
 589		if      (state == BRACKETS_OPENING       &&  r == '^'  ) {                            continue; }
 590		else if (state == BRACKETS_OPENING       &&  r == ']'  ) { state  = BRACKETS_INSIDE ; continue; }
 591		else if (state == BRACKETS_OPENING                     ) { state  = BRACKETS_INSIDE ;           }
 592
 593		if      (state == CLASS_CLOSING          &&  r == ']'  ) { state  = BRACKETS_INSIDE ;           }
 594		else if (state == CLASS_CLOSING                        ) { state  = CLASS_INSIDE    ;           }
 595		else if (state == CLASS_INSIDE           &&  r ==  c   ) { state  = CLASS_CLOSING   ;           }
 596		else if (state == INSIDE_OPENING         && (r == ':'  ||
 597		                                             r == '.'  ||
 598		                                             r == '=') ) { state  = CLASS_INSIDE    ; c = r;    }
 599		else if (state == INSIDE_OPENING         &&  r == ']'  ) { state  = OUTSIDE         ;           }
 600		else if (state == INSIDE_OPENING                       ) { state  = BRACKETS_INSIDE ;           }
 601		else if (state == BRACKETS_INSIDE        &&  r == '['  ) { state  = INSIDE_OPENING  ;           }
 602		else if (state == BRACKETS_INSIDE        &&  r == ']'  ) { state  = OUTSIDE         ;           }
 603		else if (state == OUTSIDE                &&  escape    ) { escape = 0               ;           }
 604		else if (state == OUTSIDE                &&  r == '\\' ) { escape = 1               ;           }
 605		else if (state == OUTSIDE                &&  r == delim) return s;
 606		else if (state == OUTSIDE && do_brackets &&  r == '['  ) { state  = BRACKETS_OPENING;           }
 607	}
 608	return s;
 609}
 610
 611static char *
 612chomp(char *s)
 613{
 614	return chompr(s, 0);
 615}
 616
 617/* eat all leading whitespace and occurrences of rune */
 618static char *
 619chompr(char *s, Rune rune)
 620{
 621	Rune   r;
 622	size_t rlen;
 623	char  *end = s + strlen(s);
 624
 625	while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune))
 626		s += rlen;
 627	return s;
 628}
 629
 630/* convert first nrunes Runes from UTF-8 string s in allocated Rune*
 631 * NOTE: sequence must be valid UTF-8, check first */
 632static Rune *
 633strtorunes(char *s, size_t nrunes)
 634{
 635	Rune *rs, *rp;
 636
 637	rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs));
 638
 639	while (nrunes--)
 640		s += chartorune(rp++, s);
 641
 642	*rp = '\0';
 643	return rs;
 644}
 645
 646static long
 647stol(char *s, char **endp)
 648{
 649	long n;
 650	errno = 0;
 651	n = strtol(s, endp, 10);
 652
 653	if (errno)
 654		leprintf("strtol:");
 655	if (*endp == s)
 656		leprintf("strtol: invalid number");
 657
 658	return n;
 659}
 660
 661/* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim)
 662 * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal)
 663 * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command)
 664 * if delim is 0 all escaped characters represent themselves (aci text)
 665 * memmove rest of string (beyond end) into place
 666 * return the number of converted escapes (backslashes removed)
 667 * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better
 668 */
 669static size_t
 670escapes(char *beg, char *end, Rune delim, int n_newline)
 671{
 672	size_t num = 0;
 673	char *src = beg, *dst = beg;
 674
 675	while (src < end) {
 676		/* handle escaped backslash specially so we don't think the second
 677		 * backslash is escaping something */
 678		if (*src == '\\' && src[1] == '\\') {
 679			*dst++ = *src++;
 680			if (delim)
 681				*dst++ = *src++;
 682			else
 683				src++;
 684		} else if (*src == '\\' && !delim) {
 685			src++;
 686		} else if (*src == '\\' && src[1]) {
 687			Rune r;
 688			size_t rlen;
 689			num++;
 690			src++;
 691			rlen = echarntorune(&r, src, end - src);
 692
 693			if (r == 'n' && delim == 'n') {
 694				*src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */
 695			} else if (r == 'n') {
 696				*src = '\n';
 697			} else if (r != delim) {
 698				*dst++ = '\\';
 699				num--;
 700			}
 701
 702			memmove(dst, src, rlen);
 703			dst += rlen;
 704			src += rlen;
 705		} else {
 706			*dst++ = *src++;
 707		}
 708	}
 709	memmove(dst, src, strlen(src) + 1);
 710	return num;
 711}
 712
 713static size_t
 714echarntorune(Rune *r, char *s, size_t n)
 715{
 716	size_t rlen = charntorune(r, s, n);
 717	if (!rlen || *r == Runeerror)
 718		leprintf("invalid UTF-8");
 719	return rlen;
 720}
 721
 722static void
 723insert_labels(void)
 724{
 725	size_t i;
 726	Cmd *from, *to;
 727
 728	while (branches.size) {
 729		from = prog + (ptrdiff_t)pop(&branches);
 730
 731		if (!from->u.label) {/* no label branch to end of script */
 732			from->u.jump = pc - 1;
 733		} else {
 734			for (i = 0; i < labels.size; i++) {
 735				to = prog + (ptrdiff_t)labels.data[i];
 736				if (!strcmp(from->u.label, to->u.label)) {
 737					from->u.jump = to;
 738					break;
 739				}
 740			}
 741			if (i == labels.size)
 742				leprintf("bad label");
 743		}
 744	}
 745}
 746
 747/*
 748 * Getargs / Freeargs
 749 * Read argument from s, return pointer to one past last character of argument
 750 */
 751
 752/* POSIX compliant
 753 * i\
 754 * foobar
 755 *
 756 * also allow the following non POSIX compliant
 757 * i        # empty line
 758 * ifoobar
 759 * ifoobar\
 760 * baz
 761 *
 762 * FIXME: GNU and busybox discard leading spaces
 763 * i  foobar
 764 * i foobar
 765 * ifoobar
 766 * are equivalent in GNU and busybox. We don't. Should we?
 767 */
 768static char *
 769get_aci_arg(Cmd *c, char *s)
 770{
 771	c->u.acir.print = check_puts;
 772	c->u.acir.str = (String){ NULL, 0 };
 773
 774	gflags.aci_cont = !!*s; /* no continue flag if empty string */
 775
 776	/* neither empty string nor POSIX compliant */
 777	if (*s && !(*s == '\\' && !s[1]))
 778		aci_append(c, s);
 779
 780	return s + strlen(s);
 781}
 782
 783static void
 784aci_append(Cmd *c, char *s)
 785{
 786	char *end = s + strlen(s), *p = end;
 787
 788	gflags.aci_cont = 0;
 789	while (--p >= s && *p == '\\')
 790		gflags.aci_cont = !gflags.aci_cont;
 791
 792	if (gflags.aci_cont)
 793		*--end = '\n';
 794
 795	escapes(s, end, 0, 0);
 796	stracat(&c->u.acir.str, s);
 797}
 798
 799static void
 800free_acir_arg(Cmd *c)
 801{
 802	free(c->u.acir.str.str);
 803}
 804
 805/* POSIX dictates that label is rest of line, including semicolons, trailing
 806 * whitespace, closing braces, etc. and can be limited to 8 bytes
 807 *
 808 * I allow a semicolon or closing brace to terminate a label name, it's not
 809 * POSIX compliant, but it's useful and every sed version I've tried to date
 810 * does the same.
 811 *
 812 * FIXME: POSIX dictates that leading whitespace is ignored but trailing
 813 * whitespace is not. This is annoying and we should probably get rid of it.
 814 */
 815static char *
 816get_bt_arg(Cmd *c, char *s)
 817{
 818	char *p = semicolon_arg(s = chomp(s));
 819
 820	if (p != s) {
 821		c->u.label = estrndup(s, p - s);
 822	} else {
 823		c->u.label = NULL;
 824	}
 825
 826	push(&branches, (void *)(c - prog));
 827
 828	return p;
 829}
 830
 831/* POSIX dictates file name is rest of line including semicolons, trailing
 832 * whitespace, closing braces, etc. and file name must be preceded by a space
 833 *
 834 * I allow a semicolon or closing brace to terminate a file name and don't
 835 * enforce leading space.
 836 *
 837 * FIXME: decide whether trailing whitespace should be included and fix
 838 * accordingly
 839 */
 840static char *
 841get_r_arg(Cmd *c, char *s)
 842{
 843	char *p = semicolon_arg(s = chomp(s));
 844
 845	if (p == s)
 846		leprintf("no file name");
 847
 848	c->u.acir.str.str = estrndup(s, p - s);
 849	c->u.acir.print = write_file;
 850
 851	return p;
 852}
 853
 854/* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX)
 855 *
 856 * FIXME: allow other escapes in regex and replacement? if so change escapes()
 857 */
 858static char *
 859get_s_arg(Cmd *c, char *s)
 860{
 861	Rune delim, r;
 862	Cmd buf;
 863	char *p;
 864	int esc, lastre;
 865
 866	/* s/Find/Replace/Flags */
 867
 868	/* Find */
 869	if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */
 870		lastre = 0;
 871		c->u.s.repl = (String){ NULL, 0 };
 872		c->u.s.occurrence = 1;
 873		c->u.s.file = NULL;
 874		c->u.s.p = 0;
 875
 876		if (!*s || *s == '\\')
 877			leprintf("bad delimiter");
 878
 879		p = s + strlen(s);
 880		s += echarntorune(&delim, s, p - s);
 881		c->u.s.delim = delim;
 882
 883		echarntorune(&r, s, p - s);
 884		if (r == delim) /* empty regex */
 885			lastre = 1;
 886
 887		p = find_delim(s, delim, 1);
 888		if (!*p)
 889			leprintf("missing second delimiter");
 890		p -= escapes(s, p, delim, 0);
 891		*p = '\0';
 892
 893		if (lastre) {
 894			c->u.s.re = NULL;
 895		} else {
 896			c->u.s.re = emalloc(sizeof(*c->u.s.re));
 897			/* FIXME: different eregcomp that calls fatal */
 898			eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0);
 899		}
 900		s = p + runelen(delim);
 901	}
 902
 903	/* Replace */
 904	delim = c->u.s.delim;
 905
 906	p = find_delim(s, delim, 0);
 907	p -= escapes(s, p, delim, 0);
 908	if (!*p) { /* no third delimiter */
 909		/* FIXME: same backslash counting as aci_append() */
 910		if (p[-1] != '\\')
 911			leprintf("missing third delimiter or <backslash><newline>");
 912		p[-1] = '\n';
 913		gflags.s_cont = 1;
 914	} else {
 915		gflags.s_cont = 0;
 916	}
 917
 918	/* check for bad references in replacement text */
 919	*p = '\0';
 920	for (esc = 0, p = s; *p; p++) {
 921		if (esc) {
 922			esc = 0;
 923			if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub)
 924				leprintf("back reference number greater than number of groups");
 925		} else if (*p == '\\') {
 926			esc = 1;
 927		}
 928	}
 929	stracat(&c->u.s.repl, s);
 930
 931	if (gflags.s_cont)
 932		return p;
 933
 934	s = p + runelen(delim);
 935
 936	/* Flags */
 937	p = semicolon_arg(s = chomp(s));
 938
 939	/* FIXME: currently for simplicity take last of g or occurrence flags and
 940	 *        ignore multiple p flags. need to fix that */
 941	for (; s < p; s++) {
 942		if (isdigit(*s)) {
 943			c->u.s.occurrence = stol(s, &s);
 944			s--; /* for loop will advance pointer */
 945		} else {
 946			switch (*s) {
 947			case 'g': c->u.s.occurrence = 0; break;
 948			case 'p': c->u.s.p = 1;          break;
 949			case 'w':
 950				/* must be last flag, take everything up to newline/semicolon
 951				 * s == p after this */
 952				s = get_w_arg(&buf, chomp(s+1));
 953				c->u.s.file = buf.u.file;
 954				break;
 955			}
 956		}
 957	}
 958	return p;
 959}
 960
 961static void
 962free_s_arg(Cmd *c)
 963{
 964	if (c->u.s.re)
 965		regfree(c->u.s.re);
 966	free(c->u.s.re);
 967	free(c->u.s.repl.str);
 968}
 969
 970/* see get_r_arg notes */
 971static char *
 972get_w_arg(Cmd *c, char *s)
 973{
 974	char *p = semicolon_arg(s = chomp(s));
 975	Wfile *w, **wp;
 976
 977	if (p == s)
 978		leprintf("no file name");
 979
 980	for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) {
 981		if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) {
 982			c->u.file = (*wp)->file;
 983			return p;
 984		}
 985	}
 986
 987	w = emalloc(sizeof(*w));
 988	w->path = estrndup(s, p - s);
 989
 990	if (!(w->file = fopen(w->path, "w")))
 991		leprintf("fopen failed");
 992
 993	c->u.file = w->file;
 994
 995	push(&wfiles, w);
 996	return p;
 997}
 998
 999static char *
1000get_y_arg(Cmd *c, char *s)
1001{
1002	Rune delim;
1003	char *p = s + strlen(s);
1004	size_t rlen = echarntorune(&delim, s, p - s);
1005	size_t nrunes1, nrunes2;
1006
1007	c->u.y.set1 = c->u.y.set2 = NULL;
1008
1009	s += rlen;
1010	p = find_delim(s, delim, 0);
1011	p -= escapes(s, p, delim, 1);
1012	nrunes1 = utfnlen(s, p - s);
1013	c->u.y.set1 = strtorunes(s, nrunes1);
1014
1015	s = p + rlen;
1016	p = find_delim(s, delim, 0);
1017	p -= escapes(s, p, delim, 1);
1018	nrunes2 = utfnlen(s, p - s);
1019
1020	if (nrunes1 != nrunes2)
1021		leprintf("different set lengths");
1022
1023	c->u.y.set2 = strtorunes(s, utfnlen(s, p - s));
1024
1025	return p + rlen;
1026}
1027
1028static void
1029free_y_arg(Cmd *c)
1030{
1031	free(c->u.y.set1);
1032	free(c->u.y.set2);
1033}
1034
1035/* see get_bt_arg notes */
1036static char *
1037get_colon_arg(Cmd *c, char *s)
1038{
1039	char *p = semicolon_arg(s = chomp(s));
1040
1041	if (p == s)
1042		leprintf("no label name");
1043
1044	c->u.label = estrndup(s, p - s);
1045	push(&labels, (void *)(c - prog));
1046	return p;
1047}
1048
1049static char *
1050get_lbrace_arg(Cmd *c, char *s)
1051{
1052	push(&braces, (void *)(c - prog));
1053	return s;
1054}
1055
1056static char *
1057get_rbrace_arg(Cmd *c, char *s)
1058{
1059	Cmd *lbrace;
1060
1061	if (!braces.size)
1062		leprintf("extra }");
1063
1064	lbrace = prog + (ptrdiff_t)pop(&braces);
1065	lbrace->u.offset = c - prog;
1066	return s;
1067}
1068
1069/* s points to beginning of an argument that may be semicolon terminated
1070 * return pointer to semicolon or nul byte after string
1071 * or closing brace as to not force ; before }
1072 * FIXME: decide whether or not to eat trailing whitespace for arguments that
1073 *        we allow semicolon/brace termination that POSIX doesn't
1074 *        b, r, t, w, :
1075 *        POSIX says trailing whitespace is part of label name, file name, etc.
1076 *        we should probably eat it
1077 */
1078static char *
1079semicolon_arg(char *s)
1080{
1081	char *p = strpbrk(s, ";}");
1082	if (!p)
1083		p = s + strlen(s);
1084	return p;
1085}
1086
1087static void
1088run(void)
1089{
1090	lineno = 0;
1091	if (braces.size)
1092		leprintf("extra {");
1093
1094	/* genbuf has already been initialized, patt will be in new_line
1095	 * (or we'll halt) */
1096	stracpy(&hold, "");
1097
1098	insert_labels();
1099	next_file();
1100	new_line();
1101
1102	for (pc = prog; !gflags.halt; pc++)
1103		pc->fninfo->fn(pc);
1104}
1105
1106/* return true if we are in range for c, set c->in_match appropriately */
1107static int
1108in_range(Cmd *c)
1109{
1110	if (match_addr(&c->range.beg)) {
1111		if (c->range.naddr == 2) {
1112			if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno)
1113				c->in_match = 0;
1114			else
1115				c->in_match = 1;
1116		}
1117		return !c->negate;
1118	}
1119	if (c->in_match && match_addr(&c->range.end)) {
1120		c->in_match = 0;
1121		return !c->negate;
1122	}
1123	return c->in_match ^ c->negate;
1124}
1125
1126/* return true if addr matches current line */
1127static int
1128match_addr(Addr *a)
1129{
1130	switch (a->type) {
1131	default:
1132	case IGNORE: return 0;
1133	case EVERY: return 1;
1134	case LINE: return lineno == a->u.lineno;
1135	case LAST:
1136		while (is_eof(file) && !next_file())
1137			;
1138		return !file;
1139	case REGEX:
1140		lastre = a->u.re;
1141		return !regexec(a->u.re, patt.str, 0, NULL, 0);
1142	case LASTRE:
1143		if (!lastre)
1144			leprintf("no previous regex");
1145		return !regexec(lastre, patt.str, 0, NULL, 0);
1146	}
1147}
1148
1149/* move to next input file
1150 * stdin if first call and no files
1151 * return 0 for success and 1 for no more files
1152 */
1153static int
1154next_file(void)
1155{
1156	static unsigned char first = 1;
1157
1158	if (file == stdin)
1159		clearerr(file);
1160	else if (file)
1161		fshut(file, "<file>");
1162	/* given no files, default to stdin */
1163	file = first && !*files ? stdin : NULL;
1164	first = 0;
1165
1166	while (!file && *files) {
1167		if (!strcmp(*files, "-")) {
1168			file = stdin;
1169		} else if (!(file = fopen(*files, "r"))) {
1170			/* warn this file didn't open, but move on to next */
1171			weprintf("fopen %s:", *files);
1172			ret = 1;
1173		}
1174		files++;
1175	}
1176
1177	return !file;
1178}
1179
1180/* test if stream is at EOF */
1181static int
1182is_eof(FILE *f)
1183{
1184	int c;
1185
1186	if (!f || feof(f))
1187		return 1;
1188
1189	c = fgetc(f);
1190	if (c == EOF && ferror(f))
1191		eprintf("fgetc:");
1192	if (c != EOF && ungetc(c, f) == EOF)
1193		eprintf("ungetc EOF\n");
1194
1195	return c == EOF;
1196}
1197
1198/* perform writes that were scheduled
1199 * for aci this is check_puts(string, stdout)
1200 * for r this is write_file(path, stdout)
1201 */
1202static void
1203do_writes(void)
1204{
1205	Cmd *c;
1206	size_t i;
1207
1208	for (i = 0; i < writes.size; i++) {
1209		c = writes.data[i];
1210		c->u.acir.print(c->u.acir.str.str, stdout);
1211	}
1212	writes.size = 0;
1213}
1214
1215/* used for r's u.acir.print()
1216 * FIXME: something like util's concat() would be better
1217 */
1218static void
1219write_file(char *path, FILE *out)
1220{
1221	FILE *in = fopen(path, "r");
1222	if (!in) /* no file is treated as empty file */
1223		return;
1224
1225	while (read_line(in, &genbuf) != EOF)
1226		check_puts(genbuf.str, out);
1227
1228	fshut(in, path);
1229}
1230
1231static void
1232check_puts(char *s, FILE *f)
1233{
1234	if (s && fputs(s, f) == EOF)
1235		eprintf("fputs:");
1236	if (fputs("\n", f) == EOF)
1237		eprintf("fputs:");
1238}
1239
1240static void
1241write_patt(char *s, FILE *f)
1242{
1243#if FEATURE_SED_PRESERVE_NEWLINE
1244	if (s && fputs(s, f) == EOF)
1245		eprintf("fputs:");
1246	if (hadnl) {
1247		if (fputs("\n", f) == EOF)
1248			eprintf("fputs:");
1249	}
1250#else
1251	check_puts(s, f);
1252#endif
1253}
1254
1255/* iterate from beg to end updating ranges so we don't miss any commands
1256 * e.g. sed -n '1d;1,3p' should still print lines 2 and 3
1257 */
1258static void
1259update_ranges(Cmd *beg, Cmd *end)
1260{
1261	while (beg < end)
1262		in_range(beg++);
1263}
1264
1265/*
1266 * Sed functions
1267 */
1268static void
1269cmd_a(Cmd *c)
1270{
1271	if (in_range(c))
1272		push(&writes, c);
1273}
1274
1275static void
1276cmd_b(Cmd *c)
1277{
1278	if (!in_range(c))
1279		return;
1280
1281	/* if we jump backwards update to end, otherwise update to destination */
1282	update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
1283	pc = c->u.jump;
1284}
1285
1286static void
1287cmd_c(Cmd *c)
1288{
1289	if (!in_range(c))
1290		return;
1291
1292	/* write the text on the last line of the match */
1293	if (!c->in_match)
1294		check_puts(c->u.acir.str.str, stdout);
1295	/* otherwise start the next cycle without printing pattern space
1296	 * effectively deleting the text */
1297	new_next();
1298}
1299
1300static void
1301cmd_d(Cmd *c)
1302{
1303	if (!in_range(c))
1304		return;
1305
1306	new_next();
1307}
1308
1309static void
1310cmd_D(Cmd *c)
1311{
1312	char *p;
1313
1314	if (!in_range(c))
1315		return;
1316
1317	if ((p = strchr(patt.str, '\n'))) {
1318		p++;
1319		memmove(patt.str, p, strlen(p) + 1);
1320		old_next();
1321	} else {
1322		new_next();
1323	}
1324}
1325
1326static void
1327cmd_g(Cmd *c)
1328{
1329	if (in_range(c))
1330		stracpy(&patt, hold.str);
1331}
1332
1333static void
1334cmd_G(Cmd *c)
1335{
1336	if (!in_range(c))
1337		return;
1338
1339	stracat(&patt, "\n");
1340	stracat(&patt, hold.str);
1341}
1342
1343static void
1344cmd_h(Cmd *c)
1345{
1346	if (in_range(c))
1347		stracpy(&hold, patt.str);
1348}
1349
1350static void
1351cmd_H(Cmd *c)
1352{
1353	if (!in_range(c))
1354		return;
1355
1356	stracat(&hold, "\n");
1357	stracat(&hold, patt.str);
1358}
1359
1360static void
1361cmd_i(Cmd *c)
1362{
1363	if (in_range(c))
1364		check_puts(c->u.acir.str.str, stdout);
1365}
1366
1367/* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy
1368 * the "visually unambiguous form" sed(1p)
1369 */
1370static void
1371cmd_l(Cmd *c)
1372{
1373	Rune   r;
1374	char  *p, *end;
1375	size_t rlen;
1376
1377	char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */
1378		['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b",
1379		['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t",
1380		['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */
1381	};
1382
1383	if (!in_range(c))
1384		return;
1385
1386	/* FIXME: line wrapping. sed(1p) says "length at which folding occurs is
1387	 * unspecified, but should be appropraite for the output device"
1388	 * just wrap at 80 Runes?
1389	 */
1390	for (p = patt.str, end = p + strlen(p); p < end; p += rlen) {
1391		if (isascii(*p) && escapes[(unsigned int)*p]) {
1392			fputs(escapes[(unsigned int)*p], stdout);
1393			rlen = 1;
1394		} else if (!(rlen = charntorune(&r, p, end - p))) {
1395			/* ran out of chars, print the bytes of the short sequence */
1396			for (; p < end; p++)
1397				printf("\\%03hho", (unsigned char)*p);
1398			break;
1399		} else if (r == Runeerror) {
1400			for (; rlen; rlen--, p++)
1401				printf("\\%03hho", (unsigned char)*p);
1402		} else {
1403			while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR)
1404				;
1405			if (ferror(stdout))
1406				eprintf("fwrite:");
1407		}
1408	}
1409	check_puts("$", stdout);
1410}
1411
1412static void
1413cmd_n(Cmd *c)
1414{
1415	if (!in_range(c))
1416		return;
1417
1418	if (!gflags.n)
1419		write_patt(patt.str, stdout);
1420	do_writes();
1421	new_line();
1422}
1423
1424static void
1425cmd_N(Cmd *c)
1426{
1427	if (!in_range(c))
1428		return;
1429	do_writes();
1430	app_line();
1431}
1432
1433static void
1434cmd_p(Cmd *c)
1435{
1436	if (in_range(c))
1437		write_patt(patt.str, stdout);
1438}
1439
1440static void
1441cmd_P(Cmd *c)
1442{
1443	char *p;
1444
1445	if (!in_range(c))
1446		return;
1447
1448	if ((p = strchr(patt.str, '\n')))
1449		*p = '\0';
1450
1451	write_patt(patt.str, stdout);
1452
1453	if (p)
1454		*p = '\n';
1455}
1456
1457static void
1458cmd_q(Cmd *c)
1459{
1460	if (!in_range(c))
1461		return;
1462
1463	if (!gflags.n)
1464		check_puts(patt.str, stdout);
1465	do_writes();
1466	gflags.halt = 1;
1467}
1468
1469static void
1470cmd_r(Cmd *c)
1471{
1472	if (in_range(c))
1473		push(&writes, c);
1474}
1475
1476static void
1477cmd_s(Cmd *c)
1478{
1479	String tmp;
1480	Rune r;
1481	size_t plen, rlen, len;
1482	char *p, *s, *end;
1483	unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0;
1484	regex_t *re;
1485	regmatch_t *rm, *pmatch = NULL;
1486
1487	if (!in_range(c))
1488		return;
1489
1490	if (!c->u.s.re && !lastre)
1491		leprintf("no previous regex");
1492
1493	re = c->u.s.re ? c->u.s.re : lastre;
1494	lastre = re;
1495
1496	plen = re->re_nsub + 1;
1497	pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t));
1498
1499	*genbuf.str = '\0';
1500	s = patt.str;
1501
1502	while (!qflag && !regexec(re, s, plen, pmatch, cflags)) {
1503		cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */
1504		if (!*s) /* match against empty string first time, but not again */
1505			qflag = 1;
1506
1507		/* don't substitute if last match was not empty but this one is.
1508		 * s_a*_._g
1509		 * foobar -> .f.o.o.b.r.
1510		 */
1511		if ((last_empty || pmatch[0].rm_eo) &&
1512		    (++matches == c->u.s.occurrence || !c->u.s.occurrence)) {
1513			/* copy over everything before the match */
1514			strnacat(&genbuf, s, pmatch[0].rm_so);
1515
1516			/* copy over replacement text, taking into account &, backreferences, and \ escapes */
1517			for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) {
1518				strnacat(&genbuf, p, len);
1519				p += len;
1520				switch (*p) {
1521				default:
1522					leprintf("this shouldn't be possible");
1523					break;
1524				case '\0':
1525					/* we're at the end, back up one so the ++p will put us on
1526					 * the null byte to break out of the loop */
1527					--p;
1528					break;
1529				case '&':
1530					strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so);
1531					break;
1532				case '\\':
1533					if (isdigit(*++p)) { /* backreference */
1534						/* only need to check here if using lastre, otherwise we checked when building */
1535						if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub)
1536							leprintf("back reference number greater than number of groups");
1537						rm = &pmatch[*p - '0'];
1538						strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so);
1539					} else { /* character after backslash taken literally (well one byte, but it works) */
1540						strnacat(&genbuf, p, 1);
1541					}
1542					break;
1543				}
1544			}
1545		} else {
1546			/* not replacing, copy over everything up to and including the match */
1547			strnacat(&genbuf, s, pmatch[0].rm_eo);
1548		}
1549
1550		if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */
1551			end = s + strlen(s);
1552			rlen = charntorune(&r, s, end - s);
1553
1554			if (!rlen) { /* ran out of bytes, copy short sequence */
1555				stracat(&genbuf, s);
1556				s = end;
1557			} else { /* copy whether or not it's a good rune */
1558				strnacat(&genbuf, s, rlen);
1559				s += rlen;
1560			}
1561		}
1562		last_empty = !pmatch[0].rm_eo;
1563		s += pmatch[0].rm_eo;
1564	}
1565	free(pmatch);
1566
1567	if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */
1568		return;
1569
1570	gflags.s = 1;
1571
1572	stracat(&genbuf, s);
1573
1574	tmp    = patt;
1575	patt   = genbuf;
1576	genbuf = tmp;
1577
1578	if (c->u.s.p)
1579		write_patt(patt.str, stdout);
1580	if (c->u.s.file)
1581		write_patt(patt.str, c->u.s.file);
1582}
1583
1584static void
1585cmd_t(Cmd *c)
1586{
1587	if (!in_range(c) || !gflags.s)
1588		return;
1589
1590	/* if we jump backwards update to end, otherwise update to destination */
1591	update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
1592	pc = c->u.jump;
1593	gflags.s = 0;
1594}
1595
1596static void
1597cmd_w(Cmd *c)
1598{
1599	if (in_range(c))
1600		write_patt(patt.str, c->u.file);
1601}
1602
1603static void
1604cmd_x(Cmd *c)
1605{
1606	String tmp;
1607
1608	if (!in_range(c))
1609		return;
1610
1611	tmp  = patt;
1612	patt = hold;
1613	hold = tmp;
1614}
1615
1616static void
1617cmd_y(Cmd *c)
1618{
1619	String tmp;
1620	Rune r, *rp;
1621	size_t n, rlen;
1622	char *s, *end, buf[UTFmax];
1623
1624	if (!in_range(c))
1625		return;
1626
1627	*genbuf.str = '\0';
1628	for (s = patt.str, end = s + strlen(s); *s; s += rlen) {
1629		if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */
1630			stracat(&genbuf, s);
1631			break;
1632		} else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */
1633			strnacat(&genbuf, s, rlen);
1634		} else {
1635			for (rp = c->u.y.set1; *rp; rp++)
1636				if (*rp == r)
1637					break;
1638			if (*rp) { /* found r in set1, replace with Rune from set2 */
1639				n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1));
1640				strnacat(&genbuf, buf, n);
1641			} else {
1642				strnacat(&genbuf, s, rlen);
1643			}
1644		}
1645	}
1646	tmp    = patt;
1647	patt   = genbuf;
1648	genbuf = tmp;
1649}
1650
1651static void
1652cmd_colon(Cmd *c)
1653{
1654	(void)c;
1655}
1656
1657static void
1658cmd_equal(Cmd *c)
1659{
1660	if (in_range(c))
1661		printf("%zu\n", lineno);
1662}
1663
1664static void
1665cmd_lbrace(Cmd *c)
1666{
1667	Cmd *jump;
1668
1669	if (in_range(c))
1670		return;
1671
1672	/* update ranges on all commands we skip */
1673	jump = prog + c->u.offset;
1674	update_ranges(c + 1, jump);
1675	pc = jump;
1676}
1677
1678static void
1679cmd_rbrace(Cmd *c)
1680{
1681	(void)c;
1682}
1683
1684/* not actually a sed function, but acts like one, put in last spot of script */
1685static void
1686cmd_last(Cmd *c)
1687{
1688	(void)c;
1689	if (!gflags.n)
1690		write_patt(patt.str, stdout);
1691	do_writes();
1692	new_next();
1693}
1694
1695/*
1696 * Actions
1697 */
1698
1699/* read new line, continue current cycle */
1700static void
1701new_line(void)
1702{
1703	while (read_line(file, &patt) == EOF) {
1704		if (next_file()) {
1705			gflags.halt = 1;
1706			return;
1707		}
1708	}
1709	gflags.s = 0;
1710	lineno++;
1711}
1712
1713/* append new line, continue current cycle
1714 * FIXME: used for N, POSIX specifies do not print pattern space when out of
1715 *        input, but GNU does so busybox does as well. Currently we don't.
1716 *        Should we?
1717 */
1718static void
1719app_line(void)
1720{
1721	while (read_line(file, &genbuf) == EOF) {
1722		if (next_file()) {
1723			gflags.halt = 1;
1724			return;
1725		}
1726	}
1727
1728	stracat(&patt, "\n");
1729	stracat(&patt, genbuf.str);
1730	gflags.s = 0;
1731	lineno++;
1732}
1733
1734/* read new line, start new cycle */
1735static void
1736new_next(void)
1737{
1738	*patt.str = '\0';
1739	update_ranges(pc + 1, prog + pcap);
1740	new_line();
1741	pc = prog - 1;
1742}
1743
1744/* keep old pattern space, start new cycle */
1745static void
1746old_next(void)
1747{
1748	update_ranges(pc + 1, prog + pcap);
1749	pc = prog - 1;
1750}
1751
1752// ?man sed: stream editor
1753// ?man arguments: script [file ...
1754// ?man stream editor for filtering and transforming text
1755int
1756main(int argc, char *argv[])
1757{
1758	char *arg;
1759	int script = 0;
1760
1761	ARGBEGIN {
1762	// ?man -n: print line numbers or counts
1763	case 'n':
1764		gflags.n = 1;
1765		break;
1766	// ?man -r: operate recursively
1767	case 'r':
1768	// ?man -E: specify option flag
1769	case 'E':
1770		gflags.E = 1;
1771		break;
1772	// ?man -e:str: specify expression or pattern
1773	case 'e':
1774		arg = EARGF(usage());
1775		compile(arg, 0);
1776		script = 1;
1777		break;
1778	// ?man -f:str: force the operation
1779	case 'f':
1780		arg = EARGF(usage());
1781		compile(arg, 1);
1782		script = 1;
1783		break;
1784#if FEATURE_SED_INPLACE
1785	// ?man -i: interactive mode or prompt for confirmation
1786	case 'i':
1787		iflag = 1;
1788		if (argv[0][1] != '\0') {
1789			backup_suffix = &argv[0][1];
1790			brk_ = 1;
1791		} else {
1792			backup_suffix = "";
1793		}
1794		break;
1795#endif
1796	default : usage();
1797	} ARGEND
1798
1799	/* no script to run */
1800	if (!script && !argc)
1801		usage();
1802
1803	/* no script yet, next argument is script */
1804	if (!script)
1805		compile(*argv++, 0);
1806
1807	/* shrink/grow memory to fit and add our last instruction */
1808	resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL);
1809	pc = prog + pcap - 1;
1810	pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 };
1811
1812#if FEATURE_SED_INPLACE
1813	if (iflag) {
1814		char *single_file[2] = { NULL, NULL };
1815		char **orig_files = argv;
1816		int i;
1817
1818		if (!*orig_files)
1819			eprintf("no input files\n");
1820
1821		for (i = 0; orig_files[i]; i++) {
1822			char *temp_path = NULL;
1823			int temp_fd;
1824			int real_stdout;
1825			struct stat st;
1826			Cmd *c;
1827
1828			if (strcmp(orig_files[i], "-") == 0) {
1829				weprintf("cannot edit stdin in-place\n");
1830				ret = 1;
1831				continue;
1832			}
1833
1834			if (stat(orig_files[i], &st) < 0) {
1835				weprintf("stat %s:", orig_files[i]);
1836				ret = 1;
1837				continue;
1838			}
1839
1840			temp_fd = create_temp_file(orig_files[i], &temp_path);
1841			if (temp_fd < 0) {
1842				weprintf("create_temp_file:");
1843				ret = 1;
1844				continue;
1845			}
1846
1847			real_stdout = dup(1);
1848			if (real_stdout < 0) {
1849				weprintf("dup stdout:");
1850				close(temp_fd);
1851				free(temp_path);
1852				ret = 1;
1853				continue;
1854			}
1855			if (dup2(temp_fd, 1) < 0) {
1856				weprintf("dup2 stdout:");
1857				close(temp_fd);
1858				close(real_stdout);
1859				free(temp_path);
1860				ret = 1;
1861				continue;
1862			}
1863			close(temp_fd);
1864
1865			single_file[0] = orig_files[i];
1866			files = single_file;
1867
1868			/* reset state for next file */
1869			lineno = 0;
1870			gflags.halt = 0;
1871			stracpy(&hold, "");
1872			stracpy(&patt, "");
1873			writes.size = 0;
1874			for (c = prog; c->fninfo->fn != cmd_last; c++) {
1875				c->in_match = 0;
1876			}
1877
1878			run();
1879
1880			fflush(stdout);
1881			dup2(real_stdout, 1);
1882			close(real_stdout);
1883
1884			if (backup_suffix && *backup_suffix) {
1885				char *backup_path = emalloc(strlen(orig_files[i]) + strlen(backup_suffix) + 1);
1886				sprintf(backup_path, "%s%s", orig_files[i], backup_suffix);
1887				if (rename(orig_files[i], backup_path) < 0) {
1888					weprintf("rename %s to %s:", orig_files[i], backup_path);
1889					unlink(temp_path);
1890					free(backup_path);
1891					free(temp_path);
1892					ret = 1;
1893					continue;
1894				}
1895				free(backup_path);
1896			} else {
1897				unlink(orig_files[i]);
1898			}
1899
1900			if (rename(temp_path, orig_files[i]) < 0) {
1901				weprintf("rename %s to %s:", temp_path, orig_files[i]);
1902				unlink(temp_path);
1903				free(temp_path);
1904				ret = 1;
1905				continue;
1906			}
1907
1908			chmod(orig_files[i], st.st_mode);
1909			chown(orig_files[i], st.st_uid, st.st_gid);
1910
1911			free(temp_path);
1912		}
1913	} else
1914#endif
1915	{
1916		files = argv;
1917		run();
1918	}
1919
1920	ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
1921
1922	return ret;
1923}