master xplshn/aruu / cmd / posix / join.c
  1/* See LICENSE file for copyright and license details. */
  2
  3
  4#include <ctype.h>
  5#include <stdint.h>
  6#include <stdio.h>
  7#include <stdlib.h>
  8#include <string.h>
  9
 10#include "text.h"
 11#include "utf.h"
 12#include "util.h"
 13
 14enum {
 15	INIT = 1,
 16	GROW = 2,
 17};
 18
 19enum {
 20	EXPAND = 0,
 21	RESET  = 1,
 22};
 23
 24enum { FIELD_ERROR = -2, };
 25
 26struct field {
 27	char *s;
 28	size_t len;
 29};
 30
 31struct jline {
 32	struct line text;
 33	size_t nf;
 34	size_t maxf;
 35	struct field *fields;
 36};
 37
 38struct spec {
 39	size_t fileno;
 40	size_t fldno;
 41};
 42
 43struct outlist {
 44	size_t ns;
 45	size_t maxs;
 46	struct spec **specs;
 47};
 48
 49struct span {
 50	size_t nl;
 51	size_t maxl;
 52	struct jline **lines;
 53};
 54
 55static char *sep = NULL;
 56static char *replace = NULL;
 57static const char defaultofs = ' ';
 58static const int jfield = 1;            /* POSIX default join field */
 59static int unpairsa = 0, unpairsb = 0;
 60static int oflag = 0;
 61static int pairs = 1;
 62static size_t seplen;
 63static struct outlist output;
 64
 65static void
 66usage(void)
 67{
 68	eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] "
 69	        "[-a | -v fileno] [-t delim] file1 file2\n", argv0);
 70}
 71
 72static void
 73prfield(struct field *fp)
 74{
 75	if (fwrite(fp->s, 1, fp->len, stdout) != fp->len)
 76		eprintf("fwrite:");
 77}
 78
 79static void
 80prsep(void)
 81{
 82	if (sep)
 83		fwrite(sep, 1, seplen, stdout);
 84	else
 85		putchar(defaultofs);
 86}
 87
 88static void
 89swaplines(struct jline *la, struct jline *lb)
 90{
 91	struct jline tmp;
 92
 93	tmp = *la;
 94	*la = *lb;
 95	*lb = tmp;
 96}
 97
 98static void
 99prjoin(struct jline *la, struct jline *lb, size_t jfa, size_t jfb)
100{
101	struct spec *sp;
102	struct field *joinfield;
103	size_t i;
104
105	if (jfa >= la->nf || jfb >= lb->nf)
106		return;
107
108	joinfield = &la->fields[jfa];
109
110	if (oflag) {
111		for (i = 0; i < output.ns; i++) {
112			sp = output.specs[i];
113
114			if (sp->fileno == 1) {
115				if (sp->fldno < la->nf)
116					prfield(&la->fields[sp->fldno]);
117				else if (replace)
118					fputs(replace, stdout);
119			} else if (sp->fileno == 2) {
120				if (sp->fldno < lb->nf)
121					prfield(&lb->fields[sp->fldno]);
122				else if (replace)
123					fputs(replace, stdout);
124			} else if (sp->fileno == 0) {
125				prfield(joinfield);
126			}
127
128			if (i < output.ns - 1)
129				prsep();
130		}
131	} else {
132		prfield(joinfield);
133		prsep();
134
135		for (i = 0; i < la->nf; i++) {
136			if (i != jfa) {
137				prfield(&la->fields[i]);
138				prsep();
139			}
140		}
141		for (i = 0; i < lb->nf; i++) {
142			if (i != jfb) {
143				prfield(&lb->fields[i]);
144				if (i < lb->nf - 1)
145					prsep();
146			}
147		}
148	}
149	putchar('\n');
150}
151
152static void
153prline(struct jline *lp)
154{
155	if (fwrite(lp->text.data, 1, lp->text.len, stdout) != lp->text.len)
156		eprintf("fwrite:");
157	putchar('\n');
158}
159
160static int
161jlinecmp(struct jline *la, struct jline *lb, size_t jfa, size_t jfb)
162{
163	int status;
164
165	/* return FIELD_ERROR if both lines are short */
166	if (jfa >= la->nf) {
167		status = (jfb >= lb->nf) ? FIELD_ERROR : -1;
168	} else if (jfb >= lb->nf) {
169		status = 1;
170	} else {
171		status = memcmp(la->fields[jfa].s, lb->fields[jfb].s,
172		                MAX(la->fields[jfa].len, lb->fields[jfb].len));
173		LIMIT(status, -1, 1);
174	}
175
176	return status;
177}
178
179static void
180addfield(struct jline *lp, char *sp, size_t len)
181{
182	if (lp->nf >= lp->maxf) {
183		lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf),
184		        sizeof(struct field));
185		lp->maxf *= GROW;
186	}
187	lp->fields[lp->nf].s = sp;
188	lp->fields[lp->nf].len = len;
189	lp->nf++;
190}
191
192static void
193prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb)
194{
195	size_t i, j;
196
197	for (i = 0; i < (spa->nl - 1); i++)
198		for (j = 0; j < (spb->nl - 1); j++)
199			prjoin(spa->lines[i], spb->lines[j], jfa, jfb);
200}
201
202static struct jline *
203makeline(char *s, size_t len)
204{
205	struct jline *lp;
206	char *tmp;
207	size_t i, end;
208
209	if (s[len - 1] == '\n')
210		s[--len] = '\0';
211
212	lp = ereallocarray(NULL, INIT, sizeof(struct jline));
213	lp->text.data = s;
214	lp->text.len = len;
215	lp->fields = ereallocarray(NULL, INIT, sizeof(struct field));
216	lp->nf = 0;
217	lp->maxf = INIT;
218
219	for (i = 0; i < lp->text.len && isblank(lp->text.data[i]); i++)
220		;
221	while (i < lp->text.len) {
222		if (sep) {
223			if ((lp->text.len - i) < seplen ||
224			    !(tmp = memmem(lp->text.data + i,
225			                   lp->text.len - i, sep, seplen))) {
226				goto eol;
227			}
228			end = tmp - lp->text.data;
229			addfield(lp, lp->text.data + i, end - i);
230			i = end + seplen;
231		} else {
232			for (end = i; !(isblank(lp->text.data[end])); end++) {
233				if (end + 1 == lp->text.len)
234					goto eol;
235			}
236			addfield(lp, lp->text.data + i, end - i);
237			for (i = end; isblank(lp->text.data[i]); i++)
238				;
239		}
240	}
241eol:
242	addfield(lp, lp->text.data + i, lp->text.len - i);
243
244	return lp;
245}
246
247static int
248addtospan(struct span *sp, FILE *fp, int reset)
249{
250	char *newl = NULL;
251	ssize_t len;
252	size_t size = 0;
253
254	if ((len = getline(&newl, &size, fp)) < 0) {
255		if (ferror(fp))
256			eprintf("getline:");
257		else
258			return 0;
259	}
260
261	if (reset)
262		sp->nl = 0;
263
264	if (sp->nl >= sp->maxl) {
265		sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl),
266		        sizeof(struct jline *));
267		sp->maxl *= GROW;
268	}
269
270	sp->lines[sp->nl] = makeline(newl, len);
271	sp->nl++;
272	return 1;
273}
274
275static void
276initspan(struct span *sp)
277{
278	sp->nl = 0;
279	sp->maxl = INIT;
280	sp->lines = ereallocarray(NULL, INIT, sizeof(struct jline *));
281}
282
283static void
284freespan(struct span *sp)
285{
286	size_t i;
287
288	for (i = 0; i < sp->nl; i++) {
289		free(sp->lines[i]->fields);
290		free(sp->lines[i]->text.data);
291	}
292	free(sp->lines);
293}
294
295static void
296initolist(struct outlist *olp)
297{
298	olp->ns = 0;
299	olp->maxs = 1;
300	olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *));
301}
302
303static void
304addspec(struct outlist *olp, struct spec *sp)
305{
306	if (olp->ns >= olp->maxs) {
307		olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs),
308		        sizeof(struct spec *));
309		olp->maxs *= GROW;
310	}
311	olp->specs[olp->ns] = sp;
312	olp->ns++;
313}
314
315static struct spec *
316makespec(char *s)
317{
318	struct spec *sp;
319	int fileno;
320	size_t fldno;
321
322	if (!strcmp(s, "0")) {   /* join field must be 0 and nothing else */
323		fileno = 0;
324		fldno = 0;
325	} else if ((s[0] == '1' || s[0] == '2') && s[1] == '.') {
326		fileno = s[0] - '0';
327		fldno = estrtonum(&s[2], 1, MIN((unsigned long long)LLONG_MAX, (unsigned long long)SIZE_MAX)) - 1;
328	} else {
329		eprintf("%s: invalid format\n", s);
330	}
331
332	sp = ereallocarray(NULL, INIT, sizeof(struct spec));
333	sp->fileno = fileno;
334	sp->fldno = fldno;
335	return sp;
336}
337
338static void
339makeolist(struct outlist *olp, char *s)
340{
341	char *item, *sp;
342	sp = s;
343
344	while (sp) {
345		item = sp;
346		sp = strpbrk(sp, ", \t");
347		if (sp)
348			*sp++ = '\0';
349		addspec(olp, makespec(item));
350	}
351}
352
353static void
354freespecs(struct outlist *olp)
355{
356	size_t i;
357
358	for (i = 0; i < olp->ns; i++)
359		free(olp->specs[i]);
360}
361
362static void
363join(FILE *fa, FILE *fb, size_t jfa, size_t jfb)
364{
365	struct span spa, spb;
366	int cmp, eofa, eofb;
367
368	initspan(&spa);
369	initspan(&spb);
370	cmp = eofa = eofb = 0;
371
372	addtospan(&spa, fa, RESET);
373	addtospan(&spb, fb, RESET);
374
375	while (spa.nl && spb.nl) {
376		if ((cmp = jlinecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) {
377			if (unpairsa)
378				prline(spa.lines[0]);
379			if (!addtospan(&spa, fa, RESET)) {
380				if (unpairsb) {    /* a is EOF'd; print the rest of b */
381					do
382						prline(spb.lines[0]);
383					while (addtospan(&spb, fb, RESET));
384				}
385				eofa = eofb = 1;
386			} else {
387				continue;
388			}
389		} else if (cmp > 0) {
390			if (unpairsb)
391				prline(spb.lines[0]);
392			if (!addtospan(&spb, fb, RESET)) {
393				if (unpairsa) {    /* b is EOF'd; print the rest of a */
394					do
395						prline(spa.lines[0]);
396					while (addtospan(&spa, fa, RESET));
397				}
398				eofa = eofb = 1;
399			} else {
400				continue;
401			}
402		} else if (cmp == 0) {
403			/* read all consecutive matching lines from a */
404			do {
405				if (!addtospan(&spa, fa, EXPAND)) {
406					eofa = 1;
407					spa.nl++;
408					break;
409				}
410			} while (jlinecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0);
411
412			/* read all consecutive matching lines from b */
413			do {
414				if (!addtospan(&spb, fb, EXPAND)) {
415					eofb = 1;
416					spb.nl++;
417					break;
418				}
419			} while (jlinecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0);
420
421			if (pairs)
422				prspanjoin(&spa, &spb, jfa, jfb);
423
424		} else {      /* FIELD_ERROR: both lines lacked join fields */
425			if (unpairsa)
426				prline(spa.lines[0]);
427			if (unpairsb)
428				prline(spb.lines[0]);
429			eofa = addtospan(&spa, fa, RESET) ? 0 : 1;
430			eofb = addtospan(&spb, fb, RESET) ? 0 : 1;
431			if (!eofa && !eofb)
432				continue;
433		}
434
435		if (eofa) {
436			spa.nl = 0;
437		} else {
438			swaplines(spa.lines[0], spa.lines[spa.nl - 1]);   /* ugly */
439			spa.nl = 1;
440		}
441
442		if (eofb) {
443			spb.nl = 0;
444		} else {
445			swaplines(spb.lines[0], spb.lines[spb.nl - 1]);   /* ugly */
446			spb.nl = 1;
447		}
448	}
449	freespan(&spa);
450	freespan(&spb);
451}
452
453
454// ?man join: join lines on common field
455// ?man join lines of two sorted files on a common field
456int
457main(int argc, char *argv[])
458{
459	size_t jf[2] = { jfield, jfield, };
460	FILE *fp[2];
461	int ret = 0, n;
462	char *fno;
463
464	ARGBEGIN {
465	// ?man -1:num: specify option flag
466	case '1':
467		jf[0] = estrtonum(EARGF(usage()), 1, MIN((unsigned long long)LLONG_MAX, (unsigned long long)SIZE_MAX));
468		break;
469	// ?man -2:num: specify option flag
470	case '2':
471		jf[1] = estrtonum(EARGF(usage()), 1, MIN((unsigned long long)LLONG_MAX, (unsigned long long)SIZE_MAX));
472		break;
473	// ?man -a:str: print or show all entries
474	case 'a':
475		fno = EARGF(usage());
476		if (strcmp(fno, "1") == 0)
477			unpairsa = 1;
478		else if (strcmp(fno, "2") == 0)
479			unpairsb = 1;
480		else
481			usage();
482		break;
483	// ?man -e:str: specify expression or pattern
484	case 'e':
485		replace = EARGF(usage());
486		break;
487	// ?man -o:str: specify output file
488	case 'o':
489		oflag = 1;
490		initolist(&output);
491		makeolist(&output, EARGF(usage()));
492		break;
493	// ?man -t:str: sort or specify timestamp
494	case 't':
495		sep = EARGF(usage());
496		break;
497	// ?man -v:str: verbose mode; show progress
498	case 'v':
499		pairs = 0;
500		fno = EARGF(usage());
501		if (strcmp(fno, "1") == 0)
502			unpairsa = 1;
503		else if (strcmp(fno, "2") == 0)
504			unpairsb = 1;
505		else
506			usage();
507		break;
508	default:
509		usage();
510	} ARGEND
511
512	if (sep)
513		seplen = unescape(sep);
514
515	if (argc != 2)
516		usage();
517
518	for (n = 0; n < 2; n++) {
519		if (!strcmp(argv[n], "-")) {
520			argv[n] = "<stdin>";
521			fp[n] = stdin;
522		} else if (!(fp[n] = fopen(argv[n], "r"))) {
523			eprintf("fopen %s:", argv[n]);
524		}
525	}
526
527	jf[0]--;
528	jf[1]--;
529
530	join(fp[0], fp[1], jf[0], jf[1]);
531
532	if (oflag)
533		freespecs(&output);
534
535	if (fshut(fp[0], argv[0]) | (fp[0] != fp[1] && fshut(fp[1], argv[1])) |
536	    fshut(stdout, "<stdout>"))
537		ret = 2;
538
539	return ret;
540}