master xplshn/aruu / cmd / posix / sort.c
  1/* See LICENSE file for copyright and license details. */
  2
  3
  4#include "queue.h"
  5#include "text.h"
  6#include "utf.h"
  7#include "util.h"
  8
  9#include <ctype.h>
 10#include <stdio.h>
 11#include <stdlib.h>
 12#include <string.h>
 13
 14struct keydef {
 15	int start_column;
 16	int end_column;
 17	int start_char;
 18	int end_char;
 19	int flags;
 20	TAILQ_ENTRY(keydef) entry;
 21};
 22
 23struct column {
 24	struct line line;
 25	size_t cap;
 26};
 27
 28enum {
 29	MOD_N      = 1 << 0,
 30	MOD_STARTB = 1 << 1,
 31	MOD_ENDB   = 1 << 2,
 32	MOD_R      = 1 << 3,
 33	MOD_D      = 1 << 4,
 34	MOD_F      = 1 << 5,
 35	MOD_I      = 1 << 6,
 36};
 37
 38static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
 39
 40static int Cflag = 0, cflag = 0, uflag = 0;
 41#if FEATURE_SORT_BIG
 42static int zflag = 0;
 43#endif
 44#if FEATURE_SORT_STABLE
 45static int sflag = 0;
 46#endif
 47static char *fieldsep = NULL;
 48static size_t fieldseplen = 0;
 49static struct column col1, col2;
 50
 51static void
 52skipblank(struct line *a)
 53{
 54	while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
 55		a->data++;
 56		a->len--;
 57	}
 58}
 59
 60static void
 61skipnonblank(struct line *a)
 62{
 63	while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
 64	                  *(a->data) != '\t')) {
 65		a->data++;
 66		a->len--;
 67	}
 68}
 69
 70static void
 71skipcolumn(struct line *a, int skip_to_next_col)
 72{
 73	char *s;
 74
 75	if (fieldsep) {
 76		if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
 77			if (skip_to_next_col)
 78				s += fieldseplen;
 79			a->len -= s - a->data;
 80			a->data = s;
 81		} else {
 82			a->data += a->len - 1;
 83			a->len = 1;
 84		}
 85	} else {
 86		skipblank(a);
 87		skipnonblank(a);
 88	}
 89}
 90
 91static void
 92columns(struct line *line, const struct keydef *kd, struct column *col)
 93{
 94	Rune r;
 95	struct line start, end;
 96	size_t utflen, rlen;
 97	int i;
 98
 99	start.data = line->data;
100	start.len = line->len;
101	for (i = 1; i < kd->start_column; i++)
102		skipcolumn(&start, 1);
103	if (kd->flags & MOD_STARTB)
104		skipblank(&start);
105	for (utflen = 0; start.len > 1 && utflen < (size_t)(kd->start_char - 1);) {
106		rlen = chartorune(&r, start.data);
107		start.data += rlen;
108		start.len -= rlen;
109		utflen++;
110	}
111
112	end.data = line->data;
113	end.len = line->len;
114	if (kd->end_column) {
115		for (i = 1; i < kd->end_column; i++)
116			skipcolumn(&end, 1);
117		if (kd->flags & MOD_ENDB)
118			skipblank(&end);
119		if (kd->end_char) {
120			for (utflen = 0; end.len > 1 && utflen < (size_t)kd->end_char;) {
121				rlen = chartorune(&r, end.data);
122				end.data += rlen;
123				end.len -= rlen;
124				utflen++;
125			}
126		} else {
127			skipcolumn(&end, 0);
128		}
129	} else {
130		end.data += end.len - 1;
131		end.len = 1;
132	}
133	col->line.len = MAX(0, end.data - start.data);
134	if (!(col->line.data) || col->cap < col->line.len + 1) {
135		free(col->line.data);
136		col->line.data = emalloc(col->line.len + 1);
137		col->cap = col->line.len + 1;
138	}
139	memcpy(col->line.data, start.data, col->line.len);
140	col->line.data[col->line.len] = '\0';
141}
142
143static int
144skipmodcmp(struct line *a, struct line *b, int flags)
145{
146	Rune r1, r2;
147	size_t offa = 0, offb = 0;
148
149	do {
150		offa += chartorune(&r1, a->data + offa);
151		offb += chartorune(&r2, b->data + offb);
152
153		if (flags & MOD_D && flags & MOD_I) {
154			while (offa < a->len && ((!isblankrune(r1) &&
155			       !isalnumrune(r1)) || (!isprintrune(r1))))
156				offa += chartorune(&r1, a->data + offa);
157			while (offb < b->len && ((!isblankrune(r2) &&
158			       !isalnumrune(r2)) || (!isprintrune(r2))))
159				offb += chartorune(&r2, b->data + offb);
160		}
161		else if (flags & MOD_D) {
162			while (offa < a->len && !isblankrune(r1) &&
163			       !isalnumrune(r1))
164				offa += chartorune(&r1, a->data + offa);
165			while (offb < b->len && !isblankrune(r2) &&
166			       !isalnumrune(r2))
167				offb += chartorune(&r2, b->data + offb);
168		}
169		else if (flags & MOD_I) {
170			while (offa < a->len && !isprintrune(r1))
171				offa += chartorune(&r1, a->data + offa);
172			while (offb < b->len && !isprintrune(r2))
173				offb += chartorune(&r2, b->data + offb);
174		}
175		if (flags & MOD_F) {
176			r1 = toupperrune(r1);
177			r2 = toupperrune(r2);
178		}
179	} while (r1 && r1 == r2);
180
181	return r1 - r2;
182}
183
184#if FEATURE_SORT_BIG
185static void
186getlines_z(FILE *fp, struct linebuf *b)
187{
188	char *line = NULL;
189	size_t size = 0, linelen = 0;
190	ssize_t len;
191
192	while ((len = getdelim(&line, &size, '\0', fp)) > 0) {
193		if (++b->nlines > b->capacity) {
194			b->capacity += 512;
195			b->lines = ereallocarray(b->lines, b->capacity, sizeof(*b->lines));
196		}
197		linelen = len;
198		b->lines[b->nlines - 1].data = memcpy(emalloc(linelen + 1), line, linelen + 1);
199		b->lines[b->nlines - 1].len = linelen;
200	}
201	free(line);
202	if (b->lines && b->nlines && linelen && b->lines[b->nlines - 1].data[linelen - 1] != '\0') {
203		b->lines[b->nlines - 1].data = erealloc(b->lines[b->nlines - 1].data, linelen + 2);
204		b->lines[b->nlines - 1].data[linelen] = '\0';
205		b->lines[b->nlines - 1].data[linelen + 1] = '\0';
206		b->lines[b->nlines - 1].len++;
207	}
208}
209#endif
210
211static int
212slinecmp(struct line *a, struct line *b)
213{
214	int res = 0;
215	double x, y;
216	struct keydef *kd;
217
218	TAILQ_FOREACH(kd, &kdhead, entry) {
219		columns(a, kd, &col1);
220		columns(b, kd, &col2);
221
222		/* if -u is given, don't use default key definition
223		 * unless it is the only one */
224		if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
225		    TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
226			res = 0;
227		} else if (kd->flags & MOD_N) {
228			x = strtod(col1.line.data, NULL);
229			y = strtod(col2.line.data, NULL);
230			res = (x < y) ? -1 : (x > y);
231		} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
232			res = skipmodcmp(&col1.line, &col2.line, kd->flags);
233		} else {
234			res = linecmp(&col1.line, &col2.line);
235		}
236
237		if (kd->flags & MOD_R)
238			res = -res;
239		if (res)
240			break;
241	}
242
243	return res;
244}
245
246#if FEATURE_SORT_STABLE
247struct sort_item {
248	struct line line;
249	size_t index;
250};
251
252static int
253slinecmp_stable(const struct sort_item *a, const struct sort_item *b)
254{
255	int res = slinecmp((struct line *)&a->line, (struct line *)&b->line);
256	if (res == 0)
257		return (a->index < b->index) ? -1 : 1;
258	return res;
259}
260#endif
261
262static int
263check(FILE *fp, const char *fname)
264{
265	static struct line prev, cur, tmp;
266	static size_t prevsize, cursize, tmpsize;
267	ssize_t len;
268	int delim = '\n';
269
270#if FEATURE_SORT_BIG
271	if (zflag)
272		delim = '\0';
273#endif
274
275	if (!prev.data) {
276		if ((len = getdelim(&prev.data, &prevsize, delim, fp)) < 0) {
277			if (feof(fp))
278				return 0;
279			eprintf("getdelim:");
280		}
281		prev.len = len;
282	}
283	while ((len = getdelim(&cur.data, &cursize, delim, fp)) > 0) {
284		cur.len = len;
285		if (uflag > slinecmp(&cur, &prev)) {
286			if (!Cflag) {
287				weprintf("disorder %s: ", fname);
288				fwrite(cur.data, 1, cur.len, stderr);
289			}
290			return 1;
291		}
292		tmp = cur;
293		tmpsize = cursize;
294		cur = prev;
295		cursize = prevsize;
296		prev = tmp;
297		prevsize = tmpsize;
298	}
299
300	return 0;
301}
302
303static int
304parse_flags(char **s, int *flags, int bflag)
305{
306	while (isalpha((int)**s)) {
307		switch (*((*s)++)) {
308		// ?man -b: specify block size or base directory
309	case 'b':
310			*flags |= bflag;
311			break;
312		// ?man -d: specify directory
313	case 'd':
314			*flags |= MOD_D;
315			break;
316		// ?man -f: force the operation
317	case 'f':
318			*flags |= MOD_F;
319			break;
320		// ?man -i: interactive mode or prompt for confirmation
321	case 'i':
322			*flags |= MOD_I;
323			break;
324		// ?man -n: print line numbers or counts
325	case 'n':
326			*flags |= MOD_N;
327			break;
328		// ?man -r: operate recursively
329	case 'r':
330			*flags |= MOD_R;
331			break;
332		default:
333			return -1;
334		}
335	}
336
337	return 0;
338}
339
340static void
341addkeydef(char *kdstr, int flags)
342{
343	struct keydef *kd;
344
345	kd = enmalloc(2, sizeof(*kd));
346
347	/* parse key definition kdstr with format
348	 * start_column[.start_char][flags][,end_column[.end_char][flags]]
349	 */
350	kd->start_column = 1;
351	kd->start_char = 1;
352	kd->end_column = 0; /* 0 means end of line */
353	kd->end_char = 0;   /* 0 means end of column */
354	kd->flags = flags;
355
356	if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
357		enprintf(2, "invalid start column in key definition\n");
358
359	if (*kdstr == '.') {
360		if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
361			enprintf(2, "invalid start character in key "
362			         "definition\n");
363	}
364	if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
365		enprintf(2, "invalid start flags in key definition\n");
366
367	if (*kdstr == ',') {
368		if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
369			enprintf(2, "invalid end column in key definition\n");
370		if (*kdstr == '.') {
371			if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
372				enprintf(2, "invalid end character in key "
373				         "definition\n");
374		}
375		if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
376			enprintf(2, "invalid end flags in key definition\n");
377	}
378
379	if (*kdstr != '\0')
380		enprintf(2, "invalid key definition\n");
381
382	TAILQ_INSERT_TAIL(&kdhead, kd, entry);
383}
384
385static void
386usage(void)
387{
388	enprintf(2, "usage: %s [-Cbcdfimnru"
389#if FEATURE_SORT_STABLE
390	         "s"
391#endif
392#if FEATURE_SORT_BIG
393	         "z"
394#endif
395	         "] [-o outfile] [-t delim] [-k def]... [file ...]\n", argv0);
396}
397
398// ?man sort: sort lines
399// ?man arguments: -Cbcdfimnru
400// ?man sort or merge lines of text files
401int
402main(int argc, char *argv[])
403{
404	FILE *fp, *ofp = stdout;
405	struct linebuf linebuf = EMPTY_LINEBUF;
406	size_t i;
407	int global_flags = 0, ret = 0;
408	char *outfile = NULL;
409
410	ARGBEGIN {
411	// ?man -C: specify option flag
412	case 'C':
413		Cflag = 1;
414		break;
415	// ?man -b: specify block size or base directory
416	case 'b':
417		global_flags |= MOD_STARTB | MOD_ENDB;
418		break;
419	// ?man -c: print count or perform stdout action
420	case 'c':
421		cflag = 1;
422		break;
423	// ?man -d: specify directory
424	case 'd':
425		global_flags |= MOD_D;
426		break;
427	// ?man -f: force the operation
428	case 'f':
429		global_flags |= MOD_F;
430		break;
431	// ?man -i: interactive mode or prompt for confirmation
432	case 'i':
433		global_flags |= MOD_I;
434		break;
435	// ?man -k:str: specify option flag
436	case 'k':
437		addkeydef(EARGF(usage()), global_flags);
438		break;
439	// ?man -m: specify mode or limit
440	case 'm':
441		/* more or less for free, but for performance-reasons,
442		 * we should keep this flag in mind and maybe some later
443		 * day implement it properly so we don't run out of memory
444		 * while merging large sorted files.
445		 */
446		break;
447	// ?man -n: print line numbers or counts
448	case 'n':
449		global_flags |= MOD_N;
450		break;
451	// ?man -o:file: specify output file
452	case 'o':
453		outfile = EARGF(usage());
454		break;
455	// ?man -r: operate recursively
456	case 'r':
457		global_flags |= MOD_R;
458		break;
459#if FEATURE_SORT_STABLE
460	// ?man -s: silent mode or print summary
461	case 's':
462		sflag = 1;
463		break;
464#endif
465	// ?man -t:str: sort or specify timestamp
466	case 't':
467		fieldsep = EARGF(usage());
468		if (!*fieldsep)
469			eprintf("empty delimiter\n");
470		fieldseplen = unescape(fieldsep);
471		break;
472	// ?man -u: unbuffered output
473	case 'u':
474		uflag = 1;
475		break;
476#if FEATURE_SORT_BIG
477	// ?man -z: specify option flag
478	case 'z':
479		zflag = 1;
480		break;
481#endif
482	default:
483		usage();
484	} ARGEND
485
486	/* -b shall only apply to custom key definitions */
487	if (TAILQ_EMPTY(&kdhead) && global_flags)
488		addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
489	if (TAILQ_EMPTY(&kdhead) || (!Cflag && !cflag))
490		addkeydef("1", global_flags & MOD_R);
491
492	if (!argc) {
493		if (Cflag || cflag) {
494			if (check(stdin, "<stdin>") && !ret)
495				ret = 1;
496		} else {
497#if FEATURE_SORT_BIG
498			if (zflag)
499				getlines_z(stdin, &linebuf);
500			else
501#endif
502				getlines(stdin, &linebuf);
503		}
504	} else for (; *argv; argc--, argv++) {
505		if (!strcmp(*argv, "-")) {
506			*argv = "<stdin>";
507			fp = stdin;
508		} else if (!(fp = fopen(*argv, "r"))) {
509			enprintf(2, "fopen %s:", *argv);
510			continue;
511		}
512		if (Cflag || cflag) {
513			if (check(fp, *argv) && !ret)
514				ret = 1;
515		} else {
516#if FEATURE_SORT_BIG
517			if (zflag)
518				getlines_z(fp, &linebuf);
519			else
520#endif
521				getlines(fp, &linebuf);
522		}
523		if (fp != stdin && fshut(fp, *argv))
524			ret = 2;
525	}
526
527	if (!Cflag && !cflag) {
528		if (outfile && !(ofp = fopen(outfile, "w")))
529			eprintf("fopen %s:", outfile);
530
531#if FEATURE_SORT_STABLE
532		if (sflag) {
533			struct sort_item *items = ecalloc(linebuf.nlines, sizeof(*items));
534			for (i = 0; i < linebuf.nlines; i++) {
535				items[i].line = linebuf.lines[i];
536				items[i].index = i;
537			}
538			qsort(items, linebuf.nlines, sizeof(*items),
539			      (int (*)(const void *, const void *))slinecmp_stable);
540			for (i = 0; i < linebuf.nlines; i++) {
541				linebuf.lines[i] = items[i].line;
542			}
543			free(items);
544		} else {
545#endif
546			qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
547			      (int (*)(const void *, const void *))slinecmp);
548#if FEATURE_SORT_STABLE
549		}
550#endif
551
552		for (i = 0; i < linebuf.nlines; i++) {
553			if (!uflag || i == 0 ||
554			    slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
555				fwrite(linebuf.lines[i].data, 1,
556				       linebuf.lines[i].len, ofp);
557			}
558		}
559	}
560
561	if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
562	    fshut(stderr, "<stderr>"))
563		ret = 2;
564
565	return ret;
566}