master xplshn/aruu / cmd / net / wget.c
  1/* see license file for copyright and license details */
  2
  3
  4#include "util.h"
  5#include "arg.h"
  6#include "tls.h"
  7
  8#include <arpa/inet.h>
  9#include <ctype.h>
 10#include <errno.h>
 11#include <fcntl.h>
 12#include <netdb.h>
 13#include <netinet/in.h>
 14#include <stdio.h>
 15#include <stdlib.h>
 16#include <string.h>
 17#include <sys/socket.h>
 18#include <sys/stat.h>
 19#include <sys/types.h>
 20#include <unistd.h>
 21
 22struct Stream {
 23	struct TlsSocket *ts;
 24	char buf[8192];
 25	size_t len;
 26	size_t idx;
 27};
 28
 29static int qflag = 0;
 30static int Sflag = 0;
 31static int cflag = 0;
 32static int spider = 0;
 33static int no_check_certificate = 0;
 34static int timeout_sec = 900;
 35static char *Pflag = NULL;
 36static char *Oflag = NULL;
 37static char *user_agent = "wget/aruu";
 38static char *post_data = NULL;
 39static char *post_file = NULL;
 40static char **custom_headers = NULL;
 41static size_t custom_headers_num = 0;
 42
 43static void
 44usage(void)
 45{
 46	eprintf("usage: %s [-cqS] [-O file] [-P dir] [-T timeout] [-U user_agent] "
 47	        "[-post-data data] [-post-file file] [-header header] "
 48	        "[-no-check-certificate] [-spider] url\n", argv0);
 49}
 50
 51static void
 52add_header(const char *hdr)
 53{
 54	custom_headers = ereallocarray(custom_headers, custom_headers_num + 1, sizeof(*custom_headers));
 55	custom_headers[custom_headers_num++] = estrdup(hdr);
 56}
 57
 58static int
 59dial(const char *host, const char *port)
 60{
 61	struct addrinfo hints, *res, *rp;
 62	int fd = -1, r;
 63
 64	memset(&hints, 0, sizeof(hints));
 65	hints.ai_family = AF_UNSPEC;
 66	hints.ai_socktype = SOCK_STREAM;
 67
 68	r = getaddrinfo(host, port, &hints, &res);
 69	if (r != 0) {
 70		if (!qflag)
 71			weprintf("getaddrinfo %s:%s: %s\n", host, port, gai_strerror(r));
 72		return -1;
 73	}
 74
 75	for (rp = res; rp; rp = rp->ai_next) {
 76		fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
 77		if (fd < 0)
 78			continue;
 79		if (timeout_sec > 0) {
 80			struct timeval tv;
 81			tv.tv_sec = timeout_sec;
 82			tv.tv_usec = 0;
 83			setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
 84			setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
 85		}
 86		if (connect(fd, rp->ai_addr, rp->ai_addrlen) == 0)
 87			break;
 88		close(fd);
 89		fd = -1;
 90	}
 91
 92	freeaddrinfo(res);
 93	return fd;
 94}
 95
 96static void
 97parse_url(char *url, char **host, char **port, char **path, int *is_tls)
 98{
 99	char *p, *ss;
100
101	*is_tls = 0;
102	if (strncasecmp(url, "http://", 7) == 0) {
103		url += 7;
104	} else if (strncasecmp(url, "https://", 8) == 0) {
105		url += 8;
106		*is_tls = 1;
107	} else {
108		eprintf("unsupported protocol or invalid url: %s\n", url);
109	}
110
111	*host = url;
112	p = strchr(url, '/');
113	if (p) {
114		*p = '\0';
115		*path = p + 1;
116	} else {
117		*path = "";
118	}
119
120	/* handle ipv6 brackets or host:port */
121	if (**host == '[') {
122		(*host)++;
123		ss = strchr(*host, ']');
124		if (ss) {
125			*ss = '\0';
126			ss++;
127			if (*ss == ':')
128				*port = ss + 1;
129			else
130				*port = *is_tls ? "443" : "80";
131		} else {
132			eprintf("invalid ipv6 literal: %s\n", *host);
133		}
134	} else {
135		p = strrchr(*host, ':');
136		if (p) {
137			*p = '\0';
138			*port = p + 1;
139		} else {
140			*port = *is_tls ? "443" : "80";
141		}
142	}
143}
144
145static char *
146find_header(const char *headers, const char *name)
147{
148	const char *p;
149	size_t len = strlen(name);
150
151	p = headers;
152	while (p && *p) {
153		if (strncasecmp(p, name, len) == 0) {
154			p += len;
155			while (*p == ' ' || *p == '\t')
156				p++;
157			len = strcspn(p, "\r\n");
158			return estrndup(p, len);
159		}
160		p = strchr(p, '\n');
161		if (p)
162			p++;
163	}
164	return NULL;
165}
166
167static int
168stream_getc(struct Stream *s)
169{
170	ssize_t r;
171
172	if (s->idx < s->len) {
173		return (unsigned char)s->buf[s->idx++];
174	}
175	s->idx = 0;
176	r = tls_read(s->ts, s->buf, sizeof(s->buf));
177	if (r <= 0) {
178		s->len = 0;
179		return EOF;
180	}
181	s->len = (size_t)r;
182	return (unsigned char)s->buf[s->idx++];
183}
184
185static size_t
186stream_read(struct Stream *s, void *ptr, size_t size)
187{
188	size_t total = 0;
189	size_t n;
190	char *p = ptr;
191	ssize_t r;
192
193	while (total < size) {
194		if (s->idx < s->len) {
195			n = MIN(size - total, s->len - s->idx);
196			memcpy(p + total, s->buf + s->idx, n);
197			s->idx += n;
198			total += n;
199		} else {
200			s->idx = 0;
201			r = tls_read(s->ts, s->buf, sizeof(s->buf));
202			if (r <= 0) {
203				s->len = 0;
204				break;
205			}
206			s->len = (size_t)r;
207		}
208	}
209	return total;
210}
211
212static void
213read_chunked(struct Stream *s, int out_fd)
214{
215	char line[128];
216	char chunk_buf[8192];
217	size_t line_len, n;
218	long long chunk_size, remaining;
219	int c;
220
221	for (;;) {
222		line_len = 0;
223		for (;;) {
224			c = stream_getc(s);
225			if (c == EOF)
226				eprintf("unexpected end of file reading chunk size\n");
227			if (c == '\n') {
228				line[line_len] = '\0';
229				break;
230			}
231			if (c != '\r' && line_len < sizeof(line) - 1) {
232				line[line_len++] = c;
233			}
234		}
235
236		chunk_size = strtoll(line, NULL, 16);
237		if (chunk_size == 0) {
238			stream_getc(s);
239			stream_getc(s);
240			break;
241		}
242
243		remaining = chunk_size;
244		while (remaining > 0) {
245			n = stream_read(s, chunk_buf, MIN(remaining, (long long)sizeof(chunk_buf)));
246			if (n == 0)
247				eprintf("unexpected end of file in chunk data\n");
248			if (writeall(out_fd, chunk_buf, n) < 0)
249				eprintf("write output:\n");
250			remaining -= n;
251		}
252
253		stream_getc(s);
254		stream_getc(s);
255	}
256}
257
258static void
259read_non_chunked(struct Stream *s, int out_fd, long long content_len)
260{
261	char chunk_buf[8192];
262	long long remaining = content_len;
263	size_t n, to_read;
264
265	while (content_len < 0 || remaining > 0) {
266		to_read = sizeof(chunk_buf);
267		if (content_len >= 0)
268			to_read = (size_t)MIN(remaining, (long long)sizeof(chunk_buf));
269		n = stream_read(s, chunk_buf, to_read);
270		if (n == 0) {
271			if (content_len >= 0)
272				eprintf("unexpected end of file\n");
273			break;
274		}
275		if (writeall(out_fd, chunk_buf, n) < 0)
276			eprintf("write output:\n");
277		if (content_len >= 0)
278			remaining -= n;
279	}
280}
281
282static void
283req_printf(struct TlsSocket *ts, const char *fmt, ...)
284{
285	va_list ap;
286	char buf[1024];
287	int len;
288
289	va_start(ap, fmt);
290	len = vsnprintf(buf, sizeof(buf), fmt, ap);
291	va_end(ap);
292	if (len > 0)
293		tls_write(ts, buf, len);
294}
295
296// ?man wget: retrieve files from the web
297// ?man arguments: url
298// ?man download files over http or https
299int
300main(int argc, char *argv[])
301{
302	struct Stream s;
303	char *url, *host, *port, *path, *loc;
304	char *curr_host, *curr_port, *curr_path;
305	char *new_url;
306	char *cl_str;
307	char *te_str;
308	char *header_end;
309	char *out_name;
310	int redirects = 0;
311	int max_redirects = 20;
312	int sock_fd = -1;
313	int out_fd = 1;
314	int chunked;
315	int status;
316	long long content_len;
317	size_t total_read;
318	ssize_t n;
319	size_t dir_len;
320	char *last_slash;
321	int is_tls = 0;
322	struct TlsSocket *tls_sock = NULL;
323	off_t resume_offset = 0;
324	int out_mode = O_WRONLY | O_CREAT | O_TRUNC;
325	long long post_len = 0;
326	int post_fd = -1;
327	size_t i;
328
329	ARGBEGIN {
330	// ?man -O:str: specify output file path
331	case 'O':
332		Oflag = EARGF(usage());
333		break;
334	// ?man -P:str: specify output directory prefix
335	case 'P':
336		Pflag = EARGF(usage());
337		break;
338	// ?man -T:num: set network read and connect timeout
339	case 'T':
340		timeout_sec = estrtonum(EARGF(usage()), 0, 100000);
341		break;
342	// ?man -U:str: set User-Agent header
343	case 'U':
344		user_agent = EARGF(usage());
345		break;
346	// ?man -c: continue retrieval of aborted transfer
347	case 'c':
348		cflag = 1;
349		break;
350	// ?man -q: quiet mode to suppress stderr output
351	case 'q':
352		qflag = 1;
353		break;
354	// ?man -S: print server response headers to stderr
355	case 'S':
356		Sflag = 1;
357		break;
358	// ?man --: specify - option
359	case '-':
360		if (strcmp(argv[0], "-no-check-certificate") == 0) {
361			no_check_certificate = 1;
362			brk_ = 1;
363		} else if (strncmp(argv[0], "-header=", 8) == 0) {
364			add_header(argv[0] + 8);
365			brk_ = 1;
366		} else if (strcmp(argv[0], "-header") == 0) {
367			brk_ = 1;
368			if (!argv[1])
369				usage();
370			add_header(argv[1]);
371			argv++;
372			argc--;
373		} else if (strncmp(argv[0], "-post-data=", 11) == 0) {
374			post_data = argv[0] + 11;
375			brk_ = 1;
376		} else if (strcmp(argv[0], "-post-data") == 0) {
377			brk_ = 1;
378			if (!argv[1])
379				usage();
380			post_data = argv[1];
381			argv++;
382			argc--;
383		} else if (strncmp(argv[0], "-post-file=", 11) == 0) {
384			post_file = argv[0] + 11;
385			brk_ = 1;
386		} else if (strcmp(argv[0], "-post-file") == 0) {
387			brk_ = 1;
388			if (!argv[1])
389				usage();
390			post_file = argv[1];
391			argv++;
392			argc--;
393		} else if (strcmp(argv[0], "-spider") == 0) {
394			spider = 1;
395			brk_ = 1;
396		} else {
397			usage();
398		}
399		break;
400	default:
401		usage();
402	} ARGEND
403
404	if (argc < 1)
405		usage();
406
407	url = estrdup(argv[0]);
408
409	/* determine output filename early to check for resume */
410	out_name = NULL;
411	if (Oflag) {
412		out_name = Oflag;
413	} else {
414		last_slash = strrchr(url, '/');
415		if (last_slash && *(last_slash + 1))
416			out_name = last_slash + 1;
417		else
418			out_name = "index.html";
419
420		if (Pflag) {
421			char *tmp = emalloc(strlen(Pflag) + 1 + strlen(out_name) + 1);
422			sprintf(tmp, "%s/%s", Pflag, out_name);
423			out_name = tmp;
424		}
425	}
426
427	if (cflag && out_name && strcmp(out_name, "-") != 0) {
428		struct stat st;
429		if (stat(out_name, &st) == 0 && S_ISREG(st.st_mode)) {
430			resume_offset = st.st_size;
431		}
432	}
433
434	if (post_data) {
435		post_len = strlen(post_data);
436	} else if (post_file) {
437		struct stat st;
438		post_fd = open(post_file, O_RDONLY);
439		if (post_fd < 0)
440			eprintf("open %s:\n", post_file);
441		if (fstat(post_fd, &st) < 0)
442			eprintf("stat %s:\n", post_file);
443		post_len = st.st_size;
444	}
445
446	while (!tls_sock) {
447		if (redirects > max_redirects)
448			eprintf("too many redirects\n");
449
450		curr_host = curr_port = curr_path = NULL;
451		parse_url(url, &curr_host, &curr_port, &curr_path, &is_tls);
452
453		host = estrdup(curr_host);
454		port = estrdup(curr_port);
455		path = estrdup(curr_path);
456
457		sock_fd = dial(host, port);
458		if (sock_fd < 0)
459			eprintf("failed to connect to %s:%s\n", host, port);
460
461		tls_sock = tls_connect(sock_fd, host, !no_check_certificate, is_tls);
462		if (!tls_sock) {
463			close(sock_fd);
464			eprintf("failed to establish TLS connection with %s\n", host);
465		}
466
467		/* send http request */
468		const char *method = spider ? "HEAD" : ((post_data || post_file) ? "POST" : "GET");
469		req_printf(tls_sock, "%s /%s HTTP/1.1\r\n", method, path);
470		req_printf(tls_sock, "Host: %s\r\n", host);
471		req_printf(tls_sock, "User-Agent: %s\r\n", user_agent);
472		req_printf(tls_sock, "Connection: close\r\n");
473
474		if (resume_offset > 0) {
475			req_printf(tls_sock, "Range: bytes=%lld-\r\n", (long long)resume_offset);
476		}
477
478		if (post_data || post_file) {
479			int has_ct = 0;
480			for (i = 0; i < custom_headers_num; i++) {
481				if (strncasecmp(custom_headers[i], "Content-Type:", 13) == 0) {
482					has_ct = 1;
483					break;
484				}
485			}
486			if (!has_ct) {
487				req_printf(tls_sock, "Content-Type: application/x-www-form-urlencoded\r\n");
488			}
489			req_printf(tls_sock, "Content-Length: %lld\r\n", post_len);
490		}
491
492		for (i = 0; i < custom_headers_num; i++) {
493			req_printf(tls_sock, "%s\r\n", custom_headers[i]);
494		}
495
496		req_printf(tls_sock, "\r\n");
497
498		if (post_data) {
499			tls_write(tls_sock, post_data, strlen(post_data));
500		} else if (post_file) {
501			char io_buf[8192];
502			ssize_t r;
503			while ((r = read(post_fd, io_buf, sizeof(io_buf))) > 0) {
504				if (tls_write(tls_sock, io_buf, r) < 0) {
505					eprintf("failed to write post data:\n");
506				}
507			}
508			close(post_fd);
509			post_fd = -1;
510		}
511
512		/* read headers */
513		total_read = 0;
514		header_end = NULL;
515		memset(s.buf, 0, sizeof(s.buf));
516		while (total_read < sizeof(s.buf) - 1) {
517			n = tls_read(tls_sock, s.buf + total_read, sizeof(s.buf) - 1 - total_read);
518			if (n <= 0) {
519				if (n < 0)
520					eprintf("read socket:\n");
521				else
522					eprintf("connection closed by server\n");
523			}
524			total_read += n;
525			s.buf[total_read] = '\0';
526			header_end = strstr(s.buf, "\r\n\r\n");
527			if (header_end)
528				break;
529		}
530
531		if (!header_end)
532			eprintf("http header too large or not found\n");
533
534		*header_end = '\0';
535		s.ts = tls_sock;
536		s.len = total_read;
537		s.idx = (header_end + 4) - s.buf;
538
539		if (Sflag) {
540			fprintf(stderr, "%s\n\n", s.buf);
541		}
542
543		if (strncasecmp(s.buf, "HTTP/1.1 ", 9) != 0 &&
544		    strncasecmp(s.buf, "HTTP/1.0 ", 9) != 0) {
545			eprintf("invalid http response: %s\n", s.buf);
546		}
547		status = atoi(s.buf + 9);
548
549		if (status >= 300 && status < 400) {
550			loc = find_header(s.buf, "Location:");
551			if (!loc)
552				eprintf("redirect response without location header\n");
553
554			if (strncasecmp(loc, "http://", 7) == 0 ||
555			    strncasecmp(loc, "https://", 8) == 0) {
556				new_url = estrdup(loc);
557			} else if (loc[0] == '/') {
558				new_url = emalloc(8 + strlen(host) + strlen(port) + strlen(loc) + 2);
559				sprintf(new_url, "%s://%s:%s%s", is_tls ? "https" : "http", host, port, loc);
560			} else {
561				last_slash = strrchr(path, '/');
562				dir_len = 0;
563				if (last_slash)
564					dir_len = last_slash - path + 1;
565				new_url = emalloc(8 + strlen(host) + strlen(port) + 1 + dir_len + strlen(loc) + 2);
566				sprintf(new_url, "%s://%s:%s/", is_tls ? "https" : "http", host, port);
567				if (dir_len > 0)
568					strncat(new_url, path, dir_len);
569				strcat(new_url, loc);
570			}
571
572			free(loc);
573			free(url);
574			url = new_url;
575			tls_close(tls_sock, 1);
576			tls_sock = NULL;
577			redirects++;
578		} else if (status == 206) {
579			out_mode = O_WRONLY | O_CREAT | O_APPEND;
580		} else if (status == 200) {
581			out_mode = O_WRONLY | O_CREAT | O_TRUNC;
582		} else if (status == 416) {
583			if (!qflag)
584				weprintf("file already fully retrieved or range invalid\n");
585			tls_close(tls_sock, 1);
586			free(url);
587			free(host);
588			free(port);
589			free(path);
590			return 0;
591		} else {
592			eprintf("server returned status: %d\n", status);
593		}
594
595		free(host);
596		free(port);
597		free(path);
598	}
599
600	if (spider) {
601		tls_close(tls_sock, 1);
602		free(url);
603		return 0;
604	}
605
606	cl_str = find_header(s.buf, "Content-Length:");
607	content_len = -1;
608	if (cl_str) {
609		content_len = strtoll(cl_str, NULL, 10);
610		free(cl_str);
611	}
612
613	te_str = find_header(s.buf, "Transfer-Encoding:");
614	chunked = 0;
615	if (te_str) {
616		if (strcasecmp(te_str, "chunked") == 0)
617			chunked = 1;
618		free(te_str);
619	}
620
621	if (strcmp(out_name, "-") != 0) {
622		out_fd = open(out_name, out_mode, 0644);
623		if (out_fd < 0)
624			eprintf("open %s:\n", out_name);
625	}
626
627	if (chunked)
628		read_chunked(&s, out_fd);
629	else
630		read_non_chunked(&s, out_fd, content_len);
631
632	tls_close(tls_sock, 1);
633	if (out_fd != 1)
634		close(out_fd);
635	if (Oflag != out_name && Pflag)
636		free(out_name);
637	free(url);
638
639	for (i = 0; i < custom_headers_num; i++) {
640		free(custom_headers[i]);
641	}
642	free(custom_headers);
643
644	return 0;
645}