commit 653ae91

shrub  ·  2026-05-17 21:50:22 +0000 UTC
parent f22f742
use cleaner awk
2 files changed,  +102, -6
R README => README.md
+0, -0
+102, -6
  1@@ -26,12 +26,108 @@ raw)
  2 body)
  3 	in="$1"
  4 	out="$2"
  5-	awk 'BEGIN{inb=0;skip=0} \
  6-		/<[Bb][Oo][Dd][Yy][^>]*>/{inb=1; sub(/^.*<[Bb][Oo][Dd][Yy][^>]*>/,""); if(length($0))print; next} \
  7-		/<\/[Bb][Oo][Dd][Yy]>/ {sub(/<\/[Bb][Oo][Dd][Yy]>.*/,""); if(inb&&length($0))print; inb=0; next} \
  8-		{ if(!inb)next; if($0 ~ /<table class="head">/){skip=1;next} if($0 ~ /<table class="foot">/){skip=1;next} if(skip && $0 ~ /<\/table>/){skip=0;next} if(!skip)print }' "$in" \
  9-	| perl -0777 -pe 's{<div class="Bd[^"]*\bLi\b[^"]*">\s*<pre>(.*?)</pre>\s*</div>}{my $r=$1;my @l=grep{$_!~/^\s*$/}split/\n/,$r;my $ok=@l&&!grep{$_!~/^\s*&lt;.*&gt;\s*$/}@l;if($ok){$r=~s/&lt;/</g;$r=~s/&gt;/>/g;$r=~s/&quot;/"/g;$r}else{qq{<div class="Bd Li"><pre>$1</pre></div>}}}gse' \
 10-	> "$out"
 11+	awk '
 12+		function lower(s) { return tolower(s) }
 13+		function decode_html_entities(s) {
 14+			gsub(/&lt;/, "<", s)
 15+			gsub(/&gt;/, ">", s)
 16+			gsub(/&quot;/, "\"", s)
 17+			gsub(/&amp;/, "&", s)
 18+			return s
 19+		}
 20+		function resetblock() {
 21+			in_literal_div = 0
 22+			in_pre = 0
 23+			lit_n = 0
 24+			pre_n = 0
 25+		}
 26+		function push_litline(s) { lit[++lit_n] = s }
 27+		function push_preline(s) { pre[++pre_n] = s }
 28+		function flushblock(    i, line, non_empty, all_htmlish) {
 29+			non_empty = 0
 30+			all_htmlish = 1
 31+			for (i = 1; i <= pre_n; i++) {
 32+				line = pre[i]
 33+				if (line ~ /^[[:space:]]*$/) continue
 34+				non_empty++
 35+				if (line !~ /^[[:space:]]*&lt;.*&gt;[[:space:]]*$/) all_htmlish = 0
 36+			}
 37+			if (non_empty > 0 && all_htmlish) {
 38+				for (i = 1; i <= pre_n; i++) print decode_html_entities(pre[i])
 39+			} else {
 40+				for (i = 1; i <= lit_n; i++) print lit[i]
 41+			}
 42+			resetblock()
 43+		}
 44+		BEGIN {
 45+			in_body = 0
 46+			skip_table = 0
 47+			resetblock()
 48+		}
 49+		{
 50+			line = $0
 51+			lc = lower(line)
 52+
 53+			if (!in_body) {
 54+				if (lc ~ /<body[^>]*>/) {
 55+					in_body = 1
 56+					sub(/^.*<body[^>]*>/, "", line)
 57+					if (length(line)) $0 = line
 58+					else next
 59+				} else next
 60+			}
 61+
 62+			if (in_body && lc ~ /<\/body>/) {
 63+				sub(/<\/body>.*/, "", line)
 64+				if (in_literal_div) {
 65+					if (length(line)) push_litline(line)
 66+					flushblock()
 67+				} else if (length(line)) print line
 68+				in_body = 0
 69+				next
 70+			}
 71+
 72+			if (in_literal_div) {
 73+				push_litline(line)
 74+				if (!in_pre && lc ~ /<pre>/) {
 75+					in_pre = 1
 76+					line_after_pre = line
 77+					sub(/^.*<pre>/, "", line_after_pre)
 78+					if (length(line_after_pre)) push_preline(line_after_pre)
 79+					next
 80+				}
 81+				if (in_pre && lc ~ /<\/pre>/) {
 82+					line_before_pre_end = line
 83+					sub(/<\/pre>.*/, "", line_before_pre_end)
 84+					if (length(line_before_pre_end)) push_preline(line_before_pre_end)
 85+					in_pre = 0
 86+				} else if (in_pre) push_preline(line)
 87+				if (lc ~ /<\/div>/) flushblock()
 88+				next
 89+			}
 90+
 91+			if (lc ~ /<table class="head">/ || lc ~ /<table class="foot">/) {
 92+				skip_table = 1
 93+				next
 94+			}
 95+			if (skip_table) {
 96+				if (lc ~ /<\/table>/) skip_table = 0
 97+				next
 98+			}
 99+
100+			if (lc ~ /<div class="bd[^"]*li[^"]*">/) {
101+				in_literal_div = 1
102+				push_litline(line)
103+				if (lc ~ /<pre>/) in_pre = 1
104+				next
105+			}
106+
107+			print line
108+		}
109+		END {
110+			if (in_literal_div) flushblock()
111+		}
112+	' "$in" > "$out"
113 	;;
114 sidebar)
115 	out="$1"