commit 653ae91
shrub
·
2026-05-17 21:50:22 +0000 UTC
parent f22f742
use cleaner awk
2 files changed,
+102,
-6
R README =>
README.md
+0,
-0
+102,
-6
1@@ -26,12 +26,108 @@ raw)
2 body)
3 in="$1"
4 out="$2"
5- awk 'BEGIN{inb=0;skip=0} \
6- /<[Bb][Oo][Dd][Yy][^>]*>/{inb=1; sub(/^.*<[Bb][Oo][Dd][Yy][^>]*>/,""); if(length($0))print; next} \
7- /<\/[Bb][Oo][Dd][Yy]>/ {sub(/<\/[Bb][Oo][Dd][Yy]>.*/,""); if(inb&&length($0))print; inb=0; next} \
8- { if(!inb)next; if($0 ~ /<table class="head">/){skip=1;next} if($0 ~ /<table class="foot">/){skip=1;next} if(skip && $0 ~ /<\/table>/){skip=0;next} if(!skip)print }' "$in" \
9- | perl -0777 -pe 's{<div class="Bd[^"]*\bLi\b[^"]*">\s*<pre>(.*?)</pre>\s*</div>}{my $r=$1;my @l=grep{$_!~/^\s*$/}split/\n/,$r;my $ok=@l&&!grep{$_!~/^\s*<.*>\s*$/}@l;if($ok){$r=~s/</</g;$r=~s/>/>/g;$r=~s/"/"/g;$r}else{qq{<div class="Bd Li"><pre>$1</pre></div>}}}gse' \
10- > "$out"
11+ awk '
12+ function lower(s) { return tolower(s) }
13+ function decode_html_entities(s) {
14+ gsub(/</, "<", s)
15+ gsub(/>/, ">", s)
16+ gsub(/"/, "\"", s)
17+ gsub(/&/, "&", s)
18+ return s
19+ }
20+ function resetblock() {
21+ in_literal_div = 0
22+ in_pre = 0
23+ lit_n = 0
24+ pre_n = 0
25+ }
26+ function push_litline(s) { lit[++lit_n] = s }
27+ function push_preline(s) { pre[++pre_n] = s }
28+ function flushblock( i, line, non_empty, all_htmlish) {
29+ non_empty = 0
30+ all_htmlish = 1
31+ for (i = 1; i <= pre_n; i++) {
32+ line = pre[i]
33+ if (line ~ /^[[:space:]]*$/) continue
34+ non_empty++
35+ if (line !~ /^[[:space:]]*<.*>[[:space:]]*$/) all_htmlish = 0
36+ }
37+ if (non_empty > 0 && all_htmlish) {
38+ for (i = 1; i <= pre_n; i++) print decode_html_entities(pre[i])
39+ } else {
40+ for (i = 1; i <= lit_n; i++) print lit[i]
41+ }
42+ resetblock()
43+ }
44+ BEGIN {
45+ in_body = 0
46+ skip_table = 0
47+ resetblock()
48+ }
49+ {
50+ line = $0
51+ lc = lower(line)
52+
53+ if (!in_body) {
54+ if (lc ~ /<body[^>]*>/) {
55+ in_body = 1
56+ sub(/^.*<body[^>]*>/, "", line)
57+ if (length(line)) $0 = line
58+ else next
59+ } else next
60+ }
61+
62+ if (in_body && lc ~ /<\/body>/) {
63+ sub(/<\/body>.*/, "", line)
64+ if (in_literal_div) {
65+ if (length(line)) push_litline(line)
66+ flushblock()
67+ } else if (length(line)) print line
68+ in_body = 0
69+ next
70+ }
71+
72+ if (in_literal_div) {
73+ push_litline(line)
74+ if (!in_pre && lc ~ /<pre>/) {
75+ in_pre = 1
76+ line_after_pre = line
77+ sub(/^.*<pre>/, "", line_after_pre)
78+ if (length(line_after_pre)) push_preline(line_after_pre)
79+ next
80+ }
81+ if (in_pre && lc ~ /<\/pre>/) {
82+ line_before_pre_end = line
83+ sub(/<\/pre>.*/, "", line_before_pre_end)
84+ if (length(line_before_pre_end)) push_preline(line_before_pre_end)
85+ in_pre = 0
86+ } else if (in_pre) push_preline(line)
87+ if (lc ~ /<\/div>/) flushblock()
88+ next
89+ }
90+
91+ if (lc ~ /<table class="head">/ || lc ~ /<table class="foot">/) {
92+ skip_table = 1
93+ next
94+ }
95+ if (skip_table) {
96+ if (lc ~ /<\/table>/) skip_table = 0
97+ next
98+ }
99+
100+ if (lc ~ /<div class="bd[^"]*li[^"]*">/) {
101+ in_literal_div = 1
102+ push_litline(line)
103+ if (lc ~ /<pre>/) in_pre = 1
104+ next
105+ }
106+
107+ print line
108+ }
109+ END {
110+ if (in_literal_div) flushblock()
111+ }
112+ ' "$in" > "$out"
113 ;;
114 sidebar)
115 out="$1"