summaryrefslogtreecommitdiff
path: root/report/normalize-html.pl
diff options
context:
space:
mode:
Diffstat (limited to 'report/normalize-html.pl')
-rwxr-xr-xreport/normalize-html.pl29
1 files changed, 29 insertions, 0 deletions
diff --git a/report/normalize-html.pl b/report/normalize-html.pl
new file mode 100755
index 0000000..7245a4e
--- /dev/null
+++ b/report/normalize-html.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+use File::Slurp;
+
+my $_ = read_file( $ARGV[0] ) ;
+
+#s!( )! !g; # filler horisontal space
+s!( )+! !g; # filler horisontal space
+#s!(\s*<br>\s*)+!<br>!g; # filler vertical space
+s!(\s*<br/>\s*)+!<br/>!g; # filler vertical space
+s,<br/>(?=(?:</b>)?</p>),,ig; # filler horisontal space
+s,<p>\s*</p>,,ig; # filler horisontal space
+s!\s*<b>\s*</b>\s*!!g; # filler vertical space
+s!<a name="?\d+"?></a>!!gi; # page dividers
+#s!<body><img[^>]+>\K<br><b>(.+?)</b>!<h1>$1</h1>!i; # H1
+#s!<br><b>(.+?)</b>!<h1>$1</h1>!i; # H1
+s!\.(ft\d+){font\-size:52px;.*?\K<p\b[^>]*\bclass="\1"[^>]*><b>(.*?)</b></p>!<h1>$2</h1>!is;
+s!\.(ft\d+){font\-size:19px;.*?\K<p\b[^>]*\bclass="\1"[^>]*><b>(.*?)</b></p>!<h2 class="front">$2</h2>!isg;
+s!\.(ft\d+){font\-size:16px;.*?\K<p\b[^>]*\bclass="\1"[^>]*><b>(.*?)</b></p>!<h3 class="front">$2</h3>!isg;
+s!<p\b[^>]*><b>([^<\s][^<]*?)</b></p>!<h2>$1</h2>!isg;
+s,<p\b[^>]*>(?!<b>),<p>,isg;
+s,<style.*?</style>,,isg;
+s,<!-- .*? -->,,isg;
+s,<div id="page[^>]*>,,isg;
+s,</div>,,isg;
+
+$ARGV[1] ? write_file( $ARGV[1], $_ ) : print;
+
+#pandoc --normalize --no-wrap --parse-raw -f html -t markdown -o "$outfile"