summaryrefslogtreecommitdiff
path: root/mkhtm2html-1
diff options
context:
space:
mode:
Diffstat (limited to 'mkhtm2html-1')
-rwxr-xr-xmkhtm2html-149
1 files changed, 49 insertions, 0 deletions
diff --git a/mkhtm2html-1 b/mkhtm2html-1
new file mode 100755
index 0000000..6f37a46
--- /dev/null
+++ b/mkhtm2html-1
@@ -0,0 +1,49 @@
+#!/usr/bin/perl
+
+# normalize HTML
+
+use Getopt::Long;
+use File::Slurp;
+
+use strict;
+use warnings;
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+my $_ = read_file( $stem . '.htm' );
+
+# whitespace
+s/ / /mg;
+
+# page header
+s{<P\b[^>]*;top:6[23]px;[^>]*>[^<]*</P>\s*}{}mg;
+
+# footnote
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
+ s{\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\)}{}mg;
+};
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) {
+ s{<P\b[^>]*>\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\) [^<]*</P>\s*}{}mg;
+};
+
+# unwrap similarly styled bolded paragraphs
+s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg;
+
+# headline
+s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg;
+s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg;
+s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg;
+s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg;
+
+# unwrap
+s{(?<=\S)-(<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg;
+s{\s*<br/>\s*}{ }mg;
+
+write_file( $stem . '.html', $_ );
+
+print "DONE: $0 stem $stem\n";