summaryrefslogtreecommitdiff
path: root/mkhtm2html-2
diff options
context:
space:
mode:
authorJonas Smedegaard <dr@jones.dk>2013-05-10 20:46:04 +0200
committerJonas Smedegaard <dr@jones.dk>2013-05-10 20:46:04 +0200
commit27d338dec0428bc22e2838eb8641c6e0d1681e22 (patch)
tree18eeb76e189ce03838dd9b23194f1c707b1eabbb /mkhtm2html-2
Include mk* scripts
Diffstat (limited to 'mkhtm2html-2')
-rwxr-xr-xmkhtm2html-239
1 files changed, 39 insertions, 0 deletions
diff --git a/mkhtm2html-2 b/mkhtm2html-2
new file mode 100755
index 0000000..ae18391
--- /dev/null
+++ b/mkhtm2html-2
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+
+# normalize HTML
+
+use Getopt::Long;
+use File::Slurp;
+
+use strict;
+use warnings;
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+my $_ = read_file( $stem . '.htm' );
+
+# whitespace
+s/(?:&#160;|\h)+/ /mg;
+
+# preamble
+s{.*>HAVE ADOPTED THIS REGULATION:</P>\s*}{}s;
+
+# page header
+s{<P\b[^>]*;top:(?:1172|1187)px;[^>]*>(?:(?!</P\b).)+.</P>\s*}{}mg;
+
+# headline
+s{<P\b[^>]*>(?:In Title \S+, the following Section \S+ is inserted:</P>\s*<P\b[^>]*>)?\'?(SECTION \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>\'?(Article \S+)</i></P>}{<H1>$1</H1>}mg;
+s{<P\b[^>]*>(?:Article \S+ is replaced by the following:</P>\s*<P\b[^>]*>)?\'?(Article \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*>(Article \S+) is amended as follows:</P>}{<H1>$1</H1>}mg;
+s{<P\b[^>]*>(?:paragraph \S+ is replaced by the following:</P>\s*)(<P\b[^>]*>)\'?(\d+)\. }{<H3>$2</H3>\n$1}mg;
+s{<P\b[^>]*>In (Article \S+), paragraph (\S+) is replaced by the following:</P>\s*(<P\b[^>]*>)\'?(\2)\. }{<H1>$1</H1>\n<H3>$2</H3>\n$3}mg;
+
+# unwrap
+s{\s*<br/>\s*}{ }mg;
+
+write_file( $stem . '.html', $_ );
+
+print "DONE: $0 stem $stem\n";