From 27d338dec0428bc22e2838eb8641c6e0d1681e22 Mon Sep 17 00:00:00 2001
From: Jonas Smedegaard
Date: Fri, 10 May 2013 20:46:04 +0200
Subject: Include mk* scripts
---
mkhtm2html-1 | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 49 insertions(+)
create mode 100755 mkhtm2html-1
(limited to 'mkhtm2html-1')
diff --git a/mkhtm2html-1 b/mkhtm2html-1
new file mode 100755
index 0000000..6f37a46
--- /dev/null
+++ b/mkhtm2html-1
@@ -0,0 +1,49 @@
+#!/usr/bin/perl
+
+# normalize HTML
+
+use Getopt::Long;
+use File::Slurp;
+
+use strict;
+use warnings;
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+my $_ = read_file( $stem . '.htm' );
+
+# whitespace
+s/ / /mg;
+
+# page header
+s{]*;top:6[23]px;[^>]*>[^<]*
\s*}{}mg;
+
+# footnote
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
+ s{\(
\s*]*\sclass="$class">\d+
\s*]*>\)}{}mg;
+};
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) {
+ s{
]*>\(
\s*]*\sclass="$class">\d+
\s*]*>\) [^<]*
\s*}{}mg;
+};
+
+# unwrap similarly styled bolded paragraphs
+s{]*class="([^"]+)"[^>]*>[^<]+\K
\s*]*class="\1"[^>]*>}{ }mg;
+
+# headline
+s{
]*>(TITLE \S+)
\s*]*>((?:(?!
}{$1
\n$2
}mg;
+s{]*>(SECTION \S+)
\s*]*>((?:(?!
}{$1
\n$2
}mg;
+s{]*>((?:Article|ANNEX) \S+)
\s*]*>((?:(?!
}{$1
\n$2
}mg;
+s{(]*>)(\d+)\. }{
$2
\n$1}mg;
+s{]*>\(([a-z])\)
}{$1
}mg;
+s{(]*>)\(([ivx]+)\) }{
$2
\n$1}mg;
+s{]*>\(([ivx]+)\)
}{$1
}mg;
+
+# unwrap
+s{(?<=\S)-(
|\s*]*>)(?=[[:lower:]])}{}mg;
+s{\s*
\s*}{ }mg;
+
+write_file( $stem . '.html', $_ );
+
+print "DONE: $0 stem $stem\n";
--
cgit v1.2.3