diff options
author | Jonas Smedegaard <dr@jones.dk> | 2013-05-10 20:46:04 +0200 |
---|---|---|
committer | Jonas Smedegaard <dr@jones.dk> | 2013-05-10 20:46:04 +0200 |
commit | 27d338dec0428bc22e2838eb8641c6e0d1681e22 (patch) | |
tree | 18eeb76e189ce03838dd9b23194f1c707b1eabbb /mkhtm2html-default |
Include mk* scripts
Diffstat (limited to 'mkhtm2html-default')
-rwxr-xr-x | mkhtm2html-default | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/mkhtm2html-default b/mkhtm2html-default new file mode 100755 index 0000000..c2589a2 --- /dev/null +++ b/mkhtm2html-default @@ -0,0 +1,63 @@ +#!/usr/bin/perl + +# normalize HTML + +use Getopt::Long; +use File::Slurp; + +use strict; +use warnings; + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +my $_ = read_file( $stem . '.htm' ); + +# whitespace +s/ / /mg; +s{<P\b[^>]*>\s*</P>\s*}{}mg; + +# page header +s{<DIV\b[^>]*>\s*\K(?:<P\b[^>]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:</[bi]>)?</P>\s*)+}{}mg; + +# footnote +s{<P\b[^>]*>\h+</P>\s*(?:<P\b[^>]*><b>\S+</b></P>\s*<P\b[^>]*>((?:(?!</P\b).)+.)</P>\s*)+(?=</DIV>)}{}mg; + +foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) { + s{\(</P>\s*<P\s[^>]*\sclass="$class">\d+</P>\s*<P\s[^>]*>\)}{}mg; +}; + +# document headers +s{<HEAD>.*?</HEAD>\s*}{}msg; +s{</DIV>\n</BODY>\n</HTML>.*?<DIV\b[^>]*>\s*}{}msg; + +# unwrap similarly styled bolded paragraphs +s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg; + +# headline +s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg; +s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg; +s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)\s*</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg; +s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)\s*</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+?.)\s*</b>\s*</i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg; +s{<P\b[^>]*>(\d+)\.</P>\s*}{<H3>$1</H3>\n}mg; +s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg; +s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg; +s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg; +s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg; + +# unwrap +s{(?<=[[:lower:]])-(?:<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg; +#s{<P\b[^>]*class="([^"]+)"[^>]*>[^<]+\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{1})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{2})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{3})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg; +#s{<P\b[^>]*class="([^"]+)"[^>]*>([^<]+?)\s*</P>\s*(?=<P\b[^>]*class="\1"[^>]*>)}{$2 }mg; +s{\s*<br/>\s*}{ }mg; + +# styling +s{<P\b[^>]*>}{<P>}mg; + +write_file( $stem . '.html', $_ ); + +print "DONE: $0 stem $stem\n"; |