From 27d338dec0428bc22e2838eb8641c6e0d1681e22 Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Fri, 10 May 2013 20:46:04 +0200 Subject: Include mk* scripts --- mkhtm2html-default | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100755 mkhtm2html-default (limited to 'mkhtm2html-default') diff --git a/mkhtm2html-default b/mkhtm2html-default new file mode 100755 index 0000000..c2589a2 --- /dev/null +++ b/mkhtm2html-default @@ -0,0 +1,63 @@ +#!/usr/bin/perl + +# normalize HTML + +use Getopt::Long; +use File::Slurp; + +use strict; +use warnings; + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +my $_ = read_file( $stem . '.htm' ); + +# whitespace +s/ / /mg; +s{]*>\s*

\s*}{}mg; + +# page header +s{]*>\s*\K(?:]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:)?

\s*)+}{}mg; + +# footnote +s{]*>\h+

\s*(?:]*>\S+

\s*]*>((?:(?!\s*)+(?=)}{}mg; + +foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) { + s{\(

\s*]*\sclass="$class">\d+

\s*]*>\)}{}mg; +}; + +# document headers +s{.*?\s*}{}msg; +s{\n\n.*?]*>\s*}{}msg; + +# unwrap similarly styled bolded paragraphs +s{]*class="([^"]+)"[^>]*>[^<]+\K

\s*]*class="\1"[^>]*>}{ }mg; + +# headline +s{]*>(TITLE \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>(SECTION \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>((?:Article|ANNEX) \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>((?:Article|ANNEX) \S+)\s*

\s*]*>((?:(?!\s*

}{

$1

\n

$2

}mg; +s{]*>(\d+)\.

\s*}{

$1

\n}mg; +s{(]*>)(\d+)\. }{

$2

\n$1}mg; +s{]*>\(([a-z])\)

}{

$1

}mg; +s{(]*>)\(([ivx]+)\) }{
$2
\n$1}mg; +s{]*>\(([ivx]+)\)

}{
$1
}mg; + +# unwrap +s{(?<=[[:lower:]])-(?:
|

\s*]*>)(?=[[:lower:]])}{}mg; +#s{]*class="([^"]+)"[^>]*>[^<]+\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{1})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{2})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{3})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +#s{]*class="([^"]+)"[^>]*>([^<]+?)\s*

\s*(?=]*class="\1"[^>]*>)}{$2 }mg; +s{\s*
\s*}{ }mg; + +# styling +s{]*>}{

}mg; + +write_file( $stem . '.html', $_ ); + +print "DONE: $0 stem $stem\n"; -- cgit v1.2.3