#!/usr/bin/perl # normalize HTML # # TODO: settle on a specific order of header levels # TODO: handle more subheaders use Getopt::Long; use File::Slurp; use strict; use warnings; my $force; GetOptions ("force|f"); my $stem = shift; my $_ = read_file( $stem . '.htm' ); # whitespace s/ / /mg; s{]*>\s*

\s*}{}mg; # page header s{]*>\s*\K(?:]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:)?

\s*)+}{}mg; # footnote s{]*>\h+

\s*(?:]*>\S+

\s*]*>((?:(?!\s*)+(?=)}{}mg; foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) { s{\(

\s*]*\sclass="$class">\d+

\s*]*>\)}{}mg; }; # drop document headers s{.*?\s*}{}msg; s{\n\n.*?]*>\s*}{}msg; # unwrap similarly styled bolded paragraphs s{]*class="([^"]+)"[^>]*>[^<]+\K

\s*]*class="\1"[^>]*>}{ }mg; # headline s{]*>(TITLE \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; s{]*>(SECTION \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; s{]*>((?:Article|ANNEX) \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; s{]*>((?:Article|ANNEX) \S+)\s*

\s*]*>((?:(?!\s*

}{

$1

\n

$2

}mg; s{]*>(\d+)\.

\s*}{

$1

\n}mg; s{(]*>)(\d+)\. }{

$2

\n$1}mg; s{]*>\(([a-z])\)

}{

$1

}mg; s{(]*>)\(([ivx]+)\) }{
$2
\n$1}mg; s{]*>\(([ivx]+)\)

}{
$1
}mg; # unwrap s{(?<=[[:lower:]])-(?:
|

\s*]*>)(?=[[:lower:]])}{}mg; #s{]*class="([^"]+)"[^>]*>[^<]+\K\s*

\s*]*class="\1"[^>]*>}{ }mg; s{(?<=class="(ft\d{1})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; s{(?<=class="(ft\d{2})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; s{(?<=class="(ft\d{3})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; #s{]*class="([^"]+)"[^>]*>([^<]+?)\s*

\s*(?=]*class="\1"[^>]*>)}{$2 }mg; s{\s*
\s*}{ }mg; # drop styling s{]*>}{

}mg; write_file( $stem . '.html', $_ ); print "DONE: $0 stem $stem\n";