#!/usr/bin/perl # normalize HTML # # TODO: strip comments use Getopt::Long; use File::Slurp; use strict; use warnings; my $force; GetOptions ("force|f"); my $stem = shift; my $_ = read_file( $stem . '.htm' ); # whitespace s/ / /mg; # page header s{
]*;top:6[23]px;[^>]*>[^<]*
\s*}{}mg; # footnote foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) { s{\(\s*]*\sclass="$class">\d+
\s*]*>\)}{}mg; }; foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) { s{
]*>\(
\s*]*\sclass="$class">\d+
\s*]*>\) [^<]*
\s*}{}mg; }; # unwrap similarly styled bolded paragraphs s{]*class="([^"]+)"[^>]*>[^<]+\K
\s*]*class="\1"[^>]*>}{ }mg; # headline s{
]*>(TITLE \S+)
\s*]*>((?:(?!
}{]*>(SECTION \S+)
\s*]*>((?:(?!
}{]*>((?:Article|ANNEX) \S+)
\s*]*>((?:(?!
}{]*>)(\d+)\. }{
]*>\(([a-z])\)
}{]*>)\(([ivx]+)\) }{
]*>\(([ivx]+)\)
}{]*>)(?=[[:lower:]])}{}mg;
s{\s*
\s*}{ }mg;
write_file( $stem . '.html', $_ );
print "DONE: $0 stem $stem\n";