#!/usr/bin/perl

# normalize HTML
#
# TODO: strip <!-- ... --> comments

use Getopt::Long;
use File::Slurp;

use strict;
use warnings;

my $force;
GetOptions ("force|f");

my $stem = shift;
my $_ = read_file( $stem . '.htm' );

# whitespace
s/&#160;/ /mg;

# page header
s{<P\b[^>]*;top:6[23]px;[^>]*>[^<]*</P>\s*}{}mg;

# footnote
foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
	s{\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\)}{}mg;
};
foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) {
	s{<P\b[^>]*>\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\) [^<]*</P>\s*}{}mg;
};

# drop document headers
s{<HEAD>.*?</HEAD>\s*}{}msg;
s{</DIV>\n</BODY>\n</HTML>.*?<DIV\b[^>]*>\s*}{}msg;

# unwrap similarly styled bolded paragraphs
s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg;

# headline
s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg;
s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg;
s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg;
s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg;

# unwrap
s{(?<=\S)-(<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg;
s{\s*<br/>\s*}{ }mg;

# drop styling
s{<P\b[^>]*>}{<P>}mg;

write_file( $stem . '.html', $_ );

print "DONE: $0 stem $stem\n";