#!/usr/bin/perl

# normalize HTML

use Getopt::Long;
use File::Slurp;

use strict;
use warnings;

my $force;
GetOptions ("force|f");

my $stem = shift;
my $_ = read_file( $stem . '.htm' );

# whitespace
s/(?: |\h)+/ /mg;

# preamble
s{.*>HAVE ADOPTED THIS REGULATION:</P>\s*}{}s;

# page header
s{<P\b[^>]*;top:(?:1172|1187)px;[^>]*>(?:(?!</P\b).)+.</P>\s*}{}mg;

# headline
s{<P\b[^>]*>(?:In Title \S+, the following Section \S+ is inserted:</P>\s*<P\b[^>]*>)?\'?(SECTION \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
s{<P\b[^>]*><i>\'?(Article \S+)</i></P>}{<H1>$1</H1>}mg;
s{<P\b[^>]*>(?:Article \S+ is replaced by the following:</P>\s*<P\b[^>]*>)?\'?(Article \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
s{<P\b[^>]*>(Article \S+) is amended as follows:</P>}{<H1>$1</H1>}mg;
s{<P\b[^>]*>(?:paragraph \S+ is replaced by the following:</P>\s*)(<P\b[^>]*>)\'?(\d+)\. }{<H3>$2</H3>\n$1}mg;
s{<P\b[^>]*>In (Article \S+), paragraph (\S+) is replaced by the following:</P>\s*(<P\b[^>]*>)\'?(\2)\. }{<H1>$1</H1>\n<H3>$2</H3>\n$3}mg;

# unwrap
s{\s*<br/>\s*}{ }mg;

write_file( $stem . '.html', $_ );

print "DONE: $0 stem $stem\n";