#!/usr/bin/perl # normalize HTML use Getopt::Long; use File::Slurp; use strict; use warnings; my $force; GetOptions ("force|f"); my $stem = shift; my $_ = read_file( $stem . '.htm' ); # whitespace s/(?: |\h)+/ /mg; # preamble s{.*>HAVE ADOPTED THIS REGULATION:</P>\s*}{}s; # page header s{<P\b[^>]*;top:(?:1172|1187)px;[^>]*>(?:(?!</P\b).)+.</P>\s*}{}mg; # headline s{<P\b[^>]*>(?:In Title \S+, the following Section \S+ is inserted:</P>\s*<P\b[^>]*>)?\'?(SECTION \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg; s{<P\b[^>]*><i>\'?(Article \S+)</i></P>}{<H1>$1</H1>}mg; s{<P\b[^>]*>(?:Article \S+ is replaced by the following:</P>\s*<P\b[^>]*>)?\'?(Article \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg; s{<P\b[^>]*>(Article \S+) is amended as follows:</P>}{<H1>$1</H1>}mg; s{<P\b[^>]*>(?:paragraph \S+ is replaced by the following:</P>\s*)(<P\b[^>]*>)\'?(\d+)\. }{<H3>$2</H3>\n$1}mg; s{<P\b[^>]*>In (Article \S+), paragraph (\S+) is replaced by the following:</P>\s*(<P\b[^>]*>)\'?(\2)\. }{<H1>$1</H1>\n<H3>$2</H3>\n$3}mg; # unwrap s{\s*<br/>\s*}{ }mg; write_file( $stem . '.html', $_ ); print "DONE: $0 stem $stem\n";