blob: ae183917217c99693565615a17786352edd5db64 (
plain)
- #!/usr/bin/perl
- # normalize HTML
- use Getopt::Long;
- use File::Slurp;
- use strict;
- use warnings;
- my $force;
- GetOptions ("force|f");
- my $stem = shift;
- my $_ = read_file( $stem . '.htm' );
- # whitespace
- s/(?: |\h)+/ /mg;
- # preamble
- s{.*>HAVE ADOPTED THIS REGULATION:</P>\s*}{}s;
- # page header
- s{<P\b[^>]*;top:(?:1172|1187)px;[^>]*>(?:(?!</P\b).)+.</P>\s*}{}mg;
- # headline
- s{<P\b[^>]*>(?:In Title \S+, the following Section \S+ is inserted:</P>\s*<P\b[^>]*>)?\'?(SECTION \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
- s{<P\b[^>]*><i>\'?(Article \S+)</i></P>}{<H1>$1</H1>}mg;
- s{<P\b[^>]*>(?:Article \S+ is replaced by the following:</P>\s*<P\b[^>]*>)?\'?(Article \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
- s{<P\b[^>]*>(Article \S+) is amended as follows:</P>}{<H1>$1</H1>}mg;
- s{<P\b[^>]*>(?:paragraph \S+ is replaced by the following:</P>\s*)(<P\b[^>]*>)\'?(\d+)\. }{<H3>$2</H3>\n$1}mg;
- s{<P\b[^>]*>In (Article \S+), paragraph (\S+) is replaced by the following:</P>\s*(<P\b[^>]*>)\'?(\2)\. }{<H1>$1</H1>\n<H3>$2</H3>\n$3}mg;
- # unwrap
- s{\s*<br/>\s*}{ }mg;
- write_file( $stem . '.html', $_ );
- print "DONE: $0 stem $stem\n";
|