#!/usr/bin/perl # normalize HTML # # TODO: settle on a specific order of header levels # TODO: handle more subheaders use Getopt::Long; use File::Slurp; use strict; use warnings; my $force; GetOptions ("force|f"); my $stem = shift; my $_ = read_file( $stem . '.htm' ); # whitespace s/ / /mg; s{
]*>\s*
\s*}{}mg; # page header s{]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:[bi]>)?
\s*)+}{}mg; # footnote s{]*>\h+
\s*(?:]*>\S+
\s*]*>((?:(?!
\s*)+(?=]*\sclass="$class">\d+
\s*]*>\)}{}mg; }; # drop document headers s{
.*?\s*}{}msg; s{\n