From e9a1b2ed8d2b676090e835ecd030b48ecd121811 Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Tue, 6 Nov 2007 17:21:07 +0100 Subject: Use local Locale::Po4a::Text tweaked to not wrap paragraphs containing headlines, blockquotes, triple-quotes or html tags. --- Locale/Po4a/Text.pm | 260 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 Locale/Po4a/Text.pm (limited to 'Locale') diff --git a/Locale/Po4a/Text.pm b/Locale/Po4a/Text.pm new file mode 100644 index 0000000..836d2c1 --- /dev/null +++ b/Locale/Po4a/Text.pm @@ -0,0 +1,260 @@ +#!/usr/bin/perl -w + +# Po4a::Text.pm +# +# extract and translate translatable strings from a text documents +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +######################################################################## + +=head1 NAME + +Locale::Po4a::Text - Convert text documents from/to PO files + +=head1 DESCRIPTION + +The po4a (po for anything) project goal is to ease translations (and more +interestingly, the maintenance of translations) using gettext tools on +areas where they were not expected like documentation. + +Locale::Po4a::Text is a module to help the translation of text documents into +other [human] languages. + +Paragraphs are splitted on empty lines (or lines containing only spaces or +tabulations). + +If a paragraph contains a line starting by a space (or tabulation), this +paragraph won't be rewrapped. + +=cut + +package Locale::Po4a::Text; + +use 5.006; +use strict; +use warnings; + +require Exporter; +use vars qw(@ISA @EXPORT); +@ISA = qw(Locale::Po4a::TransTractor); +@EXPORT = qw(); + +use Locale::Po4a::TransTractor; +use Locale::Po4a::Common; + +=head1 OPTIONS ACCEPTED BY THIS MODULE + +These are this module's particular options: + +=over + +=item B + +Deactivate detection of bullets. + +By default, when a bullet is detected, the bullet paragraph is not considered +as a verbatim paragraph (with the no-wrap flag in the PO file), but the module +rewrap this paragraph in the generated PO file and in the translation. + +=cut + +my $bullets = 1; + +=item B + +Handle the header and footer of +released versions, which only contain non translatable informations. + +=cut + +my $debianchangelog = 0; + +=item B + +Handle some special markup in Markdown-formatted texts. + +=cut + +my $markdown = 1; + +sub initialize { + my $self = shift; + my %options = @_; + + $self->{options}{'nobullets'}=''; + + if (defined $options{'nobullets'}) { + $bullets = 0; + } + + if (defined $options{'debianchangelog'}) { + $debianchangelog=1; + } + + if (defined $options{'markdown'}) { + $markdown=1; + } +} + +sub parse { + my $self = shift; + my ($line,$ref); + my $paragraph=""; + my $wrapped_mode = 1; + my $expect_header = 1; + ($line,$ref)=$self->shiftline(); + while (defined($line)) { + chomp($line); + $self->{ref}="$ref"; + if ($debianchangelog and + $expect_header and + $line =~ /^(\w[-+0-9a-z.]*)\ \(([^\(\) \t]+)\) # src, version + \s+([-+0-9a-z.]+); # distribution + \s*urgency\s*\=\s*(.*\S)\s*$/ix) { # + do_paragraph($self,$paragraph,$wrapped_mode); + $paragraph=""; + $self->pushline("$line\n"); + $expect_header=0; + } elsif ($debianchangelog and + $line =~ m/^ \-\- (.*) <(.*)> ((\w+\,\s*)?\d{1,2}\s+\w+\s+\d{4}\s+\d{1,2}:\d\d:\d\d\s+[-+]\d{4}(\s+\([^\\\(\)]\))?)$/) { + # Found trailer + do_paragraph($self,$paragraph,$wrapped_mode); + $paragraph=""; + $self->pushline("$line\n"); + $expect_header=1; + } elsif ($line =~ /^\s*$/) { + # Break paragraphs on lines containing only spaces + do_paragraph($self,$paragraph,$wrapped_mode); + $self->pushline("\n") unless ( $wrapped_mode == 0 + or $paragraph eq ""); + $paragraph=""; + $wrapped_mode = 1; + $self->pushline($line."\n"); + } elsif ( $line =~ /^=*$/ + or $line =~ /^_*$/ + or $line =~ /^-*$/) { + $wrapped_mode = 0; + $paragraph .= $line."\n"; + do_paragraph($self,$paragraph,$wrapped_mode); + $paragraph=""; + $wrapped_mode = 1; + } elsif ($markdown and + ( $line =~ m/^#/ # headline + or $line =~ m/^>/ # blockquote + or $line =~ m/[<>]/ # maybe html + or $line =~ m/^"""/ # textblock inside macro end + or $line =~ m/"""$/)) { # textblock inside macro begin + # Found headline + $wrapped_mode = 0; + $paragraph .= $line."\n"; + do_paragraph($self,$paragraph,$wrapped_mode); + $paragraph=""; + $wrapped_mode = 1; + } else { + if ($line =~ /^\s/) { + # A line starting by a space indicates a non-wrap + # paragraph + $wrapped_mode = 0; + } + $paragraph .= $line."\n"; + } + # paragraphs starting by a bullet, or numbered + # or paragraphs with a line containing many consecutive spaces + # (more than 3) + # are considered as verbatim paragraphs + $wrapped_mode = 0 if ( $paragraph =~ m/^(\*|[0-9]+[.)] )/s + or $paragraph =~ m/[ \t][ \t][ \t]/s); + ($line,$ref)=$self->shiftline(); + } + if (length $paragraph) { + do_paragraph($self,$paragraph,$wrapped_mode); + } +} + +sub do_paragraph { + my ($self, $paragraph, $wrap) = (shift, shift, shift); + return if ($paragraph eq ""); + + if ($bullets) { + # Detect bullets + # | * blah blah + # | blah + # | ^-- aligned + # + # + # Other bullets supported: + # - blah o blah + blah + # 1. blah 1) blah (1) blah +TEST_BULLET: + if ($paragraph =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s) { + my $para = $5; + my $bullet = $2; + my $indent1 = $1; + my $indent2 = "$1".(' ' x length $bullet); + my $text = $4; + while ($para !~ m/$indent2(?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+/ + and $para =~ s/^$indent2(\S[^\n]*\n)//s) { + $text .= $1; + } + # TODO: detect if a line starts with the same bullet + if ($text !~ m/\S[ \t][ \t][ \t]+\S/s) { + my $bullet_regex = quotemeta($indent1.$bullet); + $bullet_regex =~ s/[0-9]+/\\d\+/; + if ($para eq '' or $para =~ m/^$bullet_regex\S/s) { + my $trans = $self->translate($text, + $self->{ref}, + "Bullet: '$indent1$bullet'", + "wrap" => 1, + "wrapcol" => - (length $indent2)); + $trans =~ s/^/$indent1$bullet/s; + $trans =~ s/\n(.)/\n$indent2$1/sg; + $self->pushline( $trans."\n" ); + if ($para eq '') { + return; + } else { + # Another bullet + $paragraph = $para; + goto TEST_BULLET; + } + } + } + } + } + # TODO: detect indented paragraphs + + $self->pushline( $self->translate($paragraph, + $self->{ref}, + "Plain text", + "wrap" => $wrap) ); +} + +1; + +=head1 STATUS OF THIS MODULE + +Tested successfully on simple text files and NEWS.Debian files. + +=head1 AUTHORS + + Nicolas François + +=head1 COPYRIGHT AND LICENSE + + Copyright 2005,2007 by Nicolas FRANÇOIS . + +This program is free software; you may redistribute it and/or modify it +under the terms of GPL (see the COPYING file). -- cgit v1.2.3