- #!/usr/bin/perl -w
- # Po4a::Text.pm
- #
- # extract and translate translatable strings from a text documents
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc.,
- # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- #
- ########################################################################
- =head1 NAME
- Locale::Po4a::Text - Convert text documents from/to PO files
- =head1 DESCRIPTION
- The po4a (po for anything) project goal is to ease translations (and more
- interestingly, the maintenance of translations) using gettext tools on
- areas where they were not expected like documentation.
- Locale::Po4a::Text is a module to help the translation of text documents into
- other [human] languages.
- Paragraphs are splitted on empty lines (or lines containing only spaces or
- tabulations).
- If a paragraph contains a line starting by a space (or tabulation), this
- paragraph won't be rewrapped.
- =cut
- package Locale::Po4a::Text;
- use 5.006;
- use strict;
- use warnings;
- require Exporter;
- use vars qw(@ISA @EXPORT);
- @ISA = qw(Locale::Po4a::TransTractor);
- @EXPORT = qw();
- use Locale::Po4a::TransTractor;
- use Locale::Po4a::Common;
- =head1 OPTIONS ACCEPTED BY THIS MODULE
- These are this module's particular options:
- =over
- =item B<nobullet>
- Deactivate detection of bullets.
- By default, when a bullet is detected, the bullet paragraph is not considered
- as a verbatim paragraph (with the no-wrap flag in the PO file), but the module
- rewrap this paragraph in the generated PO file and in the translation.
- =cut
- my $bullets = 1;
- =item B<debianchangelog>
- Handle the header and footer of
- released versions, which only contain non translatable informations.
- =cut
- my $debianchangelog = 0;
- =item B<fortunes>
- Handle the fortunes format, which separate fortunes with a line which
- consists in '%' or '%%', and use '%%' as the beginning of a comment.
- =cut
- my $fortunes = 0;
- =item B<markdown>
- Handle some special markup in Markdown-formatted texts.
- =cut
- my $markdown = 0;
- sub initialize {
- my $self = shift;
- my %options = @_;
- $self->{options}{'nobullets'}='';
- if (defined $options{'nobullets'}) {
- $bullets = 0;
- }
- if (defined $options{'debianchangelog'}) {
- $debianchangelog=1;
- }
- if (defined $options{'fortunes'}) {
- $fortunes=1;
- }
- if (defined $options{'markdown'}) {
- $markdown=1;
- }
- }
- sub parse {
- my $self = shift;
- my ($line,$ref);
- my $paragraph="";
- my $wrapped_mode = 1;
- my $expect_header = 1;
- my $end_of_paragraph = 0;
- ($line,$ref)=$self->shiftline();
- my $file = $ref;
- $file =~ s/:[0-9]+$//;
- while (defined($line)) {
- $ref =~ m/^(.*):[0-9]+$/;
- if ($1 ne $file) {
- $file = $1;
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $wrapped_mode = 1;
- $expect_header = 1;
- }
- # TODO: preserve original line ends throughout the code instead
- chomp($line);
- $self->{ref}="$ref";
- if ($debianchangelog and
- $expect_header and
- $line =~ /^(\w[-+0-9a-z.]*)\ \(([^\(\) \t]+)\) # src, version
- \s+([-+0-9a-z.]+); # distribution
- \s*urgency\s*\=\s*(.*\S)\s*$/ix) { #
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $self->pushline("$line\n");
- $expect_header=0;
- } elsif ($debianchangelog and
- $line =~ m/^ \-\- (.*) <(.*)> ((\w+\,\s*)?\d{1,2}\s+\w+\s+\d{4}\s+\d{1,2}:\d\d:\d\d\s+[-+]\d{4}(\s+\([^\\\(\)]\))?)$/) {
- # Found trailer
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $self->pushline("$line\n");
- $expect_header=1;
- } elsif ($fortunes and
- $line =~ m/^%%?\s*$/) {
- # Found end of fortune
- do_paragraph($self,$paragraph,$wrapped_mode);
- # FIXME: test if this is still needed when always adding
- # newline in do_paragraph()
- $self->pushline("\n") unless ( $wrapped_mode == 0
- or $paragraph eq "");
- $paragraph="";
- $wrapped_mode = 1;
- $self->pushline("$line\n");
- } elsif ($line =~ /^\s*$/) {
- # Break paragraphs on lines containing only spaces
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $wrapped_mode = 1;
- $self->pushline($line."\n");
- } elsif ($line =~ /^-- $/) {
- # Break paragraphs on email signature hint
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $wrapped_mode = 1;
- $self->pushline($line."\n");
- } elsif ( $line =~ /^=+$/
- or $line =~ /^_+$/
- or $line =~ /^-+$/) {
- $wrapped_mode = 0;
- $paragraph .= $line."\n";
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $wrapped_mode = 1;
- } elsif ($markdown and
- ( $line =~ /^\s*\[\[\!\S+\s*$/ # macro begin
- or $line =~ /^\s*"""\s*\]\]\s*$/)) { # """ textblock inside macro end
- # Avoid translating Markdown lines containing only markup
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $wrapped_mode = 1;
- $self->pushline("$line\n");
- } elsif ($markdown and
- ( $line =~ /^#/ # headline
- or $line =~ /^\s*\[\[\!\S[^\]]*\]\]\s*$/)) { # sole macro
- # Preserve some Markdown markup as a single line
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="$line\n";
- $wrapped_mode = 0;
- $end_of_paragraph = 1;
- } elsif ($markdown and
- ( $line =~ /^"""/)) { # """ textblock inside macro end
- # Markdown markup needing separation _before_ this line
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="$line\n";
- $wrapped_mode = 1;
- } else {
- if ($line =~ /^\s/) {
- # A line starting by a space indicates a non-wrap
- # paragraph
- $wrapped_mode = 0;
- }
- if ($markdown and
- ( $line =~ /\S $/ # explicit newline
- or $line =~ /"""$/)) { # """ textblock inside macro begin
- # Markdown markup needing separation _after_ this line
- $end_of_paragraph = 1;
- }
- if ($fortunes) {
- $line =~ s/%%(.*)$//;
- }
- # TODO: comments
- $paragraph .= $line."\n";
- }
- # paragraphs starting by a bullet, or numbered
- # or paragraphs with a line containing many consecutive spaces
- # (more than 3)
- # are considered as verbatim paragraphs
- $wrapped_mode = 0 if ( $paragraph =~ m/^(\*|[0-9]+[.)] )/s
- or $paragraph =~ m/[ \t][ \t][ \t]/s);
- if ($markdown) {
- # Some Markdown markup can (or might) not survive wrapping
- $wrapped_mode = 0 if (
- $paragraph =~ /^>/ms # blockquote
- or $paragraph =~ /^( {8}|\t)/ms # monospaced
- or $paragraph =~ /^\$(\S+[{}]\S*\s*)+/ms # Xapian macro
- or $paragraph =~ /<(?![a-z]+[:@])/ms # maybe html (tags but not wiki <URI>)
- or $paragraph =~ /^[^<]+>/ms # maybe html (tag with vertical space)
- or $paragraph =~ /^\s*\[\[\!\S[^\]]+$/ms # macro begin
- );
- }
- if ($end_of_paragraph) {
- do_paragraph($self,$paragraph,$wrapped_mode);
- $paragraph="";
- $wrapped_mode = 1;
- $end_of_paragraph = 0;
- }
- ($line,$ref)=$self->shiftline();
- }
- if (length $paragraph) {
- do_paragraph($self,$paragraph,$wrapped_mode);
- }
- }
- sub do_paragraph {
- my ($self, $paragraph, $wrap) = (shift, shift, shift);
- return if ($paragraph eq "");
- if ($bullets) {
- # Detect bullets
- # | * blah blah
- # |<spaces> blah
- # | ^-- aligned
- # <empty line>
- #
- # Other bullets supported:
- # - blah o blah + blah
- # 1. blah 1) blah (1) blah
- TEST_BULLET:
- if ($paragraph =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s) {
- my $para = $5;
- my $bullet = $2;
- my $indent1 = $1;
- my $indent2 = "$1".(' ' x length $bullet);
- my $text = $4;
- while ($para !~ m/$indent2(?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+/
- and $para =~ s/^$indent2(\S[^\n]*\n)//s) {
- $text .= $1;
- }
- # TODO: detect if a line starts with the same bullet
- if ($text !~ m/\S[ \t][ \t][ \t]+\S/s) {
- my $bullet_regex = quotemeta($indent1.$bullet);
- $bullet_regex =~ s/[0-9]+/\\d\+/;
- if ($para eq '' or $para =~ m/^$bullet_regex\S/s) {
- my $trans = $self->translate($text,
- $self->{ref},
- "Bullet: '$indent1$bullet'",
- "wrap" => 1,
- "wrapcol" => - (length $indent2));
- $trans =~ s/^/$indent1$bullet/s;
- $trans =~ s/\n(.)/\n$indent2$1/sg;
- $self->pushline( $trans."\n" );
- if ($para eq '') {
- return;
- } else {
- # Another bullet
- $paragraph = $para;
- goto TEST_BULLET;
- }
- }
- }
- }
- }
- # TODO: detect indented paragraphs
- my $transfinal = $self->translate($paragraph,
- $self->{ref},
- "Plain text",
- "wrap" => $wrap);
- # TODO: preserve original line ends throughout the code instead
- chomp $transfinal;
- $transfinal .= "\n";
- $self->pushline( $transfinal );
- }
- 1;
- =head1 STATUS OF THIS MODULE
- Tested successfully on simple text files and NEWS.Debian files.
- =head1 AUTHORS
- Nicolas François <nicolas.francois@centraliens.net>
- =head1 COPYRIGHT AND LICENSE
- Copyright 2005-2008 by Nicolas FRANÇOIS <nicolas.francois@centraliens.net>.
- This program is free software; you may redistribute it and/or modify it
- under the terms of GPL (see the COPYING file).
|