Bug#845297: converting translation metadata

2018-05-15 Thread Steve McIntyre
On Wed, May 16, 2018 at 01:37:25AM +0100, Steve McIntyre wrote:
>On Wed, May 16, 2018 at 12:05:17AM +0100, Steve McIntyre wrote:
>
>Initial script attached.

After some more discussion on irc with pabs, I've tweaked the script
to use file hashes instead of commit hashes. It's a little slower, but
perfectly usable (~90s runtime on my laptop). Here's the new version,
to allow for comparison.

We'll still need to update the rest of our tools, but step 1 looks OK
I think. Bedtime for now... :-)


-- 
Steve McIntyre, Cambridge, UK.st...@einval.com
  Getting a SCSI chain working is perfectly simple if you remember that there
  must be exactly three terminations: one on one end of the cable, one on the
  far end, and the goat, terminated over the SCSI chain with a silver-handled
  knife whilst burning *black* candles. --- Anthony DeBoer
#!/usr/bin/perl

# This script walks the webwml tree to look for translated files. It
# looks for the wml::debian::translation-check header to see if a file
# is a stranslation of an original, then checks for the revision
# status of the master document.
#
# Part of the effort to switch from CVS to Git
#
# Originally written 2018 by Steve McIntyre <93...@debian.org>
# © Copyright 2018 Software in the public interest, Inc.
# This program is released under the GNU General Public License, v2.

use strict;
use warnings;

use Getopt::Long;
use Data::Dumper;
use File::Spec::Functions;
use File::Find;
use lib ($0 =~ m|(.*)/|, $1 or ".") ."/Perl";
use Webwml::TransCheck;

my $help = 0;
my $verbose = 0;
my $dry_run = 0;
my $revs_file = "";
my %rev_map;

sub usage {
print <<'EOT';
Usage: switch_to_git_translations.pl [options]
Options:
  --help display this message
  --verbose  run verbosely
  --dry-run  do not modify translation-check headers
  --revisions=REVISIONS  location of the cvs2git revisions map file

Find all wml files under the current directory, updating revisions for
translations.
EOT
exit(0);
}

# log very verbose messages
sub vvlog {
if ($verbose >= 2) {
	print STDOUT $_[0] . "\n";
}
}

# log verbose messages
sub vlog {
if ($verbose >= 1) {
	print STDOUT $_[0] . "\n";
}
}

# Parse the revisions file for use, building a hash of the git and cvs versions for each file
sub parse_revisions
{
my $revs_file = shift;
open(IN, "<", "$revs_file") or die "Can't open revisions file \$revs_file\" for reading: $!\n";
while (my $line = ) {
	chomp $line;
	my ($file, $cvs_ver, $commit_hash);
	if ($line =~ m,^(\S+) ([.\d]+) ([[:xdigit:]]+)$,)
	{
	$file = $1;
	$cvs_ver = $2;
	$commit_hash = $3;
	$rev_map{"$file"}{"$cvs_ver"}{"commit_hash"} = $commit_hash;
	} else {
	die "Failed to parse revisions file at line $.\n";
	}
	vvlog("Found file $file with CVS version $cvs_ver in commit hash $commit_hash");
}
close IN;
vlog("Parsed revisions file \"$revs_file\", found revisions for " . scalar(keys %rev_map) . " files");
}

# return a list of filenames with the given extension
sub find_files_ext
{
my $dir = shift or die('Internal error: No dir specified');
my $ext = shift or die('Internal error: No ext specified');

my @files;
find( sub { if (-f and m/\.$ext$/) { my $filename = $File::Find::name; $filename =~ s,\.\/,,; push @files, $filename }}, $dir );
return @files;
}

# Update the translation-check metadata header in a wml file
sub update_wml_file_metadata
{
my $file = shift;
my $revision = shift;
my $hash = shift;
my $text = "";

open (IN, "< $file") or die "Can't open $file for reading: $!\n";
while () {
	if (m/^#use wml::debian::translation-check/) {
	s/(translation="?)($revision)("?)/$1$hash$3/;
	}
	$text .= $_;
}
close(IN);
open(OUT, "> $file") or die "Can't open $file for writing: $!\n";
print OUT $text;
close OUT;
}

# Parse a wml file, and see if there's a translation-check header. If
# so, use the rev_map data to switch the translation information from
# the cvs version to the git hash *if available*. If it's not
# available, report an error.
sub parse_wml_file
{
my $file = shift;
my $info = 0; # Do we have any translation header info at all?
my $tc = Webwml::TransCheck->new("$file") or die "Failed transcheck: $!\n";
vlog("Looking at wml file $file");
my $target_lang = "english";
my $maint = $tc->maintainer();
if (defined($maint)) {
	vvlog("  Maintainer: $maint");
	$info += 1;
}
my $revision = $tc->revision();
if (defined($revision)) {
	vvlog("  Revision: $revision");
	$info += 1;
}
my $original = $tc->original();
if (defined($original)) {
	vvlog("  Original: $original");
	$info += 1;
	$target_lang = $original;
}
my $mindelta = $tc->mindelta();
if (defined($mindelta)) {
	vvlog("  Mindelta: $mindelta");
	$info += 1;
}
my $maxdelta = $tc->maxdelta();
if (defined($maxdelta)) {
	vvlog("  Maxdelta: $maxdelta");
	$info += 1;
}
  

Bug#845297: converting translation metadata

2018-05-15 Thread Steve McIntyre
On Wed, May 16, 2018 at 12:05:17AM +0100, Steve McIntyre wrote:
>I'm writing a script switch_to_git_translations.pl to walk through all
>the wml files and switch from cvs revision numbers to git revision
>numbers. I'm doing consistency checks as I slowly develop the
>script, for the sake of paranoia :-).
>
>I've found that there are some files that appear to have broken
>translation-check metadata. I've mentioned some in IRC, but for
>completeness we have a few more listed here.
>
>Laura and Thomas have already fixed some of these (as tagged
>here). I'm about to fix the rest myself and push the fixes.

These are all fixed now, thanks Thomas!

OK, so one more thing. In the wiki page about this area:

  "Note that there can be multiple levels of translation-check headers
   chaining through different files, for example:"

Thinking about this, that's actually irrelevant to this work. I *can*
track through a chain of dependencies here (tedious, but possible -
may potentially involve checking out different versions of files from
git), but I really don't think we need to at all. Feel free to try and
convince me otherwise!

Initial script attached.

Checking the git diff output after running the script without
--dry-run (i.e. making changes), I can see a few more files which I
think look bogus for their translation-check metadata. They've all got
multiple translation-check lines in the header, with (maybe?)
conflicting data:

tack:~/debian/www/test_webwml_cvs2git$ git diff --stat | grep -v 2
 french/consultants/xpile.wml  |   4 ++--
 japanese/international/Vietnamese.wml |   4 ++--
 russian/consultants/xpile.wml |   4 ++--
 russian/international/Croatian/index.wml  |   4 ++--
 russian/legal/anssi.wml   |   4 ++--
 43566 files changed, 43578 insertions(+), 43578 deletions(-)

For the sake of 5 files, I'm tempted to (again) just fix up the
metadata in CVS to remove any amiguity here.

-- 
Steve McIntyre, Cambridge, UK.st...@einval.com
Who needs computer imagery when you've got Brian Blessed?
#!/usr/bin/perl

# This script walks the webwml tree to look for translated files. It
# looks for the wml::debian::translation-check header to see if a file
# is a stranslation of an original, then checks for the revision
# status of the master document.
#
# Part of the effort to switch from CVS to Git
#
# Originally written 2018 by Steve McIntyre <93...@debian.org>
# © Copyright 2018 Software in the public interest, Inc.
# This program is released under the GNU General Public License, v2.

use strict;
use warnings;

use Getopt::Long;
use Data::Dumper;
use File::Spec::Functions;
use File::Find;
use lib ($0 =~ m|(.*)/|, $1 or ".") ."/Perl";
use Webwml::TransCheck;

my $help = 0;
my $verbose = 0;
my $dry_run = 0;
my $revs_file = "";
my %rev_map;

sub usage {
print <<'EOT';
Usage: switch_to_git_translations.pl [options]
Options:
  --help display this message
  --verbose  run verbosely
  --dry-run  do not modify translation-check headers
  --revisions=REVISIONS  location of the cvs2git revisions map file

Find all wml files under the current directory, updating revisions for
translations.
EOT
exit(0);
}

# log very verbose messages
sub vvlog {
if ($verbose >= 2) {
	print STDOUT $_[0] . "\n";
}
}

# log verbose messages
sub vlog {
if ($verbose >= 1) {
	print STDOUT $_[0] . "\n";
}
}

# Parse the revisions file for use, building a hash of the git and cvs versions for each file
sub parse_revisions
{
my $revs_file = shift;
open(IN, "<", "$revs_file") or die "Can't open revisions file \$revs_file\" for reading: $!\n";
while (my $line = ) {
	chomp $line;
	my ($file, $cvs_ver, $git_hash);
	if ($line =~ m,^(\S+) ([.\d]+) ([[:xdigit:]]+)$,)
	{
	$file = $1;
	$cvs_ver = $2;
	$git_hash = $3;
	$rev_map{"$file"}{"$cvs_ver"}{"git_hash"} = $git_hash;
#	$rev_map{"$file"}{"$git_hash"}{"cvs_ver"} = $cvs_ver;
	} else {
	die "Failed to parse revisions file at line $.\n";
	}
	vvlog("Found file $file with CVS version $cvs_ver in git hash $git_hash");
}
close IN;
vlog("Parsed revisions file \"$revs_file\", found revisions for " . scalar(keys %rev_map) . " files");
}

# return a list of filenames with the given extension
sub find_files_ext
{
my $dir = shift or die('Internal error: No dir specified');
my $ext = shift or die('Internal error: No ext specified');

my @files;
find( sub { if (-f and m/\.$ext$/) { my $filename = $File::Find::name; $filename =~ s,\.\/,,; push @files, $filename }}, $dir );
return @files;
}

# Update the translation-check metadata header in a wml file
sub update_wml_file_metadata
{
my $file = shift;
my $revision = shift;
my $hash = shift;
my $text = "";

open (IN, "< $file") or die "Can't open $file for reading: $!\n";
while () {
	if (m/^#use