Hi,
Attached is a patch against svn r1127 which adds a ReplicationPolicy
which aims to put a copy of a file on a different network to the
original. There's also some tests, which are good for getting an idea of
how it works.
By default a "different network" is one on a different /16 block
(255.255.0.0) to the host storing the original copy. That behaviour
should be OK for most, but if you need to change it, you can define
"network zones" with server settings and give yourself lots of different
netmasks, see the POD.
Read server/doc/pluggable-replication-policies.txt for the gory details
on how to change your network policy for a class.
This patch doesn't include the earlier bugfix to Class.pm
It was a bit nasty having to change Config.pm to allow new server
settings.
Missing from this is a change to the behaviour of mogilefs when putting
a new file. That's next on my list.
Cheers.
diff -urP trunk_r1127/server/lib/MogileFS/Config.pm trunk_r1127_patched/server/lib/MogileFS/Config.pm
--- trunk_r1127/server/lib/MogileFS/Config.pm 2007-05-17 10:42:53.000000000 +0100
+++ trunk_r1127_patched/server/lib/MogileFS/Config.pm 2007-10-03 19:04:46.000000000 +0100
@@ -300,12 +300,20 @@
die "Doesn't match acceptable format.";
};
};
-
+ my $valid_netmask = sub {
+ my $n = Net::Netmask->new2($_[0]);
+ die "Doesn't match an acceptable netmask" unless $n;
+ };
+
# let slave settings go through unmodified, for now.
if ($key =~ /^slave_/) { return $del_if_blank };
if ($key eq "enable_rebalance") { return $bool };
if ($key eq "memcache_servers") { return $any };
+ # ReplicationPolicy::MultipleNetworks
+ if ($key eq 'network_zones') { return $any };
+ if ($key =~ /^zone_/) { return $valid_netmask };
+
if ($key eq "rebalance_policy") { return sub {
my $v = shift;
return undef unless $v;
diff -urP trunk_r1127/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm trunk_r1127_patched/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm
--- trunk_r1127/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm 1970-01-01 01:00:00.000000000 +0100
+++ trunk_r1127_patched/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm 2007-10-03 19:22:06.000000000 +0100
@@ -0,0 +1,270 @@
+package MogileFS::ReplicationPolicy::MultipleNetworks;
+
+use strict;
+use base 'MogileFS::ReplicationPolicy';
+use MogileFS::Util qw(weighted_list);
+use MogileFS::ReplicationRequest qw(ALL_GOOD TOO_GOOD TEMP_NO_ANSWER);
+
+my %cache;
+my $age;
+
+sub AVOIDNETWORK { return "AVOIDNETWORK"; }
+
+sub new {
+ my ($class, $mindevcount) = @_;
+ return bless {
+ mindevcount => $mindevcount,
+ }, $class;
+}
+
+sub new_from_policy_args {
+ my ($class, $argref) = @_;
+ # Note: "MultipleNetworks()" is okay, in which case the 'mindevcount'
+ # on the class is used. (see below)
+ $$argref =~ s/^\s* \( \s* (\d*) \s* \) \s*//x
+ or die "$class failed to parse args: $$argref";
+ return $class->new($1)
+}
+
+sub mindevcount { $_[0]{mindevcount} }
+
+sub replicate_to {
+ my ($self, %args) = @_;
+
+ my $fid = delete $args{fid}; # fid scalar to copy
+ my $on_devs = delete $args{on_devs}; # arrayref of device objects
+ my $all_devs = delete $args{all_devs}; # hashref of { devid => MogileFS::Device }
+ my $failed = delete $args{failed}; # hashref of { devid => 1 } of failed attempts this round
+
+ # old-style
+ my $min = delete $args{min};
+ $min = $self->{mindevcount} || $min;
+
+ warn "Unknown parameters: " . join(", ", sort keys %args) if %args;
+ die "Missing parameters" unless $on_devs && $all_devs && $failed && $fid;
+
+ # number of devices we currently live on
+ my $already_on = @$on_devs;
+
+ # a silly special case, bail out early.
+ return ALL_GOOD if $min == 1 && $already_on;
+
+ # total disks available which are candidates for having files on them
+ my $total_disks = scalar grep { $_->dstate->should_have_files } values %$all_devs;
+
+ # if we have two copies and that's all the disks there are
+ # anywhere, be happy enough
+ return ALL_GOOD if $already_on >= 2 && $already_on == $total_disks;
+
+ # see which and how many unique hosts/networks we're already on.
+ my %on_dev;
+ my %on_host;
+ my %on_network;
+ foreach my $dev (@$on_devs) {
+ $on_host{$dev->hostid} = 1;
+ $on_dev{$dev->id} = 1;
+
+ my $on_ip = $dev->host->ip;
+ if ($on_ip) {
+ my $network = network_for_ip($on_ip);
+ $on_network{$network->desc} = $network;
+ }
+ }
+
+ my $uniq_hosts_on = scalar keys %on_host;
+ my $uniq_networks_on = scalar keys %on_network || 1;
+
+ my ($total_uniq_hosts, $total_uniq_networks) = unique_hosts_and_networks($all_devs);
+
+ # target as many networks as we can, but not more than min
+ my $target_networks = ($min < $total_uniq_networks) ? $min : $total_uniq_networks;
+
+ # we're never good if our copies aren't on as many networks as possible
+ if (($target_networks / $uniq_networks_on) <= 1) {
+ return TOO_GOOD if $uniq_hosts_on > $min;
+ return TOO_GOOD if $uniq_hosts_on == $min && $already_on > $min;
+
+ return ALL_GOOD if $uniq_hosts_on == $min;
+ return ALL_GOOD if $uniq_hosts_on >= $total_uniq_hosts && $already_on >= $min;
+ }
+
+ # if there are more hosts we're not on yet, we want to exclude devices we're already
+ # on from our applicable host search.
+ # also exclude hosts on networks we're already on
+ my @skip_network = values %on_network;
+ my %skip_host; # hostid => 1
+ if ($uniq_hosts_on < $total_uniq_hosts) {
+ %skip_host = %on_host;
+
+ if (@skip_network) {
+ # work out hosts from the devs passed to us
+ my %seen_host;
+ foreach my $device (values %$all_devs) {
+ next if ($seen_host{$device->host->id}++);
+
+ foreach my $disliked_network (@skip_network) {
+ if (($disliked_network->match($device->host->ip)) and
+ (not $skip_host{$device->host->id})) {
+ $skip_host{$device->host->id} = AVOIDNETWORK;
+ }
+ }
+ }
+ }
+ }
+
+ my @all_dests = weighted_list map {
+ [$_, 100 * $_->percent_free]
+ } grep {
+ ! $on_dev{$_->devid} &&
+ ! $failed->{$_->devid} &&
+ $_->should_get_replicated_files
+ } MogileFS::Device->devices;
+
+ return TEMP_NO_ANSWER unless @all_dests;
+
+ my @ideal = grep { ! $skip_host{$_->hostid} } @all_dests;
+ # wrong network is less desparate than wrong host
+ my @network_desp = grep { $skip_host{$_->hostid} eq AVOIDNETWORK } @all_dests;
+ my @host_desp = grep { $skip_host{$_->hostid} &&
+ $skip_host{$_->hostid} ne AVOIDNETWORK } @all_dests;
+
+ my @desp = (@network_desp, @host_desp);
+
+ return MogileFS::ReplicationRequest->new(
+ ideal => [EMAIL PROTECTED],
+ desperate => [EMAIL PROTECTED],
+ );
+}
+
+# can't just scalar keys %cache to count networks
+# might include networks for which we have no hosts yet
+sub unique_hosts_and_networks {
+ my ($devs) = @_;
+
+ my %host;
+ my %netmask;
+ foreach my $devid (keys %$devs) {
+ my $dev = $devs->{$devid};
+ next unless $dev->dstate->should_get_repl_files;
+
+ $host{$dev->hostid}++;
+
+ my $ip = $dev->host->ip;
+ $netmask{network_for_ip($ip)->desc}++;
+ }
+
+ return (scalar keys %host, scalar keys %netmask || 1);
+}
+
+
+{
+ my %cache; # '192.168.0.0/24' => Net::Netmask->new2('192.168.0.0/24');
+ my $age; # increments everytime we look
+
+ # turn a server ip into a network
+ # defaults to /16 ranges
+ # this can be overridden with a "zone_$location" setting per network "zone" and
+ # a lookup field listing all "zones"
+ # e.g.
+ # mogadm settings set network_zones location1,location2
+ # mogadm settings set zone_location1 192.168.0.0/24
+ # mogadm settings set zone_location2 10.0.0.0/24
+ # zone names and netmasks must be unique
+ sub network_for_ip {
+ my ($ip) = @_;
+
+ if (not $ip) { # can happen in testing
+ return Net::Netmask->new('default');
+ }
+
+ # clear the cache occasionally
+ if (($age == 0) or ($age++ > 500)) {
+ clear_and_build_cache();
+ $age = 1;
+ }
+
+ my $network;
+ foreach my $zone (keys %cache) {
+ if ($cache{$zone}->match($ip)) {
+ $network = $cache{$zone};
+ }
+ }
+
+ if (not $network) {
+ ($network) = ($ip =~ m/(\d+\.\d+)./);
+ $network .= '/16'; # default
+ $network = Net::Netmask->new2($network);
+ }
+
+ return $network;
+ }
+
+ sub clear_and_build_cache {
+ undef %cache;
+
+ my @zones = split(",",MogileFS::Config->server_setting("network_zones"));
+
+ foreach my $zone (@zones) {
+ my $netmask = MogileFS::Config->server_setting("zone_".$zone);
+
+ if (not $netmask) {
+ warn "couldn't find network_zone <<zone_".$zone.">> check your server settings";
+ next;
+ }
+
+ if ($cache{$netmask}) {
+ warn "duplicate netmask <$netmask> in network zones. check your server settings";
+ }
+
+ $cache{$netmask} = Net::Netmask->new2($netmask);
+
+ if (Net::Netmask::errstr()) {
+ warn "couldn't parse <$zone> as a netmask. error was <".Net::Netmask::errstr().
+ ">. check your server settings";
+ }
+ }
+ }
+
+ sub stuff_cache { # for testing, or it'll try the db
+ my ($self, $ip, $netmask) = @_;
+
+ $cache{$ip} = $netmask;
+ $age = 1;
+ }
+}
+
+1;
+
+# Local Variables:
+# mode: perl
+# c-basic-indent: 4
+# indent-tabs-mode: nil
+# End:
+
+__END__
+
+=head1 NAME
+
+MogileFS::ReplicationPolicy::MultipleNetworks
+
+=head1 RULES
+
+This policy tries to put files onto devices which are on different networks, if that isn't possible then devices on the same network are returned as "desperate" options.
+
+We aim to have as many copies as we can on unique networks, if there are 2 copies on one network and none on another, with a min of 2, we will still over-replicate to the other network. When called from the rebalancer we will therefore rebalance across networks and reduce the correct copy.
+
+By default we class 2 hosts as being on 2 different networks if they're are on different /16 networks (255.255.0.0). This can be controlled using server settings, with a list of network "zones", and then a definition of a netmask for each "zone".
+
+mogadm settings set network_zones location1,location2
+mogadm settings set zone_location1 192.168.0.0/24
+mogadm settings set zone_location2 10.0.0.0/24
+
+Zone names and netmasks must each be unique.
+
+=head1 SEE ALSO
+
+L<MogileFS::Worker::Replicate>
+
+L<MogileFS::ReplicationPolicy>
+
+l<MogileFS::ReplicationRequest>
diff -urP trunk_r1127/server/t/multiple-networks-replpol.t trunk_r1127_patched/server/t/multiple-networks-replpol.t
--- trunk_r1127/server/t/multiple-networks-replpol.t 1970-01-01 01:00:00.000000000 +0100
+++ trunk_r1127_patched/server/t/multiple-networks-replpol.t 2007-10-03 19:14:43.000000000 +0100
@@ -0,0 +1,198 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use Test::More;
+use FindBin qw($Bin);
+use Net::Netmask;
+
+use MogileFS::Server;
+use MogileFS::Util qw(error_code);
+use MogileFS::ReplicationPolicy::MultipleNetworks;
+require "$Bin/lib/mogtestlib.pl";
+
+plan tests => 25;
+
+# need just the one, so we only have to stuff the cache once
+my $polclass = "MogileFS::ReplicationPolicy::MultipleNetworks";
+my $pol = $polclass->new;
+
+# test that the MultipleHosts stuff still works
+# we cope when there are no ips
+
+# already good.
+is(rr("min=2 h1[d1=X d2=_] h2[d3=X d4=_]"),
+ "all_good", "all good");
+
+# need to get it onto host2...
+is(rr("min=2 h1[d1=X d2=_] h2[d3=_ d4=_]"),
+ "ideal(3,4)", "need host2");
+
+# still needs to be on host2, even though 2 copies on host1
+is(rr("min=2 h1[d1=X d2=X] h2[d3=_ d4=_]"),
+ "ideal(3,4)", "need host2, even though 2 on host1");
+
+# anywhere will do. (can happen on, say, rebalance)
+is(rr("min=2 h1[d1=_ d2=_] h2[d3=_ d4=_]"),
+ "ideal(1,2,3,4)", "anywhere");
+
+# should desperately try d2, since host2 is down
+is(rr("min=2 h1[d1=X d2=_] h2=down[d3=_ d4=_]"),
+ "desperate(2)");
+
+# should try host3, since host2 is down
+is(rr("min=2 h1[d1=X d2=_] h2=down[d3=_ d4=_] h3[d5=_ d6=_]"),
+ "ideal(5,6)");
+
+# need a copy on a non-dead disk on host1
+is(rr("min=2 h1[d1=_ d2=X,dead] h2=alive[d3=X d4=_]"),
+ "ideal(1)");
+
+# this is an ideal move, since we only have 2 unique hosts:
+is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=_]"),
+ "ideal(1,4)");
+
+# ... but if we have a 3rd host, it's gotta be there
+is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=_] h3[d5=_]"),
+ "ideal(5)");
+
+# ... unless that host is down, in which case it's back to 1/4,
+# but desperately
+is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=_] h3=down[d5=_]"),
+ "desperate(1,4)");
+
+# too good, uniq hosts > min
+is(rr("min=2 h1[d1=X d2=_] h2[d3=X d4=_] h3[d5=X]"),
+ "too_good");
+
+# too good, but but with uniq hosts == min
+is(rr("min=2 h1[d1=X d2=X] h2[d3=X d4=_]"),
+ "too_good");
+
+# be happy with 3 copies, even though two are on same host (that's our max unique hosts)
+is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=X]"),
+ "all_good");
+
+##
+##
+# actual network policy tests
+my ($ad1, $ad2) = ("#192.168.0.2#" ,"#192.168.0.3#" );
+my ($ad3, $ad4) = ("#10.0.0.2#" ,"#10.0.0.3#" );
+my ($ad5, $ad6) = ("#146.101.246.2#","#146.101.142.130#");
+
+# stuff the cache with the default, otherwise it'll go to the db
+$pol->stuff_cache('192.168.0.2' , Net::Netmask->new('192.168.0.0/16'));
+$pol->stuff_cache('192.168.0.3' , Net::Netmask->new('192.168.0.0/16'));
+$pol->stuff_cache('10.0.0.2' , Net::Netmask->new('10.0.0.0/16'));
+$pol->stuff_cache('10.0.0.3' , Net::Netmask->new('10.0.0.0/16'));
+$pol->stuff_cache('146.101.246.2' , Net::Netmask->new('146.101.0.0/16'));
+$pol->stuff_cache('146.101.142.130', Net::Netmask->new('146.101.0.0/16'));
+
+# retest some multiple Host logic all on the same network
+# already good. (there's only one network)
+is(rr("min=2 h1[d1=X d2=_]$ad1 h2[d3=X d4=_]$ad2"),
+ "all_good", "all good");
+
+# need to get it onto host2...
+is(rr("min=2 h1[d1=X d2=_]$ad1 h2[d3=_ d4=_]$ad2"),
+ "desperate(2,3,4)", "need host2");
+
+# still needs to be on host2, even though 2 copies on host1
+is(rr("min=2 h1[d1=X d2=X]$ad1 h2[d3=_ d4=_]$ad2"),
+ "desperate(3,4)", "need host2, even though 2 on host1");
+
+# target another network
+is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad4"),
+ "ideal(5,6,7,8)","target other network"); # no device 3 or 4 (or 1) in the ideal
+
+# other network down
+is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3=down[d5=_ d6=_]$ad3 h4=down[d7=_ d8=_]$ad4"),
+ "desperate(1,3,4)", "desperate this network");
+
+is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad5"),
+ "ideal(5,6,7,8)","include both other networks with three networks");
+
+is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3=down[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad5"),
+ "ideal(7,8)","one of three networks down");
+
+is(rr("min=2 h1[d1=_ d2=X,dead]$ad1 h2=alive[d3=_ d4=_]$ad2 h3=alive[d5=X d6=_]$ad3"),
+ "ideal(1,3,4)","dead copies don't exclude a network");
+
+is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3[d5=X d6=_]$ad3"),
+ "all_good","enough copies on different networks");
+
+is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=X d4=X]$ad2"),
+ "too_good","3 copies on 2 networks with a min of 2 is too good");
+
+# too many copies on one network, not enough on another, want to over-replicate
+is(rr("min=2 h1[d1=X d2=X]$ad1 h2[d3=X d4=X]$ad2 h3[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad4"),
+ "ideal(5,6,7,8)", "more than min hosts, but all on one network");
+
+# mess with netmasks
+$pol->stuff_cache('146.101.246.2' , Net::Netmask->new('146.101.246.0/24'));
+$pol->stuff_cache('146.101.142.130', Net::Netmask->new('146.101.142.0/24'));
+
+is(rr("min=2 h1[d1=_ d2=X]$ad6 h2[d3=_ d4=_]$ad5 h3[d5=_ d6=_]$ad4 h4[d7=_ d8=_]$ad3"),
+ "ideal(3,4,5,6,7,8)","target other network"); # ad5 and ad6 are no longer the same network
+
+sub rr {
+ my ($state) = @_;
+ my $ostate = $state; # original
+
+ MogileFS::Host->t_wipe_singletons;
+ MogileFS::Device->t_wipe_singletons;
+ MogileFS::Config->set_config_no_broadcast("min_free_space", 100);
+
+ my $min = 2;
+ if ($state =~ s/^\bmin=(\d+)\b//) {
+ $min = $1;
+ }
+
+ my $hosts = {};
+ my $devs = {};
+ my $on_devs = [];
+
+ my $parse_error = sub {
+ die "Can't parse:\n $ostate\n"
+ };
+ while ($state =~ s/\bh(\d+)(?:=(.+?))?\[(.+?)\](#\d+\.\d+\.\d+\.\d+\.?#)?//) {
+ my ($n, $opts, $devstr, $ip) = ($1, $2, $3, $4);
+ $opts ||= "";
+ die "dup host $n" if $hosts->{$n};
+
+# print "1 2 3 4 : <<$1>> <<$2>> <<$3>> <<$4>>\n";
+# print "$state\n";
+
+ my $h = $hosts->{$n} = MogileFS::Host->of_hostid($n);
+ $h->t_init($opts || "alive");
+ if ($ip) {
+ $ip =~ s/#//g;
+ # $h->set_ip($ip); # can't do, is persistent
+ $h->{hostip} = $ip;
+ }
+
+ foreach my $ddecl (split(/\s+/, $devstr)) {
+ $ddecl =~ /^d(\d+)=([_X])(?:,(\w+))?$/
+ or $parse_error->();
+ my ($dn, $on_not, $status) = ($1, $2, $3);
+ die "dup device $dn" if $devs->{$dn};
+ my $d = $devs->{$dn} = MogileFS::Device->of_devid($dn);
+ $status ||= "alive";
+ $d->t_init($h->id, $status);
+ if ($on_not eq "X" && $d->dstate->should_have_files) {
+ push @$on_devs, $d;
+ }
+ }
+ }
+ $parse_error->() if $state =~ /\S/;
+
+ my $rr = $pol->replicate_to(
+ fid => 1,
+ on_devs => $on_devs,
+ all_devs => $devs,
+ failed => {},
+ min => $min,
+ );
+ return $rr->t_as_string;
+}
+