On 3/15/07, Mustafa A. Hashmi <[EMAIL PROTECTED]> wrote:
David:
On 3/14/07, David Vasil <[EMAIL PROTECTED]> wrote:
> What are people doing for failover (at the lustre layer) under 1.4.X
> series lustre? Specifically the failing of OSTs between a failed host
> and its failover pair.
The failover bit is easily controlled via lconf and grouping of nodes. The
issue, as you list it further on:
> Under 1.4.9 I have found that the --group feature to lconf does not
> appear to work. Likewise I have had issues with "lconf --cleanup
> --force --service <ost> <config file>" trying to unload all of lustre
> modules on a running OSS (which leaves the OSS in somewhat of a bad
state).
is exactly what I am facing as well. Recently on a 1.4.9 cluster while
trying to 'fail-back' an ost to the primary oss, the secondary oss which had
taken over services refused to give them up. Unfortunately time on that
cluster was limited for me and I am relegated to setting 1.4.9 up on a few
new systems to carry on testing.
I will update you (and all) within 2 days hopefully. If anyone else can pipe
in what David and myself may be doing wrong given the lconf commands listed
above, it would greatly help.
Regards,
--
Okay so from my understanding we are trying the same things on a
production cluster with 1.4.6 (ish) and we had to code out lactive to
use config files since we no longer needed ldap. From my
understanding failover is supposed to happen using lactive and is
kinda a manual process to do. (1.6 takes care of all this with magic
and the mgs).
So I had to patch lactive and lusterdb.py to use config files as well as ldap.
======= lactive patch =========
--- b/lactive 2007-02-22 13:59:49.000000000 -0800
+++ a/lactive 2007-03-15 08:47:05.987903000 -0700
@@ -31,6 +31,7 @@
import sys, getopt, types
import string, os
import ldap
+import xml.dom.minidom
from stat import S_IROTH, S_IRGRP
PYMOD_DIR = [ "/usr/lib64/lustre/python", "/usr/lib/lustre/python" ]
@@ -52,6 +53,7 @@
('group', "The group of devices to update", PARAM),
('active', "The active node name", PARAM),
('pwfile', "File containing password", PARAM),
+ ('config_file', "Config file to use instead of ldap", PARAM)
]
def fatal(*args):
@@ -59,44 +61,57 @@
print "! " + msg
sys.exit(1)
+class stdoutWriter:
+ def stdoutWrite(self, text):
+ print text
+
cl = Lustre.Options("lactive","", lactive_options)
config, args = cl.parse(sys.argv[1:])
if not (config.group or config.active):
- fatal("Must specify both group and active node.")
+ fatal("Must specify both group and active node.")
+
+if not config.config_file:
+ if not config.config:
+ fatal("Missing config")
+
+ if config.pwfile:
+ try:
+ pwperm = os.stat(config.pwfile)[0]
+ pwreadable = pwperm & (S_IRGRP | S_IROTH)
+ if pwreadable:
+ if pwreadable == (S_IRGRP | S_IROTH):
+ readable_by = "group and others"
+ elif pwreadable == S_IRGRP:
+ readable_by = "group"
+ else:
+ readable_by = "others"
+ print "WARNING: Password file %s is readable by %s" % (
+ config.pwfile, readable_by)
+
+ pwfile = open(config.pwfile, "r")
+ pw = string.strip(pwfile.readline())
+ pwfile.close()
+ except Exception, e:
+ fatal("Can't read secret from pwfile %s: %s" % (config.pwfile, e))
+ else:
+ print "no pwfile specified, binding anonymously"
+ pw = ""
-if not config.config:
- fatal("Missing config")
+ base = "config=%s,fs=lustre" % (config.config,)
-if config.pwfile:
try:
- pwperm = os.stat(config.pwfile)[0]
- pwreadable = pwperm & (S_IRGRP | S_IROTH)
- if pwreadable:
- if pwreadable == (S_IRGRP | S_IROTH):
- readable_by = "group and others"
- elif pwreadable == S_IRGRP:
- readable_by = "group"
- else:
- readable_by = "others"
- print "WARNING: Password file %s is readable by %s" % (
- config.pwfile, readable_by)
-
- pwfile = open(config.pwfile, "r")
- pw = string.strip(pwfile.readline())
- pwfile.close()
- except Exception, e:
- fatal("Can't read secret from pwfile %s: %s" % (config.pwfile, e))
+ db = Lustre.LustreDB_LDAP('', {}, base=base, pw = pw, url =
config.ldapurl, update = 1)
+ except Lustre.error.LconfError, e:
+ print e
+ sys.exit(1)
else:
- print "no pwfile specified, binding anonymously"
- pw = ""
-
-base = "config=%s,fs=lustre" % (config.config,)
-try:
- db = Lustre.LustreDB_LDAP('', {}, base=base, pw = pw, url =
config.ldapurl, update = 1)
-except Lustre.error.LconfError, e:
- print e
- sys.exit(1)
+ try:
+ dom = xml.dom.minidom.parse(config.config_file)
+ db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
+ except Lustre.error.LconfError, e:
+ print e
+ sys.exit(1)
active_node = db.lookup_name(config.active)
if not active_node:
@@ -118,7 +133,13 @@
config.active, new_active_uuid))
db.update_active(tgtuuid, new_active_uuid)
-
-
-
-
+if config.config_file:
+ fp = open(config.config_file+".new", "w")
+ try:
+ db.dom_node.writexml(fp)
+ except ImportError, e:
+ print e
+ print "Bah!!! there was an import error!!!"
+ sys.exit(1)
+ os.rename(config.config_file, config.config_file+"~")
+ os.rename(config.config_file+".new", config.config_file)
============ end lactive patch =========================
============ lustredb.py patch =========================
--- a/lustredb.py 2007-03-14 09:57:20.000000000 -0700
+++ b/lustredb.py 2007-03-15 08:51:52.028975000 -0700
@@ -384,7 +384,10 @@
return ret
def _update_active(self, tgt, new):
- raise Lustre.LconfError("updates not implemented for XML")
+ node = self.xmllookup_by_uuid(self.dom_node, tgt)
+ children = node.getElementsByTagName("active_ref")
+ active_ref = children[0]
+ active_ref.setAttribute("uuidref", new)
# ================================================================
# LDAP Support
=============== end lustredb.py patch =========================
_______________________________________________
Lustre-discuss mailing list
[email protected]
https://mail.clusterfs.com/mailman/listinfo/lustre-discuss