Here's a followup:

Something is wrong and I don't know what. confignetwork existed with status 
code = 1; so it failed. But xcat.log does says what have failed:

[root@ceph01-ib0 ~]# cat /var/log/xcat/xcat.log
Running Kickstart Post-installation script...
Mon Jun 14 14:39:02 -03 2021 [info]: xcat.deployment: Executing post.xcat to 
prepare for firstbooting ...
Mon Jun 14 14:39:41 -03 2021 [info]: xcat.deployment: trying to download 
postscripts from 172.26.255.254...
Mon Jun 14 14:39:42 -03 2021 [info]: xcat.deployment: postscripts downloaded 
successfully
Mon Jun 14 14:39:42 -03 2021 [info]: xcat.deployment: trying to get 
mypostscript from 172.26.255.254...
Mon Jun 14 14:39:42 -03 2021 [info]: xcat.deployment.postscript: Running 
postscript: syslog
grep: /etc/rsyslog.d/remote.conf: No such file or directory
grep: /etc/rsyslog.d/remote.conf: No such file or directory
Mon Jun 14 14:39:42 -03 2021 [info]: xcat.deployment.postscript: postscript 
syslog return with 0
Mon Jun 14 14:39:42 -03 2021 [info]: xcat.deployment.postscript: Running 
postscript: remoteshell

Unable to load host key: /etc/ssh/ssh_host_ed25519_key
Mon Jun 14 14:39:44 -03 2021 [info]: xcat.deployment.postscript: postscript 
remoteshell return with 0
Mon Jun 14 14:39:44 -03 2021 [info]: xcat.deployment.postscript: Running 
postscript: syncfiles
Mon Jun 14 14:39:45 -03 2021 [info]: xcat.deployment.postscript: postscript 
syncfiles return with 0
Mon Jun 14 14:39:45 -03 2021 [info]: xcat.deployment.postscript: Running 
postscript: confignetwork
[I]: back up /etc/sysconfig/network-scripts to 
/etc/sysconfig/network-scripts.xcatbak
[I]: All valid nics and device list:
[I]: ib0
[I]: bond0 ens1f0np0@ens1f1np1
[I]: bond0.1010 bond0
[I]: bond0.123 bond0
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
configure nic and its device : ib0
[I]: Call configib for IB nics: ib0, ports:
[I]: NMCLI_USED=2 NIC_IBNICS=ib0 NIC_IBAPORTS= configib
[E]:Error: configib failed.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
configure nic and its device : bond0 ens1f0np0@ens1f1np1
[I]: create_bond_interface ifname=bond0 slave_ports=ens1f0np0,ens1f1np1 
slave_type=ethernet
[I]: Pickup xcatnet, "management", from NICNETWORKS for interface "bond0".
[I]: ip link set bond0 down
[I]: [bond.down] >> 7: bond0: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop 
state DOWN mode DEFAULT group default qlen 1000
[I]: [bond.down] >>     link/ether 96:c4:61:22:a2:d3 brd ff:ff:ff:ff:ff:ff
[I]: [bond.slavesAft] >>
[I]: ip link set ens1f0np0 down
[I]: [slave]: >> 3: ens1f0np0: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state 
DOWN mode DEFAULT group default qlen 1000
[I]: [slave]: >>     link/ether bc:97:e1:ea:08:b0 brd ff:ff:ff:ff:ff:ff
[I]: create_persistent_ifcfg ifname=ens1f0np0 
inattrs=ONBOOT=yes,USERCTL=no,TYPE=Ethernet,SLAVE=yes,MASTER=bond0,BOOTPROTO=none,MTU=1500
['ifcfg-ens1f0np0']
[I]: >> ONBOOT="yes"
[I]: >> USERCTL="no"
[I]: >> TYPE="Ethernet"
[I]: >> SLAVE="yes"
[I]: >> MASTER="bond0"
[I]: >> BOOTPROTO="static"
[I]: >> MTU="1500"
[I]: >> DEVICE="ens1f0np0"
[I]: >> NAME="ens1f0np0"
[I]: ip link set ens1f1np1 down
[I]: [slave]: >> 5: ens1f1np1: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state 
DOWN mode DEFAULT group default qlen 1000
[I]: [slave]: >>     link/ether bc:97:e1:ea:08:b1 brd ff:ff:ff:ff:ff:ff
[I]: create_persistent_ifcfg ifname=ens1f1np1 
inattrs=ONBOOT=yes,USERCTL=no,TYPE=Ethernet,SLAVE=yes,MASTER=bond0,BOOTPROTO=none,MTU=1500
['ifcfg-ens1f1np1']
[I]: >> ONBOOT="yes"
[I]: >> USERCTL="no"
[I]: >> TYPE="Ethernet"
[I]: >> SLAVE="yes"
[I]: >> MASTER="bond0"
[I]: >> BOOTPROTO="static"
[I]: >> MTU="1500"
[I]: >> DEVICE="ens1f1np1"
[I]: >> NAME="ens1f1np1"
[I]: [bond.slavesNew] >> ens1f0np0 ens1f1np1
[I]: ip link set bond0 up
[I]: [ip.link<http://ip.link>] >> 7: bond0: 
<BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode 
DEFAULT group default qlen 1000
[I]: [ip.link<http://ip.link>] >>     link/ether bc:97:e1:ea:08:b0 brd 
ff:ff:ff:ff:ff:ff
[I]: create_persistent_ifcfg ifname=bond0 xcatnet=management 
inattrs=ONBOOT=yes,USERCTL=no,TYPE=Bond,BONDING_MASTER=yes,BONDING_OPTS='mode=802.3ad
 miimon=100',BOOTPROTO=none,DHCLIENTARGS='-timeout 200',MTU=1500
['ifcfg-bond0']
[I]: >> ONBOOT="yes"
[I]: >> USERCTL="no"
[I]: >> TYPE="Bond"
[I]: >> BONDING_MASTER="yes"
[I]: >> BONDING_OPTS="mode=802.3ad miimon=100"
[I]: >> BOOTPROTO="static"
[I]: >> DHCLIENTARGS="-timeout 200"
[I]: >> MTU="1500"
[I]: >> DEVICE="bond0"
[I]: >> IPADDR="172.26.254.1"
[I]: >> NETMASK="255.255.0.0"
[I]: >> NAME="bond0"
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
configure nic and its device : bond0.1010 bond0
[I]: create_vlan_interface ifname=bond0 vlanid=1010
[I]: Pickup xcatnet, "ceph", from NICNETWORKS for interface "bond0".
[I]: ip link add link bond0 name bond0.1010 type vlan id 1010
RTNETLINK answers: Numerical result out of range
[I]: ip link set bond0.1010 up
[I]: create_persistent_ifcfg ifname=bond0.1010 xcatnet=ceph 
inattrs=ONBOOT=yes,USERCTL=no,VLAN=yes,MTU=9216
['ifcfg-bond0.1010']
[I]: >> ONBOOT="yes"
[I]: >> USERCTL="no"
[I]: >> VLAN="yes"
[I]: >> MTU="9216"
[I]: >> DEVICE="bond0.1010"
[I]: >> BOOTPROTO="static"
[I]: >> IPADDR="10.0.254.1"
[I]: >> NETMASK="255.255.255.0"
[I]: >> NAME="bond0.1010"
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
configure nic and its device : bond0.123 bond0
[I]: create_vlan_interface ifname=bond0 vlanid=123
[I]: Pickup xcatnet, "ceph-sync", from NICNETWORKS for interface "bond0".
[I]: ip link add link bond0 name bond0.123 type vlan id 123
RTNETLINK answers: Numerical result out of range
[I]: ip link set bond0.123 up
[I]: State of "bond0.123" was "UNKNOWN" instead of expected "UP". Wait 0 of 200 
with interval 1.
[I]: create_persistent_ifcfg ifname=bond0.123 xcatnet=ceph-sync 
inattrs=ONBOOT=yes,USERCTL=no,VLAN=yes,MTU=9216
['ifcfg-bond0.123']
[I]: >> ONBOOT="yes"
[I]: >> USERCTL="no"
[I]: >> VLAN="yes"
[I]: >> MTU="9216"
[I]: >> DEVICE="bond0.123"
[I]: >> BOOTPROTO="static"
[I]: >> IPADDR="192.168.168.21"
[I]: >> NETMASK="255.255.255.0"
[I]: >> NAME="bond0.123"
Mon Jun 14 14:39:54 -03 2021 [info]: xcat.deployment.postscript: postscript 
confignetwork return with 1

The mkresolvconf trick worked only on first install. After reboot resolve.conf 
was overwrite, probably NetworkManager?

Thanks,
Vinícius.

On 14 Jun 2021, at 14:10, Vinícius Ferrão via xCAT-user 
<xcat-user@lists.sourceforge.net<mailto:xcat-user@lists.sourceforge.net>> wrote:

Hi Georgios,

Thanks for the tip. I've never heard of mkresolvconf in more than 5 years using 
xCAT. But yes, it does generate the /etc/resolv.conf correctly when issuing the 
updatenode command, but not while the machine is booting.

Something may be overwriting the /etc/resolv.conf file.

But there's stil a tricky behaviour. The compunte node is selecting it's name 
from the Infiniband interface, which is incorrect. It should use it's name from 
the management interface. That was happening before the mkresolvconf trick 
anyway, and again, I don't have any ideia why this is happening.

Thanks,
Vinícius.


On 14 Jun 2021, at 07:10, Nikolis, Georgios 
<georgios.niko...@charite.de<mailto:georgios.niko...@charite.de>> wrote:

I think you need to add the mkresolvconf postscript into the node's 
postbootscripts.
Does "updatenode <node> -P mkresolvconf" repair nameserver configuration on an 
affected node?
Cheers,
Georgios


--

Georgios Nikolis
Charité – Universitätsmedizin Berlin
Geschäftsbereich IT | Scientific Computing

Campus Charité Mitte
Rahel-Hirsch-Weg 5 | Ebene 02 | Raum 073
Charitéplatz 1 | 10117 Berlin

georgios.niko...@charite.de<mailto:georgios.niko...@charite.de>
https://www.charite.de<https://www.charite.de/>


On Mon, 2021-06-14 at 05:41 +0000, Vinícius Ferrão via xCAT-user wrote:
Hello,

For unknown reasons nodes that I've installed with rinstall (using stateful 
method) didn't get the nameserver section in resolv.conf, basically leaving the 
node without any name resolution.

As specified on the documentation 
https://xcat-docs.readthedocs.io/en/stable/advanced/domain_name_resolution/domain_name_resolution.html;
 it should be generated it nameservers and domain are provided on the site 
table: The resolv.conf files for the compute nodes will be created 
automatically using the domain and nameservers values set in the xCAT network 
or site definition.

Both are defined but it still didn't generate it correctly.

[root@headnode ~]# lsdef -t site clustersite | egrep "nameserver|forward|domain"
    domain=cluster.domain.tld
    forwarders=1.1.1.1
    nameservers=172.26.255.254

I even tried adding the nameservers to the network definition, but it was a no 
go:

[root@headnode ~]# lsdef -t network management
Object name: management
    gateway=<xcatmaster>
    mask=255.255.0.0
    mgtifname=bond0
    mtu=1500
    nameservers=172.26.255.254
    net=172.26.0.0
    tftpserver=<xcatmaster>

Is there anything that I can do to debug this?

Thanks,
Vinícius.

PS: Here's full data from a given node and the networks.

[root@headnode ~]# lsdef ceph01
Object name: ceph01
    arch=x86_64
    bmc=172.25.254.1
    bmcpassword=calvin
    bmcusername=root
    cons=ipmi
    consoleenabled=1
    currchain=boot
    currstate=install ol8.4.0-x86_64-compute
    groups=ceph,all
    ip=172.26.254.1
    mac=bc:97:e1:ea:08:b0
    mgt=ipmi
    netboot=xnba
    nicdevices.bond0.123=bond0
    nicdevices.bond0.1010=bond0
    nicdevices.bond0=ens1f0np0|ens1f1np1
    nichostnamesuffixes.bond0.1010=-ceph
    nichostnamesuffixes.bond0.123=-cephsync
    nicips.ib0=172.27.254.1
    nicips.bond0=172.26.254.1
    nicips.bond0.1010=10.0.10.21
    nicips.bond0.123=192.168.168.21
    nicnetworks.bond0.123=ceph-sync
    nicnetworks.ib0=application
    nicnetworks.bond0.1010=ceph
    nicnetworks.bond0=management
    nictypes.ib0=Infiniband
    nictypes.ens1f0np0=ethernet
    nictypes.bond0.1010=vlan
    nictypes.bond0=bond
    nictypes.ens1f1np1=ethernet
    nictypes.bond0.123=vlan
    os=ol8.4.0
    postbootscripts=otherpkgs,confignics
    
postscripts=syslog,remoteshell,syncfiles,confignetwork,versatushpc/postinstall-ceph
    profile=compute
    provmethod=ol8.4.0-x86_64-install-ceph
    serialport=0
    serialspeed=115200
    status=booted
    statustime=06-14-2021 02:37:04
    updatestatus=synced
    updatestatustime=06-14-2021 02:01:55

[root@headnode ~]# lsdef -t network
application  (network)
ceph  (network)
ceph-sync  (network)
libvirt  (network)
management  (network)
service  (network)
site  (network)



_______________________________________________

xCAT-user mailing list

<mailto:xCAT-user@lists.sourceforge.net>

xCAT-user@lists.sourceforge.net


<https://lists.sourceforge.net/lists/listinfo/xcat-user>

https://lists.sourceforge.net/lists/listinfo/xcat-user


_______________________________________________
xCAT-user mailing list
xCAT-user@lists.sourceforge.net<mailto:xCAT-user@lists.sourceforge.net>
https://lists.sourceforge.net/lists/listinfo/xcat-user

_______________________________________________
xCAT-user mailing list
xCAT-user@lists.sourceforge.net<mailto:xCAT-user@lists.sourceforge.net>
https://lists.sourceforge.net/lists/listinfo/xcat-user

_______________________________________________
xCAT-user mailing list
xCAT-user@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/xcat-user

Reply via email to