The attached files can be used to test the torus-2QoS routing engine using ibsim.
fabric-torus-5x5x5 contains a fabric description that ibsim can read.
Once ibsim is running, run opensm like this:
opensm --config opensm.conf --torus_config torus-2QoS-5x5x5.conf
or
opensm --config opensm.conf --torus_config torus-2QoS-5x5x5.conf \
-Q --qos_policy_file qos-policy-torus-5x5x5.conf
-- Jim
fabric-torus-5x5x5.bz2
Description: application/bzip
# Limit the maximal operational VLs max_op_vls 8 # The number of seconds between subnet sweeps (0 disables it) sweep_interval 10 # Routing engine # Multiple routing engines can be specified separated by # commas so that specific ordering of routing algorithms will # be tried if earlier routing engines fail. # Supported engines: minhop, updn, file, ftree, lash, dor routing_engine torus-2QoS,no_fallback # Use unicast routing cache (use FALSE if unsure) use_ucast_cache TRUE # Force flush of the log file after each log message force_log_flush TRUE # Log file to be used log_file /dev/tty # console [off|local|loopback|socket] console loopback # Telnet port for console (default 10000) console_port 10000 # QoS default options # Note that for OFED > 1.3, this information can also be in qos-policy.conf. # However, it may be good to have it here also for torus-2QoS, as this will # change the defaults even if not using QoS. qos_max_vls 8 qos_high_limit 0 qos_vlarb_high 0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0 qos_vlarb_low 0:64,1:64,2:64,3:64,4:64,5:64,6:64,7:64,8:64 qos_sl2vl (null)
# This is a QoS configuration for the torus-2QoS routing engine.
# As it supports only 2 levels of QoS, via SL bit 3, we should configure
# only SLs 0 and 8. Based on that torus-2QoS will pick the appropriate
# SL value to provide deadlock-free routing for both QoS levels.
port-groups
port-group
name: Service_nodes
port-name: "H_0_0_0_0/P1" # E.g. admin
port-name: "H_0_0_1_0/P1" # E.g. NFS server
port-name: "H_0_0_2_0/P1" # E.g. boot server
port-name: "H_0_0_3_0/P1" # E.g. login node
end-port-group
port-group
name: Lustre_nodes
port-name: "H_0_0_4_0/P1" # E.g. MDS
port-name: "H_0_1_0_0/P1" # E.g. OSS
port-name: "H_0_1_1_0/P1" # E.g. OSS
port-name: "H_0_1_2_0/P1" # E.g. OSS
port-name: "H_0_1_3_0/P1" # E.g. OSS
port-name: "H_0_1_4_0/P1" # E.g. OSS
end-port-group
port-group
name: Compute_nodes
port-name: "H_0_2_0_0/P1"
port-name: "H_0_2_1_0/P1"
port-name: "H_0_2_2_0/P1"
port-name: "H_0_2_3_0/P1"
port-name: "H_0_2_4_0/P1"
port-name: "H_0_3_0_0/P1"
port-name: "H_0_3_1_0/P1"
port-name: "H_0_3_2_0/P1"
port-name: "H_0_3_3_0/P1"
port-name: "H_0_3_4_0/P1"
port-name: "H_0_4_0_0/P1"
port-name: "H_0_4_1_0/P1"
port-name: "H_0_4_2_0/P1"
port-name: "H_0_4_3_0/P1"
port-name: "H_0_4_4_0/P1"
port-name: "H_1_0_0_0/P1"
port-name: "H_1_0_1_0/P1"
port-name: "H_1_0_2_0/P1"
port-name: "H_1_0_3_0/P1"
port-name: "H_1_0_4_0/P1"
port-name: "H_1_1_0_0/P1"
port-name: "H_1_1_1_0/P1"
port-name: "H_1_1_2_0/P1"
port-name: "H_1_1_3_0/P1"
port-name: "H_1_1_4_0/P1"
port-name: "H_1_2_0_0/P1"
port-name: "H_1_2_1_0/P1"
port-name: "H_1_2_2_0/P1"
port-name: "H_1_2_3_0/P1"
port-name: "H_1_2_4_0/P1"
port-name: "H_1_3_0_0/P1"
port-name: "H_1_3_1_0/P1"
port-name: "H_1_3_2_0/P1"
port-name: "H_1_3_3_0/P1"
port-name: "H_1_3_4_0/P1"
port-name: "H_1_4_0_0/P1"
port-name: "H_1_4_1_0/P1"
port-name: "H_1_4_2_0/P1"
port-name: "H_1_4_3_0/P1"
port-name: "H_1_4_4_0/P1"
port-name: "H_2_0_0_0/P1"
port-name: "H_2_0_1_0/P1"
port-name: "H_2_0_2_0/P1"
port-name: "H_2_0_3_0/P1"
port-name: "H_2_0_4_0/P1"
port-name: "H_2_1_0_0/P1"
port-name: "H_2_1_1_0/P1"
port-name: "H_2_1_2_0/P1"
port-name: "H_2_1_3_0/P1"
port-name: "H_2_1_4_0/P1"
port-name: "H_2_2_0_0/P1"
port-name: "H_2_2_1_0/P1"
port-name: "H_2_2_2_0/P1"
port-name: "H_2_2_3_0/P1"
port-name: "H_2_2_4_0/P1"
port-name: "H_2_3_0_0/P1"
port-name: "H_2_3_1_0/P1"
port-name: "H_2_3_2_0/P1"
port-name: "H_2_3_3_0/P1"
port-name: "H_2_3_4_0/P1"
port-name: "H_2_4_0_0/P1"
port-name: "H_2_4_1_0/P1"
port-name: "H_2_4_2_0/P1"
port-name: "H_2_4_3_0/P1"
port-name: "H_2_4_4_0/P1"
port-name: "H_3_0_0_0/P1"
port-name: "H_3_0_1_0/P1"
port-name: "H_3_0_2_0/P1"
port-name: "H_3_0_3_0/P1"
port-name: "H_3_0_4_0/P1"
port-name: "H_3_1_0_0/P1"
port-name: "H_3_1_1_0/P1"
port-name: "H_3_1_2_0/P1"
port-name: "H_3_1_3_0/P1"
port-name: "H_3_1_4_0/P1"
port-name: "H_3_2_0_0/P1"
port-name: "H_3_2_1_0/P1"
port-name: "H_3_2_2_0/P1"
port-name: "H_3_2_3_0/P1"
port-name: "H_3_2_4_0/P1"
port-name: "H_3_3_0_0/P1"
port-name: "H_3_3_1_0/P1"
port-name: "H_3_3_2_0/P1"
port-name: "H_3_3_3_0/P1"
port-name: "H_3_3_4_0/P1"
port-name: "H_4_4_0_0/P1"
port-name: "H_4_4_1_0/P1"
port-name: "H_4_4_2_0/P1"
port-name: "H_4_4_3_0/P1"
port-name: "H_4_4_4_0/P1"
port-name: "H_4_0_0_0/P1"
port-name: "H_4_0_1_0/P1"
port-name: "H_4_0_2_0/P1"
port-name: "H_4_0_3_0/P1"
port-name: "H_4_0_4_0/P1"
port-name: "H_4_1_0_0/P1"
port-name: "H_4_1_1_0/P1"
port-name: "H_4_1_2_0/P1"
port-name: "H_4_1_3_0/P1"
port-name: "H_4_1_4_0/P1"
port-name: "H_4_2_0_0/P1"
port-name: "H_4_2_1_0/P1"
port-name: "H_4_2_2_0/P1"
port-name: "H_4_2_3_0/P1"
port-name: "H_4_2_4_0/P1"
port-name: "H_4_3_0_0/P1"
port-name: "H_4_3_1_0/P1"
port-name: "H_4_3_2_0/P1"
port-name: "H_4_3_3_0/P1"
port-name: "H_4_3_4_0/P1"
port-name: "H_4_4_0_0/P1"
port-name: "H_4_4_1_0/P1"
port-name: "H_4_4_2_0/P1"
port-name: "H_4_4_3_0/P1"
port-name: "H_4_4_4_0/P1"
end-port-group
port-group
name: All_ports
node-type: ALL
end-port-group
end-port-groups
#
# The default VL arbitration setup will not be quite right for
# torus-2QoS, so set up something more appropriate.
#
# All the SLs for a given QoS level need to have equal traffic priority.
# Since SLs 0-7 map to VLs 0-3, and SLs 8-15 map to VLs 4-7, we need
# equal VL arbitration weightings in each of those VL ranges.
#
# OFED 1.3 doesn't use this information, just parses and drops it on the floor,
# so it needs to be repeated in opensm.conf. Putting it in opensm.conf has
# the added benefit that the defaults can be set and used even if QoS isn't
# configured.
#
qos-setup
vlarb-tables
vlarb-scope
group: All_ports
across: All_ports
vl-high-limit: 0
vlarb-high: 0:0
vlarb-high: 1:0
vlarb-high: 2:0
vlarb-high: 3:0
vlarb-high: 4:0
vlarb-high: 5:0
vlarb-high: 6:0
vlarb-high: 7:0
vlarb-high: 8:0
vlarb-high: 9:0
vlarb-high: 10:0
vlarb-high: 11:0
vlarb-high: 12:0
vlarb-high: 13:0
vlarb-high: 14:0
vlarb-low: 0:64
vlarb-low: 1:64
vlarb-low: 2:64
vlarb-low: 3:64
vlarb-low: 4:64
vlarb-low: 5:64
vlarb-low: 6:64
vlarb-low: 7:64
vlarb-low: 8:64
vlarb-low: 9:64
vlarb-low: 10:64
vlarb-low: 11:64
vlarb-low: 12:64
vlarb-low: 13:64
vlarb-low: 14:64
end-vlarb-scope
end-vlarb-tables
end-qos-setup
#
# We don't explicitly use the qos-class keyword in qos-match-rule, because
# we don't have any control over how apps will specify qos-class in path
# queries, and we don't want rule matching falures due to wrong qos-class
# values in queries.
#
qos-levels
qos-level
name: DEFAULT
sl: 0
end-qos-level
# By assigning Lustre and MPI traffic to different SLs (and thus
# different VLs) we keep MPI and Lustre from starving each other.
qos-level
name: Lustre
sl: 0
end-qos-level
qos-level
name: MPI
sl: 8
end-qos-level
end-qos-levels
#
# For the purposes of QoS configuration, MPI is not a supported ULP.
# Need to use port group match rules get MPI to request SL 8.
#
qos-ulps
ipoib : 0
default : 0
end-qos-ulps
#
# Note that the first matching rule is used to assign the qos-level-name
# used to chose the SL to send on, and that anything that doesn't match
# one of the above rules will be assigned to the DEFAULT qos-level.
#
qos-match-rules
qos-match-rule
source: Compute_nodes
destination: Compute_nodes
qos-level-name: MPI
end-qos-match-rule
qos-match-rule
source: Lustre_nodes
qos-level-name: Lustre
end-qos-match-rule
qos-match-rule
destination: Lustre_nodes
qos-level-name: Lustre
end-qos-match-rule
# Note that anything that doesn't match one of the above rules
# will be assigned to the DEFAULT qos-level.
end-qos-match-rules
# We want the torus routing engine to attempt to find a # 5x5x5 torus in the fabric: torus 5 5 5 # We need to tell the routing engine what directions we # want the torus coordinate directions to be, by specifing # the endpoints (switch GUID + port) of a link in each # direction. These links need to share a common switch, # which we call the torus seed. # Here we specify positive coordinate directions: xp_link 0x200000 0x200019 # S_0_0_0 -> S_1_0_0 yp_link 0x200000 0x200005 # S_0_0_0 -> S_0_1_0 zp_link 0x200000 0x200001 # S_0_0_0 -> S_0_0_1 # If one of the above switches were to fail, the routing # engine would not have sufficient information to locate the # torus in the fabric. Specify a backup seed here: next_seed xp_link 0x20001f 0x200038 # S_1_1_1 -> S_2_1_1 yp_link 0x20001f 0x200024 # S_1_1_1 -> S_1_2_1 zp_link 0x20001f 0x200020 # S_1_1_1 -> S_1_1_2 # The torus routing engine uses the concept of a dateline, # where a coordinate wraps from its maximum back to zero, # in order to compute path SL values that provide routing # that is free from credit loops. # # If it is forced by a failed switch to use the backup # seed specification, that would cause the datelines # to move, which would change many path SL values, and # defeats one of the main benefits of this routing engine. # So, describe the position of the original datelines # relative to the backup seed as follows: x_dateline -1 y_dateline -1 z_dateline -1 # You can specify as many backup seeds as you like, but # in practice, the torus routing engine is only guaranteed # to be able to route around a single failed switch without # introducing credit loops, so one backup seed is enough.
