Hi,
We added 2 new JBODs (each with 60disks - 1PB) and wanted to extend our 3PB
cluster to 5PB. We tried to do several approaches:
Add jbods with weight 0 and reweight them slowly with 0.01 - This seems to
be very slow...
We change their weight to use it at max and cluster started to
recover/rebalance things around. (with ~20% of objects not in place). All
went smooth for a few hours, but now some random OSDs crash with similar
error messages below [1]. Is this known? Or I am pushing it too much? What
might be the reason for this crash?
Let me know if you need more details
Thanks
[1]
{
"assert_condition": "abort",
"assert_file":
"/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.13/rpm/el8/BUILD/ceph-15.2.13/src/osd/PeeringState.cc",
"assert_func":
"PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed,
PeeringState::PeeringMachine>::my_context)",
"assert_line": 4243,
"assert_msg":
"/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.13/rpm/el8/BUILD/ceph-15.2.13/src/osd/PeeringState.cc:
In function
'PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed,
PeeringState::PeeringMachine>::my_context)' thread 7f7cd0dec700 time
2021-07-13T15:50:44.086598-0700\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.13/rpm/el8/BUILD/ceph-15.2.13/src/osd/PeeringState.cc:
4243: ceph_abort_msg(\"we got a bad state machine event\")\n",
"assert_thread_name": "tp_osd_tp",
"backtrace": [
"(()+0x12b20) [0x7f7cf1474b20]",
"(gsignal()+0x10f) [0x7f7cf00db7ff]",
"(abort()+0x127) [0x7f7cf00c5c35]",
"(ceph::__ceph_abort(char const*, int, char const*,
std::__cxx11::basic_string<char, std::char_traits<char>,
std::allocator<char> > const&)+0x1b6) [0x55981fd083e9]",
"(PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed,
PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na>,
(boost::statechart::history_mode)0>::my_context)+0xc4) [0x559820082714]",
"(boost::statechart::state<PeeringState::Crashed,
PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na>,
(boost::statechart::history_mode)0>::deep_construct(boost::statechart::state_machine<PeeringState::PeeringMachine,
PeeringState::Initial, std::allocator<boost::statechart::none>,
boost::statechart::null_exception_translator>* const&,
boost::statechart::state_machine<PeeringState::PeeringMachine,
PeeringState::Initial, std::allocator<boost::statechart::none>,
boost::statechart::null_exception_translator>&)+0x3a) [0x5598200b812a]",
"(boost::statechart::simple_state<PeeringState::Primary,
PeeringState::Started, PeeringState::Peering,
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
const&, void const*)+0x21a) [0x5598200b8eda]",
"(boost::statechart::simple_state<PeeringState::Backfilling,
PeeringState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na>,
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
const&, void const*)+0xc0) [0x5598200b2ac0]",
"(boost::statechart::state_machine<PeeringState::PeeringMachine,
PeeringState::Initial, std::allocator<boost::statechart::none>,
boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base
const&)+0x5b) [0x55981feac2ab]",
"(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>,
PeeringCtx&)+0x2d1) [0x55981fe9e8a1]",
"(OSD::dequeue_peering_evt(OSDShard*, PG*,
std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x29c)
[0x55981fe15c7c]",
"(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*,
boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x559820047906]",
"(OSD::ShardedOpWQ::_process(unsigned int,
ceph::heartbeat_handle_d*)+0x12ef) [0x55981fe0892f]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4)
[0x559820448f84]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x14)
[0x55982044bbe4]",
"(()+0x814a) [0x7f7cf146a14a]",
"(clone()+0x43) [0x7f7cf01a0f23]"
],
"ceph_version": "15.2.13",
"crash_id":
"2021-07-13T22:50:44.104753Z_40759660-ecd8-420d-8c00-bac2c1c46760",
"entity_name": "osd.288",
"os_id": "centos",
"os_name": "CentOS Linux",
"os_version": "8",
"os_version_id": "8",
"process_name": "ceph-osd",
"stack_sig":
"c31b62beceef1c1fcd1b8ea9db822deef8e9680a5c65bbb76a4c8ef8b2a0f3a1",
"timestamp": "2021-07-13T22:50:44.104753Z",
"utsname_hostname": "data-16-1.tier2",
"utsname_machine": "x86_64",
"utsname_release": "5.11.16-1.el8.elrepo.x86_64",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP Mon Apr 19 19:16:48 EDT 2021"
}
_______________________________________________
ceph-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]