[ https://issues.apache.org/jira/browse/KUDU-2509?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Alexey Serbin updated KUDU-2509: -------------------------------- Description: As it's seen from the code snippet from {{src/kudu/tablet/tablet_bootstrap.cc}}, the {{TabletBootstrap::HandleCommitMessage()}} can return non-OK status while applying pending commits via {{ApplyCommitMessage()}}, when {{commit_entry}} is already deallocated after prior call to {{ApplyCommitMessage()}}: {code:java} OpId last_applied = commit_entry->commit().commited_op_id(); RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry)); delete commit_entry; auto iter = state->pending_commits.begin(); while (iter != state->pending_commits.end()) { if ((*iter).first == last_applied.index() + 1) { gscoped_ptr<LogEntryPB> buffered_commit_entry((*iter).second); state->pending_commits.erase(iter++); last_applied = buffered_commit_entry->commit().commited_op_id(); RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get())); continue; } break; } return Status::OK(); {code} That violates the contract of the {{TabletBootstrap::HandleCommitMessage()}}, so the following code does use-after-free and can get SIGSEGV while calling {{DebugInfo()}} to get more information on the {{entry}}: {code:java} s = HandleEntry(&state, entry.get()); if (!s.ok()) { DumpReplayStateToLog(state); RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(), segment->header().sequence_number(), entry_count, segment->path(), *entry)); } {code} The stack trace in the core file looks like the following: {noformat} #0 0x0000000001e343a0 in GetDescriptor (this=0x7ff9f4a5e8c0, message=..., generator=0x7ff9f4a5e7e0) at thirdparty/src/protobuf-3.4.1/src/google/protobuf/message.h:332 #1 google::protobuf::TextFormat::Printer::Print (this=0x7ff9f4a5e8c0, message=..., generator=0x7ff9f4a5e7e0) at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1836 #2 0x0000000001e3460c in google::protobuf::TextFormat::Printer::Print (this=Unhandled dwarf expression opcode 0xf3 ) at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1759 #3 0x0000000001e346ad in google::protobuf::TextFormat::Printer::PrintToString (this=0x7ff9f4a5e8c0, message=..., output=0x7ff9f4a5eaf0) at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1742 #4 0x0000000001c83a95 in kudu::pb_util::SecureShortDebugString (msg=...) at src/kudu/util/pb_util.cc:603 #5 0x00000000009fe50b in DebugInfo (this=Unhandled dwarf expression opcode 0xf3) at src/kudu/tablet/tablet_bootstrap.cc:468 #6 kudu::tablet::TabletBootstrap::PlaySegments (this=Unhandled dwarf expression opcode 0xf3) at src/kudu/tablet/tablet_bootstrap.cc:1177 #7 0x00000000009ffc4b in kudu::tablet::TabletBootstrap::RunBootstrap (this=0x7ff9f4a5f510, rebuilt_tablet=0x7ff9f4a5f8a0, rebuilt_log=0x7ff9f4a5f870, consensus_info=0x7ff9f4a5f9d0) at src/kudu/tablet/tablet_bootstrap.cc:586 {noformat} The affected 'after-free' message looks like the following in the captured core: {noformat} {<google::protobuf::MessageLite> = {_vptr.MessageLite = 0xc}, <No data fields>} {noformat} The {{0xc}} address for vptr is invalid and the process got SIGSEGV while calling {{Message::GetDescriptor()}}, a virtual method. was: As it's seen from the code snippet from {{src/kudu/tablet/tablet_bootstrap.cc}}, the {{TabletBootstrap::HandleCommitMessage()}} can return non-OK status while applying pending commits via {{ApplyCommitMessage()}}, when {{commit_entry}} is already deallocated after prior call to {{ApplyCommitMessage()}}: {code:java} OpId last_applied = commit_entry->commit().commited_op_id(); RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry)); delete commit_entry; auto iter = state->pending_commits.begin(); while (iter != state->pending_commits.end()) { if ((*iter).first == last_applied.index() + 1) { gscoped_ptr<LogEntryPB> buffered_commit_entry((*iter).second); state->pending_commits.erase(iter++); last_applied = buffered_commit_entry->commit().commited_op_id(); RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get())); continue; } break; } return Status::OK(); {code} That violates the contract of the {{TabletBootstrap::HandleCommitMessage()}}, so the following code does use-after-free and can get SIGSEGV while calling {{DebugInfo()}} to get more information on the {{entry}}: {code:java} s = HandleEntry(&state, entry.get()); if (!s.ok()) { DumpReplayStateToLog(state); RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(), segment->header().sequence_number(), entry_count, segment->path(), *entry)); } {code} The stack trace in the core file looks like the following: {noformat} #0 0x0000000001e343a0 in GetDescriptor (this=0x7ff9f4a5e8c0, message=..., generator=0x7ff9f4a5e7e0) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/message.h:332 #1 google::protobuf::TextFormat::Printer::Print (this=0x7ff9f4a5e8c0, message=..., generator=0x7ff9f4a5e7e0) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1836 #2 0x0000000001e3460c in google::protobuf::TextFormat::Printer::Print (this=Unhandled dwarf expression opcode 0xf3 ) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1759 #3 0x0000000001e346ad in google::protobuf::TextFormat::Printer::PrintToString (this=0x7ff9f4a5e8c0, message=..., output=0x7ff9f4a5eaf0) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1742 #4 0x0000000001c83a95 in kudu::pb_util::SecureShortDebugString (msg=...) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/util/pb_util.cc:603 #5 0x00000000009fe50b in DebugInfo (this=Unhandled dwarf expression opcode 0xf3) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/tablet/tablet_bootstrap.cc:468 #6 kudu::tablet::TabletBootstrap::PlaySegments (this=Unhandled dwarf expression opcode 0xf3) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/tablet/tablet_bootstrap.cc:1177 #7 0x00000000009ffc4b in kudu::tablet::TabletBootstrap::RunBootstrap (this=0x7ff9f4a5f510, rebuilt_tablet=0x7ff9f4a5f8a0, rebuilt_log=0x7ff9f4a5f870, consensus_info=0x7ff9f4a5f9d0) at /usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/tablet/tablet_bootstrap.cc:586 {noformat} > In some rare senarios, tserver may crash with SIGSEGV while boostrapping > tablets > -------------------------------------------------------------------------------- > > Key: KUDU-2509 > URL: https://issues.apache.org/jira/browse/KUDU-2509 > Project: Kudu > Issue Type: Bug > Components: tserver > Affects Versions: 0.7.0, 0.7.1, 0.8.0, 0.9.0, 0.9.1, 1.0.0, 1.0.1, 1.1.0, > 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.6.0, 1.7.0, 1.7.1 > Reporter: Alexey Serbin > Assignee: Alexey Serbin > Priority: Major > > As it's seen from the code snippet from > {{src/kudu/tablet/tablet_bootstrap.cc}}, the > {{TabletBootstrap::HandleCommitMessage()}} can return non-OK status while > applying pending commits via {{ApplyCommitMessage()}}, when {{commit_entry}} > is already deallocated after prior call to {{ApplyCommitMessage()}}: > {code:java} > OpId last_applied = commit_entry->commit().commited_op_id(); > RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry)); > delete commit_entry; > auto iter = state->pending_commits.begin(); > while (iter != state->pending_commits.end()) { > if ((*iter).first == last_applied.index() + 1) { > gscoped_ptr<LogEntryPB> buffered_commit_entry((*iter).second); > state->pending_commits.erase(iter++); > last_applied = buffered_commit_entry->commit().commited_op_id(); > RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get())); > continue; > } > break; > } > return Status::OK(); > {code} > That violates the contract of the {{TabletBootstrap::HandleCommitMessage()}}, > so the following code does use-after-free and can get SIGSEGV while calling > {{DebugInfo()}} to get more information on the {{entry}}: > {code:java} > s = HandleEntry(&state, entry.get()); > if (!s.ok()) { > DumpReplayStateToLog(state); > RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(), > > segment->header().sequence_number(), > entry_count, segment->path(), > *entry)); > } > {code} > The stack trace in the core file looks like the following: > {noformat} > #0 0x0000000001e343a0 in GetDescriptor (this=0x7ff9f4a5e8c0, message=..., > generator=0x7ff9f4a5e7e0) > at thirdparty/src/protobuf-3.4.1/src/google/protobuf/message.h:332 > #1 google::protobuf::TextFormat::Printer::Print (this=0x7ff9f4a5e8c0, > message=..., generator=0x7ff9f4a5e7e0) > at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1836 > #2 0x0000000001e3460c in google::protobuf::TextFormat::Printer::Print > (this=Unhandled dwarf expression opcode 0xf3 > ) > > at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1759 > #3 0x0000000001e346ad in > google::protobuf::TextFormat::Printer::PrintToString (this=0x7ff9f4a5e8c0, > message=..., output=0x7ff9f4a5eaf0) > at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1742 > #4 0x0000000001c83a95 in kudu::pb_util::SecureShortDebugString (msg=...) > > at src/kudu/util/pb_util.cc:603 > #5 0x00000000009fe50b in DebugInfo (this=Unhandled dwarf expression opcode > 0xf3) > at src/kudu/tablet/tablet_bootstrap.cc:468 > #6 kudu::tablet::TabletBootstrap::PlaySegments (this=Unhandled dwarf > expression opcode 0xf3) > at src/kudu/tablet/tablet_bootstrap.cc:1177 > #7 0x00000000009ffc4b in kudu::tablet::TabletBootstrap::RunBootstrap > (this=0x7ff9f4a5f510, rebuilt_tablet=0x7ff9f4a5f8a0, > rebuilt_log=0x7ff9f4a5f870, consensus_info=0x7ff9f4a5f9d0) > at src/kudu/tablet/tablet_bootstrap.cc:586 > {noformat} > The affected 'after-free' message looks like the following in the captured > core: > {noformat} > {<google::protobuf::MessageLite> = {_vptr.MessageLite = 0xc}, <No data > fields>} > {noformat} > The {{0xc}} address for vptr is invalid and the process got SIGSEGV while > calling {{Message::GetDescriptor()}}, a virtual method. -- This message was sent by Atlassian JIRA (v7.6.3#76005)