[ 
https://issues.apache.org/jira/browse/KUDU-2509?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Alexey Serbin updated KUDU-2509:
--------------------------------
    Description: 
As it's seen from the code snippet from 
{{src/kudu/tablet/tablet_bootstrap.cc}}, the 
{{TabletBootstrap::HandleCommitMessage()}} can return non-OK status while 
applying pending commits via {{ApplyCommitMessage()}}, when {{commit_entry}} is 
already deallocated after prior call to {{ApplyCommitMessage()}}:

{code:java}
  OpId last_applied = commit_entry->commit().commited_op_id();
  RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry));
  delete commit_entry;

  auto iter = state->pending_commits.begin();
  while (iter != state->pending_commits.end()) {
    if ((*iter).first == last_applied.index() + 1) {
      gscoped_ptr<LogEntryPB> buffered_commit_entry((*iter).second);
      state->pending_commits.erase(iter++);
      last_applied = buffered_commit_entry->commit().commited_op_id();
      RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get()));
      continue;
    }
    break;
  }

  return Status::OK();
{code}

That violates the contract of the {{TabletBootstrap::HandleCommitMessage()}}, 
so the following code does use-after-free and can get SIGSEGV while calling 
{{DebugInfo()}} to get more information on the {{entry}}:

{code:java}
      s = HandleEntry(&state, entry.get());
      if (!s.ok()) {
        DumpReplayStateToLog(state);
        RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(),
                                           segment->header().sequence_number(),
                                           entry_count, segment->path(),
                                           *entry));
      }
{code}

The stack trace in the core file looks like the following:
{noformat}
#0  0x0000000001e343a0 in GetDescriptor (this=0x7ff9f4a5e8c0, message=..., 
generator=0x7ff9f4a5e7e0)  
    at thirdparty/src/protobuf-3.4.1/src/google/protobuf/message.h:332
#1  google::protobuf::TextFormat::Printer::Print (this=0x7ff9f4a5e8c0, 
message=..., generator=0x7ff9f4a5e7e0)
    at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1836
#2  0x0000000001e3460c in google::protobuf::TextFormat::Printer::Print 
(this=Unhandled dwarf expression opcode 0xf3
)                                                                               
    at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1759
#3  0x0000000001e346ad in google::protobuf::TextFormat::Printer::PrintToString 
(this=0x7ff9f4a5e8c0, message=..., output=0x7ff9f4a5eaf0)
    at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1742
#4  0x0000000001c83a95 in kudu::pb_util::SecureShortDebugString (msg=...)       
    at src/kudu/util/pb_util.cc:603         
#5  0x00000000009fe50b in DebugInfo (this=Unhandled dwarf expression opcode 
0xf3)
    at src/kudu/tablet/tablet_bootstrap.cc:468
#6  kudu::tablet::TabletBootstrap::PlaySegments (this=Unhandled dwarf 
expression opcode 0xf3)
    at src/kudu/tablet/tablet_bootstrap.cc:1177
#7  0x00000000009ffc4b in kudu::tablet::TabletBootstrap::RunBootstrap 
(this=0x7ff9f4a5f510, rebuilt_tablet=0x7ff9f4a5f8a0, 
rebuilt_log=0x7ff9f4a5f870, consensus_info=0x7ff9f4a5f9d0)
    at src/kudu/tablet/tablet_bootstrap.cc:586
{noformat}

The affected 'after-free' message looks like the following in the captured core:
{noformat}
{<google::protobuf::MessageLite> = {_vptr.MessageLite = 0xc}, <No data fields>}
{noformat}

The {{0xc}} address for vptr is invalid and the process got SIGSEGV while 
calling {{Message::GetDescriptor()}}, a virtual method. 

  was:
As it's seen from the code snippet from 
{{src/kudu/tablet/tablet_bootstrap.cc}}, the 
{{TabletBootstrap::HandleCommitMessage()}} can return non-OK status while 
applying pending commits via {{ApplyCommitMessage()}}, when {{commit_entry}} is 
already deallocated after prior call to {{ApplyCommitMessage()}}:

{code:java}
  OpId last_applied = commit_entry->commit().commited_op_id();
  RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry));
  delete commit_entry;

  auto iter = state->pending_commits.begin();
  while (iter != state->pending_commits.end()) {
    if ((*iter).first == last_applied.index() + 1) {
      gscoped_ptr<LogEntryPB> buffered_commit_entry((*iter).second);
      state->pending_commits.erase(iter++);
      last_applied = buffered_commit_entry->commit().commited_op_id();
      RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get()));
      continue;
    }
    break;
  }

  return Status::OK();
{code}

That violates the contract of the {{TabletBootstrap::HandleCommitMessage()}}, 
so the following code does use-after-free and can get SIGSEGV while calling 
{{DebugInfo()}} to get more information on the {{entry}}:

{code:java}
      s = HandleEntry(&state, entry.get());
      if (!s.ok()) {
        DumpReplayStateToLog(state);
        RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(),
                                           segment->header().sequence_number(),
                                           entry_count, segment->path(),
                                           *entry));
      }
{code}

The stack trace in the core file looks like the following:
{noformat}
#0  0x0000000001e343a0 in GetDescriptor (this=0x7ff9f4a5e8c0, message=..., 
generator=0x7ff9f4a5e7e0)  
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/message.h:332
#1  google::protobuf::TextFormat::Printer::Print (this=0x7ff9f4a5e8c0, 
message=..., generator=0x7ff9f4a5e7e0)
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1836
#2  0x0000000001e3460c in google::protobuf::TextFormat::Printer::Print 
(this=Unhandled dwarf expression opcode 0xf3
)                                                                               
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1759
#3  0x0000000001e346ad in google::protobuf::TextFormat::Printer::PrintToString 
(this=0x7ff9f4a5e8c0, message=..., output=0x7ff9f4a5eaf0)
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1742
#4  0x0000000001c83a95 in kudu::pb_util::SecureShortDebugString (msg=...)       
    at /usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/util/pb_util.cc:603         
#5  0x00000000009fe50b in DebugInfo (this=Unhandled dwarf expression opcode 
0xf3)
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/tablet/tablet_bootstrap.cc:468
#6  kudu::tablet::TabletBootstrap::PlaySegments (this=Unhandled dwarf 
expression opcode 0xf3)
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/tablet/tablet_bootstrap.cc:1177
#7  0x00000000009ffc4b in kudu::tablet::TabletBootstrap::RunBootstrap 
(this=0x7ff9f4a5f510, rebuilt_tablet=0x7ff9f4a5f8a0, 
rebuilt_log=0x7ff9f4a5f870, consensus_info=0x7ff9f4a5f9d0)
    at 
/usr/src/debug/kudu-1.6.0-cdh5.14.0/src/kudu/tablet/tablet_bootstrap.cc:586
{noformat}


> In some rare senarios, tserver may crash with SIGSEGV while boostrapping 
> tablets
> --------------------------------------------------------------------------------
>
>                 Key: KUDU-2509
>                 URL: https://issues.apache.org/jira/browse/KUDU-2509
>             Project: Kudu
>          Issue Type: Bug
>          Components: tserver
>    Affects Versions: 0.7.0, 0.7.1, 0.8.0, 0.9.0, 0.9.1, 1.0.0, 1.0.1, 1.1.0, 
> 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.6.0, 1.7.0, 1.7.1
>            Reporter: Alexey Serbin
>            Assignee: Alexey Serbin
>            Priority: Major
>
> As it's seen from the code snippet from 
> {{src/kudu/tablet/tablet_bootstrap.cc}}, the 
> {{TabletBootstrap::HandleCommitMessage()}} can return non-OK status while 
> applying pending commits via {{ApplyCommitMessage()}}, when {{commit_entry}} 
> is already deallocated after prior call to {{ApplyCommitMessage()}}:
> {code:java}
>   OpId last_applied = commit_entry->commit().commited_op_id();
>   RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry));
>   delete commit_entry;
>   auto iter = state->pending_commits.begin();
>   while (iter != state->pending_commits.end()) {
>     if ((*iter).first == last_applied.index() + 1) {
>       gscoped_ptr<LogEntryPB> buffered_commit_entry((*iter).second);
>       state->pending_commits.erase(iter++);
>       last_applied = buffered_commit_entry->commit().commited_op_id();
>       RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get()));
>       continue;
>     }
>     break;
>   }
>   return Status::OK();
> {code}
> That violates the contract of the {{TabletBootstrap::HandleCommitMessage()}}, 
> so the following code does use-after-free and can get SIGSEGV while calling 
> {{DebugInfo()}} to get more information on the {{entry}}:
> {code:java}
>       s = HandleEntry(&state, entry.get());
>       if (!s.ok()) {
>         DumpReplayStateToLog(state);
>         RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(),
>                                            
> segment->header().sequence_number(),
>                                            entry_count, segment->path(),
>                                            *entry));
>       }
> {code}
> The stack trace in the core file looks like the following:
> {noformat}
> #0  0x0000000001e343a0 in GetDescriptor (this=0x7ff9f4a5e8c0, message=..., 
> generator=0x7ff9f4a5e7e0)  
>     at thirdparty/src/protobuf-3.4.1/src/google/protobuf/message.h:332
> #1  google::protobuf::TextFormat::Printer::Print (this=0x7ff9f4a5e8c0, 
> message=..., generator=0x7ff9f4a5e7e0)
>     at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1836
> #2  0x0000000001e3460c in google::protobuf::TextFormat::Printer::Print 
> (this=Unhandled dwarf expression opcode 0xf3
> )                                                                             
>   
>     at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1759
> #3  0x0000000001e346ad in 
> google::protobuf::TextFormat::Printer::PrintToString (this=0x7ff9f4a5e8c0, 
> message=..., output=0x7ff9f4a5eaf0)
>     at thirdparty/src/protobuf-3.4.1/src/google/protobuf/text_format.cc:1742
> #4  0x0000000001c83a95 in kudu::pb_util::SecureShortDebugString (msg=...)     
>   
>     at src/kudu/util/pb_util.cc:603         
> #5  0x00000000009fe50b in DebugInfo (this=Unhandled dwarf expression opcode 
> 0xf3)
>     at src/kudu/tablet/tablet_bootstrap.cc:468
> #6  kudu::tablet::TabletBootstrap::PlaySegments (this=Unhandled dwarf 
> expression opcode 0xf3)
>     at src/kudu/tablet/tablet_bootstrap.cc:1177
> #7  0x00000000009ffc4b in kudu::tablet::TabletBootstrap::RunBootstrap 
> (this=0x7ff9f4a5f510, rebuilt_tablet=0x7ff9f4a5f8a0, 
> rebuilt_log=0x7ff9f4a5f870, consensus_info=0x7ff9f4a5f9d0)
>     at src/kudu/tablet/tablet_bootstrap.cc:586
> {noformat}
> The affected 'after-free' message looks like the following in the captured 
> core:
> {noformat}
> {<google::protobuf::MessageLite> = {_vptr.MessageLite = 0xc}, <No data 
> fields>}
> {noformat}
> The {{0xc}} address for vptr is invalid and the process got SIGSEGV while 
> calling {{Message::GetDescriptor()}}, a virtual method. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to