[ 
https://issues.apache.org/jira/browse/KUDU-2194?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Adar Dembo updated KUDU-2194:
-----------------------------
    Code Review: https://gerrit.cloudera.org/c/8340

> Kudu crashes when started with more than 34 data directories
> ------------------------------------------------------------
>
>                 Key: KUDU-2194
>                 URL: https://issues.apache.org/jira/browse/KUDU-2194
>             Project: Kudu
>          Issue Type: Bug
>          Components: fs
>    Affects Versions: 1.6.0
>            Reporter: Adar Dembo
>            Assignee: Adar Dembo
>             Fix For: 1.6.0
>
>
> Each data directory creates its own uniquely named thread pool, which means 
> that when we intern trace metric names during pool construction, each pool 
> interns a unique set of three trace metric names. The global intern map is 
> limited to 100 entries, and 34 data directories will cause us to exceed that.
> Here's the relevant code from the thread pool constructor:
> {code}
>   std::string prefix = !builder.trace_metric_prefix_.empty() ?
>       builder.trace_metric_prefix_ : builder.name_;
>   queue_time_trace_metric_name_ = TraceMetrics::InternName(
>       prefix + ".queue_time_us");
>   run_wall_time_trace_metric_name_ = TraceMetrics::InternName(
>       prefix + ".run_wall_time_us");
>   run_cpu_time_trace_metric_name_ = TraceMetrics::InternName(
>       prefix + ".run_cpu_time_us");
> {code}
> And here's what the crash looks like reproduced in a unit test:
> {noformat}
> F1019 14:34:53.684109  5117 trace_metrics.cc:68] Check failed: 
> g_intern_map->size() < 100 (100 vs. 100) Too many interned strings: (data dir 
> 0.queue_time_us, data dir 0.queue_time_us) (data dir 0.run_cpu_time_us, data 
> dir 0.run_cpu_time_us) (data dir 0.run_wall_time_us, data dir 
> 0.run_wall_time_us) (data dir 1.queue_time_us, data dir 1.queue_time_us) 
> (data dir 1.run_cpu_time_us, data dir 1.run_cpu_time_us) (data dir 
> 1.run_wall_time_us, data dir 1.run_wall_time_us) (data dir 10.queue_time_us, 
> data dir 10.queue_time_us) (data dir 10.run_cpu_time_us, data dir 
> 10.run_cpu_time_us) (data dir 10.run_wall_time_us, data dir 
> 10.run_wall_time_us) (data dir 11.queue_time_us, data dir 11.queue_time_us) 
> (data dir 11.run_cpu_time_us, data dir 11.run_cpu_time_us) (data dir 
> 11.run_wall_time_us, data dir 11.run_wall_time_us) (data dir 
> 12.queue_time_us, data dir 12.queue_time_us) (data dir 12.run_cpu_time_us, 
> data dir 12.run_cpu_time_us) (data dir 12.run_wall_time_us, data dir 
> 12.run_wall_time_us) (data dir 13.queue_time_us, data dir 13.queue_time_us) 
> (data dir 13.run_cpu_time_us, data dir 13.run_cpu_time_us) (data dir 
> 13.run_wall_time_us, data dir 13.run_wall_time_us) (data dir 
> 14.queue_time_us, data dir 14.queue_time_us) (data dir 14.run_cpu_time_us, 
> data dir 14.run_cpu_time_us) (data dir 14.run_wall_time_us, data dir 
> 14.run_wall_time_us) (data dir 15.queue_time_us, data dir 15.queue_time_us) 
> (data dir 15.run_cpu_time_us, data dir 15.run_cpu_time_us) (data dir 
> 15.run_wall_time_us, data dir 15.run_wall_time_us) (data dir 
> 16.queue_time_us, data dir 16.queue_time_us) (data dir 16.run_cpu_time_us, 
> data dir 16.run_cpu_time_us) (data dir 16.run_wall_time_us, data dir 
> 16.run_wall_time_us) (data dir 17.queue_time_us, data dir 17.queue_time_us) 
> (data dir 17.run_cpu_time_us, data dir 17.run_cpu_time_us) (data dir 
> 17.run_wall_time_us, data dir 17.run_wall_time_us) (data dir 
> 18.queue_time_us, data dir 18.queue_time_us) (data dir 18.run_cpu_time_us, 
> data dir 18.run_cpu_time_us) (data dir 18.run_wall_time_us, data dir 
> 18.run_wall_time_us) (data dir 19.queue_time_us, data dir 19.queue_time_us) 
> (data dir 19.run_cpu_time_us, data dir 19.run_cpu_time_us) (data dir 
> 19.run_wall_time_us, data dir 19.run_wall_time_us) (data dir 2.queue_time_us, 
> data dir 2.queue_time_us) (data dir 2.run_cpu_time_us, data dir 
> 2.run_cpu_time_us) (data dir 2.run_wall_time_us, data dir 2.run_wall_time_us) 
> (data dir 20.queue_time_us, data dir 20.queue_time_us) (data dir 
> 20.run_cpu_time_us, data dir 20.run_cpu_time_us) (data dir 
> 20.run_wall_time_us, data dir 20.run_wall_time_us) (data dir 
> 21.queue_time_us, data dir 21.queue_time_us) (data dir 21.run_cpu_time_us, 
> data dir 21.run_cpu_time_us) (data dir 21.run_wall_time_us, data dir 
> 21.run_wall_time_us) (data dir 22.queue_time_us, data dir 22.queue_time_us) 
> (data dir 22.run_cpu_time_us, data dir 22.run_cpu_time_us) (data dir 
> 22.run_wall_time_us, data dir 22.run_wall_time_us) (data dir 
> 23.queue_time_us, data dir 23.queue_time_us) (data dir 23.run_cpu_time_us, 
> data dir 23.run_cpu_time_us) (data dir 23.run_wall_time_us, data dir 
> 23.run_wall_time_us) (data dir 24.queue_time_us, data dir 24.queue_time_us) 
> (data dir 24.run_cpu_time_us, data dir 24.run_cpu_time_us) (data dir 
> 24.run_wall_time_us, data dir 24.run_wall_time_us) (data dir 
> 25.queue_time_us, data dir 25.queue_time_us) (data dir 25.run_cpu_time_us, 
> data dir 25.run_cpu_time_us) (data dir 25.run_wall_time_us, data dir 
> 25.run_wall_time_us) (data dir 26.queue_time_us, data dir 26.queue_time_us) 
> (data dir 26.run_cpu_time_us, data dir 26.run_cpu_time_us) (data dir 
> 26.run_wall_time_us, data dir 26.run_wall_time_us) (data dir 
> 27.queue_time_us, data dir 27.queue_time_us) (data dir 27.run_cpu_time_us, 
> data dir 27.run_cpu_time_us) (data dir 27.run_wall_time_us, data dir 
> 27.run_wall_time_us) (data dir 28.queue_time_us, data dir 28.queue_time_us) 
> (data dir 28.run_cpu_time_us, data dir 28.run_cpu_time_us) (data dir 
> 28.run_wall_time_us, data dir 28.run_wall_time_us) (data dir 
> 29.queue_time_us, data dir 29.queue_time_us) (data dir 29.run_cpu_time_us, 
> data dir 29.run_cpu_time_us) (data dir 29.run_wall_time_us, data dir 
> 29.run_wall_time_us) (data dir 3.queue_time_us, data dir 3.queue_time_us) 
> (data dir 3.run_cpu_time_us, data dir 3.run_cpu_time_us) (data dir 
> 3.run_wall_time_us, data dir 3.run_wall_time_us) (data dir 30.queue_time_us, 
> data dir 30.queue_time_us) (data dir 30.run_cpu_time_us, data dir 
> 30.run_cpu_time_us) (data dir 30.run_wall_time_us, data dir 
> 30.run_wall_time_us) (data dir 31.queue_time_us, data dir 31.queue_time_us) 
> (data dir 31.run_cpu_time_us, data dir 31.run_cpu_time_us) (data dir 
> 31.run_wall_time_us, data dir 31.run_wall_time_us) (data dir 
> 32.queue_time_us, data dir 32.queue_time_us) (data dir 32.run_cpu_time_us, 
> data dir 32.run_cpu_time_us) (data dir 32.run_wall_time_us, data dir 
> 32.run_wall_time_us) (data dir 33.queue_time_us, data dir 33.queue_time_us) 
> (data dir 4.queue_time_us, data dir 4.queue_time_us) (data dir 
> 4.run_cpu_time_us, data dir 4.run_cpu_time_us) (data dir 4.run_wall_time_us, 
> data dir 4.run_wall_time_us) (data dir 5.queue_time_us, data dir 
> 5.queue_time_us) (data dir 5.run_cpu_time_us, data dir 5.run_cpu_time_us) 
> (data dir 5.run_wall_time_us, data dir 5.run_wall_time_us) (data dir 
> 6.queue_time_us, data dir 6.queue_time_us) (data dir 6.run_cpu_time_us, data 
> dir 6.run_cpu_time_us) (data dir 6.run_wall_time_us, data dir 
> 6.run_wall_time_us) (data dir 7.queue_time_us, data dir 7.queue_time_us) 
> (data dir 7.run_cpu_time_us, data dir 7.run_cpu_time_us) (data dir 
> 7.run_wall_time_us, data dir 7.run_wall_time_us) (data dir 8.queue_time_us, 
> data dir 8.queue_time_us) (data dir 8.run_cpu_time_us, data dir 
> 8.run_cpu_time_us) (data dir 8.run_wall_time_us, data dir 8.run_wall_time_us) 
> (data dir 9.queue_time_us, data dir 9.queue_time_us) (data dir 
> 9.run_cpu_time_us, data dir 9.run_cpu_time_us) (data dir 9.run_wall_time_us, 
> data dir 9.run_wall_time_us)
> *** Check failure stack trace: ***
> *** Aborted at 1508448893 (unix time) try "date -d @1508448893" if you are 
> using GNU date ***
> PC: @     0x7fa0e2fa3428 gsignal
> *** SIGABRT (@0x3e8000013fd) received by PID 5117 (TID 0x7fa0e4f26800) from 
> PID 5117; stack trace: ***
>     @     0x7fa0e3349390 (unknown)
>     @     0x7fa0e2fa3428 gsignal
>     @     0x7fa0e2fa502a abort
>     @     0x7fa0e406cd99 google::logging_fail()
>     @     0x7fa0e406e6dd google::LogMessage::Fail()
>     @     0x7fa0e40705c3 google::LogMessage::SendToLog()
>     @     0x7fa0e406e23a google::LogMessage::Flush()
>     @     0x7fa0e4070f8f google::LogMessageFatal::~LogMessageFatal()
>     @     0x7fa0e4ad9586 kudu::TraceMetrics::InternName()
>     @     0x7fa0e4ac2a48 kudu::ThreadPool::ThreadPool()
>     @     0x7fa0e4ac149a kudu::ThreadPoolBuilder::Build()
>     @     0x7fa0e4db11bc kudu::fs::DataDirManager::Open()
>     @     0x7fa0e4daed74 kudu::fs::DataDirManager::CreateNew()
>     @     0x7fa0e4daeb3e kudu::fs::DataDirManager::CreateNewForTests()
>     @           0x43ed42 
> kudu::fs::TooManyDataDirManagerTest_TestTooManyInternedStrings_Test::TestBody()
>     @     0x7fa0e3e3ce77 
> testing::internal::HandleExceptionsInMethodIfSupported<>()
>     @     0x7fa0e3e32262 testing::Test::Run()
>     @     0x7fa0e3e323a8 testing::TestInfo::Run()
>     @     0x7fa0e3e32485 testing::TestCase::Run()
>     @     0x7fa0e3e33158 testing::internal::UnitTestImpl::RunAllTests()
>     @     0x7fa0e3e3d387 
> testing::internal::HandleExceptionsInMethodIfSupported<>()
>     @     0x7fa0e3e3255a testing::UnitTest::Run()
>     @     0x7fa0e50e509a RUN_ALL_TESTS()
>     @     0x7fa0e50e2d88 main
>     @     0x7fa0e2f8e830 __libc_start_main
>     @           0x434d89 _start
>     @                0x0 (unknown)
> Aborted (core dumped)
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to