KUDU-1657: read-only FsManager::Open on active tablet can crash This fixes a race condition when opening a read-only FsManager on a data directory with lbm containers that are actively being appended to. See the code comment for more details. A best-effort repro test case is included; it typically takes about 35 seconds to repro without the fix. This issue was identified because it was causing alter_table-randomized-test to be significantly flaky.
Change-Id: If68b04f1f1b8cd099120a220b1245ecf8f422770 Reviewed-on: http://gerrit.cloudera.org:8080/4551 Reviewed-by: Adar Dembo <[email protected]> Tested-by: Dan Burkert <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/70211f84 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/70211f84 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/70211f84 Branch: refs/heads/master Commit: 70211f84c9888e2ed8ed9ec87c4718e1e091dad0 Parents: 74c9e67 Author: Dan Burkert <[email protected]> Authored: Tue Sep 27 19:39:18 2016 -0700 Committer: Dan Burkert <[email protected]> Committed: Fri Sep 30 04:41:42 2016 +0000 ---------------------------------------------------------------------- src/kudu/fs/log_block_manager.cc | 12 +- src/kudu/integration-tests/CMakeLists.txt | 1 + .../integration-tests/open-readonly-fs-itest.cc | 146 +++++++++++++++++++ 3 files changed, 158 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/70211f84/src/kudu/fs/log_block_manager.cc ---------------------------------------------------------------------- diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc index 6866422..0046af9 100644 --- a/src/kudu/fs/log_block_manager.cc +++ b/src/kudu/fs/log_block_manager.cc @@ -492,7 +492,17 @@ Status LogBlockContainer::ReadContainerRecords(deque<BlockRecordPB>* records) co local_records.pop_back(); break; } - CheckBlockRecord(local_records.back(), data_file_size); + + const auto& record = local_records.back(); + if (PREDICT_FALSE(data_file_size < record.offset() + record.length())) { + // KUDU-1657: When opening a container in read-only mode which is actively + // being written to by another lbm, we must reinspect the data file's size + // frequently in order to account for the latest appends. Inspecting the + // file size is expensive, so we only do it when the metadata indicates + // that additional data has been written to the file. + RETURN_NOT_OK(data_file_->Size(&data_file_size)); + } + CheckBlockRecord(record, data_file_size); } // NOTE: 'read_status' will never be OK here. if (PREDICT_TRUE(read_status.IsEndOfFile())) { http://git-wip-us.apache.org/repos/asf/kudu/blob/70211f84/src/kudu/integration-tests/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/CMakeLists.txt b/src/kudu/integration-tests/CMakeLists.txt index 2201120..9e45d95 100644 --- a/src/kudu/integration-tests/CMakeLists.txt +++ b/src/kudu/integration-tests/CMakeLists.txt @@ -62,6 +62,7 @@ ADD_KUDU_TEST_DEPENDENCIES(master_migration-itest kudu) ADD_KUDU_TEST(master_replication-itest RESOURCE_LOCK "master-rpc-ports") ADD_KUDU_TEST(master-stress-test RESOURCE_LOCK "master-rpc-ports") +ADD_KUDU_TEST(open-readonly-fs-itest) ADD_KUDU_TEST(raft_consensus-itest RUN_SERIAL true) ADD_KUDU_TEST(registration-test RESOURCE_LOCK "master-web-port") ADD_KUDU_TEST(table_locations-itest) http://git-wip-us.apache.org/repos/asf/kudu/blob/70211f84/src/kudu/integration-tests/open-readonly-fs-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/open-readonly-fs-itest.cc b/src/kudu/integration-tests/open-readonly-fs-itest.cc new file mode 100644 index 0000000..d07f311 --- /dev/null +++ b/src/kudu/integration-tests/open-readonly-fs-itest.cc @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <thread> + +#include "kudu/client/client-test-util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +using kudu::client::KuduClient; +using kudu::client::KuduColumnSchema; +using kudu::client::KuduSchema; +using kudu::client::KuduSchemaBuilder; +using kudu::client::KuduSession; +using kudu::client::KuduTable; +using kudu::client::KuduTableCreator; +using kudu::client::KuduWriteOperation; +using kudu::client::sp::shared_ptr; + +using std::unique_ptr; + +namespace kudu { +namespace itest { + +const auto kTimeout = MonoDelta::FromSeconds(60); +const char kTableName[] = "test-table"; +const int kNumColumns = 50; + +class OpenReadonlyFsITest : public KuduTest { + + void SetUp() override { + KuduTest::SetUp(); + + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = 1; + + // To repro KUDU-1657 with high probability, the container file size needs + // to be increasing frequently. To ensure this, we reduce the MRS flush + // threshold to increase flush frequency and increase the number of MM + // threads to encourage frequent compactions. The net effect of both of + // these changes: more blocks are written to disk. Turning off container + // file preallocation forces every append to increase the file size. + opts.extra_tserver_flags.push_back("--log_container_preallocate_bytes=0"); + opts.extra_tserver_flags.push_back("--maintenance_manager_num_threads=16"); + opts.extra_tserver_flags.push_back("--flush_threshold_mb=1"); + + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + + ASSERT_OK(cluster_->CreateClient(nullptr, &client_)); + } + + protected: + unique_ptr<ExternalMiniCluster> cluster_; + shared_ptr<KuduClient> client_; +}; + +// Integration test that creates a tablet, then in parallel: +// - writes as fast as possible to the tablet, causing as many flushes as possible +// - opens read-only instances of the FS manager on the tablet's data directory +// +// This is a regression test for KUDU-1657. It typically takes about 35 seconds +// to trigger that bug. +TEST_F(OpenReadonlyFsITest, TestWriteAndVerify) { + if (!AllowSlowTests()) return; + + KuduSchema schema; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT64)->NotNull()->PrimaryKey(); + + for (int i = 1; i < kNumColumns; i++) { + b.AddColumn(strings::Substitute("value-$0", i)) + ->Type(KuduColumnSchema::INT64) + ->NotNull(); + } + + ASSERT_OK(b.Build(&schema)); + unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator()); + table_creator->table_name(kTableName) + .schema(&schema) + .set_range_partition_columns({ }) + .num_replicas(1); + + ASSERT_OK(table_creator->Create()); + + shared_ptr<KuduTable> table; + client_->OpenTable(kTableName, &table); + + shared_ptr<KuduSession> session(client_->NewSession()); + ASSERT_OK(session->SetFlushMode(KuduSession::AUTO_FLUSH_BACKGROUND)); + + Random rng(SeedRandom()); + auto deadline = MonoTime::Now() + kTimeout; + + auto t = std::thread([this, deadline] () { + FsManagerOpts fs_opts; + fs_opts.read_only = true; + fs_opts.wal_path = cluster_->tablet_server(0)->data_dir(); + fs_opts.data_paths = { cluster_->tablet_server(0)->data_dir() }; + while (MonoTime::Now() < deadline) { + FsManager fs(Env::Default(), fs_opts); + CHECK_OK(fs.Open()); + } + }); + + int64_t count = 0; + while (MonoTime::Now() < deadline) { + // Upsert a batch of [500, 1500) random rows. + int batch_count = 500 + rng.Next() % 1000; + for (int i = 0; i < batch_count; i++) { + unique_ptr<KuduWriteOperation> row(table->NewUpsert()); + for (int i = 0; i < kNumColumns; i++) { + ASSERT_OK(row->mutable_row()->SetInt64(i, rng.Next64())); + } + ASSERT_OK(session->Apply(row.release())); + } + ASSERT_OK(session->Flush()); + ASSERT_EQ(0, session->CountPendingErrors()); + count += batch_count; + LOG(INFO) << count << " rows inserted"; + } + + t.join(); +} + +} // namespace itest +} // namespace kudu
