Repository: mesos Updated Branches: refs/heads/master 8327209a4 -> b872ccd08
Generate u32 filter handle in user level to work around MESOS-1617. Review: https://reviews.apache.org/r/23701 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b872ccd0 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b872ccd0 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b872ccd0 Branch: refs/heads/master Commit: b872ccd0863a05807992791304bb61117ed2f733 Parents: 8327209 Author: Jie Yu <yujie....@gmail.com> Authored: Fri Jul 18 16:52:36 2014 -0700 Committer: Jie Yu <yujie....@gmail.com> Committed: Mon Jul 21 13:52:46 2014 -0700 ---------------------------------------------------------------------- src/linux/routing/filter/arp.cpp | 3 + src/linux/routing/filter/filter.hpp | 17 +++- src/linux/routing/filter/handle.hpp | 71 ++++++++++++++++ src/linux/routing/filter/icmp.cpp | 3 + src/linux/routing/filter/internal.hpp | 125 ++++++++++++++++++++++++++++- src/linux/routing/filter/ip.cpp | 21 +++++ src/linux/routing/filter/ip.hpp | 12 +++ src/tests/routing_tests.cpp | 74 +++++++++++++++++ 8 files changed, 321 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/arp.cpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/arp.cpp b/src/linux/routing/filter/arp.cpp index c18aebb..b794a73 100644 --- a/src/linux/routing/filter/arp.cpp +++ b/src/linux/routing/filter/arp.cpp @@ -124,6 +124,7 @@ Try<bool> create( parent, Classifier(), priority, + None(), redirect)); } @@ -140,6 +141,7 @@ Try<bool> create( parent, Classifier(), priority, + None(), mirror)); } @@ -161,6 +163,7 @@ Try<bool> update( parent, Classifier(), None(), + None(), mirror)); } http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/filter.hpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/filter.hpp b/src/linux/routing/filter/filter.hpp index 40e21db..a603d73 100644 --- a/src/linux/routing/filter/filter.hpp +++ b/src/linux/routing/filter/filter.hpp @@ -26,6 +26,7 @@ #include <stout/option.hpp> #include "linux/routing/filter/action.hpp" +#include "linux/routing/filter/handle.hpp" #include "linux/routing/filter/priority.hpp" #include "linux/routing/queueing/handle.hpp" @@ -51,20 +52,24 @@ public: // Creates a filter with no action. Filter(const queueing::Handle& _parent, const Classifier& _classifier, - const Option<Priority>& _priority) + const Option<Priority>& _priority, + const Option<Handle>& _handle) : parent_(_parent), classifier_(_classifier), - priority_(_priority) {} + priority_(_priority), + handle_(_handle) {} // TODO(jieyu): Support arbitrary number of actions. template <typename Action> Filter(const queueing::Handle& _parent, const Classifier& _classifier, const Option<Priority>& _priority, + const Option<Handle>& _handle, const Action& action) : parent_(_parent), classifier_(_classifier), - priority_(_priority) + priority_(_priority), + handle_(_handle) { attach(action); } @@ -76,9 +81,10 @@ public: actions_.push_back(process::Shared<action::Action>(new A(action))); } - const queueing::Handle parent() const { return parent_; } + const queueing::Handle& parent() const { return parent_; } const Classifier& classifier() const { return classifier_; } const Option<Priority>& priority() const { return priority_; } + const Option<Handle>& handle() const { return handle_; } // Returns all the actions attached to this filter. const std::vector<process::Shared<action::Action> >& actions() const @@ -97,6 +103,9 @@ private: // The priority of this filter. Option<Priority> priority_; + // The handle of this filter. + Option<Handle> handle_; + // The set of actions attached to this filer. Note that we use // Shared here to make Filter copyable. std::vector<process::Shared<action::Action> > actions_; http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/handle.hpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/handle.hpp b/src/linux/routing/filter/handle.hpp new file mode 100644 index 0000000..0381933 --- /dev/null +++ b/src/linux/routing/filter/handle.hpp @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LINUX_ROUTING_FILTER_HANDLE_HPP__ +#define __LINUX_ROUTING_FILTER_HANDLE_HPP__ + +#include <stdint.h> + +namespace routing { +namespace filter { + +// Represents the handle for a traffic control (tc) filter. Different +// types of filters have different types of handles. This is the base +// class of all types of filters. +class Handle +{ +public: + explicit Handle(uint32_t _handle) : handle(_handle) {} + virtual ~Handle() {} + + uint32_t get() const { return handle; } + +protected: + uint32_t handle; +}; + + +// Represents a u32 filter handle. A u32 filter handle has three +// parts. The first number identifies a hash table, the second number +// identifies a bucket within the hash table, and the third number +// identifies the filter item within the bucket. +// http://ace-host.stuart.id.au/russell/files/tc/doc/cls_u32.txt +class U32Handle : public Handle +{ +public: + explicit U32Handle(uint32_t _handle) : Handle(_handle) {} + + // The format of a u32 filter handle. + // +------------+--------+------------+ + // | htid | hash | node | + // +------------+--------+------------+ + // 12 bits 8 bits 12 bits + U32Handle(uint32_t htid, uint32_t hash, uint32_t node) + : Handle(((htid & 0xfff) << 20) + ((hash & 0xff) << 12) + (node & 0xfff)) {} + + virtual ~U32Handle() {} + + uint32_t htid() const { return handle >> 20; } + uint32_t hash() const { return (handle & 0x000ff000) >> 12; } + uint32_t node() const { return handle & 0x00000fff; } +}; + +} // namespace filter { +} // namespace routing { + +#endif // __LINUX_ROUTING_FILTER_HANDLE_HPP__ http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/icmp.cpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/icmp.cpp b/src/linux/routing/filter/icmp.cpp index 8e6f013..12b9eb7 100644 --- a/src/linux/routing/filter/icmp.cpp +++ b/src/linux/routing/filter/icmp.cpp @@ -220,6 +220,7 @@ Try<bool> create( parent, classifier, priority, + None(), redirect)); } @@ -237,6 +238,7 @@ Try<bool> create( parent, classifier, priority, + None(), mirror)); } @@ -262,6 +264,7 @@ Try<bool> update( parent, classifier, None(), + None(), mirror)); } http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/internal.hpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/internal.hpp b/src/linux/routing/filter/internal.hpp index 6836d1f..a970613 100644 --- a/src/linux/routing/filter/internal.hpp +++ b/src/linux/routing/filter/internal.hpp @@ -42,6 +42,8 @@ #include <stout/error.hpp> #include <stout/foreach.hpp> +#include <stout/hashmap.hpp> +#include <stout/hashset.hpp> #include <stout/none.hpp> #include <stout/nothing.hpp> #include <stout/option.hpp> @@ -271,6 +273,85 @@ inline Try<Nothing> attach( } +// Generates the handle for the given filter on the link. Returns none +// if we decide to let the kernel choose the handle. +template <typename Classifier> +Result<U32Handle> generateU32Handle( + const Netlink<struct rtnl_link>& link, + const Filter<Classifier>& filter) +{ + // If the user does not specify a priority, we have no choice but + // let the kernel choose the handle because we do not know the + // 'htid' that is associated with that priority. + if (filter.priority().isNone()) { + return None(); + } + + // Scan all the filters attached to the given parent on the link. + Try<Netlink<struct nl_sock> > sock = routing::socket(); + if (sock.isError()) { + return Error(sock.error()); + } + + // Dump all the libnl filters (i.e., rtnl_cls) attached to the given + // parent on the link. + struct nl_cache* c = NULL; + int err = rtnl_cls_alloc_cache( + sock.get().get(), + rtnl_link_get_ifindex(link.get()), + filter.parent().get(), + &c); + + if (err != 0) { + return Error( + "Failed to get filter info from kernel: " + + std::string(nl_geterror(err))); + } + + Netlink<struct nl_cache> cache(c); + + // A map from priority to the corresponding 'htid'. + hashmap<uint16_t, uint32_t> htids; + + // A map from 'htid' to a set of already used nodes. + hashmap<uint32_t, hashset<uint32_t> > nodes; + + for (struct nl_object* o = nl_cache_get_first(cache.get()); + o != NULL; o = nl_cache_get_next(o)) { + struct rtnl_cls* cls = (struct rtnl_cls*) o; + + // Only look at u32 filters. For other type of filters, their + // handles are generated by the kernel correctly. + if (rtnl_tc_get_kind(TC_CAST(cls)) == std::string("u32")) { + U32Handle handle(rtnl_tc_get_handle(TC_CAST(cls))); + + htids[rtnl_cls_get_prio(cls)] = handle.htid(); + nodes[handle.htid()].insert(handle.node()); + } + } + + // If this filter has a new priority, we need to let the kernel + // decide the handle because we don't know which 'htid' this + // priority will be associated with. + if (!htids.contains(filter.priority().get().get())) { + return None(); + } + + // NOTE: By default, kernel will choose to use divisor 1, which + // means all filters will be in hash bucket 0. Also, kernel assigns + // node id starting from 0x800 by default. Here, we keep the same + // semantics as kernel. + uint32_t htid = htids[filter.priority().get().get()]; + for (uint32_t node = 0x800; node <= 0xfff; node++) { + if (!nodes[htid].contains(node)) { + return U32Handle(htid, 0x0, node); + } + } + + return Error("No available handle exists"); +} + + // Encodes a filter (in our representation) to a libnl filter // (rtnl_cls). We use template here so that it works for any type of // classifier. @@ -308,6 +389,31 @@ Try<Netlink<struct rtnl_cls> > encodeFilter( } } + // Encode the handle. + if (filter.handle().isSome()) { + rtnl_tc_set_handle(TC_CAST(cls.get()), filter.handle().get().get()); + } else { + // NOTE: This is a workaround for MESOS-1617. Normally, if the + // user does not specify the handle for a filter, the kernel will + // generate one automatically. However, for u32 filters, the + // existing kernel is buggy in the sense that it will generate a + // handle that is already used by some other u32 filter (see the + // ticket for details). To address that, we explicitly set the + // handle of the filter by picking an unused handle. + // TODO(jieyu): Revisit this once the kernel bug is fixed. + if (rtnl_tc_get_kind(TC_CAST(cls.get())) == std::string("u32")) { + Result<U32Handle> handle = generateU32Handle(link, filter); + if (handle.isError()) { + return Error("Failed to find an unused u32 handle: " + handle.error()); + } + + // If 'handle' is none, let the kernel choose the handle. + if (handle.isSome()) { + rtnl_tc_set_handle(TC_CAST(cls.get()), handle.get().get()); + } + } + } + return cls; } @@ -333,6 +439,11 @@ Result<Filter<Classifier> > decodeFilter(const Netlink<struct rtnl_cls>& cls) // always have a valid priority here. Priority priority(rtnl_cls_get_prio(cls.get())); + // Decode the handle. If the handle is not specified by the user, + // kernel will assign a handle to the filter. So we should always + // have a valid handle here. + Handle handle(rtnl_tc_get_handle(TC_CAST(cls.get()))); + // Decode the classifier using a classifier specific function. Result<Classifier> classifier = decode<Classifier>(cls); if (classifier.isError()) { @@ -344,7 +455,7 @@ Result<Filter<Classifier> > decodeFilter(const Netlink<struct rtnl_cls>& cls) // TODO(jieyu): Decode all the actions attached to the filter. // Currently, libnl does not support that (but will support that in // the future). - return Filter<Classifier>(parent, classifier.get(), priority); + return Filter<Classifier>(parent, classifier.get(), priority, handle); } ///////////////////////////////////////////////// @@ -581,6 +692,18 @@ Try<bool> update(const std::string& _link, const Filter<Classifier>& filter) stringify(filter.priority().get().get())); } + // The kernel does not allow us to update the handle. So if the user + // specifies a handle, we will check to make sure they match. + if (filter.handle().isSome() && + filter.handle().get().get() != + rtnl_tc_get_handle(TC_CAST(oldCls.get().get()))) { + return Error( + "The handles do not match. The old handle is " + + stringify(rtnl_tc_get_handle(TC_CAST(oldCls.get().get()))) + + " and the new handle is " + + stringify(filter.handle().get().get())); + } + Try<Netlink<struct rtnl_cls> > newCls = encodeFilter(link.get(), filter); if (newCls.isError()) { return Error("Failed to encode the new filter: " + newCls.error()); http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/ip.cpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/ip.cpp b/src/linux/routing/filter/ip.cpp index 43f1f7f..74525a9 100644 --- a/src/linux/routing/filter/ip.cpp +++ b/src/linux/routing/filter/ip.cpp @@ -472,6 +472,26 @@ Try<bool> create( parent, classifier, priority, + None(), + redirect)); +} + + +Try<bool> create( + const string& link, + const queueing::Handle& parent, + const Classifier& classifier, + const Option<Priority>& priority, + const Option<Handle>& handle, + const action::Redirect& redirect) +{ + return internal::create( + link, + Filter<Classifier>( + parent, + classifier, + priority, + handle, redirect)); } @@ -489,6 +509,7 @@ Try<bool> create( parent, classifier, priority, + None(), terminal)); } http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/linux/routing/filter/ip.hpp ---------------------------------------------------------------------- diff --git a/src/linux/routing/filter/ip.hpp b/src/linux/routing/filter/ip.hpp index 11e417d..566fd1e 100644 --- a/src/linux/routing/filter/ip.hpp +++ b/src/linux/routing/filter/ip.hpp @@ -158,6 +158,18 @@ Try<bool> create( const action::Redirect& redirect); +// Same as above, but allow the user to specify the handle. This +// interface is exposed only for testing MESOS-1617. +// TODO(jieyu): Revisit this once the kernel bug is fixed. +Try<bool> create( + const std::string& link, + const queueing::Handle& parent, + const Classifier& classifier, + const Option<Priority>& priority, + const Option<Handle>& handle, + const action::Redirect& redirect); + + // Creates an IP packet filter attached to the given parent on the // link which will stop the IP packets from being sent to the next // filter. Returns false if an IP packet filter attached to the given http://git-wip-us.apache.org/repos/asf/mesos/blob/b872ccd0/src/tests/routing_tests.cpp ---------------------------------------------------------------------- diff --git a/src/tests/routing_tests.cpp b/src/tests/routing_tests.cpp index dcc4982..020676c 100644 --- a/src/tests/routing_tests.cpp +++ b/src/tests/routing_tests.cpp @@ -993,3 +993,77 @@ TEST_F(RoutingVethTest, ROOT_IPFilterRemove) ASSERT_SOME(classifiers); EXPECT_EQ(0u, classifiers.get().size()); } + + +// Test the workaround introduced for MESOS-1617. +TEST_F(RoutingVethTest, ROOT_HandleGeneration) +{ + ASSERT_SOME(link::create(TEST_VETH_LINK, TEST_PEER_LINK, None())); + + EXPECT_SOME_TRUE(link::exists(TEST_VETH_LINK)); + EXPECT_SOME_TRUE(link::exists(TEST_PEER_LINK)); + + ASSERT_SOME_TRUE(ingress::create(TEST_VETH_LINK)); + + Result<net::MAC> mac = net::mac(TEST_VETH_LINK); + ASSERT_SOME(mac); + + net::IP ip = net::IP(0x01020304); // 1.2.3.4 + + Try<ip::PortRange> sourcePorts1 = + ip::PortRange::fromBeginEnd(1024, 1027); + + ASSERT_SOME(sourcePorts1); + + Try<ip::PortRange> destinationPorts1 = + ip::PortRange::fromBeginEnd(2000, 2000); + + ASSERT_SOME(destinationPorts1); + + Try<ip::PortRange> sourcePorts2 = + ip::PortRange::fromBeginEnd(3024, 3025); + + ASSERT_SOME(sourcePorts2); + + Try<ip::PortRange> destinationPorts2 = + ip::PortRange::fromBeginEnd(4000, 4003); + + ASSERT_SOME(destinationPorts2); + + ip::Classifier classifier1 = + ip::Classifier( + mac.get(), + ip, + sourcePorts1.get(), + destinationPorts1.get()); + + ip::Classifier classifier2 = + ip::Classifier( + mac.get(), + ip, + sourcePorts2.get(), + destinationPorts2.get()); + + // Use handle 800:00:fff for the first filter. + EXPECT_SOME_TRUE(ip::create( + TEST_VETH_LINK, + ingress::HANDLE, + classifier1, + Priority(2, 1), + U32Handle(0x800, 0x0, 0xfff), + action::Redirect(TEST_PEER_LINK))); + + // With the workaround, this filter should be assigned a handle + // different than 800:00:fff. + EXPECT_SOME_TRUE(ip::create( + TEST_VETH_LINK, + ingress::HANDLE, + classifier2, + Priority(2, 1), + action::Redirect(TEST_PEER_LINK))); + + // Try to remove the second filter. If we don't have the workaround, + // removing the second filter will return false since the kernel + // will find the handle matches the first filter. + EXPECT_SOME_TRUE(ip::remove(TEST_VETH_LINK, ingress::HANDLE, classifier2)); +}