Jesse, I forgot to commit `git rm table.c table.h` to this patch. I will do it when I push final patch.
Thanks, Pravin. On Tue, Aug 23, 2011 at 3:34 PM, Pravin Shelar <pshe...@nicira.com> wrote: > Currently OVS uses its own hashing implmentation for hash tables which > has some problems, e.g. error case on deletion code. Following patch > replaces that with hlist based hash table which is consistent with other > kernel hash tables. As Jesse suggested, flex-array is used for allocating > hash buckets, So that we can have large hash-table without large contiguous > kernel memory. > struct sw_flow is rearragned accodring to frequency of writes to a given > struct member. > --- > datapath/Modules.mk | 2 - > datapath/datapath.c | 111 +++++++++---------------- > datapath/datapath.h | 2 +- > datapath/flow.c | 190 ++++++++++++++++++++++++++++++++++++++++-- > datapath/flow.h | 61 +++++++++---- > datapath/tunnel.c | 230 > ++++++++++++++++++++++++++------------------------- > datapath/tunnel.h | 4 +- > 7 files changed, 385 insertions(+), 215 deletions(-) > > diff --git a/datapath/Modules.mk b/datapath/Modules.mk > index 587569f..8eb4e91 100644 > --- a/datapath/Modules.mk > +++ b/datapath/Modules.mk > @@ -18,7 +18,6 @@ openvswitch_sources = \ > dp_sysfs_if.c \ > flow.c \ > loop_counter.c \ > - table.c \ > tunnel.c \ > vlan.c \ > vport.c \ > @@ -37,7 +36,6 @@ openvswitch_headers = \ > dp_sysfs.h \ > flow.h \ > loop_counter.h \ > - table.h \ > tunnel.h \ > vlan.h \ > vport.h \ > diff --git a/datapath/datapath.c b/datapath/datapath.c > index 0b6e2e5..ba0b05c 100644 > --- a/datapath/datapath.c > +++ b/datapath/datapath.c > @@ -49,8 +49,8 @@ > #include "datapath.h" > #include "actions.h" > #include "flow.h" > -#include "table.h" > #include "vlan.h" > +#include "tunnel.h" > #include "vport-internal_dev.h" > > #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) || \ > @@ -107,7 +107,7 @@ struct datapath *get_dp(int dp_ifindex) > EXPORT_SYMBOL_GPL(get_dp); > > /* Must be called with genl_mutex. */ > -static struct tbl *get_table_protected(struct datapath *dp) > +static struct flow_table *get_table_protected(struct datapath *dp) > { > return rcu_dereference_protected(dp->table, lockdep_genl_is_held()); > } > @@ -225,7 +225,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) > { > struct datapath *dp = container_of(rcu, struct datapath, rcu); > > - tbl_destroy((struct tbl __force *)dp->table, flow_free_tbl); > + flow_destroy_table(dp->table); > free_percpu(dp->stats_percpu); > kobject_put(&dp->ifobj); > } > @@ -269,6 +269,7 @@ int dp_detach_port(struct vport *p) > void dp_process_received_packet(struct vport *p, struct sk_buff *skb) > { > struct datapath *dp = p->dp; > + struct sw_flow *flow; > struct dp_stats_percpu *stats; > int stats_counter_off; > int error; > @@ -277,7 +278,6 @@ void dp_process_received_packet(struct vport *p, struct > sk_buff *skb) > > if (!OVS_CB(skb)->flow) { > struct sw_flow_key key; > - struct tbl_node *flow_node; > int key_len; > bool is_frag; > > @@ -295,9 +295,8 @@ void dp_process_received_packet(struct vport *p, struct > sk_buff *skb) > } > > /* Look up flow. */ > - flow_node = tbl_lookup(rcu_dereference(dp->table), &key, > key_len, > - flow_hash(&key, key_len), flow_cmp); > - if (unlikely(!flow_node)) { > + flow = flow_lookup(rcu_dereference(dp->table), &key, key_len); > + if (unlikely(!flow)) { > struct dp_upcall_info upcall; > > upcall.cmd = OVS_PACKET_CMD_MISS; > @@ -311,7 +310,7 @@ void dp_process_received_packet(struct vport *p, struct > sk_buff *skb) > goto out; > } > > - OVS_CB(skb)->flow = flow_cast(flow_node); > + OVS_CB(skb)->flow = flow; > } > > stats_counter_off = offsetof(struct dp_stats_percpu, n_hit); > @@ -524,8 +523,8 @@ err_kfree_skbs: > /* Called with genl_mutex. */ > static int flush_flows(int dp_ifindex) > { > - struct tbl *old_table; > - struct tbl *new_table; > + struct flow_table *old_table; > + struct flow_table *new_table; > struct datapath *dp; > > dp = get_dp(dp_ifindex); > @@ -533,14 +532,13 @@ static int flush_flows(int dp_ifindex) > return -ENODEV; > > old_table = get_table_protected(dp); > - new_table = tbl_create(TBL_MIN_BUCKETS); > + new_table = flow_hash_alloc(TBL_MIN_BUCKETS); > if (!new_table) > return -ENOMEM; > > rcu_assign_pointer(dp->table, new_table); > > - tbl_deferred_destroy(old_table, flow_free_tbl); > - > + flow_deferred_destroy_table(old_table); > return 0; > } > > @@ -622,24 +620,6 @@ static void clear_stats(struct sw_flow *flow) > flow->byte_count = 0; > } > > -/* Called with genl_mutex. */ > -static int expand_table(struct datapath *dp) > -{ > - struct tbl *old_table = get_table_protected(dp); > - struct tbl *new_table; > - > - new_table = tbl_expand(old_table); > - if (IS_ERR(new_table)) { > - if (PTR_ERR(new_table) != -ENOSPC) > - return PTR_ERR(new_table); > - } else { > - rcu_assign_pointer(dp->table, new_table); > - tbl_deferred_destroy(old_table, NULL); > - } > - > - return 0; > -} > - > static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info > *info) > { > struct ovs_header *ovs_header = info->userhdr; > @@ -693,7 +673,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, > struct genl_info *info) > err = flow_extract(packet, -1, &flow->key, &key_len, &is_frag); > if (err) > goto err_flow_put; > - flow->tbl_node.hash = flow_hash(&flow->key, key_len); > > err = flow_metadata_from_nlattrs(&flow->key.eth.in_port, > &flow->key.eth.tun_id, > @@ -747,9 +726,9 @@ static struct genl_ops dp_packet_genl_ops[] = { > static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) > { > int i; > - struct tbl *table = get_table_protected(dp); > + struct flow_table *table = get_table_protected(dp); > > - stats->n_flows = tbl_count(table); > + stats->n_flows = flow_tbl_count(table); > > stats->n_frags = stats->n_hit = stats->n_missed = stats->n_lost = 0; > for_each_possible_cpu(i) { > @@ -940,13 +919,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, > struct genl_info *info) > { > struct nlattr **a = info->attrs; > struct ovs_header *ovs_header = info->userhdr; > - struct tbl_node *flow_node; > struct sw_flow_key key; > struct sw_flow *flow; > struct sk_buff *reply; > struct datapath *dp; > - struct tbl *table; > - u32 hash; > + struct flow_table *table; > int error; > int key_len; > > @@ -973,10 +950,9 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, > struct genl_info *info) > if (!dp) > goto error; > > - hash = flow_hash(&key, key_len); > table = get_table_protected(dp); > - flow_node = tbl_lookup(table, &key, key_len, hash, flow_cmp); > - if (!flow_node) { > + flow = flow_lookup(table, &key, key_len); > + if (!flow) { > struct sw_flow_actions *acts; > > /* Bail out if we're not allowed to create a new flow. */ > @@ -985,11 +961,15 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, > struct genl_info *info) > goto error; > > /* Expand table, if necessary, to make room. */ > - if (tbl_count(table) >= tbl_n_buckets(table)) { > - error = expand_table(dp); > - if (error) > - goto error; > - table = get_table_protected(dp); > + if (flow_tbl_need_to_expand(table)) { > + struct flow_table *new_table; > + > + new_table = flow_hash_expand(table); > + if (!IS_ERR(new_table)) { > + rcu_assign_pointer(dp->table, new_table); > + flow_deferred_destroy_table(table); > + table = get_table_protected(dp); > + } > } > > /* Allocate flow. */ > @@ -1009,9 +989,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, > struct genl_info *info) > rcu_assign_pointer(flow->sf_acts, acts); > > /* Put flow in bucket. */ > - error = tbl_insert(table, &flow->tbl_node, hash); > - if (error) > - goto error_free_flow; > + flow->hash = flow_hash(&key, key_len); > + flow_insert(table, flow); > > reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, > info->snd_seq, > OVS_FLOW_CMD_NEW); > @@ -1031,7 +1010,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, > struct genl_info *info) > goto error; > > /* Update actions. */ > - flow = flow_cast(flow_node); > old_acts = rcu_dereference_protected(flow->sf_acts, > lockdep_genl_is_held()); > if (a[OVS_FLOW_ATTR_ACTIONS] && > @@ -1079,11 +1057,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, > struct genl_info *info) > struct nlattr **a = info->attrs; > struct ovs_header *ovs_header = info->userhdr; > struct sw_flow_key key; > - struct tbl_node *flow_node; > struct sk_buff *reply; > struct sw_flow *flow; > struct datapath *dp; > - struct tbl *table; > + struct flow_table *table; > int err; > int key_len; > > @@ -1098,12 +1075,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, > struct genl_info *info) > return -ENODEV; > > table = get_table_protected(dp); > - flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len), > - flow_cmp); > - if (!flow_node) > + flow = flow_lookup(table, &key, key_len); > + if (!flow) > return -ENOENT; > > - flow = flow_cast(flow_node); > reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, > info->snd_seq, OVS_FLOW_CMD_NEW); > if (IS_ERR(reply)) > return PTR_ERR(reply); > @@ -1116,11 +1091,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, > struct genl_info *info) > struct nlattr **a = info->attrs; > struct ovs_header *ovs_header = info->userhdr; > struct sw_flow_key key; > - struct tbl_node *flow_node; > struct sk_buff *reply; > struct sw_flow *flow; > struct datapath *dp; > - struct tbl *table; > + struct flow_table *table; > int err; > int key_len; > > @@ -1135,21 +1109,15 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, > struct genl_info *info) > return -ENODEV; > > table = get_table_protected(dp); > - flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len), > - flow_cmp); > - if (!flow_node) > + flow = flow_lookup(table, &key, key_len); > + if (!flow) > return -ENOENT; > - flow = flow_cast(flow_node); > > reply = ovs_flow_cmd_alloc_info(flow); > if (!reply) > return -ENOMEM; > > - err = tbl_remove(table, flow_node); > - if (err) { > - kfree_skb(reply); > - return err; > - } > + flow_remove(table, flow); > > err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid, > info->snd_seq, 0, OVS_FLOW_CMD_DEL); > @@ -1172,17 +1140,15 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, > struct netlink_callback *cb) > return -ENODEV; > > for (;;) { > - struct tbl_node *flow_node; > struct sw_flow *flow; > u32 bucket, obj; > > bucket = cb->args[0]; > obj = cb->args[1]; > - flow_node = tbl_next(get_table_protected(dp), &bucket, &obj); > - if (!flow_node) > + flow = flow_tbl_next(get_table_protected(dp), &bucket, &obj); > + if (!flow) > break; > > - flow = flow_cast(flow_node); > if (ovs_flow_cmd_fill_info(flow, dp, skb, > NETLINK_CB(cb->skb).pid, > cb->nlh->nlmsg_seq, NLM_F_MULTI, > OVS_FLOW_CMD_NEW) < 0) > @@ -1377,7 +1343,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct > genl_info *info) > > /* Allocate table. */ > err = -ENOMEM; > - rcu_assign_pointer(dp->table, tbl_create(TBL_MIN_BUCKETS)); > + rcu_assign_pointer(dp->table, flow_hash_alloc(TBL_MIN_BUCKETS)); > if (!dp->table) > goto err_free_dp; > > @@ -1423,7 +1389,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct > genl_info *info) > err_destroy_local_port: > dp_detach_port(get_vport_protected(dp, OVSP_LOCAL)); > err_destroy_table: > - tbl_destroy(get_table_protected(dp), NULL); > + flow_destroy_table(get_table_protected(dp)); > err_free_dp: > kfree(dp); > err_put_module: > @@ -2088,6 +2054,7 @@ static void dp_cleanup(void) > unregister_netdevice_notifier(&dp_device_notifier); > vport_exit(); > flow_exit(); > + tnl_exit(); > } > > module_init(dp_init); > diff --git a/datapath/datapath.h b/datapath/datapath.h > index d14f974..b4f08ac 100644 > --- a/datapath/datapath.h > +++ b/datapath/datapath.h > @@ -85,7 +85,7 @@ struct datapath { > int drop_frags; > > /* Flow table. */ > - struct tbl __rcu *table; > + struct flow_table __rcu *table; > > /* Switch ports. */ > struct vport __rcu *ports[DP_MAX_PORTS]; > diff --git a/datapath/flow.c b/datapath/flow.c > index 9d7fbc9..7e83d53 100644 > --- a/datapath/flow.c > +++ b/datapath/flow.c > @@ -216,14 +216,153 @@ struct sw_flow *flow_alloc(void) > return flow; > } > > -void flow_free_tbl(struct tbl_node *node) > +static struct hlist_head __rcu *find_bucket(struct flow_table * table, u32 > hash) > { > - struct sw_flow *flow = flow_cast(node); > + return flex_array_get(table->buckets, > + (hash & (table->n_buckets - 1))); > +} > + > +static struct flex_array __rcu *alloc_buckets(unsigned int n_buckets) > +{ > + struct flex_array __rcu * buckets; > + int i, err; > + > + buckets = flex_array_alloc(sizeof(struct hlist_head *), > + n_buckets, GFP_KERNEL); > + if (!buckets) > + return NULL; > + > + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); > + if (err) { > + flex_array_free(buckets); > + return NULL; > + } > + > + for (i = 0; i < n_buckets; i++) > + INIT_HLIST_HEAD((struct hlist_head *) > + flex_array_get(buckets, i)); > + > + return buckets; > +} > + > +static void free_buckets(struct flex_array * buckets) > +{ > + flex_array_free(buckets); > +} > + > +struct flow_table *flow_hash_alloc(int new_size) > +{ > + struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); > + > + if (!table) > + return NULL; > + > + table->buckets = alloc_buckets(new_size); > + > + if (!table->buckets) { > + kfree(table); > + return NULL; > + } > + table->n_buckets = new_size; > + table->count = 0; > + > + return table; > +} > > +static void flow_free(struct sw_flow *flow) > +{ > flow->dead = true; > flow_put(flow); > } > > +void flow_destroy_table(struct flow_table *table) > +{ > + int i; > + > + for (i = 0; i < table->n_buckets; i++) { > + struct sw_flow *flow; > + struct hlist_head *head = flex_array_get(table->buckets, i); > + struct hlist_node *node, *n; > + > + hlist_for_each_entry_safe(flow, node, n, head, hash_node) { > + hlist_del_init_rcu(&flow->hash_node); > + flow_free(flow); > + } > + } > + > + free_buckets(table->buckets); > + kfree(table); > +} > + > +static void flow_destroy_table_rcu_cb(struct rcu_head *rcu) > +{ > + struct flow_table *table = container_of(rcu, struct flow_table, rcu); > + > + flow_destroy_table(table); > +} > + > +void flow_deferred_destroy_table(struct flow_table *table) > +{ > + if (!table) > + return; > + > + call_rcu(&table->rcu, flow_destroy_table_rcu_cb); > +} > + > +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 > *last) > +{ > + struct sw_flow *flow = NULL; > + struct hlist_head *head; > + struct hlist_node *n; > + int i; > + > + while (*bucket < table->n_buckets) { > + i = 0; > + head = flex_array_get(table->buckets, *bucket); > + hlist_for_each_entry_rcu(flow, n, head, hash_node) { > + if (i < *last) { > + i++; > + continue; > + } > + *last = i + 1; > + return flow; > + } > + (*bucket)++; > + *last = 0; > + flow = NULL; > + } > + > + return flow; > +} > + > +struct flow_table *flow_hash_expand(struct flow_table *table) > +{ > + struct flow_table *new_table; > + struct sw_flow *flow; > + struct hlist_head *head; > + struct hlist_node *n, *pos; > + int n_buckets = table->n_buckets << 2; > + int i; > + > + if (n_buckets > TBL_MAX_BUCKETS) > + return ERR_PTR(-ENOSPC); > + > + new_table = flow_hash_alloc(n_buckets); > + if (!new_table) > + return ERR_PTR(-ENOMEM); > + > + for (i = 0; i < table->n_buckets; i++) { > + head = flex_array_get(table->buckets, i); > + > + hlist_for_each_entry_safe(flow, n, pos, head, hash_node) { > + hlist_del_init_rcu(&flow->hash_node); > + flow_insert(new_table, flow); > + } > + } > + > + return new_table; > +} > + > /* RCU callback used by flow_deferred_free. */ > static void rcu_free_flow_callback(struct rcu_head *rcu) > { > @@ -589,12 +728,51 @@ u32 flow_hash(const struct sw_flow_key *key, int > key_len) > return jhash2((u32*)key, DIV_ROUND_UP(key_len, sizeof(u32)), > hash_seed); > } > > -int flow_cmp(const struct tbl_node *node, void *key2_, int len) > +struct sw_flow * flow_lookup(struct flow_table *table, struct sw_flow_key > *key, > + int key_len) > { > - const struct sw_flow_key *key1 = &flow_cast(node)->key; > - const struct sw_flow_key *key2 = key2_; > + struct sw_flow *flow; > + struct hlist_node *n; > + struct hlist_head *head; > + u32 hash; > > - return !memcmp(key1, key2, len); > + hash = flow_hash(key, key_len); > + > + head = find_bucket(table, hash); > + hlist_for_each_entry_rcu(flow, n, head, hash_node) { > + if (flow->hash == hash && > + !memcmp(&flow->key, key, key_len)) { > + return flow; > + } > + } > + return NULL; > +} > + > +void flow_insert(struct flow_table *table, struct sw_flow *flow) > +{ > + struct hlist_head *head; > + struct hlist_node *n; > + struct sw_flow *cflow; > + > + head = find_bucket(table, flow->hash); > + > + hlist_for_each_entry_rcu(cflow, n, head, hash_node) { > + if (!memcmp(&cflow->key, &flow->key, sizeof(cflow->key))) { > + BUG(); > + return; > + } > + } > + hlist_add_head_rcu(&flow->hash_node, head); > + table->count++; > +} > + > +void flow_remove(struct flow_table *table, struct sw_flow *flow) > +{ > + if (!hlist_unhashed(&flow->hash_node)) { > + hlist_del_init_rcu(&flow->hash_node); > + table->count--; > + BUG_ON(table->count < 0); > + } > } > > /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ > diff --git a/datapath/flow.h b/datapath/flow.h > index 997692c..b304c60 100644 > --- a/datapath/flow.h > +++ b/datapath/flow.h > @@ -18,9 +18,8 @@ > #include <linux/in6.h> > #include <linux/jiffies.h> > #include <linux/time.h> > - > +#include <linux/flex_array.h> > #include "openvswitch/datapath-protocol.h" > -#include "table.h" > > struct sk_buff; > > @@ -78,21 +77,21 @@ struct sw_flow_key { > }; > }; > > -struct sw_flow { > - struct rcu_head rcu; > - struct tbl_node tbl_node; > > - struct sw_flow_key key; > +struct sw_flow { > struct sw_flow_actions __rcu *sf_acts; > - > - atomic_t refcnt; > + struct hlist_node hash_node; > + struct sw_flow_key key; > + u32 hash; > + u8 tcp_flags; /* Union of seen TCP flags. */ > bool dead; > > - spinlock_t lock; /* Lock for values below. */ > - unsigned long used; /* Last used time (in jiffies). */ > - u64 packet_count; /* Number of packets matched. */ > - u64 byte_count; /* Number of bytes matched. */ > - u8 tcp_flags; /* Union of seen TCP flags. */ > + spinlock_t lock; /* Lock for values below. */ > + unsigned long used; /* Last used time (in jiffies). */ > + u64 packet_count; /* Number of packets matched. */ > + u64 byte_count; /* Number of bytes matched. */ > + atomic_t refcnt; > + struct rcu_head rcu; > }; > > struct arp_eth_header > @@ -115,7 +114,6 @@ void flow_exit(void); > > struct sw_flow *flow_alloc(void); > void flow_deferred_free(struct sw_flow *); > -void flow_free_tbl(struct tbl_node *); > > struct sw_flow_actions *flow_actions_alloc(const struct nlattr *); > void flow_deferred_free_acts(struct sw_flow_actions *); > @@ -128,9 +126,6 @@ int flow_extract(struct sk_buff *, u16 in_port, struct > sw_flow_key *, > void flow_used(struct sw_flow *, struct sk_buff *); > u64 flow_used_time(unsigned long flow_jiffies); > > -u32 flow_hash(const struct sw_flow_key *, int key_lenp); > -int flow_cmp(const struct tbl_node *, void *target, int len); > - > /* Upper bound on the length of a nlattr-formatted flow key. The longest > * nlattr-formatted flow key would be: > * > @@ -155,9 +150,37 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int > *key_lenp, > int flow_metadata_from_nlattrs(u16 *in_port, __be64 *tun_id, > const struct nlattr *); > > -static inline struct sw_flow *flow_cast(const struct tbl_node *node) > +#define TBL_MIN_BUCKETS 1024 > +#define TBL_MAX_BUCKETS (TBL_MIN_BUCKETS * TBL_MIN_BUCKETS) > + > +struct flow_table { > + struct flex_array *buckets; > + unsigned int count, n_buckets; > + struct rcu_head rcu; > +}; > + > +static inline int flow_tbl_count(struct flow_table *table) > { > - return container_of(node, struct sw_flow, tbl_node); > + return table->count; > } > > +static inline int flow_tbl_need_to_expand(struct flow_table *table) > +{ > + return (table->count > table->n_buckets); > +} > + > +struct sw_flow *flow_lookup(struct flow_table *table, struct sw_flow_key > *key, > + int len); > +void flow_destroy_table(struct flow_table *table); > +void flow_deferred_destroy_table(struct flow_table *table); > +struct flow_table *flow_hash_alloc(int new_size); > +struct flow_table *flow_hash_expand(struct flow_table *table); > +void flow_insert(struct flow_table *table, struct sw_flow *flow); > +void flow_remove(struct flow_table *table, struct sw_flow *flow); > +u32 flow_hash(const struct sw_flow_key *key, int key_len); > + > +int flow_tbl_count(struct flow_table *table); > +int flow_tbl_need_to_expand(struct flow_table *table); > +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 > *idx); > + > #endif /* flow.h */ > diff --git a/datapath/tunnel.c b/datapath/tunnel.c > index 67556d7..8121b02 100644 > --- a/datapath/tunnel.c > +++ b/datapath/tunnel.c > @@ -13,6 +13,7 @@ > #include <linux/in.h> > #include <linux/in_route.h> > #include <linux/jhash.h> > +#include <linux/list.h> > #include <linux/kernel.h> > #include <linux/version.h> > #include <linux/workqueue.h> > @@ -31,7 +32,6 @@ > #include "actions.h" > #include "checksum.h" > #include "datapath.h" > -#include "table.h" > #include "tunnel.h" > #include "vlan.h" > #include "vport.h" > @@ -68,8 +68,10 @@ > #define CACHE_CLEANER_INTERVAL (5 * HZ) > > #define CACHE_DATA_ALIGN 16 > +#define PORT_TABLE_SIZE 1024 > > -static struct tbl __rcu *port_table __read_mostly; > +static struct hlist_head *port_table __read_mostly; > +static int port_table_count; > > static void cache_cleaner(struct work_struct *work); > static DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner); > @@ -95,11 +97,6 @@ static inline struct vport *tnl_vport_to_vport(const > struct tnl_vport *tnl_vport > return vport_from_priv(tnl_vport); > } > > -static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node > *node) > -{ > - return container_of(node, struct tnl_vport, tbl_node); > -} > - > /* This is analogous to rtnl_dereference for the tunnel cache. It checks > that > * cache_lock is held, so it is only for update side code. > */ > @@ -186,11 +183,9 @@ struct port_lookup_key { > * Modifies 'target' to store the rcu_dereferenced pointer that was used to do > * the comparision. > */ > -static int port_cmp(const struct tbl_node *node, void *target, int unused) > +static int port_cmp(const struct tnl_vport *tnl_vport, > + struct port_lookup_key *lookup) > { > - const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node); > - struct port_lookup_key *lookup = target; > - > lookup->mutable = rcu_dereference_rtnl(tnl_vport->mutable); > > return (lookup->mutable->tunnel_type == lookup->tunnel_type && > @@ -218,106 +213,95 @@ static u32 mutable_hash(const struct > tnl_mutable_config *mutable) > return port_hash(&lookup); > } > > -static void check_table_empty(void) > -{ > - struct tbl *old_table = rtnl_dereference(port_table); > > - if (tbl_count(old_table) == 0) { > - cancel_delayed_work_sync(&cache_cleaner_wq); > - rcu_assign_pointer(port_table, NULL); > - tbl_deferred_destroy(old_table, NULL); > - } > +static inline struct hlist_head *find_bucket(u32 hash) > +{ > + return &port_table[(hash & (PORT_TABLE_SIZE - 1))]; > } > > -static int add_port(struct vport *vport) > +static int alloc_port_table(void) > { > - struct tbl *cur_table = rtnl_dereference(port_table); > - struct tnl_vport *tnl_vport = tnl_vport_priv(vport); > - int err; > + unsigned int i; > + static struct hlist_head *head; > > - if (!port_table) { > - struct tbl *new_table; > + head = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *), > + GFP_KERNEL); > + if (!head) > + return -ENOMEM; > > - new_table = tbl_create(TBL_MIN_BUCKETS); > - if (!new_table) > - return -ENOMEM; > + for (i = 0; i < PORT_TABLE_SIZE; i++) > + INIT_HLIST_HEAD(&head[i]); > > - rcu_assign_pointer(port_table, new_table); > - schedule_cache_cleaner(); > + smp_wmb(); > + port_table = head; > + return 0; > +} > > - } else if (tbl_count(cur_table) > tbl_n_buckets(cur_table)) { > - struct tbl *new_table; > +static int add_port(struct vport *vport) > +{ > + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); > + u32 hash = mutable_hash(rtnl_dereference(tnl_vport->mutable)); > + int err; > > - new_table = tbl_expand(cur_table); > - if (IS_ERR(new_table)) { > - if (PTR_ERR(new_table) != -ENOSPC) > - return PTR_ERR(new_table); > - } else { > - rcu_assign_pointer(port_table, new_table); > - tbl_deferred_destroy(cur_table, NULL); > - } > + if (unlikely(!port_table)) { > + err = alloc_port_table(); > + if (err) > + return err; > } > + if (port_table_count == 0) > + schedule_cache_cleaner(); > > - err = tbl_insert(rtnl_dereference(port_table), &tnl_vport->tbl_node, > - mutable_hash(rtnl_dereference(tnl_vport->mutable))); > - if (err) { > - check_table_empty(); > - return err; > - } > + hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash)); > + port_table_count++; > > (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++; > - > return 0; > } > > -static int move_port(struct vport *vport, struct tnl_mutable_config > *new_mutable) > +static void move_port(struct vport *vport, > + struct tnl_mutable_config *new_mutable) > { > - int err; > - struct tbl *cur_table = rtnl_dereference(port_table); > struct tnl_vport *tnl_vport = tnl_vport_priv(vport); > u32 hash; > > hash = mutable_hash(new_mutable); > - if (hash == tnl_vport->tbl_node.hash) > - goto table_updated; > - > - /* > - * Ideally we should make this move atomic to avoid having gaps in > - * finding tunnels or the possibility of failure. However, if we do > - * find a tunnel it will always be consistent. > - */ > - err = tbl_remove(cur_table, &tnl_vport->tbl_node); > - if (err) > - return err; > + hlist_del_init_rcu(&tnl_vport->hash_node); > + hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash)); > > - err = tbl_insert(cur_table, &tnl_vport->tbl_node, hash); > - if (err) { > - (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--; > - check_table_empty(); > - return err; > - } > - > -table_updated: > (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--; > assign_config_rcu(vport, new_mutable); > (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++; > - > - return 0; > } > > -static int del_port(struct vport *vport) > +static void del_port(struct vport *vport) > { > struct tnl_vport *tnl_vport = tnl_vport_priv(vport); > - int err; > > - err = tbl_remove(rtnl_dereference(port_table), &tnl_vport->tbl_node); > - if (err) > - return err; > + hlist_del_init_rcu(&tnl_vport->hash_node); > + > + port_table_count--; > + if (port_table_count == 0) > + cancel_delayed_work_sync(&cache_cleaner_wq); > > - check_table_empty(); > (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--; > +} > > - return 0; > +struct tnl_vport *port_table_lookup(struct port_lookup_key *lookup) > +{ > + struct hlist_node *n; > + struct hlist_head *bucket; > + u32 hash = port_hash(lookup); > + struct tnl_vport * tnl_vport; > + > + bucket = find_bucket(hash); > + > + hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) { > + if (!port_cmp(tnl_vport, lookup)) { > + return tnl_vport; > + } > + } > + > + return NULL; > } > > struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key, > @@ -325,10 +309,9 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, > __be64 key, > const struct tnl_mutable_config **mutable) > { > struct port_lookup_key lookup; > - struct tbl *table = rcu_dereference_rtnl(port_table); > - struct tbl_node *tbl_node; > + struct tnl_vport * tnl_vport; > > - if (unlikely(!table)) > + if (unlikely(!port_table)) > return NULL; > > lookup.saddr = saddr; > @@ -339,18 +322,15 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, > __be64 key, > lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_MATCH; > > if (key_local_remote_ports) { > - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup), > - port_hash(&lookup), port_cmp); > - if (tbl_node) > + tnl_vport = port_table_lookup(&lookup); > + if (tnl_vport) > goto found; > } > > if (key_remote_ports) { > lookup.saddr = 0; > - > - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup), > - port_hash(&lookup), port_cmp); > - if (tbl_node) > + tnl_vport = port_table_lookup(&lookup); > + if (tnl_vport) > goto found; > > lookup.saddr = saddr; > @@ -362,18 +342,15 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, > __be64 key, > lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_EXACT; > > if (local_remote_ports) { > - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup), > - port_hash(&lookup), port_cmp); > - if (tbl_node) > + tnl_vport = port_table_lookup(&lookup); > + if (tnl_vport) > goto found; > } > > if (remote_ports) { > lookup.saddr = 0; > - > - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup), > - port_hash(&lookup), port_cmp); > - if (tbl_node) > + tnl_vport = port_table_lookup(&lookup); > + if (tnl_vport) > goto found; > } > } > @@ -382,7 +359,7 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, > __be64 key, > > found: > *mutable = lookup.mutable; > - return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node)); > + return tnl_vport_to_vport(tnl_vport); > } > > static void ecn_decapsulate(struct sk_buff *skb, u8 tos) > @@ -848,10 +825,10 @@ static inline bool check_cache_valid(const struct > tnl_cache *cache, > (cache->flow && !cache->flow->dead)); > } > > -static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux) > +static void __cache_cleaner(struct tnl_vport *tnl_vport) > { > - struct tnl_vport *tnl_vport = tnl_vport_table_cast(tbl_node); > - const struct tnl_mutable_config *mutable = > rcu_dereference(tnl_vport->mutable); > + const struct tnl_mutable_config *mutable = > + rcu_dereference(tnl_vport->mutable); > const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache); > > if (cache && !check_cache_valid(cache, mutable) && > @@ -859,16 +836,24 @@ static int cache_cleaner_cb(struct tbl_node *tbl_node, > void *aux) > assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL); > spin_unlock_bh(&tnl_vport->cache_lock); > } > - > - return 0; > } > > static void cache_cleaner(struct work_struct *work) > { > + struct hlist_node *n; > + struct hlist_head *bucket; > + struct tnl_vport * tnl_vport; > + int i; > + > schedule_cache_cleaner(); > > rcu_read_lock(); > - tbl_foreach(rcu_dereference(port_table), cache_cleaner_cb, NULL); > + for (i = 0; i < PORT_TABLE_SIZE; i++) { > + bucket = &port_table[i]; > + hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) { > + __cache_cleaner(tnl_vport); > + } > + } > rcu_read_unlock(); > } > > @@ -949,12 +934,12 @@ static struct tnl_cache *build_cache(struct vport > *vport, > > if (is_internal_dev(rt_dst(rt).dev)) { > struct sw_flow_key flow_key; > - struct tbl_node *flow_node; > struct vport *dst_vport; > struct sk_buff *skb; > bool is_frag; > int err; > int flow_key_len; > + struct sw_flow *flow; > > dst_vport = internal_dev_get_vport(rt_dst(rt).dev); > if (!dst_vport) > @@ -974,13 +959,9 @@ static struct tnl_cache *build_cache(struct vport *vport, > if (err || is_frag) > goto done; > > - flow_node = tbl_lookup(rcu_dereference(dst_vport->dp->table), > - &flow_key, flow_key_len, > - flow_hash(&flow_key, flow_key_len), > - flow_cmp); > - if (flow_node) { > - struct sw_flow *flow = flow_cast(flow_node); > - > + flow = flow_lookup(rcu_dereference(dst_vport->dp->table), > + &flow_key, flow_key_len); > + if (flow) { > cache->flow = flow; > flow_hold(flow); > } > @@ -1498,9 +1479,8 @@ int tnl_set_options(struct vport *vport, struct nlattr > *options) > if (err) > goto error_free; > > - err = move_port(vport, mutable); > - if (err) > - goto error_free; > + if (mutable_hash(mutable) != mutable_hash(old_mutable)) > + move_port(vport, mutable); > > return 0; > > @@ -1598,3 +1578,27 @@ void tnl_free_linked_skbs(struct sk_buff *skb) > skb = next; > } > } > + > + > +void tnl_exit() > +{ > + struct hlist_node *n, *pos; > + struct hlist_head *hash_head; > + struct tnl_vport * tnl_vport; > + int i; > + > + if (!port_table) > + return; > + > + cancel_delayed_work_sync(&cache_cleaner_wq); > + for (i = 0; i < PORT_TABLE_SIZE; i++) { > + hash_head = &port_table[i]; > + hlist_for_each_entry_safe(tnl_vport, pos, n, hash_head, > + hash_node) { > + BUG(); > + goto out; > + } > + } > +out: > + kfree(port_table); > +} > diff --git a/datapath/tunnel.h b/datapath/tunnel.h > index b488e46..169c408 100644 > --- a/datapath/tunnel.h > +++ b/datapath/tunnel.h > @@ -13,7 +13,6 @@ > > #include "flow.h" > #include "openvswitch/tunnel.h" > -#include "table.h" > #include "vport.h" > > /* > @@ -187,7 +186,7 @@ struct tnl_cache { > > struct tnl_vport { > struct rcu_head rcu; > - struct tbl_node tbl_node; > + struct hlist_node hash_node; > > char name[IFNAMSIZ]; > const struct tnl_ops *tnl_ops; > @@ -236,6 +235,7 @@ bool tnl_frag_needed(struct vport *vport, > struct sk_buff *skb, unsigned int mtu, __be64 flow_key); > void tnl_free_linked_skbs(struct sk_buff *skb); > > +void tnl_exit(void); > static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport) > { > return vport_priv(vport); > -- > 1.7.1 > > _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev