On Thu, Feb 6, 2014 at 3:13 PM, Jarno Rajahalme <[email protected]> wrote:
> Keep kernel flow stats for each NUMA node rather than each (logical)
> CPU. This avoids using the per-CPU allocator and removes most of the
>
> diff --git a/datapath/flow.c b/datapath/flow.c
> index abe6789..e86034e 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> @@ -64,14 +64,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
>
> void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
> {
> - struct flow_stats *stats;
> + int node = numa_node_id();
> + struct flow_stats *stats = flow->stats[node];
> __be16 tcp_flags = 0;
>
> - if (!flow->stats.is_percpu)
> - stats = flow->stats.stat;
> - else
> - stats = this_cpu_ptr(flow->stats.cpu_stats);
> -
> if ((flow->key.eth.type == htons(ETH_P_IP) ||
> flow->key.eth.type == htons(ETH_P_IPV6)) &&
> flow->key.ip.frag != OVS_FRAG_TYPE_LATER &&
> @@ -80,96 +76,126 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct
> sk_buff *skb)
> tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
> }
>
> - spin_lock(&stats->lock);
> + /* Check if already have node-specific stats. */
> + if (likely(stats))
> + spin_lock(&stats->lock);
> + else {
> + stats = flow->stats[0]; /* Pre-allocated. */
> + spin_lock(&stats->lock);
> +
> + /* If the current NUMA-node is the only writer on the
> + * pre-allocated stats keep using them.
> + * A previous locker may have already allocated the stats,
> + * so we need to check again. If the node-specific stats
> + * were already allocated, we update the pre-allocated stats
> + * as we have already locked them. */
> + if (likely(stats->last_writer != node && stats->last_writer
> >= 0
> + && !flow->stats[node])) {
> + /* Try to allocate node-specific stats. */
> + struct flow_stats *new_stats;
> +
> + new_stats = kmem_cache_alloc_node(flow_stats_cache,
> + GFP_THISNODE |
> + __GFP_NOMEMALLOC,
> + node);
> + if (likely(new_stats)) {
> + new_stats->used = jiffies;
> + new_stats->packet_count = 1;
> + new_stats->byte_count = skb->len;
> + new_stats->tcp_flags = tcp_flags;
> + new_stats->last_writer = node;
> + spin_lock_init(&new_stats->lock);
> +
> + flow->stats[node] = new_stats;
> + goto unlock; /* Unlock the pre-allocated
> stats. */
One more thing I noticed is that we need memory barrier for numa-stats
array. Since other process could be reading starts from different cpu.
> + }
> + }
> + }
> +
> stats->used = jiffies;
> stats->packet_count++;
> stats->byte_count += skb->len;
> stats->tcp_flags |= tcp_flags;
> + stats->last_writer = node;
> +unlock:
> spin_unlock(&stats->lock);
> }
>
> -static void stats_read(struct flow_stats *stats, bool lock_bh,
> - struct ovs_flow_stats *ovs_stats,
> - unsigned long *used, __be16 *tcp_flags)
> -{
> - if (lock_bh)
> - spin_lock_bh(&stats->lock);
> - else
> - spin_lock(&stats->lock);
> -
> - if (time_after(stats->used, *used))
> - *used = stats->used;
> - *tcp_flags |= stats->tcp_flags;
> - ovs_stats->n_packets += stats->packet_count;
> - ovs_stats->n_bytes += stats->byte_count;
> -
> - if (lock_bh)
> - spin_unlock_bh(&stats->lock);
> - else
> - spin_unlock(&stats->lock);
> -}
> -
> void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats
> *ovs_stats,
> unsigned long *used, __be16 *tcp_flags)
> {
> - int cpu, cur_cpu;
> + int node, local_node;
> + struct flow_stats *stats;
>
> - *used = 0;
> - *tcp_flags = 0;
> - memset(ovs_stats, 0, sizeof(*ovs_stats));
> + preempt_disable();
> + local_node = numa_node_id();
>
> - if (!flow->stats.is_percpu) {
> - stats_read(flow->stats.stat, true, ovs_stats, used,
> tcp_flags);
> + /* Start from the local node. */
> + stats = flow->stats[local_node];
> + if (stats && likely(stats->packet_count)) {
> + spin_lock_bh(&stats->lock);
> + *used = stats->used;
> + *tcp_flags = stats->tcp_flags;
> + ovs_stats->n_packets = stats->packet_count;
> + ovs_stats->n_bytes = stats->byte_count;
> + spin_unlock_bh(&stats->lock);
> } else {
> - cur_cpu = get_cpu();
> -
> - for_each_possible_cpu(cpu) {
> - struct flow_stats *stats;
> - bool lock_bh;
> + *used = 0;
> + *tcp_flags = 0;
> + ovs_stats->n_packets = 0;
> + ovs_stats->n_bytes = 0;
> + }
>
> - stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
> - lock_bh = (cpu == cur_cpu);
> - stats_read(stats, lock_bh, ovs_stats, used,
> tcp_flags);
> + /* Collect stats from other nodes. */
> + for_each_node(node) {
> + if (node == local_node)
> + continue; /* Done already. */
> + stats = flow->stats[node];
> + if (stats && likely(stats->packet_count)) {
> + spin_lock(&stats->lock);
> + if (time_after(stats->used, *used))
> + *used = stats->used;
> + *tcp_flags |= stats->tcp_flags;
> + ovs_stats->n_packets += stats->packet_count;
> + ovs_stats->n_bytes += stats->byte_count;
> + spin_unlock(&stats->lock);
> }
> - put_cpu();
> }
> -}
> -
> -static void stats_reset(struct flow_stats *stats, bool lock_bh)
> -{
> - if (lock_bh)
> - spin_lock_bh(&stats->lock);
> - else
> - spin_lock(&stats->lock);
> -
> - stats->used = 0;
> - stats->packet_count = 0;
> - stats->byte_count = 0;
> - stats->tcp_flags = 0;
> -
> - if (lock_bh)
> - spin_unlock_bh(&stats->lock);
> - else
> - spin_unlock(&stats->lock);
> + preempt_enable();
> }
>
> void ovs_flow_stats_clear(struct sw_flow *flow)
> {
> - int cpu, cur_cpu;
> -
> - if (!flow->stats.is_percpu) {
> - stats_reset(flow->stats.stat, true);
> - } else {
> - cur_cpu = get_cpu();
> + int node, local_node;
> + struct flow_stats *stats;
>
> - for_each_possible_cpu(cpu) {
> - bool lock_bh;
> + preempt_disable();
> + local_node = numa_node_id();
>
> - lock_bh = (cpu == cur_cpu);
> - stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu),
> lock_bh);
> + /* Start from the current node. */
> + stats = flow->stats[local_node];
> + if (stats) {
> + spin_lock_bh(&stats->lock);
> + stats->used = 0;
> + stats->packet_count = 0;
> + stats->byte_count = 0;
> + stats->tcp_flags = 0;
> + spin_unlock_bh(&stats->lock);
> + }
> + for_each_node(node) {
> + if (node == local_node)
> + continue; /* Done already. */
> + stats = flow->stats[node];
> + if (stats) {
> + spin_lock(&stats->lock);
> + stats->used = 0;
> + stats->packet_count = 0;
> + stats->byte_count = 0;
> + stats->tcp_flags = 0;
> + spin_unlock(&stats->lock);
> }
> - put_cpu();
> }
> + preempt_enable();
> }
>
_______________________________________________
dev mailing list
[email protected]
http://openvswitch.org/mailman/listinfo/dev