Re: [dpdk-dev] [PATCH v3 03/12] event/octeontx: add support to create and free timer adapter

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:05 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 03/12] event/octeontx: add support to create
>  and free timer adapter
> X-Mailer: git-send-email 2.16.3
> 
> When the application requests to create a timer device, Octeontx TIM
> create does the following:
> - Get the requested TIMvf ring based on adapter_id.
> - Verify the config parameters supplied.
> - Allocate memory required for
>   * Buckets based on min and max timeout supplied.
>   * Allocate the chunk pool based on the number of timers.
> - Clear the interrupts.
> 
> On Free:
> - Free the allocated bucket and chunk memory.
> - Free private data used by TIMvf.
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
> +static int
> +timvf_ring_create(struct rte_event_timer_adapter *adptr)
> +{
> +
> + switch (rcfg->clk_src) {
> + case RTE_EVENT_TIMER_ADAPTER_CPU_CLK:

while defining the enum, equate TIM_CLK_SRC_SCLK, TIM_CLK_SRC_GPIO..
values to RTE_EVENT_TIMER_ADAPTER_CPU_CLK,
RTE_EVENT_TIMER_ADAPTER_EXT_CLK0 ..
etc to avoid switch case.

> + timr->clk_src = TIM_CLK_SRC_SCLK;
> + break;
> + case RTE_EVENT_TIMER_ADAPTER_EXT_CLK0:
> + timr->clk_src = TIM_CLK_SRC_GPIO;
> + break;
> + case RTE_EVENT_TIMER_ADAPTER_EXT_CLK1:
> + timr->clk_src = TIM_CLK_SRC_GTI;
> + break;
> + case RTE_EVENT_TIMER_ADAPTER_EXT_CLK2:
> + timr->clk_src = TIM_CLK_SRC_PTP;
> + break;
> + default:
> + timvf_log_err("Invalid clk source specified.");
> + goto cfg_err;
> + }
> +
> + timvf_write64(0, (uint8_t *)timr->vbar0 + TIM_VRING_BASE);
> + timvf_write64(0, (uint8_t *)timr->vbar0 + TIM_VF_NRSPERR_INT);
> + timvf_write64(0, (uint8_t *)timr->vbar0 + TIM_VF_NRSPERR_INT_W1S);
> + timvf_write64(0x7, (uint8_t *)timr->vbar0 + TIM_VF_NRSPERR_ENA_W1C);
> + timvf_write64(0x7, (uint8_t *)timr->vbar0 + TIM_VF_NRSPERR_ENA_W1S);
> +
> + return 0;
> +mem_err:
> + rte_free(timr);
> + return -ENOMEM;
> +cfg_err:
> + rte_free(timr);
> + return -EINVAL;
> +}
> +
> +static int
> +timvf_ring_free(struct rte_event_timer_adapter *adptr)
> +{
> + struct timvf_ring *timr = adptr->data->adapter_priv;
> + rte_mempool_free(timr->meta.chunk_pool);
> + rte_free(timr->meta.bkt);
> + rte_free(adptr->data->adapter_priv);
> + return 0;
> +}
> +
> +static struct rte_event_timer_adapter_ops timvf_ops = {

use const

> + .init   = timvf_ring_create,

Found additional tab

> + .uninit = timvf_ring_free,
> + .get_info   = timvf_ring_info_get,
> +};
> +
> +int
> +timvf_timer_adapter_caps_get(const struct rte_eventdev *dev, uint64_t flags,
> + uint32_t *caps, const struct rte_event_timer_adapter_ops **ops)
> +{
> + RTE_SET_USED(dev);
> + RTE_SET_USED(flags);
> + *caps = RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT;
> + *ops = &timvf_ops;
> + return -EINVAL;
> +}
> +enum timvf_clk_src {
> + TIM_CLK_SRC_SCLK,

See above comment.

> + TIM_CLK_SRC_GPIO,
> + TIM_CLK_SRC_GTI,
> + TIM_CLK_SRC_PTP,
> +};
> +
> +struct timvf_meta {
> + bkt_id get_target_bkt;
> + refill_chunk refill_chunk;
> + struct rte_reciprocal_u64 fast_div;
> + uint64_t ring_start_cyc;
> + uint32_t nb_bkts;
> + struct tim_mem_bucket *bkt;
> + void *chunk_pool;
> + uint64_t tck_int;
> +};
> +
> +struct timvf_ring {
> + struct timvf_meta meta;

IMO, Additional 'meta' indirection can be avoid to reduce the code
clutter.

> + uint64_t tck_nsec;
> + void  *vbar0;
> + void *bkt_pos;
> + uint64_t max_tout;
> + uint64_t nb_chunks;
> + enum timvf_clk_src clk_src;
> + uint16_t tim_ring_id;
> +} __rte_cache_aligned;
> +


Re: [dpdk-dev] [PATCH v3 04/12] event/octeontx: add support to start and stop timer device

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:06 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 04/12] event/octeontx: add support to start
>  and stop timer device
> X-Mailer: git-send-email 2.16.3
> 
> When application requests to start the timer adapter through
> `rte_event_timer_adapter_start`, Octeontx TIMvf ring does the
> following:
> - Uses mbox to communicate TIMpf driver about,
>   * SCLK frequency used to convert ns<->cycles.
>   * program the ring control parameters and start the ring.
>   * get the exact cycle at which the TIMvf ring has started which can be
>   used to estimate the bucket position.
> 
> On `rte_event_timer_adapter_stop` i.e stop, Octeontx TIMvf ring does the
> following:
> - Use mbox to communicate TIMpf driver about,
>   * reset the ring control parameters and stop the ring.
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
>  drivers/event/octeontx/timvf_evdev.c | 140 
> +++
>  drivers/event/octeontx/timvf_evdev.h |   5 ++
>  2 files changed, 145 insertions(+)
> 
> +/* Response messages */
> +enum {
> + MBOX_RET_SUCCESS,
> + MBOX_RET_INVALID,
> + MBOX_RET_INTERNAL_ERR,
> +};

If it is duplicate definition then remove it.

> +
> +static int
> +timvf_mbox_dev_info_get(struct timvf_mbox_dev_info *info)
> +{
> + struct octeontx_mbox_hdr hdr = {0};
> + uint16_t len = sizeof(struct timvf_mbox_dev_info);
> +
> + hdr.coproc = TIM_COPROC;
> + hdr.msg = TIM_GET_DEV_INFO;
> + hdr.vfid = 0; /* TIM DEV is always 0. TIM RING ID changes. */
> +
> + memset(info, 0, len);
> + return octeontx_ssovf_mbox_send(&hdr, NULL, 0, info, len);

rebase to latest dpdk-next-eventdev where mbox api changed to
octeontx_mbox_send

> +}
> +
>  static void
>  timvf_ring_info_get(const struct rte_event_timer_adapter *adptr,
>   struct rte_event_timer_adapter_info *adptr_info)
> @@ -27,6 +53,118 @@ timvf_ring_info_get(const struct rte_event_timer_adapter 
> *adptr,
>   sizeof(struct rte_event_timer_adapter_conf));
>  }
>  
> +static int
> +timvf_ring_start(const struct rte_event_timer_adapter *adptr)
> +{
> + int ret;
> + uint64_t interval = 0;

This assignment can be avoided.

> + struct timvf_ctrl_reg rctrl = {0};
> + struct timvf_mbox_dev_info dinfo;
> + struct timvf_ring *timr = adptr->data->adapter_priv;
> +
> + ret = timvf_mbox_dev_info_get(&dinfo);
> + if (ret < 0 || ret != sizeof(struct timvf_mbox_dev_info))
> + return -EINVAL;
> +
> + /* Calculate the interval cycles according to clock source. */
> + switch (timr->clk_src) {
> + case TIM_CLK_SRC_SCLK:
> + interval = NSEC2CLK(timr->tck_nsec, dinfo.clk_freq);
> + break;
> + case TIM_CLK_SRC_GPIO:
> + /* GPIO doesn't work on tck_nsec. */
> + interval = 0;
> + break;
> + case TIM_CLK_SRC_GTI:
> + interval = NSEC2CLK(timr->tck_nsec, dinfo.clk_freq);
> + break;
> + case TIM_CLK_SRC_PTP:
> + interval = NSEC2CLK(timr->tck_nsec, dinfo.clk_freq);
> + break;

Shouldn't we return error if clock source is not supported?

> + }


> +
> + /*CTRL0 register.*/
> + rctrl.rctrl0 = interval;
> +
> + /*CTRL1 register.*/
> + rctrl.rctrl1 =  (uint64_t)(timr->clk_src) << 51 |
> + 1ull << 48 |
> + 1ull << 47 |
> + 1ull << 44 |

Add comments to this bit definitions.


> + (timr->meta.nb_bkts - 1);
> +
> + rctrl.rctrl2 = (uint64_t)(TIM_CHUNK_SIZE / 16) << 40;
> +
> + timvf_write64((uint64_t)timr->meta.bkt,
> + (uint8_t *)timr->vbar0 + TIM_VRING_BASE);
> + if (timvf_ring_conf_set(&rctrl, timr->tim_ring_id)) {
> + ret = -EACCES;
> + goto error;
> + }
> +
> + if (timvf_get_start_cyc(&timr->meta.ring_start_cyc,
> + timr->tim_ring_id) < 0) {
> + ret = -EACCES;
> + goto error;
> + }
> + timr->meta.tck_int = NSEC2CLK(timr->tck_nsec, rte_get_timer_hz());
> + timr->meta.fast_div = rte_reciprocal_value_u64(timr->meta.tck_int);
> + timvf_log_info("nb_bkts %d min_ns %"PRIu64" min_cyc %"PRIu64""
> + " maxtmo %"PRIu64"\n",
> + timr->meta.nb_bkts, timr->tck_nsec, interval,
> + timr->max_tout);
> +
> + return 0;
> +error:
> + rte_free(timr->meta.bkt);
> + rte_mempool_free(timr->meta.chunk_pool);
> + return ret;
> +}
> +
> +static int
> +timvf_ring_stop(const struct rte_event_timer_adapter *adptr)
> +{
> + struct timvf_ring *timr = adptr->data->adapter_priv;
> + struct timvf_ctrl_reg rctrl = {0};
> + rctrl.rctrl0 = timvf_read64((uint8_t *)timr->vbar0 + TIM_VRING_CTL0);
> +

Re: [dpdk-dev] [PATCH v3 05/12] event/octeontx: add event timer stats get and reset

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:07 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 05/12] event/octeontx: add event timer stats
>  get and reset
> X-Mailer: git-send-email 2.16.3
> 
> Add functions to get and reset event timer adapter stats.
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
>  drivers/event/octeontx/timvf_evdev.c | 26 ++
>  drivers/event/octeontx/timvf_evdev.h |  1 +
>  2 files changed, 27 insertions(+)
> 
> diff --git a/drivers/event/octeontx/timvf_evdev.c 
> b/drivers/event/octeontx/timvf_evdev.c
> index ccf724115..4db10cdd0 100644
> --- a/drivers/event/octeontx/timvf_evdev.c
> +++ b/drivers/event/octeontx/timvf_evdev.c
> @@ -281,12 +281,38 @@ timvf_ring_free(struct rte_event_timer_adapter *adptr)
>   return 0;
>  }
>  
> +
> +static int
> +timvf_stats_get(const struct rte_event_timer_adapter *adapter,
> + struct rte_event_timer_adapter_stats *stats)
> +{
> + struct timvf_ring *timr = adapter->data->adapter_priv;
> + uint64_t bkt_cyc = rte_rdtsc() - timr->meta.ring_start_cyc;
> +
> + stats->evtim_exp_count = timr->meta.tim_arm_cnt;
> + stats->ev_enq_count = timr->meta.tim_arm_cnt;

We are updating the these counts in fastpath. Right? As an optimization,
may be we could take the arguments from vdev to choose to enable "stats" at
runtime by making it two function pointers. By default it can be disabled.


Re: [dpdk-dev] [PATCH v3 06/12] event/octeontx: add multiproducer timer arm and cancel

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:08 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 06/12] event/octeontx: add multiproducer
>  timer arm and cancel
> X-Mailer: git-send-email 2.16.3
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
> +static inline int16_t
> +timr_bkt_get_rem(struct tim_mem_bucket *bktp)
> +{
> + return __atomic_load_n((int16_t *)&bktp->chunk_remainder,
> + __ATOMIC_ACQUIRE);
> +}
> +
> +static inline void
> +timr_bkt_set_rem(struct tim_mem_bucket *bktp, uint16_t v)
> +{
> + __atomic_store_n((int16_t *)&bktp->chunk_remainder, v,

typecast is not required for  __atomic_* gcc atomic builtins functions.

> + __ATOMIC_RELEASE);
> +}
> +
> +static inline void
> +timr_bkt_sub_rem(struct tim_mem_bucket *bktp, uint16_t v)
> +{
> + __atomic_fetch_sub((int16_t *)&bktp->chunk_remainder, v,
> + __ATOMIC_RELEASE);
> +}
> +
> +{
> +
> +/* Multi producer functions. */
> +static inline int
> +timvf_add_entry_mp(struct timvf_ring * const timr, const uint32_t rel_bkt,
> + struct rte_event_timer * const tim,
> + const struct tim_mem_entry * const pent)
> +{
> + uint8_t lock_cnt;
> + int16_t rem;
> + uint64_t lock_sema;
> + struct tim_mem_bucket *bkt;
> + struct tim_mem_entry *chunk;
> +
> +__retry:
> + bkt = timvf_get_target_bucket(timr, rel_bkt);
> + /* Bucket related checks. */
> + /*Get Bucket sema*/
> + lock_sema = timr_bkt_fetch_sema_lock(bkt);
> + if (unlikely(timr_bkt_get_shbt(lock_sema))) {
> + timr_bkt_dec_lock(bkt);
> + goto __retry;
> + }
> +
> + RTE_SET_USED(lock_cnt);

lock_cnt not been used. Remove it.

With above changes:
Acked-by: Jerin Jacob 




Re: [dpdk-dev] [PATCH v3 08/12] event/octeontx: add burst mode for timer arm

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:10 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 08/12] event/octeontx: add burst mode for
>  timer arm
> X-Mailer: git-send-email 2.16.3
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
>  drivers/event/octeontx/timvf_evdev.c  |  1 +
>  drivers/event/octeontx/timvf_evdev.h  |  3 ++
>  drivers/event/octeontx/timvf_worker.c | 38 ++
>  drivers/event/octeontx/timvf_worker.h | 95 
> +++
>  4 files changed, 137 insertions(+)
> 
> diff --git a/drivers/event/octeontx/timvf_evdev.c 
> b/drivers/event/octeontx/timvf_evdev.c
> index a32892107..b23500e0d 100644
> --- a/drivers/event/octeontx/timvf_evdev.c
> +++ b/drivers/event/octeontx/timvf_evdev.c
> @@ -333,6 +333,7 @@ timvf_timer_adapter_caps_get(const struct rte_eventdev 
> *dev, uint64_t flags,
>   else
>   timvf_ops.arm_burst = timvf_timer_reg_burst_mp;
>  
> + timvf_ops.arm_tmo_tick_burst = timvf_timer_reg_brst;

IMO, To inline with spec name, may we could use arm_burst/cancel_burst instead 
of
reg_brst or unreg_burst.

>   timvf_ops.cancel_burst = timvf_timer_unreg_burst;
>  
>   *caps = RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT;
> diff --git a/drivers/event/octeontx/timvf_evdev.h 
> b/drivers/event/octeontx/timvf_evdev.h
> index ab2de678f..d8a6d111f 100644
> --- a/drivers/event/octeontx/timvf_evdev.h
> +++ b/drivers/event/octeontx/timvf_evdev.h
> @@ -200,6 +200,9 @@ uint16_t timvf_timer_reg_burst_sp(const struct 
> rte_event_timer_adapter *adptr,
>   struct rte_event_timer **tim, const uint16_t nb_timers);
>  uint16_t timvf_timer_reg_burst_mp(const struct rte_event_timer_adapter 
> *adptr,
>   struct rte_event_timer **tim, const uint16_t nb_timers);
> +uint16_t timvf_timer_reg_brst(const struct rte_event_timer_adapter *adptr,
> + struct rte_event_timer **tim, const uint64_t timeout_tick,
> + const uint16_t nb_timers);
>  void timvf_set_chunk_refill(struct timvf_ring * const timr);
>  
>  #endif /* __TIMVF_EVDEV_H__ */
> diff --git a/drivers/event/octeontx/timvf_worker.c 
> b/drivers/event/octeontx/timvf_worker.c
> index 139dfdc07..f4f40d150 100644
> --- a/drivers/event/octeontx/timvf_worker.c
> +++ b/drivers/event/octeontx/timvf_worker.c
> @@ -113,6 +113,44 @@ timvf_timer_reg_burst_mp(const struct 
> rte_event_timer_adapter *adptr,
>   return index;
>  }
>  
> +/* Burst mode functions */
> +static inline int
> +timvf_add_entry_brst(struct timvf_ring * const timr, const uint16_t rel_bkt,
> + struct rte_event_timer ** const tim,
> + const struct tim_mem_entry *ents,
> + const uint16_t nb_timers)
> +{
> + int16_t rem;
> + int16_t crem = 0;
> + uint8_t lock_cnt;
> + uint16_t index = 0;
> + uint16_t chunk_remainder = 0;

Looks like above all assignments to 'zero' is unnecessary.



Re: [dpdk-dev] [PATCH v3 09/12] event/octeontx: optimize timer adapter resolution parameters

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:11 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 09/12] event/octeontx: optimize timer adapter
>  resolution parameters
> X-Mailer: git-send-email 2.16.3
> 
> When application sets `RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES` flag
> while creating adapter underlying driver is free to optimize the
> resolution for best possible configuration.
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
>  static int
>  timvf_ring_start(const struct rte_event_timer_adapter *adptr)
>  {
> @@ -217,7 +256,7 @@ timvf_ring_create(struct rte_event_timer_adapter *adptr)
>   }
>  
>   timr->tim_ring_id = adptr->data->id;
> - timr->tck_nsec = rcfg->timer_tick_ns;
> + timr->tck_nsec = RTE_ALIGN_MUL_CEIL(rcfg->timer_tick_ns, 10);
>   timr->max_tout = rcfg->max_tmo_ns;
>   timr->meta.nb_bkts = (timr->max_tout / timr->tck_nsec) + 1;
>   timr->vbar0 = octeontx_timvf_bar(timr->tim_ring_id, 0);
> @@ -227,6 +266,13 @@ timvf_ring_create(struct rte_event_timer_adapter *adptr)
>  
>   timr->nb_chunks = nb_timers / nb_chunk_slots;
>  
> + /* Try to optimize the bucket parameters. */
> + if ((rcfg->flags & RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES)
> + && !rte_is_power_of_2(timr->meta.nb_bkts)) {
> + optimize_bucket_parameters(timr);
> + timvf_log_info("Optimizing configured values");

You could print the adjusted values here.

> + }
> +
>   if (rcfg->flags & RTE_EVENT_TIMER_ADAPTER_F_SP_PUT) {
>   mp_flags = MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET;
>   timvf_log_info("Using single producer mode");
> diff --git a/drivers/event/octeontx/timvf_evdev.h 
> b/drivers/event/octeontx/timvf_evdev.h
> index d8a6d111f..22c8c2266 100644
> --- a/drivers/event/octeontx/timvf_evdev.h
> +++ b/drivers/event/octeontx/timvf_evdev.h
> @@ -192,6 +192,12 @@ bkt_mod(const uint32_t rel_bkt, const uint32_t nb_bkts)
>   return rel_bkt % nb_bkts;
>  }
>  
> +static __rte_always_inline uint32_t __hot

__hot may not be required here as it in as inline function.

> +bkt_and(uint32_t rel_bkt, uint32_t nb_bkts)
> +{
> + return rel_bkt & (nb_bkts - 1);
> +}
> +
>  int timvf_timer_adapter_caps_get(const struct rte_eventdev *dev, uint64_t 
> flags,
>   uint32_t *caps, const struct rte_event_timer_adapter_ops **ops);
>  uint16_t timvf_timer_unreg_burst(const struct rte_event_timer_adapter *adptr,

With above change:
Acked-by: Jerin Jacob 




Re: [dpdk-dev] [PATCH v3 11/12] doc: update eventdev OcteonTx documentation

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:13 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 11/12] doc: update eventdev OcteonTx
>  documentation
> X-Mailer: git-send-email 2.16.3
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
>  doc/guides/eventdevs/octeontx.rst | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/doc/guides/eventdevs/octeontx.rst 
> b/doc/guides/eventdevs/octeontx.rst
> index 4fabe54f9..f4f5473fd 100644
> --- a/doc/guides/eventdevs/octeontx.rst
> +++ b/doc/guides/eventdevs/octeontx.rst
> @@ -28,6 +28,9 @@ Features of the OCTEONTX SSOVF PMD are:
>  - Open system with configurable amount of outstanding events
>  - HW accelerated dequeue timeout support to enable power management
>  - SR-IOV VF
> +- HW managed event timers support through TIMVF, with high precision and
> +  time granularity of 1us.
> +- Upto 64 event timer adapters.

s/Upto/Up to/

>  
>  Supported OCTEONTX SoCs
>  ---
> @@ -110,3 +113,9 @@ Rx adapter support
>  
>  When eth_octeontx is used as Rx adapter event schedule type
>  ``RTE_SCHED_TYPE_PARALLEL`` is not supported.
> +
> +Event timer adapter support
> +~~~
> +
> +When timvf is used as Event timer adapter event schedule type
> +``RTE_SCHED_TYPE_PARALLEL`` is not supported.

You could add the RTE_EVENT_TIMER_ADAPTER_EXT_CLK0..
RTE_EVENT_TIMER_ADAPTER_EXT_CLK3 platform mapping here.


With above changes:
Acked-by: Jerin Jacob 


> -- 
> 2.16.3
> 


Re: [dpdk-dev] [PATCH v3 12/12] maintainers: claim responsibility for octeontx timvf

2018-04-08 Thread Jerin Jacob
-Original Message-
> Date: Tue,  3 Apr 2018 20:35:14 +0530
> From: Pavan Nikhilesh 
> To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
>  erik.g.carri...@intel.com
> Cc: dev@dpdk.org, Pavan Nikhilesh 
> Subject: [dpdk-dev] [PATCH v3 12/12] maintainers: claim responsibility for
>  octeontx timvf
> X-Mailer: git-send-email 2.16.3
> 
> Signed-off-by: Pavan Nikhilesh 
> ---
>  MAINTAINERS | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index d4c0cc1bc..4a4a12d3d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -736,6 +736,10 @@ M: Santosh Shukla 
>  F: drivers/event/octeontx/
>  F: doc/guides/eventdevs/octeontx.rst
>  
> +Cavium OCTEONTX timvf
> +M: Pavan Nikhilesh 
> +F: drivers/event/octeontx/timvf_*

Please squash this patch with previous doc patch.

Acked-by: Jerin Jacob 

> +
>  NXP DPAA2 eventdev
>  M: Hemant Agrawal 
>  M: Nipun Gupta 
> -- 
> 2.16.3
> 


Re: [dpdk-dev] [PATCH v4 0/3] net/mlx5: use Netlink in VF mode

2018-04-08 Thread Shahaf Shuler
Thursday, April 5, 2018 6:07 PM, Nelio Laranjeiro:
> Subject: [dpdk-dev] [PATCH v4 0/3] net/mlx5: use Netlink in VF mode
> 
> When MLX5 behaves in VF mode and the hypervisor have **trusted** this
> VF, to be able to receive specific traffic some requests must be done to
> configure the NIC.  There is no API currently available to do it though Verbs,
> but there is in Linux side using Netlink.
> 
> The specific cases are:
> - Enable/disable promiscuous mode.
> - Enable/disable allmulti mode.
> - Add/remove mac addresses.

Series applied to next-net-mlx, thanks.





Re: [dpdk-dev] [PATCH] doc: add timestamp offload for mlx5 to features table

2018-04-08 Thread Shahaf Shuler
Thursday, April 5, 2018 6:50 PM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH] doc: add timestamp offload for mlx5 to
> features table
> 
> For completeness:
> 
> Fixes: 78c7406b7b5a ("net/mlx5: add Rx HW timestamp")
> Cc: sta...@dpdk.org
> 
> On Mon, Apr 02, 2018 at 10:01:22AM -0700, Yongseok Koh wrote:
> > Signed-off-by: Yongseok Koh 
> 
> Acked-by: Adrien Mazarguil 

Applied to next-net-mlx with Adrien's additions, thanks. 



Re: [dpdk-dev] [PATCH v3 00/12] event/octeontx: add event timer adapter driver

2018-04-08 Thread Pavan Nikhilesh
Hi Jerin,

On Sun, Apr 08, 2018 at 08:25:35AM +0530, Jerin Jacob wrote:
> -Original Message-
> > Date: Tue,  3 Apr 2018 20:35:02 +0530
> > From: Pavan Nikhilesh 
> > To: jerin.ja...@caviumnetworks.com, santosh.shu...@caviumnetworks.com,
> >  erik.g.carri...@intel.com
> > Cc: dev@dpdk.org, Pavan Nikhilesh 
> > Subject: [dpdk-dev] [PATCH v3 00/12] event/octeontx: add event timer
> >  adapter driver
> > X-Mailer: git-send-email 2.16.3
> >
> > The event timer adapter[1] provides APIs to configure an event timer device
> > that allows an application to arm timers which on expiry push events to an
> > event device such as OcteonTx SSO.
> > The OcteonTx TIM is a co-processor that can be configured as an event timer
> > adapter which can be used by an application to manage event timers.
> >
> > The TIM co-processor processes the event timers registered and pushes
> > expired event timers to SSO based on the event queue, schedule type, flow
> > id etc. provided as rte_event while arming the event timer. It maintains
> > event timers with high precision and time granularity of 1us (microsecond).
> >
> > [1] http://dpdk.org/dev/patchwork/patch/33525/
> >
> > This patch set depends on:
> > 1. http://dpdk.org/dev/patchwork/bundle/pbhagavatula/event_timerdev_v9
> > 2. http://dpdk.org/dev/patchwork/patch/35216/
> > 3. http://dpdk.org/dev/patchwork/patch/36089/
> >
> > v3 Changes:
> > - change datapath function return from int to uint16_t.
> > - Add missing state checks while arm/cancel.
> > - add stats.
> >
> > v2 Changes:
> > - Move common code changes to separate patches.
> > - Reduce ifdef clutter.
> > - fix improper return value (erik).
> > - remove redundent ops assignment (erik).
> > - update octeontx documentation.
>
> Please check the following :
>
> 1) It has build issue with 32bit build(gcc 7.3.1)
>
> 2) clang6 shows some crazy link errors, even though following symbol
> has been not used. Looks like we need -latomic
> undefined reference to `__atomic_fetch_add_8'
>
> 3) Fix check-git-log.sh issues
> $./devtools/check-git-log.sh
> Wrong headline lowercase:
>   event/octeontx: add multiproducer timer arm and cancel
>   event/octeontx: add single producer timer arm variant
>   event/octeontx: add burst mode for timer arm

Thanks for the review, will send out v4 with the changes.

Pavan.

>


[dpdk-dev] [PATCH v5] vhost: add support for interrupt mode

2018-04-08 Thread Junjie Chen
In some cases we want vhost dequeue work in interrupt mode to
release cpus to others when no data to transmit. So we install
interrupt handler of vhost device and interrupt vectors for each
rx queue when creating new backend according to vhost intrerupt
configuration. Thus, applications could register a epoll event fd
to associate rx queues with interrupt vectors.

Signed-off-by: Junjie Chen 
---
Changes in v5:
- update license to DPDK new license format
- rebase code to master 
Changes in v4:
- revert back license change
Changes in v3:
- handle failure in the middle of intr setup.
- use vhost API to enable interrupt.
- rebase to check rxq existence.
- update vhost API to support guest notification.
Changes in v2:
- update rx queue index.
- fill efd_counter_size for intr handler.
- update log.
 drivers/net/vhost/rte_eth_vhost.c | 267 +-
 lib/librte_vhost/vhost.c  |  13 +-
 2 files changed, 212 insertions(+), 68 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 11b6076..536e089 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -1,35 +1,8 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright (c) 2016 IGEL Co., Ltd.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in
- *   the documentation and/or other materials provided with the
- *   distribution.
- * * Neither the name of IGEL Co.,Ltd. nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 IGEL Co., Ltd.
+ * Copyright(c) 2016-2018 Intel Corporation
  */
+
 #include 
 #include 
 #include 
@@ -528,10 +501,13 @@ update_queuing_status(struct rte_eth_dev *dev)
unsigned int i;
int allow_queuing = 1;
 
-   if (rte_atomic32_read(&internal->dev_attached) == 0)
+   if (!dev->data->rx_queues || !dev->data->tx_queues) {
+   RTE_LOG(ERR, PMD, "RX/TX queues not setup yet\n");
return;
+   }
 
-   if (rte_atomic32_read(&internal->started) == 0)
+   if (rte_atomic32_read(&internal->started) == 0 ||
+   rte_atomic32_read(&internal->dev_attached) == 0)
allow_queuing = 0;
 
/* Wait until rx/tx_pkt_burst stops accessing vhost device */
@@ -554,25 +530,157 @@ update_queuing_status(struct rte_eth_dev *dev)
}
 }
 
+static int
+eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
+{
+   struct rte_vhost_vring vring;
+   struct vhost_queue *vq;
+   int ret = 0;
+
+   vq = dev->data->rx_queues[qid];
+   if (!vq) {
+   RTE_LOG(ERR, PMD, "rxq%d is not setup yet\n", qid);
+   return -1;
+   }
+
+   ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
+   if (ret < 0) {
+   RTE_LOG(ERR, PMD, "Failed to get rxq%d's vring\n", qid);
+   return ret;
+   }
+   RTE_LOG(INFO, PMD, "Enable interrupt for rxq%d\n", qid);
+   rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
+   rte_wmb();
+
+   return ret;
+}
+
+static int
+eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
+{
+   struct rte_vhost_vring vring;
+   struct vhost_queue *vq;
+   int ret = 0;
+
+   vq = dev->data->rx_queues[qid];
+   if (!vq) {
+   RTE_LOG(ERR, PMD, "rxq%d is not setup yet\n", qid);
+   return -1;
+   }
+
+   ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
+   if (ret < 0) {
+   RTE_LOG(ERR, PMD, "Failed to get rxq%d's vring", qid);
+ 

[dpdk-dev] [PATCH v4 0/4] support Tx generic tunnel checksum and TSO

2018-04-08 Thread Xueming Li
V4: 
- Removed DEV_TX_OFFLOAD_GENERIC_TNL_CKSUM and DEV_TX_OFFLOAD_GENERIC_TNL_TSO
- Replaced with DEV_TX_OFFLOAD_IP_TNL_TSO
- Removed PKT_TX_OUTER_UDP
- Splited PKT_TX_TUNNEL_UNKNOWN into PKT_TX_TUNNEL_IP and PKT_TX_TUNNEL_UDP

V3:
- Add VXLAN-GPE and GRE extention support to testpmd csum forwarding enginee
- Split DEV_TX_OFFLOAD_GENERIC_TNL_CKSUM_TSO into 
DEV_TX_OFFLOAD_GENERIC_TNL_CKSUM
  and DEV_TX_OFFLOAD_GENERIC_TNL_TSO
- Add PKT_TX_TUNNEL_UNKNOWN and PKT_TX_OUTER_UDP

  http://www.dpdk.org/dev/patchwork/patch/34655/


This patchset introduces new HW capability of generic tunnel cheksum and TSO 
offloads, HW supporting generic tunnel checksum and TSO could handle new tunnel 
type offloading w/o upgrading HW. 
New Generic tunnel checksum and TSO offload support IP tunnel and UDP tunnel. 
Please note that tunnel type that has length, sequence id or checksum are not 
considered as generic tunnel type.

Xueming Li (4):
  ethdev: introduce Tx generic tunnel L3/L4 offload
  app/testpmd: testpmd support Tx generic tunnel offloads
  app/testpmd: add more GRE extension to csum engine
  app/testpmd: introduce VXLAN GPE to csum forwarding engine

 app/test-pmd/cmdline.c|  14 -
 app/test-pmd/config.c |  17 +
 app/test-pmd/csumonly.c   | 115 ++
 app/test-pmd/parameters.c |  12 +++-
 app/test-pmd/testpmd.h|   2 +
 doc/guides/testpmd_app_ug/run_app.rst |   5 ++
 lib/librte_ether/rte_ethdev.h |   4 ++
 lib/librte_mbuf/rte_mbuf.c|   6 ++
 lib/librte_mbuf/rte_mbuf.h|   2 +
 9 files changed, 163 insertions(+), 14 deletions(-)

-- 
2.13.3



[dpdk-dev] [PATCH v4 4/4] app/testpmd: introduce VXLAN GPE to csum forwarding engine

2018-04-08 Thread Xueming Li
This patch introduced VXLAN-GPE support to csum forwarding engine by
recognizing VXLAN-GPE UDP port and parsing tunnel payload according to
next-protocol type.

Signed-off-by: Xueming Li 
---
 app/test-pmd/csumonly.c   | 96 +--
 app/test-pmd/parameters.c | 12 -
 app/test-pmd/testpmd.h|  2 +
 doc/guides/testpmd_app_ug/run_app.rst |  5 ++
 4 files changed, 110 insertions(+), 5 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 00ec40d58..d8b214e6d 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -56,6 +56,10 @@
 #define GRE_SUPPORTED_FIELDS   (GRE_CHECKSUM_PRESENT | GRE_KEY_PRESENT |\
 GRE_SEQUENCE_PRESENT)
 
+#define VXLAN_GPE_TYPE_IPv4 1
+#define VXLAN_GPE_TYPE_IPv6 2
+#define VXLAN_GPE_TYPE_ETH 3
+
 /* We cannot use rte_cpu_to_be_16() on a constant in a switch/case */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define _htons(x) ((uint16_t)x) & 0x00ffU) << 8) | (((x) & 0xff00U) >> 8)))
@@ -63,6 +67,8 @@
 #define _htons(x) (x)
 #endif
 
+uint16_t vxlan_gpe_udp_port = 4790;
+
 /* structure that caches offload info for the current packet */
 struct testpmd_offload_info {
uint16_t ethertype;
@@ -87,6 +93,14 @@ struct simple_gre_hdr {
uint16_t proto;
 } __attribute__((__packed__));
 
+/* simplified VXLAN-GPE header */
+struct vxlan_gpe_hdr {
+   uint8_t vx_flags; /**< flag (8). */
+   uint8_t reserved[2]; /**< Reserved (16). */
+   uint8_t proto; /**< next-protocol (8). */
+   uint32_t vx_vni;   /**< VNI (24) + Reserved (8). */
+} __attribute__((__packed__));
+
 static uint16_t
 get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype)
 {
@@ -197,6 +211,70 @@ parse_vxlan(struct udp_hdr *udp_hdr,
info->l2_len += ETHER_VXLAN_HLEN; /* add udp + vxlan */
 }
 
+/* Parse a vxlan-gpe header */
+static void
+parse_vxlan_gpe(struct udp_hdr *udp_hdr,
+   struct testpmd_offload_info *info)
+{
+   struct ether_hdr *eth_hdr;
+   struct ipv4_hdr *ipv4_hdr;
+   struct ipv6_hdr *ipv6_hdr;
+   struct vxlan_gpe_hdr *vxlan_gpe_hdr;
+   uint8_t vxlan_gpe_len = sizeof(*vxlan_gpe_hdr);
+
+   /* check udp destination port, 4790 is the default vxlan-gpe port */
+   if (udp_hdr->dst_port != _htons(vxlan_gpe_udp_port))
+   return;
+
+   vxlan_gpe_hdr = (struct vxlan_gpe_hdr *)((char *)udp_hdr +
+   sizeof(struct udp_hdr));
+
+   if (!vxlan_gpe_hdr->proto || vxlan_gpe_hdr->proto ==
+   VXLAN_GPE_TYPE_IPv4) {
+   info->is_tunnel = 1;
+   info->outer_ethertype = info->ethertype;
+   info->outer_l2_len = info->l2_len;
+   info->outer_l3_len = info->l3_len;
+   info->outer_l4_proto = info->l4_proto;
+
+   ipv4_hdr = (struct ipv4_hdr *)((char *)vxlan_gpe_hdr +
+  vxlan_gpe_len);
+
+   parse_ipv4(ipv4_hdr, info);
+   info->ethertype = _htons(ETHER_TYPE_IPv4);
+   info->l2_len = 0;
+
+   } else if (vxlan_gpe_hdr->proto == VXLAN_GPE_TYPE_IPv6) {
+   info->is_tunnel = 1;
+   info->outer_ethertype = info->ethertype;
+   info->outer_l2_len = info->l2_len;
+   info->outer_l3_len = info->l3_len;
+   info->outer_l4_proto = info->l4_proto;
+
+   ipv6_hdr = (struct ipv6_hdr *)((char *)vxlan_gpe_hdr +
+  vxlan_gpe_len);
+
+   info->ethertype = _htons(ETHER_TYPE_IPv6);
+   parse_ipv6(ipv6_hdr, info);
+   info->l2_len = 0;
+
+   } else if (vxlan_gpe_hdr->proto == VXLAN_GPE_TYPE_ETH) {
+   info->is_tunnel = 1;
+   info->outer_ethertype = info->ethertype;
+   info->outer_l2_len = info->l2_len;
+   info->outer_l3_len = info->l3_len;
+   info->outer_l4_proto = info->l4_proto;
+
+   eth_hdr = (struct ether_hdr *)((char *)vxlan_gpe_hdr +
+ vxlan_gpe_len);
+
+   parse_ethernet(eth_hdr, info);
+   } else
+   return;
+
+   info->l2_len += ETHER_VXLAN_HLEN;
+}
+
 /* Parse a gre header */
 static void
 parse_gre(struct simple_gre_hdr *gre_hdr, struct testpmd_offload_info *info)
@@ -591,6 +669,10 @@ pkt_copy_split(const struct rte_mbuf *pkt)
  *   Ether / (vlan) / IP|IP6 / UDP|TCP|SCTP .
  *   Ether / (vlan) / outer IP|IP6 / outer UDP / VxLAN / Ether / IP|IP6 /
  *   UDP|TCP|SCTP
+ *   Ether / (vlan) / outer IP|IP6 / outer UDP / VXLAN-GPE / Ether / IP|IP6 /
+ *   UDP|TCP|SCTP
+ *   Ether / (vlan) / outer IP|IP6 / outer UDP / VXLAN-GPE / IP|IP6 /
+ *   UDP|TCP|SCTP
  *   Ether / (vlan) / outer IP|IP6 / GRE / Ether / IP|IP6 / UDP|TCP|SCTP
  *   Ether / (vlan) / outer IP|IP6 / GRE / IP|IP6 / UDP|TCP|SCTP
  *   Ether / (vlan) / outer IP|IP6 / IP|IP6 / UDP|

[dpdk-dev] [PATCH v4 2/4] app/testpmd: testpmd support Tx generic tunnel offloads

2018-04-08 Thread Xueming Li
"show port cap" and "csum parse tunnel" command support TX generic
tunnel offloads

Signed-off-by: Xueming Li 
---
 app/test-pmd/cmdline.c  | 14 --
 app/test-pmd/config.c   | 17 +
 app/test-pmd/csumonly.c |  3 ++-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 40b31ad7e..a81112220 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -4013,6 +4013,12 @@ check_tunnel_tso_nic_support(portid_t port_id)
if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_GENEVE_TNL_TSO))
printf("Warning: GENEVE TUNNEL TSO not supported therefore "
   "not enabled for port %d\n", port_id);
+   if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IP_TNL_TSO))
+   printf("Warning: IP TUNNEL TSO not supported therefore "
+  "not enabled for port %d\n", port_id);
+   if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TNL_TSO))
+   printf("Warning: UDP TUNNEL TSO not supported therefore "
+  "not enabled for port %d\n", port_id);
return dev_info;
 }
 
@@ -4040,13 +4046,17 @@ cmd_tunnel_tso_set_parsed(void *parsed_result,
~(DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
  DEV_TX_OFFLOAD_GRE_TNL_TSO |
  DEV_TX_OFFLOAD_IPIP_TNL_TSO |
- DEV_TX_OFFLOAD_GENEVE_TNL_TSO);
+ DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
+ DEV_TX_OFFLOAD_IP_TNL_TSO |
+ DEV_TX_OFFLOAD_UDP_TNL_TSO);
printf("TSO for tunneled packets is disabled\n");
} else {
uint64_t tso_offloads = (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
 DEV_TX_OFFLOAD_GRE_TNL_TSO |
 DEV_TX_OFFLOAD_IPIP_TNL_TSO |
-DEV_TX_OFFLOAD_GENEVE_TNL_TSO);
+DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
+DEV_TX_OFFLOAD_IP_TNL_TSO |
+DEV_TX_OFFLOAD_UDP_TNL_TSO);
 
ports[res->port_id].dev_conf.txmode.offloads |=
(tso_offloads & dev_info.tx_offload_capa);
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 4bb255c62..481d2b62d 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -722,6 +722,23 @@ port_offload_cap_display(portid_t port_id)
printf("off\n");
}
 
+   if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IP_TNL_TSO) {
+   printf("IP tunnel TSO:  ");
+   if (ports[port_id].dev_conf.txmode.offloads &
+   DEV_TX_OFFLOAD_IP_TNL_TSO)
+   printf("on\n");
+   else
+   printf("off\n");
+   }
+
+   if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TNL_TSO) {
+   printf("UDP tunnel TSO:  ");
+   if (ports[port_id].dev_conf.txmode.offloads &
+   DEV_TX_OFFLOAD_UDP_TNL_TSO)
+   printf("on\n");
+   else
+   printf("off\n");
+   }
 }
 
 int
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 5f5ab64aa..7b2309372 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -693,7 +693,8 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
info.l3_len);
parse_vxlan(udp_hdr, &info, m->packet_type);
if (info.is_tunnel)
-   tx_ol_flags |= PKT_TX_TUNNEL_VXLAN;
+   tx_ol_flags |= (PKT_TX_TUNNEL_VXLAN |
+   PKT_TX_OUTER_UDP);
} else if (info.l4_proto == IPPROTO_GRE) {
struct simple_gre_hdr *gre_hdr;
 
-- 
2.13.3



[dpdk-dev] [PATCH v4 3/4] app/testpmd: add more GRE extension to csum engine

2018-04-08 Thread Xueming Li
This patch adds GRE checksum and sequence extension supports in addtion
to key extension to csum forwarding engine.

Signed-off-by: Xueming Li 
---
 app/test-pmd/csumonly.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 7b2309372..00ec40d58 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -49,9 +49,12 @@
 #define IP_HDRLEN  0x05 /* default IP header length == five 32-bits words. */
 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
 
-#define GRE_KEY_PRESENT 0x2000
-#define GRE_KEY_LEN 4
-#define GRE_SUPPORTED_FIELDS GRE_KEY_PRESENT
+#define GRE_CHECKSUM_PRESENT   0x8000
+#define GRE_KEY_PRESENT0x2000
+#define GRE_SEQUENCE_PRESENT   0x1000
+#define GRE_EXT_LEN4
+#define GRE_SUPPORTED_FIELDS   (GRE_CHECKSUM_PRESENT | GRE_KEY_PRESENT |\
+GRE_SEQUENCE_PRESENT)
 
 /* We cannot use rte_cpu_to_be_16() on a constant in a switch/case */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
@@ -203,14 +206,14 @@ parse_gre(struct simple_gre_hdr *gre_hdr, struct 
testpmd_offload_info *info)
struct ipv6_hdr *ipv6_hdr;
uint8_t gre_len = 0;
 
-   /* check which fields are supported */
-   if ((gre_hdr->flags & _htons(~GRE_SUPPORTED_FIELDS)) != 0)
-   return;
-
gre_len += sizeof(struct simple_gre_hdr);
 
if (gre_hdr->flags & _htons(GRE_KEY_PRESENT))
-   gre_len += GRE_KEY_LEN;
+   gre_len += GRE_EXT_LEN;
+   if (gre_hdr->flags & _htons(GRE_SEQUENCE_PRESENT))
+   gre_len += GRE_EXT_LEN;
+   if (gre_hdr->flags & _htons(GRE_CHECKSUM_PRESENT))
+   gre_len += GRE_EXT_LEN;
 
if (gre_hdr->proto == _htons(ETHER_TYPE_IPv4)) {
info->is_tunnel = 1;
@@ -739,6 +742,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
 
/* step 3: fill the mbuf meta data (flags and header lengths) */
 
+   m->tx_offload = 0;
if (info.is_tunnel == 1) {
if (info.tunnel_tso_segsz ||
(tx_offloads &
-- 
2.13.3



[dpdk-dev] [PATCH v4 1/4] ethdev: introduce generic IP/UDP tunnel checksum and TSO

2018-04-08 Thread Xueming Li
This patch introduce new TX offload flags for device that supports
IP or UDP tunneled packet L3/L4 checksum and TSO offload.

The support from the device is for inner and outer checksums on
IPV4/TCP/UDP and TSO for *any packet with the following format*:

 / [optional IPv4/IPv6] / [optional TCP/UDP] /  / [optional inner IPv4/IPv6] / [optional TCP/UDP]

For example the following packets can use this feature:

1. eth / ipv4 / udp / VXLAN / ip / tcp
2. eth / ipv4 / GRE / MPLS / ipv4 / udp

Please note that tunnel headers that contain payload length, sequence id
or checksum will not be updated.

Signed-off-by: Xueming Li 
---
 lib/librte_ether/rte_ethdev.h | 4 
 lib/librte_mbuf/rte_mbuf.c| 6 ++
 lib/librte_mbuf/rte_mbuf.h| 2 ++
 3 files changed, 12 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5c8af16f5..a3ae43cdc 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -980,6 +980,10 @@ struct rte_eth_conf {
  *   the same mempool and has refcnt = 1.
  */
 #define DEV_TX_OFFLOAD_SECURITY 0x0002
+/**< Device supports UDP tunneled packet TSO */
+#define DEV_TX_OFFLOAD_UDP_TNL_TSO  0x0004
+/**< Device supports IP based tunnel packet TSO */
+#define DEV_TX_OFFLOAD_IP_TNL_TSO   0x0008
 
 /*
  * If new Tx offload capabilities are defined, they also must be
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 091d388d3..41e1bc953 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -405,6 +405,8 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
case PKT_TX_TUNNEL_IPIP: return "PKT_TX_TUNNEL_IPIP";
case PKT_TX_TUNNEL_GENEVE: return "PKT_TX_TUNNEL_GENEVE";
case PKT_TX_TUNNEL_MPLSINUDP: return "PKT_TX_TUNNEL_MPLSINUDP";
+   case PKT_TX_TUNNEL_IP: return "PKT_TX_TUNNEL_IP";
+   case PKT_TX_TUNNEL_UDP: return "PKT_TX_TUNNEL_UDP";
case PKT_TX_MACSEC: return "PKT_TX_MACSEC";
case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD";
default: return NULL;
@@ -439,6 +441,10 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t 
buflen)
  "PKT_TX_TUNNEL_NONE" },
{ PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK,
  "PKT_TX_TUNNEL_NONE" },
+   { PKT_TX_TUNNEL_IP, PKT_TX_TUNNEL_MASK,
+ "PKT_TX_TUNNEL_NONE" },
+   { PKT_TX_TUNNEL_UDP, PKT_TX_TUNNEL_MASK,
+ "PKT_TX_TUNNEL_NONE" },
{ PKT_TX_MACSEC, PKT_TX_MACSEC, NULL },
{ PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL },
};
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 62740254d..6a8031c7a 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -210,6 +210,8 @@ extern "C" {
 #define PKT_TX_TUNNEL_GENEVE  (0x4ULL << 45)
 /**< TX packet with MPLS-in-UDP RFC 7510 header. */
 #define PKT_TX_TUNNEL_MPLSINUDP (0x5ULL << 45)
+#define PKT_TX_TUNNEL_IP (0xDULL << 45) /**< Tx IP tunneled packet. */
+#define PKT_TX_TUNNEL_UDP (0xEULL << 45) /**< Tx UDP tunneled packet. */
 /* add new TX TUNNEL type here */
 #define PKT_TX_TUNNEL_MASK(0xFULL << 45)
 
-- 
2.13.3



[dpdk-dev] [PATCH 0/3] mlx5 support Tx generic tunnel checksum and TSO

2018-04-08 Thread Xueming Li
This patchset introduced Tx generic tunnel checksum and TSO offload to mlx5 PMD.

This patchset relies on new ethdev API of:
http://www.dpdk.org/dev/patchwork/patch/37519/

Xueming Li (3):
  net/mlx5: separate TSO function in Tx data path
  net/mlx5: support generic tunnel offloading
  net/mlx5: allow max 192B TSO inline header length

 drivers/net/mlx5/Makefile |   5 +
 drivers/net/mlx5/mlx5.c   |  14 ++-
 drivers/net/mlx5/mlx5.h   |   1 +
 drivers/net/mlx5/mlx5_defs.h  |   2 +-
 drivers/net/mlx5/mlx5_ethdev.c|   5 +-
 drivers/net/mlx5/mlx5_prm.h   |  24 
 drivers/net/mlx5/mlx5_rxtx.c  | 208 --
 drivers/net/mlx5/mlx5_rxtx.h  | 100 
 drivers/net/mlx5/mlx5_rxtx_vec.c  |   9 +-
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |   2 +-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |   2 +-
 drivers/net/mlx5/mlx5_txq.c   |  10 +-
 12 files changed, 289 insertions(+), 93 deletions(-)

-- 
2.13.3



[dpdk-dev] [PATCH 2/3] net/mlx5: support generic tunnel offloading

2018-04-08 Thread Xueming Li
This commit adds support for generic tunnel TSO and checksum offload.
PMD will compute the inner/outer headers offset according to the
mbuf fields. Hardware will do calculation based on offsets and types.

Signed-off-by: Xueming Li 
---
 drivers/net/mlx5/Makefile |   5 ++
 drivers/net/mlx5/mlx5.c   |  14 +++-
 drivers/net/mlx5/mlx5.h   |   1 +
 drivers/net/mlx5/mlx5_ethdev.c|   5 +-
 drivers/net/mlx5/mlx5_prm.h   |  24 +++
 drivers/net/mlx5/mlx5_rxtx.c  | 122 ++
 drivers/net/mlx5/mlx5_rxtx.h  | 100 ++--
 drivers/net/mlx5/mlx5_rxtx_vec.c  |   9 +--
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |   2 +-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |   2 +-
 drivers/net/mlx5/mlx5_txq.c   |  10 ++-
 11 files changed, 234 insertions(+), 60 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 201f6f06a..cc128ef69 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -135,6 +135,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
enum IBV_WQ_FLAG_RX_END_PADDING \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
+   HAVE_IBV_MLX5_MOD_SWP \
+   infiniband/mlx5dv.h \
+   enum MLX5DV_CONTEXT_MASK_SWP \
+   $(AUTOCONF_OUTPUT)
+   $Q sh -- '$<' '$@' \
HAVE_IBV_MLX5_MOD_MPW \
infiniband/mlx5dv.h \
enum MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED \
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7d58d66bb..d886ddd4f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -600,6 +600,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
unsigned int mps;
unsigned int cqe_comp;
unsigned int tunnel_en = 0;
+   unsigned int swp = 0;
int idx;
int i;
struct mlx5dv_context attrs_out = {0};
@@ -667,6 +668,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
}
ibv_dev = list[i];
DRV_LOG(DEBUG, "device opened");
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+   attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+#endif
/*
 * Multi-packet send is supported by ConnectX-4 Lx PF as well
 * as all ConnectX-5 devices.
@@ -687,6 +691,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
DRV_LOG(DEBUG, "MPW isn't supported");
mps = MLX5_MPW_DISABLED;
}
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+   if (attrs_out.comp_mask | MLX5DV_CONTEXT_MASK_SWP)
+   swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
+   DRV_LOG(DEBUG, "SWP support: %u", swp);
+#endif
if (RTE_CACHE_LINE_SIZE == 128 &&
!(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
cqe_comp = 0;
@@ -733,6 +742,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
.txq_inline = MLX5_ARG_UNSET,
.txqs_inline = MLX5_ARG_UNSET,
.inline_max_packet_sz = MLX5_ARG_UNSET,
+   .swp = !!swp,
};
 
len = snprintf(name, sizeof(name), PCI_PRI_FMT,
@@ -1182,8 +1192,10 @@ RTE_INIT(rte_mlx5_pmd_init);
 static void
 rte_mlx5_pmd_init(void)
 {
-   /* Build the static table for ptype conversion. */
+   /* Build the static tables for Verbs conversion. */
mlx5_set_ptype_table();
+   mlx5_set_cksum_table();
+   mlx5_set_swp_types_table();
/*
 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
 * huge pages. Calling ibv_fork_init() during init allows
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index faacfd9d6..b5e5e0b6c 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -87,6 +87,7 @@ struct mlx5_dev_config {
unsigned int tx_vec_en:1; /* Tx vector is enabled. */
unsigned int rx_vec_en:1; /* Rx vector is enabled. */
unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
+   unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */
unsigned int ind_table_max_size; /* Maximum indirection table size. */
int txq_inline; /* Maximum packet size for inlining. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index b6f5101cf..aecfdc1d4 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1063,11 +1063,14 @@ mlx5_select_tx_function(struct rte_eth_dev *dev)
int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
DEV_TX_OFFLOAD_GRE_TNL_TSO));
+   int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
+

[dpdk-dev] [PATCH 1/3] net/mlx5: separate TSO function in Tx data path

2018-04-08 Thread Xueming Li
Separate TSO function to make logic of mlx5_tx_burst clear.

Signed-off-by: Xueming Li 
---
 drivers/net/mlx5/mlx5_rxtx.c | 112 ++-
 1 file changed, 67 insertions(+), 45 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1f422c70b..a9de69131 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -259,6 +259,66 @@ mlx5_copy_to_wq(void *dst, const void *src, size_t n,
 }
 
 /**
+ * Inline TSO headers into WQE.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+static int
+inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
+  uint32_t *length,
+  uint8_t *cs_flags,
+  uintptr_t *addr,
+  uint16_t *pkt_inline_sz,
+  uint8_t **raw,
+  uint16_t *max_wqe,
+  uint16_t *tso_segsz,
+  uint16_t *tso_header_sz)
+{
+   uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
+   (1 << txq->wqe_n) * MLX5_WQE_SIZE);
+   unsigned int copy_b;
+   uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
+   const uint8_t tunneled = txq->tunnel_en &&
+(buf->ol_flags & (PKT_TX_TUNNEL_GRE |
+  PKT_TX_TUNNEL_VXLAN));
+   uint16_t n_wqe;
+
+   *tso_segsz = buf->tso_segsz;
+   *tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
+   if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
+   txq->stats.oerrors++;
+   return -EINVAL;
+   }
+   if (tunneled) {
+   *tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+   *cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
+   } else {
+   *cs_flags |= MLX5_ETH_WQE_L4_CSUM;
+   }
+   if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER)) {
+   txq->stats.oerrors++;
+   return -EINVAL;
+   }
+   copy_b = *tso_header_sz - *pkt_inline_sz;
+   /* First seg must contain all TSO headers. */
+   assert(copy_b <= *length);
+   if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
+   return -EAGAIN;
+   n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
+   if (unlikely(*max_wqe < n_wqe))
+   return -EINVAL;
+   *max_wqe -= n_wqe;
+   rte_memcpy((void *)*raw, (void *)*addr, copy_b);
+   *length -= copy_b;
+   *addr += copy_b;
+   copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
+   *pkt_inline_sz += copy_b;
+   *raw += copy_b;
+   return 0;
+}
+
+/**
  * DPDK callback to check the status of a tx descriptor.
  *
  * @param tx_queue
@@ -392,6 +452,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
 #ifdef MLX5_PMD_SOFT_COUNTERS
uint32_t total_length = 0;
 #endif
+   int ret;
 
/* first_seg */
buf = *pkts;
@@ -457,52 +518,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
raw += MLX5_WQE_DWORD_SIZE;
tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
if (tso) {
-   uintptr_t end =
-   (uintptr_t)(((uintptr_t)txq->wqes) +
-   (1 << txq->wqe_n) * MLX5_WQE_SIZE);
-   unsigned int copy_b;
-   uint8_t vlan_sz =
-   (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
-   const uint64_t is_tunneled =
-   buf->ol_flags & (PKT_TX_TUNNEL_GRE |
-PKT_TX_TUNNEL_VXLAN);
-
-   tso_header_sz = buf->l2_len + vlan_sz +
-   buf->l3_len + buf->l4_len;
-   tso_segsz = buf->tso_segsz;
-   if (unlikely(tso_segsz == 0)) {
-   txq->stats.oerrors++;
+   ret = inline_tso(txq, buf, &length, &cs_flags,
+&addr, &pkt_inline_sz,
+&raw, &max_wqe,
+&tso_segsz, &tso_header_sz);
+   if (ret == -EINVAL) {
break;
-   }
-   if (is_tunneled && txq->tunnel_en) {
-   tso_header_sz += buf->outer_l2_len +
-buf->outer_l3_len;
-   cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
-   } else {
-   cs_flags |= MLX5_ETH_WQE_L4_CSUM;
-   }
-   if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) {
-   txq->stats.oerrors++;
-   break;
-  

[dpdk-dev] [PATCH 3/3] net/mlx5: allow max 192B TSO inline header length

2018-04-08 Thread Xueming Li
Change max inline header length to 192B to allow IPv6 VXLAN TSO headers
and header with options that more than 128B.

Signed-off-by: Xueming Li 
---
 drivers/net/mlx5/mlx5_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 6401588ee..851166ed9 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -58,7 +58,7 @@
 #define MLX5_MAX_XSTATS 32
 
 /* Maximum Packet headers size (L2+L3+L4) for TSO. */
-#define MLX5_MAX_TSO_HEADER 128
+#define MLX5_MAX_TSO_HEADER 192
 
 /* Default minimum number of Tx queues for vectorized Tx. */
 #define MLX5_VPMD_MIN_TXQS 4
-- 
2.13.3



[dpdk-dev] [PATCH v4 01/13] compressdev: add basic device management

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Add basic functions to manage compress devices,
including driver and device allocation, and the basic
interface with compressdev PMDs.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 MAINTAINERS|   7 +
 config/common_base |   6 +
 config/rte_config.h|   3 +
 doc/api/doxy-api-index.md  |   1 +
 doc/api/doxy-api.conf  |   1 +
 doc/guides/rel_notes/release_18_05.rst |   6 +
 lib/Makefile   |   3 +
 lib/librte_compressdev/Makefile|  28 ++
 lib/librte_compressdev/meson.build |   8 +
 lib/librte_compressdev/rte_compressdev.c   | 464 +
 lib/librte_compressdev/rte_compressdev.h   | 267 
 lib/librte_compressdev/rte_compressdev_pmd.c   | 157 +++
 lib/librte_compressdev/rte_compressdev_pmd.h   | 283 +
 lib/librte_compressdev/rte_compressdev_version.map |  31 ++
 lib/meson.build|   2 +-
 mk/rte.app.mk  |   1 +
 16 files changed, 1267 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_compressdev/Makefile
 create mode 100644 lib/librte_compressdev/meson.build
 create mode 100644 lib/librte_compressdev/rte_compressdev.c
 create mode 100644 lib/librte_compressdev/rte_compressdev.h
 create mode 100644 lib/librte_compressdev/rte_compressdev_pmd.c
 create mode 100644 lib/librte_compressdev/rte_compressdev_pmd.h
 create mode 100644 lib/librte_compressdev/rte_compressdev_version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 1081d71aa..75f13f92e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -325,6 +325,13 @@ F: drivers/raw/skeleton_rawdev/
 F: test/test/test_rawdev.c
 F: doc/guides/prog_guide/rawdev.rst
 
+Compression API - EXPERIMENTAL
+M: Fiona Trahe 
+M: Pablo de Lara 
+M: Ashish Gupta 
+T: git://dpdk.org/next/dpdk-next-crypto
+F: lib/librte_compressdev/
+
 
 Bus Drivers
 ---
diff --git a/config/common_base b/config/common_base
index ad4706267..f40354487 100644
--- a/config/common_base
+++ b/config/common_base
@@ -543,6 +543,12 @@ CONFIG_RTE_LIBRTE_PMD_MRVL_CRYPTO_DEBUG=n
 #
 CONFIG_RTE_LIBRTE_SECURITY=y
 
+#
+# Compile generic compression device library
+#
+CONFIG_RTE_LIBRTE_COMPRESSDEV=y
+CONFIG_RTE_COMPRESS_MAX_DEVS=64
+
 #
 # Compile generic event device library
 #
diff --git a/config/rte_config.h b/config/rte_config.h
index db6ceb6cd..949071f6e 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -51,6 +51,9 @@
 #define RTE_CRYPTO_MAX_DEVS 64
 #define RTE_CRYPTODEV_NAME_LEN 64
 
+/* compressdev defines */
+#define RTE_COMPRESS_MAX_DEVS 64
+
 /* eventdev defines */
 #define RTE_EVENT_MAX_DEVS 16
 #define RTE_EVENT_MAX_QUEUES_PER_DEV 64
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index d77f205bb..530808e9d 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -45,6 +45,7 @@ The public API headers are grouped by topics:
   [bbdev]  (@ref rte_bbdev.h),
   [cryptodev]  (@ref rte_cryptodev.h),
   [security]   (@ref rte_security.h),
+  [compressdev](@ref rte_compressdev.h),
   [eventdev]   (@ref rte_eventdev.h),
   [event_eth_rx_adapter]   (@ref rte_event_eth_rx_adapter.h),
   [rawdev] (@ref rte_rawdev.h),
diff --git a/doc/api/doxy-api.conf b/doc/api/doxy-api.conf
index cda52fdfb..06432c3aa 100644
--- a/doc/api/doxy-api.conf
+++ b/doc/api/doxy-api.conf
@@ -45,6 +45,7 @@ INPUT   = doc/api/doxy-api-index.md \
   lib/librte_cfgfile \
   lib/librte_cmdline \
   lib/librte_compat \
+  lib/librte_compressdev \
   lib/librte_cryptodev \
   lib/librte_distributor \
   lib/librte_efd \
diff --git a/doc/guides/rel_notes/release_18_05.rst 
b/doc/guides/rel_notes/release_18_05.rst
index 7c428439e..77d3c1bf8 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -70,6 +70,11 @@ New Features
 
   * AES-CMAC (128-bit key).
 
+* **Added compressdev API, a generic compression service library.**
+
+  The compressdev library provides an API for offload of compression and
+  decompression operations to hardware or software accelerator devices.
+
 
 API Changes
 ---
@@ -155,6 +160,7 @@ The libraries prepended with a plus sign were incremented 
in this version.
  librte_cfgfile.so.2
  librte_cmdline.so.2
+ librte_common_octeontx.so.1
+   + librte_compressdev.so.1
  librte_cryptodev.so.4
  librte_distributor.so.1
+ librte_eal.so.7
diff --git a/lib/Makefile b/lib/Makefile
index ec965a606..19396daff

[dpdk-dev] [PATCH v4 00/13] Implement compression API

2018-04-08 Thread Pablo de Lara
With the vast amounts of data being transported around networks
and stored in storage systems, reducing data size is becoming ever more 
important.

There are both software libraries and hardware devices available that
provide compression, but no common API.
Such an API is proposed in this patchset, which supports the following features:

- Deflate Algorithm (https://tools.ietf.org/html/rfc1951)
- LZS algorithm (https://tools.ietf.org/html/rfc2395)
- Static and Dynamic Huffman encoding.
- Compression levels
- Checksum generation
- Asynchronous burst API
- private_xform - a place for PMDs to hold private data derived from
  a xform and used by stateless operations.
- stream - a place for PMDs to hold private data derived from
  a xform and also maintain state and history data. For
  stateful flows.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara pablo.de.lara.gua...@intel.com
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 

Opens:
 - creation of private_xform and stream mempools. In v3, config
   added to give PMD the size params needed to create mempools.
   Still open as to whether the pools should be created by
   application, in API layer or by PMDs. Expect to resolve in a v4. 
 - alternative to mbufs for passing data between application
   and accelerator. A lot of varied opinions on this, expect to
   address in a future release.
 - addition of feature to add capability to PMD to allow more than one
   inflight operation from a stateful stream - to be added by NXP 

Changes in v4:
 - Fix build (missing ";")
 - Change order in common_base, so compression is placed after security

Changes in v3:
 - Remove rte_comp_op_ctod helper functions
 - Remove param_range_check macro
 - Rename from phys_addr to iova_addr
 - Remove rte_comp_algo_strings
 - Rename rte_compressdev_pmd_is_valid_dev to rte_compressdev_is_valid_dev
 - Remove feature flags from compressdev
 - Support hash operations
 - Add shareable priv xform in feature flags, instead of returnin it
   on xform creation
 - Allow max number of queue pairs to be 0, meaning that there is no
   limit.
 - Add invalid configuration checks
 - Add capability helper functions

Changes in v2:
 - Add stream APIs
 - Remove session
 - Add SHAREABLE / NON_SHAREABLE private_xform types
 - Add algo enum 'UNSPECIFIED' to fix warning in capabilities
 - Change one remaining log to use dynamic logging.
 - Add rte_cache_aligned keyword to op
 - Rename enums with better names _ALGO, __CHECKSUM, _HUFFMAN_
 - Use const keyword when passing xform
 - Remove qp_count fn from dev_ops as never used
 - Remove max_nb_queue-pairs from compressdev_init_param as never used
 - Clarify device configure and start sequence
 - Replace OUT_OF_SPACE with OUT_OF_SPACE_RECOVERABLE and TERMINATED
   and clarified usage. 
 - Add stream and private_xform sizes to device config for use in
   mempool creation
 - Add capability helper fn
 - Use Base2 log value for window size on xforms
 - Add Meson build
 - Update MAINTAINERS
 - Update Map file
 - Change order in doxy file
 - Update Release note 

Fiona Trahe (12):
  compressdev: add basic device management
  compressdev: add queue pair management
  compressdev: add compression specific data
  compressdev: add enqueue/dequeue functions
  compressdev: add operation management
  compressdev: support stateless operations
  compressdev: support stateful operations
  compressdev: add device feature flags
  compressdev: add compression service feature flags
  compressdev: add device stats
  compressdev: add device capabilities
  compressdev: get device id from name

Shally Verma (1):
  compressdev: support hash operations

 MAINTAINERS|   7 +
 config/common_base |   6 +
 config/rte_config.h|   3 +
 doc/api/doxy-api-index.md  |   1 +
 doc/api/doxy-api.conf  |   1 +
 doc/guides/rel_notes/release_18_05.rst |   6 +
 lib/Makefile   |   3 +
 lib/librte_compressdev/Makefile|  29 +
 lib/librte_compressdev/meson.build |   9 +
 lib/librte_compressdev/rte_comp.h  | 539 
 lib/librte_compressdev/rte_compressdev.c   | 902 +
 lib/librte_compressdev/rte_compressdev.h   | 701 
 lib/librte_compressdev/rte_compressdev_pmd.c   | 157 
 lib/librte_compressdev/rte_compressdev_pmd.h   | 437 ++
 lib/librte_compressdev/rte_compressdev_version.map |  44 +
 lib/meson.build|   2 +-
 mk/rte.app.mk  |   1 +
 17 files changed, 2847 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_compressdev/Makefile
 create mode 100644 lib/librte_compressdev/meson.build
 create mode 100644 lib/librte_compressdev/rte_comp.h
 create mode 100644 lib/librte_compressdev/rte_c

[dpdk-dev] [PATCH v4 02/13] compressdev: add queue pair management

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Add functions to manage device queue pairs.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 135 +
 lib/librte_compressdev/rte_compressdev.h   |  50 
 lib/librte_compressdev/rte_compressdev_pmd.h   |  47 +++
 lib/librte_compressdev/rte_compressdev_version.map |   2 +
 4 files changed, 234 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index d635953b2..c90e4beaf 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -266,10 +266,100 @@ rte_compressdev_pmd_release_device(struct 
rte_compressdev *compressdev)
return 0;
 }
 
+uint16_t __rte_experimental
+rte_compressdev_queue_pair_count(uint8_t dev_id)
+{
+   struct rte_compressdev *dev;
+
+   dev = &rte_comp_devices[dev_id];
+   return dev->data->nb_queue_pairs;
+}
+
+static int
+rte_compressdev_queue_pairs_config(struct rte_compressdev *dev,
+   uint16_t nb_qpairs, int socket_id)
+{
+   struct rte_compressdev_info dev_info;
+   void **qp;
+   unsigned int i;
+
+   if ((dev == NULL) || (nb_qpairs < 1)) {
+   COMPRESSDEV_LOG(ERR, "invalid param: dev %p, nb_queues %u",
+   dev, nb_qpairs);
+   return -EINVAL;
+   }
+
+   COMPRESSDEV_LOG(DEBUG, "Setup %d queues pairs on device %u",
+   nb_qpairs, dev->data->dev_id);
+
+   memset(&dev_info, 0, sizeof(struct rte_compressdev_info));
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+   (*dev->dev_ops->dev_infos_get)(dev, &dev_info);
+
+   if ((dev_info.max_nb_queue_pairs != 0) &&
+   (nb_qpairs > dev_info.max_nb_queue_pairs)) {
+   COMPRESSDEV_LOG(ERR, "Invalid num queue_pairs (%u) for dev %u",
+   nb_qpairs, dev->data->dev_id);
+   return -EINVAL;
+   }
+
+   if (dev->data->queue_pairs == NULL) { /* first time configuration */
+   dev->data->queue_pairs = rte_zmalloc_socket(
+   "compressdev->queue_pairs",
+   sizeof(dev->data->queue_pairs[0]) * nb_qpairs,
+   RTE_CACHE_LINE_SIZE, socket_id);
+
+   if (dev->data->queue_pairs == NULL) {
+   dev->data->nb_queue_pairs = 0;
+   COMPRESSDEV_LOG(ERR,
+   "failed to get memory for qp meta data, nb_queues %u",
+   nb_qpairs);
+   return -(ENOMEM);
+   }
+   } else { /* re-configure */
+   int ret;
+   uint16_t old_nb_queues = dev->data->nb_queue_pairs;
+
+   qp = dev->data->queue_pairs;
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_release,
+   -ENOTSUP);
+
+   for (i = nb_qpairs; i < old_nb_queues; i++) {
+   ret = (*dev->dev_ops->queue_pair_release)(dev, i);
+   if (ret < 0)
+   return ret;
+   }
+
+   qp = rte_realloc(qp, sizeof(qp[0]) * nb_qpairs,
+   RTE_CACHE_LINE_SIZE);
+   if (qp == NULL) {
+   COMPRESSDEV_LOG(ERR,
+   "failed to realloc qp meta data, nb_queues %u",
+   nb_qpairs);
+   return -(ENOMEM);
+   }
+
+   if (nb_qpairs > old_nb_queues) {
+   uint16_t new_qs = nb_qpairs - old_nb_queues;
+
+   memset(qp + old_nb_queues, 0,
+   sizeof(qp[0]) * new_qs);
+   }
+
+   dev->data->queue_pairs = qp;
+
+   }
+   dev->data->nb_queue_pairs = nb_qpairs;
+   return 0;
+}
+
 int __rte_experimental
 rte_compressdev_configure(uint8_t dev_id, struct rte_compressdev_config 
*config)
 {
struct rte_compressdev *dev;
+   int diag;
 
if (!rte_compressdev_is_valid_dev(dev_id)) {
COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id);
@@ -286,6 +376,16 @@ rte_compressdev_configure(uint8_t dev_id, struct 
rte_compressdev_config *config)
 
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
 
+   /* Setup new number of queue pairs and reconfigure device. */
+   diag = rte_compressdev_queue_pairs_config(dev, config->nb_queue_pairs,
+   config->socket_id);
+   if (diag != 0) {
+   COMPRESSDEV_LOG(ERR,
+   "dev%d rte_comp_dev_queue_pairs_config = %d",
+   dev_id, diag);
+

[dpdk-dev] [PATCH v4 07/13] compressdev: support stateful operations

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Added stream data (stream) in compression operation,
which will contain the private data from each PMD
to support stateful operations.
Also, added functions to create/free this data.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 49 +
 lib/librte_compressdev/rte_compressdev.h   | 50 ++
 lib/librte_compressdev/rte_compressdev_pmd.h   | 41 ++
 lib/librte_compressdev/rte_compressdev_version.map |  2 +
 4 files changed, 142 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index 3843a6bbf..433973aaa 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -578,6 +578,55 @@ rte_compressdev_private_xform_free(uint8_t dev_id, void 
*priv_xform)
return 0;
 }
 
+int __rte_experimental
+rte_compressdev_stream_create(uint8_t dev_id,
+   const struct rte_comp_xform *xform,
+   void **stream)
+{
+   struct rte_compressdev *dev;
+   int ret;
+
+   dev = rte_compressdev_pmd_get_dev(dev_id);
+
+   if (xform == NULL || dev == NULL || stream == NULL)
+   return -EINVAL;
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stream_create, -ENOTSUP);
+   ret = (*dev->dev_ops->stream_create)(dev, xform, stream);
+   if (ret < 0) {
+   COMPRESSDEV_LOG(ERR,
+   "dev_id %d failed to create stream: err=%d",
+   dev_id, ret);
+   return ret;
+   };
+
+   return 0;
+}
+
+
+int __rte_experimental
+rte_compressdev_stream_free(uint8_t dev_id, void *stream)
+{
+   struct rte_compressdev *dev;
+   int ret;
+
+   dev = rte_compressdev_pmd_get_dev(dev_id);
+
+   if (dev == NULL || stream == NULL)
+   return -EINVAL;
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stream_free, -ENOTSUP);
+   ret = dev->dev_ops->stream_free(dev, stream);
+   if (ret < 0) {
+   COMPRESSDEV_LOG(ERR,
+   "dev_id %d failed to free stream: err=%d",
+   dev_id, ret);
+   return ret;
+   };
+
+   return 0;
+}
+
 /** Initialise rte_comp_op mempool element */
 static void
 rte_comp_op_init(struct rte_mempool *mempool,
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index 917c0d764..0e148dd30 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -127,6 +127,8 @@ struct rte_compressdev_config {
/**< Total number of queue pairs to configure on a device */
uint16_t max_nb_priv_xforms;
/**< Max number of private_xforms which will be created on the device */
+   uint16_t max_nb_streams;
+   /**< Max number of streams which will be created on the device */
 };
 
 /**
@@ -411,6 +413,54 @@ rte_compressdev_enqueue_burst(uint8_t dev_id, uint16_t 
qp_id,
dev->data->queue_pairs[qp_id], ops, nb_ops);
 }
 
+/**
+ * This should alloc a stream from the device's mempool and initialise it.
+ * The application should call this API when setting up for the stateful
+ * processing of a set of data on a device. The API can be called multiple
+ * times to set up a stream for each data set. The handle returned is only for
+ * use with ops of op_type STATEFUL and must be passed to the PMD
+ * with every op in the data stream
+ *
+ * @param dev_id
+ *   Compress device identifier
+ * @param xform
+ *   xform data
+ * @param stream
+ *   Pointer to where PMD's private stream handle should be stored
+ *
+ * @return
+ *  - 0 if successful and valid stream handle
+ *  - <0 in error cases
+ *  - Returns -EINVAL if input parameters are invalid.
+ *  - Returns -ENOTSUP if comp device does not support STATEFUL operations.
+ *  - Returns -ENOTSUP if comp device does not support the comp transform.
+ *  - Returns -ENOMEM if the private stream could not be allocated.
+ *
+ */
+int __rte_experimental
+rte_compressdev_stream_create(uint8_t dev_id,
+   const struct rte_comp_xform *xform,
+   void **stream);
+
+/**
+ * This should clear the stream and return it to the device's mempool.
+ *
+ * @param dev_id
+ *   Compress device identifier
+ *
+ * @param stream
+ *   PMD's private stream data
+ *
+ * @return
+ *  - 0 if successful
+ *  - <0 in error cases
+ *  - Returns -EINVAL if input parameters are invalid.
+ *  - Returns -ENOTSUP if comp device does not support STATEFUL operations.
+ *  - Returns -EBUSY if can't free stream as there are inflight operations
+ */
+int __rte_experimental
+rte_compressdev_stream_free(uint8_t dev_id, void *stream);
+
 /**
  * This should alloc a private_xform from the device's mempool and initialise
  * it. The application should call this A

[dpdk-dev] [PATCH v4 08/13] compressdev: support hash operations

2018-04-08 Thread Pablo de Lara
From: Shally Verma 

- Added hash algo enumeration and params in xform and rte_comp_op
- Updated compress/decompress xform to input hash algorithm
- Updated struct rte_comp_op to input hash buffer

User in capability query will know about support hashes via
device info comp_feature_flag. If supported, application can initialize
desired algorithm enumeration in xform structure and pass valid hash
buffer during enqueue_burst().

Signed-off-by: Shally Verma 
Signed-off-by: Sunila Sahu 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_comp.h | 32 
 1 file changed, 32 insertions(+)

diff --git a/lib/librte_compressdev/rte_comp.h 
b/lib/librte_compressdev/rte_comp.h
index 4cf84c5db..055f2ee75 100644
--- a/lib/librte_compressdev/rte_comp.h
+++ b/lib/librte_compressdev/rte_comp.h
@@ -55,6 +55,17 @@ enum rte_comp_algorithm {
RTE_COMP_ALGO_LIST_END
 };
 
+/** Compression Hash Algorithms */
+enum rte_comp_hash_algorithm {
+   RTE_COMP_HASH_ALGO_UNSPECIFIED = 0,
+   /**< No hash */
+   RTE_COMP_HASH_ALGO_SHA1,
+   /**< SHA1 hash algorithm */
+   RTE_COMP_HASH_ALGO_SHA2_256,
+   /**< SHA256 hash algorithm of SHA2 family */
+   RTE_COMP_HASH_ALGO_LIST_END
+};
+
 /**< Compression Level.
  * The number is interpreted by each PMD differently. However, lower numbers
  * give fastest compression, at the expense of compression ratio while
@@ -162,6 +173,10 @@ struct rte_comp_compress_xform {
 */
enum rte_comp_checksum_type chksum;
/**< Type of checksum to generate on the uncompressed data */
+   enum rte_comp_hash_algorithm hash_algo;
+   /**< Hash algorithm to be used with compress operation. Hash is always
+* done on plaintext.
+*/
 };
 
 /**
@@ -177,6 +192,10 @@ struct rte_comp_decompress_xform {
 * compressed data. If window size can't be supported by the PMD then
 * setup of stream or private_xform should fail.
 */
+   enum rte_comp_hash_algorithm hash_algo;
+   /**< Hash algorithm to be used with decompress operation. Hash is always
+* done on plaintext.
+*/
 };
 
 /**
@@ -268,6 +287,19 @@ struct rte_comp_op {
 * decompress direction.
 */
} dst;
+   struct {
+   uint8_t *digest;
+   /**< Output buffer to store hash output, if enabled in xform.
+* Buffer would contain valid value only after an op with
+* flush flag = RTE_COMP_FLUSH_FULL/FLUSH_FINAL is processed
+* successfully.
+*
+* Length of buffer should be contiguous and large enough to
+* accommodate digest produced by specific hash algo.
+*/
+   rte_iova_t iova_addr;
+   /**< IO address of the buffer */
+   } hash;
enum rte_comp_flush_flag flush_flag;
/**< Defines flush characteristics for the output data.
 * Only applicable in compress direction
-- 
2.14.3



[dpdk-dev] [PATCH v4 05/13] compressdev: add operation management

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Added functions to allocate and free compression operations.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_comp.h  | 195 +
 lib/librte_compressdev/rte_compressdev.c   |  72 
 lib/librte_compressdev/rte_compressdev_version.map |   1 +
 3 files changed, 268 insertions(+)

diff --git a/lib/librte_compressdev/rte_comp.h 
b/lib/librte_compressdev/rte_comp.h
index cf0f3c999..4cf84c5db 100644
--- a/lib/librte_compressdev/rte_comp.h
+++ b/lib/librte_compressdev/rte_comp.h
@@ -305,6 +305,201 @@ struct rte_comp_op {
 */
 } __rte_cache_aligned;
 
+
+/**
+ * Reset the fields of an operation to their default values.
+ *
+ * @note The private data associated with the operation is not zeroed.
+ *
+ * @param op
+ *   The operation to be reset
+ */
+static inline void
+__rte_comp_op_reset(struct rte_comp_op *op)
+{
+   struct rte_mempool *tmp_mp = op->mempool;
+   rte_iova_t tmp_iova_addr = op->iova_addr;
+
+   memset(op, 0, sizeof(struct rte_comp_op));
+   op->status = RTE_COMP_OP_STATUS_NOT_PROCESSED;
+   op->iova_addr = tmp_iova_addr;
+   op->mempool = tmp_mp;
+}
+
+/**
+ * Private data structure belonging to an operation pool.
+ */
+struct rte_comp_op_pool_private {
+   uint16_t user_size;
+   /**< Size of private user data with each operation. */
+};
+
+
+/**
+ * Returns the size of private user data allocated with each object in
+ * the mempool
+ *
+ * @param mempool
+ *   Mempool for operations
+ * @return
+ *   user data size
+ */
+static inline uint16_t
+__rte_comp_op_get_user_data_size(struct rte_mempool *mempool)
+{
+   struct rte_comp_op_pool_private *priv =
+   (struct rte_comp_op_pool_private *)rte_mempool_get_priv(mempool);
+
+   return priv->user_size;
+}
+
+
+/**
+ * Creates an operation pool
+ *
+ * @param name
+ *   Compress pool name
+ * @param nb_elts
+ *   Number of elements in pool
+ * @param cache_size
+ *   Number of elements to cache on lcore, see
+ *   *rte_mempool_create* for further details about cache size
+ * @param user_size
+ *   Size of private data to allocate for user with each operation
+ * @param socket_id
+ *   Socket to identifier allocate memory on
+ * @return
+ *  - On success pointer to mempool
+ *  - On failure NULL
+ */
+struct rte_mempool *
+rte_comp_op_pool_create(const char *name,
+   unsigned int nb_elts, unsigned int cache_size,
+   uint16_t user_size, int socket_id);
+
+/**
+ * Bulk allocate raw element from mempool and return as comp operations
+ *
+ * @param mempool
+ *   Compress operation mempool
+ * @param ops
+ *   Array to place allocated operations
+ * @param nb_ops
+ *   Number of operations to allocate
+ * @return
+ * - On success returns  number of ops allocated
+ */
+static inline int
+__rte_comp_op_raw_bulk_alloc(struct rte_mempool *mempool,
+   struct rte_comp_op **ops, uint16_t nb_ops)
+{
+   if (rte_mempool_get_bulk(mempool, (void **)ops, nb_ops) == 0)
+   return nb_ops;
+
+   return 0;
+}
+
+/**
+ * Allocate an operation from a mempool with default parameters set
+ *
+ * @param mempool
+ *   Compress operation mempool
+ *
+ * @return
+ * - On success returns a valid rte_comp_op structure
+ * - On failure returns NULL
+ */
+static inline struct rte_comp_op *
+rte_comp_op_alloc(struct rte_mempool *mempool)
+{
+   struct rte_comp_op *op = NULL;
+   int retval;
+
+   retval = __rte_comp_op_raw_bulk_alloc(mempool, &op, 1);
+   if (unlikely(retval != 1))
+   return NULL;
+
+   __rte_comp_op_reset(op);
+
+   return op;
+}
+
+
+/**
+ * Bulk allocate operations from a mempool with default parameters set
+ *
+ * @param mempool
+ *   Compress operation mempool
+ * @param ops
+ *   Array to place allocated operations
+ * @param nb_ops
+ *   Number of operations to allocate
+ * @return
+ * - nb_ops if the number of operations requested were allocated.
+ * - 0 if the requested number of ops are not available.
+ *   None are allocated in this case.
+ */
+static inline unsigned
+rte_comp_op_bulk_alloc(struct rte_mempool *mempool,
+   struct rte_comp_op **ops, uint16_t nb_ops)
+{
+   int i;
+
+   if (unlikely(__rte_comp_op_raw_bulk_alloc(mempool, ops, nb_ops)
+   != nb_ops))
+   return 0;
+
+   for (i = 0; i < nb_ops; i++)
+   __rte_comp_op_reset(ops[i]);
+
+   return nb_ops;
+}
+
+
+
+/**
+ * Returns a pointer to the private user data of an operation if
+ * that operation has enough capacity for requested size.
+ *
+ * @param op
+ *   Compress operation
+ * @param size
+ *   Size of space requested in private data
+ * @return
+ * - if sufficient space available returns pointer to start of user data
+ * - if insufficient space returns NULL
+ */
+static inline void *
+__rte_comp_op_get_

[dpdk-dev] [PATCH v4 06/13] compressdev: support stateless operations

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Added private transform data (priv_xform) in compression
operation, which will contain the private data from each
PMD to support stateless operations.
Also, added functions to create/free this data.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 48 ++
 lib/librte_compressdev/rte_compressdev.h   | 48 ++
 lib/librte_compressdev/rte_compressdev_pmd.h   | 40 ++
 lib/librte_compressdev/rte_compressdev_version.map |  2 +
 4 files changed, 138 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index 0dab92650..3843a6bbf 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -530,6 +530,54 @@ rte_compressdev_info_get(uint8_t dev_id, struct 
rte_compressdev_info *dev_info)
dev_info->driver_name = dev->device->driver->name;
 }
 
+int __rte_experimental
+rte_compressdev_private_xform_create(uint8_t dev_id,
+   const struct rte_comp_xform *xform,
+   void **priv_xform)
+{
+   struct rte_compressdev *dev;
+   int ret;
+
+   dev = rte_compressdev_pmd_get_dev(dev_id);
+
+   if (xform == NULL || priv_xform == NULL || dev == NULL)
+   return -EINVAL;
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->private_xform_create, -ENOTSUP);
+   ret = (*dev->dev_ops->private_xform_create)(dev, xform, priv_xform);
+   if (ret < 0) {
+   COMPRESSDEV_LOG(ERR,
+   "dev_id %d failed to create private_xform: err=%d",
+   dev_id, ret);
+   return ret;
+   };
+
+   return 0;
+}
+
+int __rte_experimental
+rte_compressdev_private_xform_free(uint8_t dev_id, void *priv_xform)
+{
+   struct rte_compressdev *dev;
+   int ret;
+
+   dev = rte_compressdev_pmd_get_dev(dev_id);
+
+   if (dev == NULL || priv_xform == NULL)
+   return -EINVAL;
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->private_xform_free, -ENOTSUP);
+   ret = dev->dev_ops->private_xform_free(dev, priv_xform);
+   if (ret < 0) {
+   COMPRESSDEV_LOG(ERR,
+   "dev_id %d failed to free private xform: err=%d",
+   dev_id, ret);
+   return ret;
+   };
+
+   return 0;
+}
+
 /** Initialise rte_comp_op mempool element */
 static void
 rte_comp_op_init(struct rte_mempool *mempool,
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index 06f9ee135..917c0d764 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -125,6 +125,8 @@ struct rte_compressdev_config {
/**< Socket on which to allocate resources */
uint16_t nb_queue_pairs;
/**< Total number of queue pairs to configure on a device */
+   uint16_t max_nb_priv_xforms;
+   /**< Max number of private_xforms which will be created on the device */
 };
 
 /**
@@ -409,6 +411,52 @@ rte_compressdev_enqueue_burst(uint8_t dev_id, uint16_t 
qp_id,
dev->data->queue_pairs[qp_id], ops, nb_ops);
 }
 
+/**
+ * This should alloc a private_xform from the device's mempool and initialise
+ * it. The application should call this API when setting up for stateless
+ * processing on a device. If it returns non-shareable, then the appl cannot
+ * share this handle with multiple in-flight ops and should call this API again
+ * to get a separate handle for every in-flight op.
+ * The handle returned is only valid for use with ops of op_type STATELESS.
+ *
+ * @param dev_id
+ *   Compress device identifier
+ * @param xform
+ *   xform data
+ * @param private_xform
+ *   Pointer to where PMD's private_xform handle should be stored
+ *
+ * @return
+ *  - if successful returns 0
+ *and valid private_xform handle
+ *  - <0 in error cases
+ *  - Returns -EINVAL if input parameters are invalid.
+ *  - Returns -ENOTSUP if comp device does not support the comp transform.
+ *  - Returns -ENOMEM if the private_xform could not be allocated.
+ */
+int __rte_experimental
+rte_compressdev_private_xform_create(uint8_t dev_id,
+   const struct rte_comp_xform *xform,
+   void **private_xform);
+
+/**
+ * This should clear the private_xform and return it to the device's mempool.
+ *
+ * @param dev_id
+ *   Compress device identifier
+ *
+ * @param private_xform
+ *   PMD's private_xform data
+ *
+ * @return
+ *  - 0 if successful
+ *  - <0 in error cases
+ *  - Returns -EINVAL if input parameters are invalid.
+ *  - Returns -EBUSY if can't free private_xform due to inflight operations
+ */
+int __rte_experimental
+rte_compressdev_private_xform_free(uint8_t dev_id, void *private_xform);
+
 /**
  * Provide driver identifier.
  *
diff --git a/li

[dpdk-dev] [PATCH v4 03/13] compressdev: add compression specific data

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Added structures and enums specific to compression,
including the compression operation structure and the
different supported algorithms, checksums and compression
levels.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/Makefile|   1 +
 lib/librte_compressdev/meson.build |   3 +-
 lib/librte_compressdev/rte_comp.h  | 312 +
 3 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_compressdev/rte_comp.h

diff --git a/lib/librte_compressdev/Makefile b/lib/librte_compressdev/Makefile
index 5f67ab817..6f1546afd 100644
--- a/lib/librte_compressdev/Makefile
+++ b/lib/librte_compressdev/Makefile
@@ -19,6 +19,7 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_kvargs
 SRCS-y += rte_compressdev.c rte_compressdev_pmd.c
 
 # export include files
+SYMLINK-y-include += rte_comp.h
 SYMLINK-y-include += rte_compressdev.h
 SYMLINK-y-include += rte_compressdev_pmd.h
 
diff --git a/lib/librte_compressdev/meson.build 
b/lib/librte_compressdev/meson.build
index fc5eaf009..a72d4cea5 100644
--- a/lib/librte_compressdev/meson.build
+++ b/lib/librte_compressdev/meson.build
@@ -4,5 +4,6 @@
 allow_experimental_apis = true
 sources = files('rte_compressdev.c', 'rte_compressdev_pmd.c')
 headers = files('rte_compressdev.h',
-   'rte_compressdev_pmd.h')
+   'rte_compressdev_pmd.h',
+   'rte_comp.h')
 deps += ['kvargs', 'mbuf']
diff --git a/lib/librte_compressdev/rte_comp.h 
b/lib/librte_compressdev/rte_comp.h
new file mode 100644
index 0..cf0f3c999
--- /dev/null
+++ b/lib/librte_compressdev/rte_comp.h
@@ -0,0 +1,312 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef _RTE_COMP_H_
+#define _RTE_COMP_H_
+
+/**
+ * @file rte_comp.h
+ *
+ * RTE definitions for Data Compression Service
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include 
+#include 
+
+/** Status of comp operation */
+enum rte_comp_op_status {
+   RTE_COMP_OP_STATUS_SUCCESS = 0,
+   /**< Operation completed successfully */
+   RTE_COMP_OP_STATUS_NOT_PROCESSED,
+   /**< Operation has not yet been processed by the device */
+   RTE_COMP_OP_STATUS_INVALID_ARGS,
+   /**< Operation failed due to invalid arguments in request */
+   RTE_COMP_OP_STATUS_ERROR,
+   /**< Error handling operation */
+   RTE_COMP_OP_STATUS_INVALID_STATE,
+   /**< Operation is invoked in invalid state */
+   RTE_COMP_OP_STATUS_OUT_OF_SPACE,
+   /**< Output buffer ran out of space before operation completed */
+};
+
+/** Compression Algorithms */
+enum rte_comp_algorithm {
+   RTE_COMP_ALGO_UNSPECIFIED = 0,
+   /** No Compression algorithm */
+   RTE_COMP_ALGO_NULL,
+   /**< No compression.
+* Pass-through, data is copied unchanged from source buffer to
+* destination buffer.
+*/
+   RTE_COMP_ALGO_DEFLATE,
+   /**< DEFLATE compression algorithm
+* https://tools.ietf.org/html/rfc1951
+*/
+   RTE_COMP_ALGO_LZS,
+   /**< LZS compression algorithm
+* https://tools.ietf.org/html/rfc2395
+*/
+   RTE_COMP_ALGO_LIST_END
+};
+
+/**< Compression Level.
+ * The number is interpreted by each PMD differently. However, lower numbers
+ * give fastest compression, at the expense of compression ratio while
+ * higher numbers may give better compression ratios but are likely slower.
+ */
+#defineRTE_COMP_LEVEL_PMD_DEFAULT  (-1)
+/** Use PMD Default */
+#defineRTE_COMP_LEVEL_NONE (0)
+/** Output uncompressed blocks if supported by the specified algorithm */
+#define RTE_COMP_LEVEL_MIN (1)
+/** Use minimum compression level supported by the PMD */
+#define RTE_COMP_LEVEL_MAX (9)
+/** Use maximum compression level supported by the PMD */
+
+/** Compression checksum types */
+enum rte_comp_checksum_type {
+   RTE_COMP_CHECKSUM_NONE,
+   /**< No checksum generated */
+   RTE_COMP_CHECKSUM_CRC32,
+   /**< Generates a CRC32 checksum, as used by gzip */
+   RTE_COMP_CHECKSUM_ADLER32,
+   /**< Generates an Adler-32 checksum, as used by zlib */
+   RTE_COMP_CHECKSUM_CRC32_ADLER32,
+   /**< Generates both Adler-32 and CRC32 checksums, concatenated.
+* CRC32 is in the lower 32bits, Adler-32 in the upper 32 bits.
+*/
+};
+
+
+/** Compression Huffman Type - used by DEFLATE algorithm */
+enum rte_comp_huffman {
+   RTE_COMP_HUFFMAN_DEFAULT,
+   /**< PMD may choose which Huffman codes to use */
+   RTE_COMP_HUFFMAN_FIXED,
+   /**< Use Fixed Huffman codes */
+   RTE_COMP_HUFFMAN_DYNAMIC,
+   /**< Use Dynamic Huffman codes */
+};
+
+enum rte_comp_flush_flag {
+   RTE_COMP_FLUSH_NONE,
+   /**< Data is not flushed. Output may remain in the compressor and be
+* processed during a following op. It may not be po

[dpdk-dev] [PATCH v4 04/13] compressdev: add enqueue/dequeue functions

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.h | 121 +++
 1 file changed, 121 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index 883a5c273..06f9ee135 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -241,8 +241,23 @@ rte_compressdev_queue_pair_count(uint8_t dev_id);
 void __rte_experimental
 rte_compressdev_info_get(uint8_t dev_id, struct rte_compressdev_info 
*dev_info);
 
+
+typedef uint16_t (*compress_dequeue_pkt_burst_t)(void *qp,
+   struct rte_comp_op **ops, uint16_t nb_ops);
+/**< Dequeue processed packets from queue pair of a device. */
+
+typedef uint16_t (*compress_enqueue_pkt_burst_t)(void *qp,
+   struct rte_comp_op **ops, uint16_t nb_ops);
+/**< Enqueue packets for processing on queue pair of a device. */
+
+
 /** The data structure associated with each comp device. */
 struct rte_compressdev {
+   compress_dequeue_pkt_burst_t dequeue_burst;
+   /**< Pointer to PMD receive function */
+   compress_enqueue_pkt_burst_t enqueue_burst;
+   /**< Pointer to PMD transmit function */
+
struct rte_compressdev_data *data;
/**< Pointer to device data */
struct rte_compressdev_ops *dev_ops;
@@ -288,6 +303,112 @@ struct rte_compressdev_data {
 
 struct rte_compressdev *rte_compressdevs;
 
+/**
+ *
+ * Dequeue a burst of processed compression operations from a queue on the comp
+ * device. The dequeued operation are stored in *rte_comp_op* structures
+ * whose pointers are supplied in the *ops* array.
+ *
+ * The rte_compressdev_dequeue_burst() function returns the number of ops
+ * actually dequeued, which is the number of *rte_comp_op* data structures
+ * effectively supplied into the *ops* array.
+ *
+ * A return value equal to *nb_ops* indicates that the queue contained
+ * at least *nb_ops* operations, and this is likely to signify that other
+ * processed operations remain in the devices output queue. Applications
+ * implementing a "retrieve as many processed operations as possible" policy
+ * can check this specific case and keep invoking the
+ * rte_compressdev_dequeue_burst() function until a value less than
+ * *nb_ops* is returned.
+ *
+ * The rte_compressdev_dequeue_burst() function does not provide any error
+ * notification to avoid the corresponding overhead.
+ *
+ * Note: operation ordering is not maintained within the queue pair.
+ *
+ * @param dev_id
+ *   Compress device identifier
+ * @param qp_id
+ *   The index of the queue pair from which to retrieve
+ *   processed operations. The value must be in the range
+ *   [0, nb_queue_pair - 1] previously supplied to
+ *   rte_compressdev_configure()
+ * @param ops
+ *   The address of an array of pointers to
+ *   *rte_comp_op* structures that must be
+ *   large enough to store *nb_ops* pointers in it
+ * @param nb_ops
+ *   The maximum number of operations to dequeue
+ * @return
+ *   - The number of operations actually dequeued, which is the number
+ *   of pointers to *rte_comp_op* structures effectively supplied to the
+ *   *ops* array.
+ */
+static inline uint16_t
+rte_compressdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id,
+   struct rte_comp_op **ops, uint16_t nb_ops)
+{
+   struct rte_compressdev *dev = &rte_compressdevs[dev_id];
+
+   nb_ops = (*dev->dequeue_burst)
+   (dev->data->queue_pairs[qp_id], ops, nb_ops);
+
+   return nb_ops;
+}
+
+/**
+ * Enqueue a burst of operations for processing on a compression device.
+ *
+ * The rte_compressdev_enqueue_burst() function is invoked to place
+ * comp operations on the queue *qp_id* of the device designated by
+ * its *dev_id*.
+ *
+ * The *nb_ops* parameter is the number of operations to process which are
+ * supplied in the *ops* array of *rte_comp_op* structures.
+ *
+ * The rte_compressdev_enqueue_burst() function returns the number of
+ * operations it actually enqueued for processing. A return value equal to
+ * *nb_ops* means that all packets have been enqueued.
+ *
+ * @note All compression operations are Out-of-place (OOP) operations,
+ * as the size of the output data is different to the size of the input data.
+ *
+ * @note The flush flag only applies to operations which return SUCCESS.
+ * In OUT_OF_SPACE case whether STATEFUL or STATELESS, data in dest buffer
+ * is as if flush flag was FLUSH_NONE.
+ * @note flush flag only applies in compression direction. It has no meaning
+ * for decompression.
+ * @note: operation ordering is not maintained within the queue pair.
+ *
+ * @param dev_id
+ *   Compress device identifier
+ * @param qp_id
+ *   The index of the queue pair on which operations
+ *   are to be enqueued for processing. The value
+ *   must be in the range [0, nb_queue_pa

[dpdk-dev] [PATCH v4 09/13] compressdev: add device feature flags

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 21 ++
 lib/librte_compressdev/rte_compressdev.h   | 33 ++
 lib/librte_compressdev/rte_compressdev_version.map |  1 +
 3 files changed, 55 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index 433973aaa..b962ccf23 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -56,6 +56,27 @@ static struct rte_compressdev_global compressdev_globals = {
 
 struct rte_compressdev_global *rte_compressdev_globals = &compressdev_globals;
 
+const char * __rte_experimental
+rte_compressdev_get_feature_name(uint64_t flag)
+{
+   switch (flag) {
+   case RTE_COMPDEV_FF_HW_ACCELERATED:
+   return "HW_ACCELERATED";
+   case RTE_COMPDEV_FF_CPU_SSE:
+   return "CPU_SSE";
+   case RTE_COMPDEV_FF_CPU_AVX:
+   return "CPU_AVX";
+   case RTE_COMPDEV_FF_CPU_AVX2:
+   return "CPU_AVX2";
+   case RTE_COMPDEV_FF_CPU_AVX512:
+   return "CPU_AVX512";
+   case RTE_COMPDEV_FF_CPU_NEON:
+   return "CPU_NEON";
+   default:
+   return NULL;
+   }
+}
+
 struct rte_compressdev * __rte_experimental
 rte_compressdev_pmd_get_dev(uint8_t dev_id)
 {
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index 0e148dd30..72c46dd96 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -34,10 +34,43 @@ extern int compressdev_logtype;
 #define RTE_COMPRESSDEV_NAME_MAX_LEN   (64)
 /**< Max length of name of comp PMD */
 
+/**
+ * compression device supported feature flags
+ *
+ * @note New features flags should be added to the end of the list
+ *
+ * Keep these flags synchronised with rte_compressdev_get_feature_name()
+ */
+#defineRTE_COMPDEV_FF_HW_ACCELERATED   (1ULL << 0)
+/**< Operations are off-loaded to an external hardware accelerator */
+#defineRTE_COMPDEV_FF_CPU_SSE  (1ULL << 1)
+/**< Utilises CPU SIMD SSE instructions */
+#defineRTE_COMPDEV_FF_CPU_AVX  (1ULL << 2)
+/**< Utilises CPU SIMD AVX instructions */
+#defineRTE_COMPDEV_FF_CPU_AVX2 (1ULL << 3)
+/**< Utilises CPU SIMD AVX2 instructions */
+#defineRTE_COMPDEV_FF_CPU_AVX512   (1ULL << 4)
+/**< Utilises CPU SIMD AVX512 instructions */
+#defineRTE_COMPDEV_FF_CPU_NEON (1ULL << 5)
+/**< Utilises CPU NEON instructions */
+
+/**
+ * Get the name of a compress device feature flag.
+ *
+ * @param flag
+ *   The mask describing the flag
+ *
+ * @return
+ *   The name of this flag, or NULL if it's not a valid feature flag.
+ */
+const char * __rte_experimental
+rte_compressdev_get_feature_name(uint64_t flag);
+
 /**  comp device information */
 struct rte_compressdev_info {
const char *driver_name;/**< Driver name. */
uint8_t driver_id;  /**< Driver identifier */
+   uint64_t feature_flags; /**< Feature flags */
uint16_t max_nb_queue_pairs;
/**< Maximum number of queues pairs supported by device.
 * (If 0, there is no limit in maximum number of queue pairs)
diff --git a/lib/librte_compressdev/rte_compressdev_version.map 
b/lib/librte_compressdev/rte_compressdev_version.map
index 58cb5205a..01ddc7a07 100644
--- a/lib/librte_compressdev/rte_compressdev_version.map
+++ b/lib/librte_compressdev/rte_compressdev_version.map
@@ -12,6 +12,7 @@ EXPERIMENTAL {
rte_compressdev_devices_get;
rte_compressdev_driver_id_get;
rte_compressdev_driver_name_get;
+   rte_compressdev_get_feature_name;
rte_compressdev_info_get;
rte_compressdev_is_valid_dev;
rte_compressdev_name_get;
-- 
2.14.3



[dpdk-dev] [PATCH v4 12/13] compressdev: add device capabilities

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Added structure which each PMD will fill out,
providing the capabilities of each driver
(containing mainly which compression services
it supports).

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 23 ++
 lib/librte_compressdev/rte_compressdev.h   | 35 ++
 lib/librte_compressdev/rte_compressdev_version.map |  1 +
 3 files changed, 59 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index 15811cd6e..4435801fd 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -56,6 +56,29 @@ static struct rte_compressdev_global compressdev_globals = {
 
 struct rte_compressdev_global *rte_compressdev_globals = &compressdev_globals;
 
+const struct rte_compressdev_capabilities * __rte_experimental
+rte_compressdev_capability_get(uint8_t dev_id,
+   enum rte_comp_algorithm algo)
+{
+   const struct rte_compressdev_capabilities *capability;
+   struct rte_compressdev_info dev_info;
+   int i = 0;
+
+   if (dev_id >= compressdev_globals.nb_devs) {
+   COMPRESSDEV_LOG(ERR, "Invalid dev_id=%d", dev_id);
+   return NULL;
+   }
+   rte_compressdev_info_get(dev_id, &dev_info);
+
+   while ((capability = &dev_info.capabilities[i++])->algo !=
+   RTE_COMP_ALGO_UNSPECIFIED){
+   if (capability->algo == algo)
+   return capability;
+   }
+
+   return NULL;
+}
+
 const char * __rte_experimental
 rte_compressdev_get_feature_name(uint64_t flag)
 {
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index f8aab528c..e9cb212ec 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -34,6 +34,39 @@ extern int compressdev_logtype;
 #define RTE_COMPRESSDEV_NAME_MAX_LEN   (64)
 /**< Max length of name of comp PMD */
 
+/**
+ * Parameter log base 2 range description.
+ * Final value will be 2^value.
+ */
+struct rte_param_log2_range {
+   uint8_t min;/**< Minimum log2 value */
+   uint8_t max;/**< Maximum log2 value */
+   uint8_t increment;
+   /**< If a range of sizes are supported,
+* this parameter is used to indicate
+* increments in base 2 log byte value
+* that are supported between the minimum and maximum
+*/
+};
+
+/** Structure used to capture a capability of a comp device */
+struct rte_compressdev_capabilities {
+   enum rte_comp_algorithm algo;
+   /* Compression algorithm */
+   uint64_t comp_feature_flags;
+   /**< Bitmask of flags for compression service features */
+   struct rte_param_log2_range window_size;
+   /**< Window size range in base two log byte values */
+};
+
+/** Macro used at end of comp PMD list */
+#define RTE_COMP_END_OF_CAPABILITIES_LIST() \
+   { RTE_COMP_ALGO_UNSPECIFIED }
+
+const struct rte_compressdev_capabilities * __rte_experimental
+rte_compressdev_capability_get(uint8_t dev_id,
+   enum rte_comp_algorithm algo);
+
 /**
  * compression device supported feature flags
  *
@@ -116,6 +149,8 @@ struct rte_compressdev_info {
const char *driver_name;/**< Driver name. */
uint8_t driver_id;  /**< Driver identifier */
uint64_t feature_flags; /**< Feature flags */
+   const struct rte_compressdev_capabilities *capabilities;
+   /**< Array of devices supported capabilities */
uint16_t max_nb_queue_pairs;
/**< Maximum number of queues pairs supported by device.
 * (If 0, there is no limit in maximum number of queue pairs)
diff --git a/lib/librte_compressdev/rte_compressdev_version.map 
b/lib/librte_compressdev/rte_compressdev_version.map
index 7bdc58a38..dec73fcff 100644
--- a/lib/librte_compressdev/rte_compressdev_version.map
+++ b/lib/librte_compressdev/rte_compressdev_version.map
@@ -5,6 +5,7 @@ EXPERIMENTAL {
rte_compressdev_allocate_driver;
rte_compressdev_callback_register;
rte_compressdev_callback_unregister;
+   rte_compressdev_capability_get;
rte_compressdev_close;
rte_compressdev_configure;
rte_compressdev_count;
-- 
2.14.3



[dpdk-dev] [PATCH v4 10/13] compressdev: add compression service feature flags

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 31 +++
 lib/librte_compressdev/rte_compressdev.h   | 45 ++
 lib/librte_compressdev/rte_compressdev_version.map |  1 +
 3 files changed, 77 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index b962ccf23..43a26747b 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -77,6 +77,37 @@ rte_compressdev_get_feature_name(uint64_t flag)
}
 }
 
+const char * __rte_experimental
+rte_comp_get_feature_name(uint64_t flag)
+{
+   switch (flag) {
+   case RTE_COMP_FF_STATEFUL_COMPRESSION:
+   return "STATEFUL_COMPRESSION";
+   case RTE_COMP_FF_STATEFUL_DECOMPRESSION:
+   return "STATEFUL_DECOMPRESSION";
+   case RTE_COMP_FF_MBUF_SCATTER_GATHER:
+   return "MBUF_SCATTER_GATHER";
+   case RTE_COMP_FF_MULTI_PKT_CHECKSUM:
+   return "MULTI_PKT_CHECKSUM";
+   case RTE_COMP_FF_ADLER32_CHECKSUM:
+   return "ADLER32_CHECKSUM";
+   case RTE_COMP_FF_CRC32_CHECKSUM:
+   return "CRC32_CHECKSUM";
+   case RTE_COMP_FF_CRC32_ADLER32_CHECKSUM:
+   return "CRC32_ADLER32_CHECKSUM";
+   case RTE_COMP_FF_NONCOMPRESSED_BLOCKS:
+   return "NONCOMPRESSED_BLOCKS";
+   case RTE_COMP_FF_SHA1_HASH:
+   return "SHA1_HASH";
+   case RTE_COMP_FF_SHA2_SHA256_HASH:
+   return "SHA2_SHA256_HASH";
+   case RTE_COMP_FF_SHAREABLE_PRIV_XFORM:
+   return "SHAREABLE_PRIV_XFORM";
+   default:
+   return NULL;
+   }
+}
+
 struct rte_compressdev * __rte_experimental
 rte_compressdev_pmd_get_dev(uint8_t dev_id)
 {
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index 72c46dd96..02a6c531c 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -54,6 +54,39 @@ extern int compressdev_logtype;
 #defineRTE_COMPDEV_FF_CPU_NEON (1ULL << 5)
 /**< Utilises CPU NEON instructions */
 
+/**
+ * compression service feature flags
+ *
+ * @note New features flags should be added to the end of the list
+ *
+ * Keep these flags synchronised with rte_comp_get_feature_name()
+ */
+#define RTE_COMP_FF_STATEFUL_COMPRESSION   (1ULL << 0)
+/**< Stateful compression is supported */
+#define RTE_COMP_FF_STATEFUL_DECOMPRESSION (1ULL << 1)
+/**< Stateful decompression is supported */
+#defineRTE_COMP_FF_MBUF_SCATTER_GATHER (1ULL << 2)
+/**< Scatter-gather mbufs are supported */
+#define RTE_COMP_FF_ADLER32_CHECKSUM   (1ULL << 3)
+/**< Adler-32 Checksum is supported */
+#define RTE_COMP_FF_CRC32_CHECKSUM (1ULL << 4)
+/**< CRC32 Checksum is supported */
+#define RTE_COMP_FF_CRC32_ADLER32_CHECKSUM (1ULL << 5)
+/**< Adler-32/CRC32 Checksum is supported */
+#define RTE_COMP_FF_MULTI_PKT_CHECKSUM (1ULL << 6)
+/**< Generation of checksum across multiple stateless packets is supported */
+#define RTE_COMP_FF_SHA1_HASH  (1ULL << 7)
+/**< SHA1 Hash is supported */
+#define RTE_COMP_FF_SHA2_SHA256_HASH   (1ULL << 8)
+/**< SHA256 Hash of SHA2 family is supported */
+#define RTE_COMP_FF_NONCOMPRESSED_BLOCKS   (1ULL << 9)
+/**< Creation of non-compressed blocks using RTE_COMP_LEVEL_NONE is supported 
*/
+#define RTE_COMP_FF_SHAREABLE_PRIV_XFORM   (1ULL << 10)
+/**< Private xforms created by the PMD can be shared
+ * across multiple stateless operations. If not set, then app needs
+ * to create as many priv_xforms as many expected in flight.
+ */
+
 /**
  * Get the name of a compress device feature flag.
  *
@@ -66,6 +99,18 @@ extern int compressdev_logtype;
 const char * __rte_experimental
 rte_compressdev_get_feature_name(uint64_t flag);
 
+/**
+ * Get the name of a compress service feature flag
+ *
+ * @param flag
+ *   The mask describing the flag
+ *
+ * @return
+ *   The name of this flag, or NULL if it's not a valid feature flag.
+ */
+const char * __rte_experimental
+rte_comp_get_feature_name(uint64_t flag);
+
 /**  comp device information */
 struct rte_compressdev_info {
const char *driver_name;/**< Driver name. */
diff --git a/lib/librte_compressdev/rte_compressdev_version.map 
b/lib/librte_compressdev/rte_compressdev_version.map
index 01ddc7a07..f538b9f57 100644
--- a/lib/librte_compressdev/rte_compressdev_version.map
+++ b/lib/librte_compressdev/rte_compressdev_version.map
@@ -33,6 +33,7 @@ EXPERIMENTAL {
rte_compressdev_stop;
rte_compressdev_stream_create;
rte_compressdev_stream_free;
+   rte_comp_get_feature_name;
rte_comp_op_pool_create;
 
 local: *;
-- 
2.14.3



[dpdk-dev] [PATCH v4 13/13] compressdev: get device id from name

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Added API to retrieve the device id provided the device name.

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 18 ++
 lib/librte_compressdev/rte_compressdev.h   | 13 +
 lib/librte_compressdev/rte_compressdev_version.map |  1 +
 3 files changed, 32 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index 4435801fd..4ad13a0ec 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -173,6 +173,24 @@ rte_compressdev_is_valid_dev(uint8_t dev_id)
 }
 
 
+int __rte_experimental
+rte_compressdev_get_dev_id(const char *name)
+{
+   unsigned int i;
+
+   if (name == NULL)
+   return -1;
+
+   for (i = 0; i < rte_compressdev_globals->nb_devs; i++)
+   if ((strcmp(rte_compressdev_globals->devs[i].data->name, name)
+   == 0) &&
+   (rte_compressdev_globals->devs[i].attached ==
+   RTE_COMPRESSDEV_ATTACHED))
+   return i;
+
+   return -1;
+}
+
 uint8_t __rte_experimental
 rte_compressdev_count(void)
 {
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index e9cb212ec..1d1a072ef 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -170,6 +170,19 @@ struct rte_compressdev_stats {
/**< Total error count on operations dequeued */
 };
 
+
+/**
+ * Get the device identifier for the named compress device.
+ *
+ * @param name
+ *   Device name to select the device structure
+ * @return
+ *   - Returns compress device identifier on success.
+ *   - Return -1 on failure to find named compress device.
+ */
+int __rte_experimental
+rte_compressdev_get_dev_id(const char *name);
+
 /**
  * Get the compress device name given a device identifier.
  *
diff --git a/lib/librte_compressdev/rte_compressdev_version.map 
b/lib/librte_compressdev/rte_compressdev_version.map
index dec73fcff..46bdda88b 100644
--- a/lib/librte_compressdev/rte_compressdev_version.map
+++ b/lib/librte_compressdev/rte_compressdev_version.map
@@ -13,6 +13,7 @@ EXPERIMENTAL {
rte_compressdev_devices_get;
rte_compressdev_driver_id_get;
rte_compressdev_driver_name_get;
+   rte_compressdev_get_dev_id;
rte_compressdev_get_feature_name;
rte_compressdev_info_get;
rte_compressdev_is_valid_dev;
-- 
2.14.3



[dpdk-dev] [PATCH v4 11/13] compressdev: add device stats

2018-04-08 Thread Pablo de Lara
From: Fiona Trahe 

Signed-off-by: Fiona Trahe 
Signed-off-by: Pablo de Lara 
Signed-off-by: Shally Verma 
Signed-off-by: Ashish Gupta 
---
 lib/librte_compressdev/rte_compressdev.c   | 41 ++
 lib/librte_compressdev/rte_compressdev.h   | 39 
 lib/librte_compressdev/rte_compressdev_pmd.h   | 26 ++
 lib/librte_compressdev/rte_compressdev_version.map |  2 ++
 4 files changed, 108 insertions(+)

diff --git a/lib/librte_compressdev/rte_compressdev.c 
b/lib/librte_compressdev/rte_compressdev.c
index 43a26747b..15811cd6e 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -562,6 +562,47 @@ rte_compressdev_queue_pair_setup(uint8_t dev_id, uint16_t 
queue_pair_id,
max_inflight_ops, socket_id);
 }
 
+
+int __rte_experimental
+rte_compressdev_stats_get(uint8_t dev_id, struct rte_compressdev_stats *stats)
+{
+   struct rte_compressdev *dev;
+
+   if (!rte_compressdev_is_valid_dev(dev_id)) {
+   COMPRESSDEV_LOG(ERR, "Invalid dev_id=%d", dev_id);
+   return -ENODEV;
+   }
+
+   if (stats == NULL) {
+   COMPRESSDEV_LOG(ERR, "Invalid stats ptr");
+   return -EINVAL;
+   }
+
+   dev = &rte_comp_devices[dev_id];
+   memset(stats, 0, sizeof(*stats));
+
+   RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
+   (*dev->dev_ops->stats_get)(dev, stats);
+   return 0;
+}
+
+void __rte_experimental
+rte_compressdev_stats_reset(uint8_t dev_id)
+{
+   struct rte_compressdev *dev;
+
+   if (!rte_compressdev_is_valid_dev(dev_id)) {
+   COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id);
+   return;
+   }
+
+   dev = &rte_comp_devices[dev_id];
+
+   RTE_FUNC_PTR_OR_RET(*dev->dev_ops->stats_reset);
+   (*dev->dev_ops->stats_reset)(dev);
+}
+
+
 void __rte_experimental
 rte_compressdev_info_get(uint8_t dev_id, struct rte_compressdev_info *dev_info)
 {
diff --git a/lib/librte_compressdev/rte_compressdev.h 
b/lib/librte_compressdev/rte_compressdev.h
index 02a6c531c..f8aab528c 100644
--- a/lib/librte_compressdev/rte_compressdev.h
+++ b/lib/librte_compressdev/rte_compressdev.h
@@ -122,6 +122,19 @@ struct rte_compressdev_info {
 */
 };
 
+/** comp device statistics */
+struct rte_compressdev_stats {
+   uint64_t enqueued_count;
+   /**< Count of all operations enqueued */
+   uint64_t dequeued_count;
+   /**< Count of all operations dequeued */
+
+   uint64_t enqueue_err_count;
+   /**< Total error count on operations enqueued */
+   uint64_t dequeue_err_count;
+   /**< Total error count on operations dequeued */
+};
+
 /**
  * Get the compress device name given a device identifier.
  *
@@ -306,6 +319,32 @@ rte_compressdev_queue_pair_setup(uint8_t dev_id, uint16_t 
queue_pair_id,
 uint16_t __rte_experimental
 rte_compressdev_queue_pair_count(uint8_t dev_id);
 
+
+/**
+ * Retrieve the general I/O statistics of a device.
+ *
+ * @param dev_id
+ *   The identifier of the device
+ * @param stats
+ *   A pointer to a structure of type
+ *   *rte_compressdev_stats* to be filled with the
+ *   values of device counters
+ * @return
+ *   - Zero if successful.
+ *   - Non-zero otherwise.
+ */
+int __rte_experimental
+rte_compressdev_stats_get(uint8_t dev_id, struct rte_compressdev_stats *stats);
+
+/**
+ * Reset the general I/O statistics of a device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ */
+void __rte_experimental
+rte_compressdev_stats_reset(uint8_t dev_id);
+
 /**
  * Retrieve the contextual information of a device.
  *
diff --git a/lib/librte_compressdev/rte_compressdev_pmd.h 
b/lib/librte_compressdev/rte_compressdev_pmd.h
index 59bcb688c..e08c69e36 100644
--- a/lib/librte_compressdev/rte_compressdev_pmd.h
+++ b/lib/librte_compressdev/rte_compressdev_pmd.h
@@ -140,6 +140,27 @@ typedef void (*compressdev_stop_t)(struct rte_compressdev 
*dev);
 typedef int (*compressdev_close_t)(struct rte_compressdev *dev);
 
 
+/**
+ * Function used to get statistics of a device.
+ *
+ * @param dev
+ *   Compress device
+ * @param stats
+ *   Compress device stats to populate
+ */
+typedef void (*compressdev_stats_get_t)(struct rte_compressdev *dev,
+   struct rte_compressdev_stats *stats);
+
+
+/**
+ * Function used to reset statistics of a device.
+ *
+ * @param dev
+ *   Compress device
+ */
+typedef void (*compressdev_stats_reset_t)(struct rte_compressdev *dev);
+
+
 /**
  * Function used to get specific information of a device.
  *
@@ -271,6 +292,11 @@ struct rte_compressdev_ops {
 
compressdev_info_get_t dev_infos_get;   /**< Get device info. */
 
+   compressdev_stats_get_t stats_get;
+   /**< Get device statistics. */
+   compressdev_stats_reset_t stats_reset;
+   /**< Reset device statistics. */
+
compressdev_queue_pair_setup_t queue_pair_set

Re: [dpdk-dev] [PATCH] net/mlx5: fix link status initialization

2018-04-08 Thread Shahaf Shuler
Thursday, April 5, 2018 9:51 AM, Nélio Laranjeiro:
> Subject: Re: [PATCH] net/mlx5: fix link status initialization
> 
> On Thu, Apr 05, 2018 at 05:35:57AM +, Shahaf Shuler wrote:
> > Wednesday, April 4, 2018 3:11 PM, Nélio Laranjeiro:
> > > Subject: Re: [PATCH] net/mlx5: fix link status initialization
> > >
> > > On Wed, Apr 04, 2018 at 09:58:33AM +, Shahaf Shuler wrote:
> > > > Wednesday, April 4, 2018 10:30 AM, Nélio Laranjeiro:
> > > > > Subject: Re: [PATCH] net/mlx5: fix link status initialization
> > > > >
> > > > > On Tue, Apr 03, 2018 at 07:48:17AM +0300, Shahaf Shuler wrote:

[..]

> > >
> > > According to your analysis, this is only necessary when the LCS is
> > > configured in the device.  Why not adding this call to
> > > mlx5_dev_interrupt_handler_install() which is responsible to install
> > > the LCS callback.
> >
> > I think it is good practice whether or not LSC is set.
> > The link status should be initialized to the correct value after the probe.
> 
> There is no guarantee the link will be accurate, at probe time the link may be
> up so internal information has a status up with a speed with this patch.
> The application probes a second port, in the mean time the link of the first
> port goes down, the interrupt is still not installed and the internal status
> becomes wrong (still up whereas the port is down).
> 
> Finally at start, the device installs the handler, but the link is still down
> whereas internally it is up, the application will call
> rte_eth_link_get_nowait() which will directly copy the wrong internal status
> to the application.

This is not correct.
Using Verbs, the async_fd on which link status interrupts are reported is 
created on probe. 
Even if the interrupt handler is not installed, interrupts still trigger on 
this fd. They will be processed when the interrupt handler will be installed as 
part of the port start. 
So in fact you have the whole trace on the link status changes waiting to be 
processed upon port start. 

Please try and see. 

> 
> There is also another situation, when the application stops the port, the
> interrupt is also removed, which means during this time, the internal status
> may be wrong as it won't be updated anymore.
> This comes to the same possible situation as above.

Same comment. 

> 
> > > Another point, the wait to complete flag is useless, if the link is
> > > up, the status and speed will be accurate, if not, it will receive an LSC
> event later.
> >
> > Agree.
> >
> > So how about keeping the code on the current place, just removing the
> > wait_to_complete?
> 
> The current place is not fixing the issue as there is still a possibility to 
> have a
> wrong value.
> 
> Regards,
> 
> --
> Nélio Laranjeiro
> 6WIND


[dpdk-dev] [PATCH] [pktgen] [PATCH] Ignore the enable range cmd when sending packets

2018-04-08 Thread Bing Zhao
When using "enable [ports] range" command to enable the range sending feature, 
the code will only set the "SEND_RANGE_PKTS" without any 
"CLEAR_FAST_ALLOC_FLAG" (belongs to "start" command).
If the enable actiong is done when no packets are sending, everything will be 
OK. But when sending packets, it will automaticlly switch the bufferpool from 
tx_mp to range_mp without "pktgen_setup_packets" called to set up and fill some 
fileds of the buffers. I assume that the sending process will fail and the 
buffers will get leaked. (Correct me if anything wrong). And the alloc function 
will return -105 which indicates no free buffer. Then the only thing we can do 
is to quit the process and restart it.
To avoid such wrong operation procedures, a check is added in the enable_range 
function.

Thanks

Signed-off-by: Bing Zhao 
---
 app/pktgen-cmds.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/app/pktgen-cmds.c b/app/pktgen-cmds.c
index 66ea98a..cc3a2ec 100644
--- a/app/pktgen-cmds.c
+++ b/app/pktgen-cmds.c
@@ -2591,6 +2591,10 @@ void
 enable_range(port_info_t *info, uint32_t state)
 {
if (state == ENABLE_STATE) {
+   if (SENDING_PACKETS & rte_atomic32_read(&info->port_flags)) {
+   pktgen_log_warning("Cannot enable the range settings 
while sending packets!");
+   return;
+   }
pktgen_clr_port_flags(info, SEND_SEQ_PKTS);
pktgen_clr_port_flags(info, SEND_PCAP_PKTS);
pktgen_set_port_flags(info, SEND_RANGE_PKTS);
-- 
2.11.0.windows.1




[dpdk-dev] [PATCH v2 0/5] Initial compressdev unit tests

2018-04-08 Thread Pablo de Lara
Added initial tests for Compressdev library.
The tests are performed compressing a test buffer (or multiple test buffers)
with compressdev or Zlib, and decompressing it/them with the other library
(if compression is done with compressdev, decompression is done with Zlib, and 
viceversa).

Tests added so far are based on the deflate algorithm,
including:
- Fixed huffman on single buffer
- Dynamic huffman on single buffer
- Multi compression level test on single buffer
- Multi buffer
- Multi xform using the same buffer

Due to a dependency on Zlib, the test is not enabled by default.
Once the library is installed, the configuration option 
CONFIG_RTE_COMPRESSDEV_TEST must be set to Y.
However, if building with Meson, the test will be built automatically,
if Zlib is installed.

The test requires a compressdev PMD to be initialized, when running the test 
app. For example:

./build/app/test --vdev="compress_X"

RTE>>compressdev_autotest

This patchset depends on the Compressdev API patchset:
http://dpdk.org/ml/archives/dev/2018-April/096028.html
("[PATCH v4 00/13] Implement compression API")

Changes in v2:
- Add meson build
- Add invalid configuration tests
- Use new Compressdev API:
  * Substitute session with priv xform
  * Check if priv xform is shareable and create one per operation if not


Pablo de Lara (5):
  test/compress: add initial unit tests
  test/compress: add multi op test
  test/compress: add multi level test
  test/compress: add multi xform test
  test/compress: add invalid configuration tests

 config/common_base   |5 +
 test/test/Makefile   |9 +
 test/test/meson.build|8 +
 test/test/test_compressdev.c | 1094 ++
 test/test/test_compressdev_test_buffer.h |  295 
 5 files changed, 1411 insertions(+)
 create mode 100644 test/test/test_compressdev.c
 create mode 100644 test/test/test_compressdev_test_buffer.h

-- 
2.14.3



[dpdk-dev] [PATCH v2 1/5] test/compress: add initial unit tests

2018-04-08 Thread Pablo de Lara
This commit introduces the initial tests for compressdev,
performing basic compression and decompression operations
of sample test buffers, using the Zlib library in one direction
and compressdev in another direction, to make sure that
the library is compatible with Zlib.

Due to the use of Zlib API, the test is disabled by default,
to avoid adding a new dependency on DPDK.

Signed-off-by: Pablo de Lara 
Signed-off-by: Ashish Gupta 
Signed-off-by: Shally Verma 
---
 config/common_base   |   5 +
 test/test/Makefile   |   9 +
 test/test/meson.build|   8 +
 test/test/test_compressdev.c | 727 +++
 test/test/test_compressdev_test_buffer.h | 295 +
 5 files changed, 1044 insertions(+)
 create mode 100644 test/test/test_compressdev.c
 create mode 100644 test/test/test_compressdev_test_buffer.h

diff --git a/config/common_base b/config/common_base
index f40354487..004b5e5d1 100644
--- a/config/common_base
+++ b/config/common_base
@@ -549,6 +549,11 @@ CONFIG_RTE_LIBRTE_SECURITY=y
 CONFIG_RTE_LIBRTE_COMPRESSDEV=y
 CONFIG_RTE_COMPRESS_MAX_DEVS=64
 
+#
+# Compile compressdev unit test
+#
+CONFIG_RTE_COMPRESSDEV_TEST=n
+
 #
 # Compile generic event device library
 #
diff --git a/test/test/Makefile b/test/test/Makefile
index a88cc38bf..0faa03bad 100644
--- a/test/test/Makefile
+++ b/test/test/Makefile
@@ -181,6 +181,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_PMD_RING) += test_pmd_ring_perf.c
 SRCS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += test_cryptodev_blockcipher.c
 SRCS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += test_cryptodev.c
 
+ifeq ($(CONFIG_RTE_COMPRESSDEV_TEST),y)
+SRCS-$(CONFIG_RTE_LIBRTE_COMPRESSDEV) += test_compressdev.c
+endif
+
 ifeq ($(CONFIG_RTE_LIBRTE_EVENTDEV),y)
 SRCS-y += test_eventdev.c
 SRCS-y += test_event_ring.c
@@ -201,6 +205,11 @@ CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -D_GNU_SOURCE
 
 LDLIBS += -lm
+ifeq ($(CONFIG_RTE_COMPRESSDEV_TEST),y)
+ifeq ($(CONFIG_RTE_LIBRTE_COMPRESSDEV),y)
+LDLIBS += -lz
+endif
+endif
 
 # Disable VTA for memcpy test
 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
diff --git a/test/test/meson.build b/test/test/meson.build
index eb3d87a4d..69d3fa699 100644
--- a/test/test/meson.build
+++ b/test/test/meson.build
@@ -235,6 +235,14 @@ if dpdk_conf.has('RTE_LIBRTE_KNI')
 endif
 
 test_dep_objs = []
+compress_test_dep = dependency('zlib', required: false)
+if compress_test_dep.found()
+   test_dep_objs += compress_test_dep
+   test_sources += 'test_compressdev.c'
+   test_deps += 'compressdev'
+   test_names += 'compressdev_autotest'
+endif
+
 foreach d:test_deps
def_lib = get_option('default_library')
test_dep_objs += get_variable(def_lib + '_rte_' + d)
diff --git a/test/test/test_compressdev.c b/test/test/test_compressdev.c
new file mode 100644
index 0..02fc6c3fa
--- /dev/null
+++ b/test/test/test_compressdev.c
@@ -0,0 +1,727 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_compressdev_test_buffer.h"
+#include "test.h"
+
+#define DEFAULT_WINDOW_SIZE 15
+#define DEFAULT_MEM_LEVEL 8
+#define MAX_DEQD_RETRIES 10
+#define DEQUEUE_WAIT_TIME 1
+
+/*
+ * 30% extra size for compressed data compared to original data,
+ * in case data size cannot be reduced and it is actually bigger
+ * due to the compress block headers
+ */
+#define COMPRESS_BUF_SIZE_RATIO 1.3
+#define NUM_MBUFS 16
+#define NUM_OPS 16
+#define NUM_MAX_XFORMS 1
+#define NUM_MAX_INFLIGHT_OPS 128
+#define CACHE_SIZE 0
+
+#define DIV_CEIL(a, b)  ((a % b) ? ((a / b) + 1) : (a / b))
+
+const char *
+huffman_type_strings[] = {
+   [RTE_COMP_HUFFMAN_DEFAULT]  = "PMD default",
+   [RTE_COMP_HUFFMAN_FIXED]= "Fixed",
+   [RTE_COMP_HUFFMAN_DYNAMIC]  = "Dynamic"
+};
+
+enum zlib_direction {
+   ZLIB_NONE,
+   ZLIB_COMPRESS,
+   ZLIB_DECOMPRESS,
+   ZLIB_ALL
+};
+
+struct comp_testsuite_params {
+   struct rte_mempool *mbuf_pool;
+   struct rte_mempool *op_pool;
+   struct rte_comp_xform def_comp_xform;
+   struct rte_comp_xform def_decomp_xform;
+};
+
+static struct comp_testsuite_params testsuite_params = { 0 };
+
+static void
+testsuite_teardown(void)
+{
+   struct comp_testsuite_params *ts_params = &testsuite_params;
+
+   rte_mempool_free(ts_params->mbuf_pool);
+   rte_mempool_free(ts_params->op_pool);
+}
+
+static int
+testsuite_setup(void)
+{
+   struct comp_testsuite_params *ts_params = &testsuite_params;
+   unsigned int i;
+
+   if (rte_compressdev_count() == 0) {
+   RTE_LOG(ERR, USER1, "Need at least one compress device\n");
+   return -EINVAL;
+   }
+
+   uint32_t max_buf_size = 0;
+   for (i = 0; i < RTE_DIM(compress_test_bufs); i++)
+   max_buf_size = RTE_MAX(max_buf_size,
+   strlen(compress_test_

[dpdk-dev] [PATCH v2 2/5] test/compress: add multi op test

2018-04-08 Thread Pablo de Lara
Add test that checks if multiple operations with
different buffers can be handled on a single enqueue call.

Signed-off-by: Pablo de Lara 
---
 test/test/test_compressdev.c | 476 +--
 1 file changed, 319 insertions(+), 157 deletions(-)

diff --git a/test/test/test_compressdev.c b/test/test/test_compressdev.c
index 02fc6c3fa..a264c4da4 100644
--- a/test/test/test_compressdev.c
+++ b/test/test/test_compressdev.c
@@ -47,6 +47,10 @@ enum zlib_direction {
ZLIB_ALL
 };
 
+struct priv_op_data {
+   uint16_t orig_idx;
+};
+
 struct comp_testsuite_params {
struct rte_mempool *mbuf_pool;
struct rte_mempool *op_pool;
@@ -99,7 +103,8 @@ testsuite_setup(void)
}
 
ts_params->op_pool = rte_comp_op_pool_create("op_pool", NUM_OPS,
-   0, 0, rte_socket_id());
+   0, sizeof(struct priv_op_data),
+   rte_socket_id());
if (ts_params->op_pool == NULL) {
RTE_LOG(ERR, USER1, "Operation pool could not be created\n");
goto exit;
@@ -339,7 +344,9 @@ decompress_zlib(struct rte_comp_op *op,
  * Compresses and decompresses buffer with compressdev API and Zlib API
  */
 static int
-test_deflate_comp_decomp(const char *test_buffer,
+test_deflate_comp_decomp(const char * const test_bufs[],
+   unsigned int num_bufs,
+   uint16_t buf_idx[],
struct rte_comp_xform *compress_xform,
struct rte_comp_xform *decompress_xform,
enum rte_comp_op_type state,
@@ -348,95 +355,146 @@ test_deflate_comp_decomp(const char *test_buffer,
struct comp_testsuite_params *ts_params = &testsuite_params;
int ret_status = -1;
int ret;
-   struct rte_mbuf *comp_buf = NULL;
-   struct rte_mbuf *uncomp_buf = NULL;
-   struct rte_comp_op *op = NULL;
-   struct rte_comp_op *op_processed = NULL;
-   void *priv_xform = NULL;
-   uint16_t num_deqd;
+   struct rte_mbuf *uncomp_bufs[num_bufs];
+   struct rte_mbuf *comp_bufs[num_bufs];
+   struct rte_comp_op *ops[num_bufs];
+   struct rte_comp_op *ops_processed[num_bufs];
+   void *priv_xforms[num_bufs];
+   uint16_t num_enqd, num_deqd, num_total_deqd;
+   uint16_t num_priv_xforms = 0;
unsigned int deqd_retries = 0;
+   struct priv_op_data *priv_data;
char *data_ptr;
-
-   /* Prepare the source mbuf with the data */
-   uncomp_buf = rte_pktmbuf_alloc(ts_params->mbuf_pool);
-   if (uncomp_buf == NULL) {
+   unsigned int i;
+   const struct rte_compressdev_capabilities *capa =
+   rte_compressdev_capability_get(0, RTE_COMP_ALGO_DEFLATE);
+
+   /* Initialize all arrays to NULL */
+   memset(uncomp_bufs, 0, sizeof(struct rte_mbuf *) * num_bufs);
+   memset(comp_bufs, 0, sizeof(struct rte_mbuf *) * num_bufs);
+   memset(ops, 0, sizeof(struct rte_comp_op *) * num_bufs);
+   memset(ops_processed, 0, sizeof(struct rte_comp_op *) * num_bufs);
+   memset(priv_xforms, 0, sizeof(void *) * num_bufs);
+
+   /* Prepare the source mbufs with the data */
+   ret = rte_pktmbuf_alloc_bulk(ts_params->mbuf_pool, uncomp_bufs, 
num_bufs);
+   if (ret < 0) {
RTE_LOG(ERR, USER1,
-   "Source mbuf could not be allocated "
+   "Source mbufs could not be allocated "
"from the mempool\n");
goto exit;
}
 
-   data_ptr = rte_pktmbuf_append(uncomp_buf, strlen(test_buffer) + 1);
-   snprintf(data_ptr, strlen(test_buffer) + 1, "%s", test_buffer);
+   for (i = 0; i < num_bufs; i++) {
+   data_ptr = rte_pktmbuf_append(uncomp_bufs[i],
+   strlen(test_bufs[i]) + 1);
+   snprintf(data_ptr, strlen(test_bufs[i]) + 1, "%s",
+   test_bufs[i]);
+   }
 
-   /* Prepare the destination mbuf */
-   comp_buf = rte_pktmbuf_alloc(ts_params->mbuf_pool);
-   if (comp_buf == NULL) {
+   /* Prepare the destination mbufs */
+   ret = rte_pktmbuf_alloc_bulk(ts_params->mbuf_pool, comp_bufs, num_bufs);
+   if (ret < 0) {
RTE_LOG(ERR, USER1,
-   "Destination mbuf could not be allocated "
+   "Destination mbufs could not be allocated "
"from the mempool\n");
goto exit;
}
 
-   rte_pktmbuf_append(comp_buf,
-   strlen(test_buffer) * COMPRESS_BUF_SIZE_RATIO);
+   for (i = 0; i < num_bufs; i++)
+   rte_pktmbuf_append(comp_bufs[i],
+   strlen(test_bufs[i]) * COMPRESS_BUF_SIZE_RATIO);
 
/* Build the compression operations */
-   op = rte_comp_op_alloc(ts_params->op_pool);
-   if (op == NULL) {
+   ret = rte_comp_op_bulk_alloc(

[dpdk-dev] [PATCH v2 4/5] test/compress: add multi xform test

2018-04-08 Thread Pablo de Lara
Add test that checks if multiple xforms can be
handled on a single enqueue call.

Signed-off-by: Pablo de Lara 
---
 test/test/test_compressdev.c | 261 ---
 1 file changed, 193 insertions(+), 68 deletions(-)

diff --git a/test/test/test_compressdev.c b/test/test/test_compressdev.c
index 10f205ac9..0cffd85ba 100644
--- a/test/test/test_compressdev.c
+++ b/test/test/test_compressdev.c
@@ -27,7 +27,7 @@
 #define COMPRESS_BUF_SIZE_RATIO 1.3
 #define NUM_MBUFS 16
 #define NUM_OPS 16
-#define NUM_MAX_XFORMS 1
+#define NUM_MAX_XFORMS 16
 #define NUM_MAX_INFLIGHT_OPS 128
 #define CACHE_SIZE 0
 
@@ -54,8 +54,8 @@ struct priv_op_data {
 struct comp_testsuite_params {
struct rte_mempool *mbuf_pool;
struct rte_mempool *op_pool;
-   struct rte_comp_xform def_comp_xform;
-   struct rte_comp_xform def_decomp_xform;
+   struct rte_comp_xform *def_comp_xform;
+   struct rte_comp_xform *def_decomp_xform;
 };
 
 static struct comp_testsuite_params testsuite_params = { 0 };
@@ -67,6 +67,8 @@ testsuite_teardown(void)
 
rte_mempool_free(ts_params->mbuf_pool);
rte_mempool_free(ts_params->op_pool);
+   rte_free(ts_params->def_comp_xform);
+   rte_free(ts_params->def_decomp_xform);
 }
 
 static int
@@ -110,21 +112,26 @@ testsuite_setup(void)
goto exit;
}
 
+   ts_params->def_comp_xform =
+   rte_malloc(NULL, sizeof(struct rte_comp_xform), 0);
+   ts_params->def_decomp_xform =
+   rte_malloc(NULL, sizeof(struct rte_comp_xform), 0);
+
/* Initializes default values for compress/decompress xforms */
-   ts_params->def_comp_xform.next = NULL;
-   ts_params->def_comp_xform.type = RTE_COMP_COMPRESS;
-   ts_params->def_comp_xform.compress.algo = RTE_COMP_ALGO_DEFLATE,
-   ts_params->def_comp_xform.compress.deflate.huffman =
+   ts_params->def_comp_xform->next = NULL;
+   ts_params->def_comp_xform->type = RTE_COMP_COMPRESS;
+   ts_params->def_comp_xform->compress.algo = RTE_COMP_ALGO_DEFLATE,
+   ts_params->def_comp_xform->compress.deflate.huffman =
RTE_COMP_HUFFMAN_DEFAULT;
-   ts_params->def_comp_xform.compress.level = RTE_COMP_LEVEL_PMD_DEFAULT;
-   ts_params->def_comp_xform.compress.chksum = RTE_COMP_CHECKSUM_NONE;
-   ts_params->def_comp_xform.compress.window_size = DEFAULT_WINDOW_SIZE;
+   ts_params->def_comp_xform->compress.level = RTE_COMP_LEVEL_PMD_DEFAULT;
+   ts_params->def_comp_xform->compress.chksum = RTE_COMP_CHECKSUM_NONE;
+   ts_params->def_comp_xform->compress.window_size = DEFAULT_WINDOW_SIZE;
 
-   ts_params->def_decomp_xform.next = NULL;
-   ts_params->def_decomp_xform.type = RTE_COMP_DECOMPRESS;
-   ts_params->def_decomp_xform.decompress.algo = RTE_COMP_ALGO_DEFLATE,
-   ts_params->def_decomp_xform.decompress.chksum = RTE_COMP_CHECKSUM_NONE;
-   ts_params->def_decomp_xform.decompress.window_size = 
DEFAULT_WINDOW_SIZE;
+   ts_params->def_decomp_xform->next = NULL;
+   ts_params->def_decomp_xform->type = RTE_COMP_DECOMPRESS;
+   ts_params->def_decomp_xform->decompress.algo = RTE_COMP_ALGO_DEFLATE,
+   ts_params->def_decomp_xform->decompress.chksum = RTE_COMP_CHECKSUM_NONE;
+   ts_params->def_decomp_xform->decompress.window_size = 
DEFAULT_WINDOW_SIZE;
 
return TEST_SUCCESS;
 
@@ -347,8 +354,9 @@ static int
 test_deflate_comp_decomp(const char * const test_bufs[],
unsigned int num_bufs,
uint16_t buf_idx[],
-   struct rte_comp_xform *compress_xform,
-   struct rte_comp_xform *decompress_xform,
+   struct rte_comp_xform *compress_xforms[],
+   struct rte_comp_xform *decompress_xforms[],
+   unsigned int num_xforms,
enum rte_comp_op_type state,
enum zlib_direction zlib_dir)
 {
@@ -443,8 +451,9 @@ test_deflate_comp_decomp(const char * const test_bufs[],
/* Compress data (either with Zlib API or compressdev API */
if (zlib_dir == ZLIB_COMPRESS || zlib_dir == ZLIB_ALL) {
for (i = 0; i < num_bufs; i++) {
-   ret = compress_zlib(ops[i],
-   (const struct rte_comp_xform *)compress_xform,
+   const struct rte_comp_xform *compress_xform =
+   compress_xforms[i % num_xforms];
+   ret = compress_zlib(ops[i], compress_xform,
DEFAULT_MEM_LEVEL);
if (ret < 0)
goto exit;
@@ -452,11 +461,11 @@ test_deflate_comp_decomp(const char * const test_bufs[],
ops_processed[i] = ops[i];
}
} else {
-   if (capa->comp_feature_flags & 
RTE_COMP_FF_SHAREABLE_PRIV_XFORM) {
-   /* Create single

[dpdk-dev] [PATCH v2 3/5] test/compress: add multi level test

2018-04-08 Thread Pablo de Lara
Add test that checks if all compression levels
are supported and compress a buffer correctly.

Signed-off-by: Pablo de Lara 
---
 test/test/test_compressdev.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/test/test/test_compressdev.c b/test/test/test_compressdev.c
index a264c4da4..10f205ac9 100644
--- a/test/test/test_compressdev.c
+++ b/test/test/test_compressdev.c
@@ -865,6 +865,37 @@ test_compressdev_deflate_stateless_multi_op(void)
return TEST_SUCCESS;
 }
 
+
+static int
+test_compressdev_deflate_stateless_multi_level(void)
+{
+   struct comp_testsuite_params *ts_params = &testsuite_params;
+   const char *test_buffer;
+   unsigned int level;
+   uint16_t i;
+   struct rte_comp_xform compress_xform;
+
+   memcpy(&compress_xform, &ts_params->def_comp_xform,
+   sizeof(struct rte_comp_xform));
+
+   for (i = 0; i < RTE_DIM(compress_test_bufs); i++) {
+   test_buffer = compress_test_bufs[i];
+   for (level = RTE_COMP_LEVEL_MIN; level <= RTE_COMP_LEVEL_MAX;
+   level++) {
+   compress_xform.compress.level = level;
+   /* Compress with compressdev, decompress with Zlib */
+   if (test_deflate_comp_decomp(&test_buffer, 1,
+   &i,
+   &compress_xform,
+   &ts_params->def_decomp_xform,
+   RTE_COMP_OP_STATELESS,
+   ZLIB_DECOMPRESS) < 0)
+   return TEST_FAILED;
+   }
+   }
+
+   return TEST_SUCCESS;
+}
 static struct unit_test_suite compressdev_testsuite  = {
.suite_name = "compressdev unit test suite",
.setup = testsuite_setup,
@@ -876,6 +907,8 @@ static struct unit_test_suite compressdev_testsuite  = {
test_compressdev_deflate_stateless_dynamic),
TEST_CASE_ST(generic_ut_setup, generic_ut_teardown,
test_compressdev_deflate_stateless_multi_op),
+   TEST_CASE_ST(generic_ut_setup, generic_ut_teardown,
+   test_compressdev_deflate_stateless_multi_level),
TEST_CASES_END() /**< NULL terminate unit test array */
}
 };
-- 
2.14.3



[dpdk-dev] [PATCH v2 5/5] test/compress: add invalid configuration tests

2018-04-08 Thread Pablo de Lara
Add tests that check if device configuration
is not successful when providing invalid parameters.

Signed-off-by: Pablo de Lara 
---
 test/test/test_compressdev.c | 49 +++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/test/test/test_compressdev.c b/test/test/test_compressdev.c
index 0cffd85ba..1dbd8779d 100644
--- a/test/test/test_compressdev.c
+++ b/test/test/test_compressdev.c
@@ -177,6 +177,51 @@ generic_ut_teardown(void)
rte_compressdev_stop(0);
 }
 
+static int
+test_compressdev_invalid_configuration(void)
+{
+   struct rte_compressdev_config invalid_config;
+   struct rte_compressdev_config valid_config = {
+   .socket_id = rte_socket_id(),
+   .nb_queue_pairs = 1,
+   .max_nb_priv_xforms = NUM_MAX_XFORMS,
+   .max_nb_streams = 0
+   };
+   struct rte_compressdev_info dev_info;
+
+   /* Invalid configuration with 0 queue pairs */
+   memcpy(&invalid_config, &valid_config,
+   sizeof(struct rte_compressdev_config));
+   invalid_config.nb_queue_pairs = 0;
+
+   TEST_ASSERT_FAIL(rte_compressdev_configure(0, &invalid_config),
+   "Device configuration was successful "
+   "with no queue pairs (invalid)\n");
+
+   /*
+* Invalid configuration with too many queue pairs
+* (if there is an actual maximum number of queue pairs)
+*/
+   rte_compressdev_info_get(0, &dev_info);
+   if (dev_info.max_nb_queue_pairs != 0) {
+   memcpy(&invalid_config, &valid_config,
+   sizeof(struct rte_compressdev_config));
+   invalid_config.nb_queue_pairs = dev_info.max_nb_queue_pairs + 1;
+
+   TEST_ASSERT_FAIL(rte_compressdev_configure(0, &invalid_config),
+   "Device configuration was successful "
+   "with too many queue pairs (invalid)\n");
+   }
+
+   /* Invalid queue pair setup, with no number of queue pairs set */
+   TEST_ASSERT_FAIL(rte_compressdev_queue_pair_setup(0, 0,
+   NUM_MAX_INFLIGHT_OPS, rte_socket_id()),
+   "Queue pair setup was successful "
+   "with no queue pairs set (invalid)\n");
+
+   return TEST_SUCCESS;
+}
+
 static int
 compare_buffers(const char *buffer1, uint32_t buffer1_len,
const char *buffer2, uint32_t buffer2_len)
@@ -692,7 +737,7 @@ test_deflate_comp_decomp(const char * const test_bufs[],
 
/* Attach non shareable private xform data to ops */
for (i = 0; i < num_bufs; i++) {
-   priv_data = (struct priv_op_data *) (ops[i] + 
1);
+   priv_data = (struct priv_op_data *)(ops[i] + 1);
uint16_t xform_idx = priv_data->orig_idx;
ops[i]->private_xform = priv_xforms[xform_idx];
}
@@ -1024,6 +1069,8 @@ static struct unit_test_suite compressdev_testsuite  = {
.setup = testsuite_setup,
.teardown = testsuite_teardown,
.unit_test_cases = {
+   TEST_CASE_ST(NULL, NULL,
+   test_compressdev_invalid_configuration),
TEST_CASE_ST(generic_ut_setup, generic_ut_teardown,
test_compressdev_deflate_stateless_fixed),
TEST_CASE_ST(generic_ut_setup, generic_ut_teardown,
-- 
2.14.3



Re: [dpdk-dev] [PATCH] net/ixgbe: update data->eth_link status on start

2018-04-08 Thread Zhang, Helin


> -Original Message-
> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Zhang, Qi Z
> Sent: Sunday, April 8, 2018 10:05 AM
> To: Chas Williams; dev@dpdk.org
> Cc: Lu, Wenzhuo; Ananyev, Konstantin; Charles (Chas) Williams
> Subject: Re: [dpdk-dev] [PATCH] net/ixgbe: update data->eth_link status on
> start
> 
> 
> 
> > -Original Message-
> > From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Chas Williams
> > Sent: Wednesday, February 14, 2018 6:56 AM
> > To: dev@dpdk.org
> > Cc: Lu, Wenzhuo ; Ananyev, Konstantin
> > ; Charles (Chas) Williams
> > 
> > Subject: [dpdk-dev] [PATCH] net/ixgbe: update data->eth_link status on
> > start
> >
> > From: "Charles (Chas) Williams" 
> >
> > dev->data->eth_link isn't updated until the first interrupt.  If a
> > link is carrier down, then this interrupt may never happen.  Before we
> > finished starting the PMD, call ixgbe_dev_link_update() to ensure we
> > have a valid status.
> >
> > Signed-off-by: Chas Williams 
> 
> Acked-by: Qi Zhang 
Applied to dpdk-next-net-intel, thanks!

/Helin



Re: [dpdk-dev] [PATCH] bus/fslmc: support for hotplugging of memory

2018-04-08 Thread Burakov, Anatoly

On 05-Apr-18 3:14 PM, Shreyansh Jain wrote:

Restructure VFIO DMA code for handling hotplug memory events
(callbacks) and --legacy case.

Signed-off-by: Shreyansh Jain 
---

###
This is based on the 16fbfef04a3 github repository. This is assuming
that changes already exists as done in patch 26/68.
Though, this can be a standalone, replacing 26/88. Though, the Makefile
changes don't exist in this.
Also, this just a first draft. I will push any review changes after this
incrementally over v4.
###


Hi Shreyansh,

I think we can keep the 26/68 as it still works within the context of 
the patchset. I would like to add these changes closer to the end, where 
we enable support for callbacks in VFIO (this could/should come as the 
next patch).


That said, i took some liberties when integrating this patch, hopefully 
for the better. I know you mentioned it's a draft, so you can post any 
comments for the inevitable v4 :)




  drivers/bus/fslmc/fslmc_bus.c|  15 
  drivers/bus/fslmc/fslmc_vfio.c   | 161 +++
  drivers/bus/fslmc/fslmc_vfio.h   |   1 +
  drivers/net/dpaa2/dpaa2_ethdev.c |   1 -
  4 files changed, 163 insertions(+), 15 deletions(-)

diff --git a/drivers/bus/fslmc/fslmc_bus.c b/drivers/bus/fslmc/fslmc_bus.c
index 5ee0beb85..50884ff3a 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -266,6 +266,21 @@ rte_fslmc_probe(void)
return 0;
}
  
+	if (rte_log_get_global_level() >= RTE_LOG_DEBUG)

+   rte_dump_physmem_layout(stdout);


Presumably, this is not needed - just debug leftovers?


+
+   /* Map existing segments as well as, in case of hotpluggable memory,
+* install callback handler.
+*/
+   ret = rte_fslmc_vfio_dmamap();
+   if (ret) {
+   FSLMC_BUS_LOG(ERR, "Unable to DMA map existing VAs: (%d)",
+ ret);
+   /* Not continuing ahead */
+   FSLMC_BUS_LOG(ERR, "FSLMC VFIO Mapping failed");
+   return 0;
+   }
+


What happens if there are no devices on the bus that can be used by 
DPDK? As far as i can tell, it would return error, which may or may not 
be desirable (failing to map anything because there aren't any fslmc 
devices is not an error?).


For "regular" VFIO, the container is an empty shell unless you add 
groups into it - does fslmc VFIO support work differently, and container 
is already working/initialized by the time we reach this point?


Anyway, i'll leave this as is.


ret = fslmc_vfio_process_group();
if (ret) {
FSLMC_BUS_LOG(ERR, "Unable to setup devices %d", ret);
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 31831e3ce..60725fb70 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -30,6 +30,7 @@
  #include 
  #include 
  #include 
+#include 


<...>


+}
+
  static int
-fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
-   const struct rte_memseg *ms, void *arg)
+#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr, size_t len)
+#else
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr __rte_unused, size_t len)
+#endif


I think i'll leave this as just "rte_iova_t iovaaddr __rte_unused" :)


  {
-   int *n_segs = arg;
struct fslmc_vfio_group *group;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(struct vfio_iommu_type1_dma_map),
@@ -205,10 +263,11 @@ fslmc_vfio_map(const struct rte_memseg_list *msl 
__rte_unused,
};
int ret;
  
-	dma_map.size = ms->len;

-   dma_map.vaddr = ms->addr_64;
+   dma_map.size = len;
+   dma_map.vaddr = vaddr;


<...>

  
  	if (is_dma_done)

return 0;
  


I suspect this check was needed because you've done VFIO mapping on 
device probe as opposed to bus probe, so VFIO mapping function could've 
been called multiple times. Is that still the case, or is this check no 
longer needed? I took the liberty to remove it.



-   if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
+   /* Lock before parsing and registering callback to memory subsystem */
+   rte_rwlock_read_lock(mem_lock);
+


<...>


return 0;
diff --git a/drivers/bus/fslmc/fslmc_vfio.h b/drivers/bus/fslmc/fslmc_vfio.h
index e8fb3445f..e77e4c4ac 100644
--- a/drivers/bus/fslmc/fslmc_vfio.h
+++ b/drivers/bus/fslmc/fslmc_vfio.h
@@ -9,6 +9,7 @@
  #define _FSLMC_VFIO_H_
  
  #include 

+#include 
  
  #include "eal_vfio.h"
  


I suspect this change is not needed? I took the liberty to remove it.

--
Thanks,
Anatoly


[dpdk-dev] [PATCH v4 05/70] test: add command to dump malloc heap contents

2018-04-08 Thread Anatoly Burakov
Signed-off-by: Anatoly Burakov 
---
 test/test/commands.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test/commands.c b/test/test/commands.c
index cf0b726..6bfdc02 100644
--- a/test/test/commands.c
+++ b/test/test/commands.c
@@ -137,6 +137,8 @@ static void cmd_dump_parsed(void *parsed_result,
rte_log_dump(stdout);
else if (!strcmp(res->dump, "dump_malloc_stats"))
rte_malloc_dump_stats(stdout, NULL);
+   else if (!strcmp(res->dump, "dump_malloc_heaps"))
+   rte_malloc_dump_heaps(stdout);
 }
 
 cmdline_parse_token_string_t cmd_dump_dump =
@@ -147,6 +149,7 @@ cmdline_parse_token_string_t cmd_dump_dump =
 "dump_ring#"
 "dump_mempool#"
 "dump_malloc_stats#"
+"dump_malloc_heaps#"
 "dump_devargs#"
 "dump_log_types");
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 02/70] eal: move all locking to heap

2018-04-08 Thread Anatoly Burakov
Down the line, we will need to do everything from the heap as any
alloc or free may trigger alloc/free OS memory, which would involve
growing/shrinking heap.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/malloc_elem.c | 16 ++--
 lib/librte_eal/common/malloc_heap.c | 38 +
 lib/librte_eal/common/malloc_heap.h |  6 ++
 lib/librte_eal/common/rte_malloc.c  |  4 ++--
 4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index 0cadc8a..ea041e2 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -243,10 +243,6 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem 
*elem2)
 int
 malloc_elem_free(struct malloc_elem *elem)
 {
-   if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
-   return -1;
-
-   rte_spinlock_lock(&(elem->heap->lock));
size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
uint8_t *ptr = (uint8_t *)&elem[1];
struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
@@ -274,8 +270,6 @@ malloc_elem_free(struct malloc_elem *elem)
 
memset(ptr, 0, sz);
 
-   rte_spinlock_unlock(&(elem->heap->lock));
-
return 0;
 }
 
@@ -292,11 +286,10 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
return 0;
 
struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
-   rte_spinlock_lock(&elem->heap->lock);
if (next ->state != ELEM_FREE)
-   goto err_return;
+   return -1;
if (elem->size + next->size < new_size)
-   goto err_return;
+   return -1;
 
/* we now know the element fits, so remove from free list,
 * join the two
@@ -311,10 +304,5 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
split_elem(elem, split_pt);
malloc_elem_free_list_insert(split_pt);
}
-   rte_spinlock_unlock(&elem->heap->lock);
return 0;
-
-err_return:
-   rte_spinlock_unlock(&elem->heap->lock);
-   return -1;
 }
diff --git a/lib/librte_eal/common/malloc_heap.c 
b/lib/librte_eal/common/malloc_heap.c
index 7aafc88..7d8d70a 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,6 +145,44 @@ malloc_heap_alloc(struct malloc_heap *heap,
return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
+int
+malloc_heap_free(struct malloc_elem *elem)
+{
+   struct malloc_heap *heap;
+   int ret;
+
+   if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+   return -1;
+
+   /* elem may be merged with previous element, so keep heap address */
+   heap = elem->heap;
+
+   rte_spinlock_lock(&(heap->lock));
+
+   ret = malloc_elem_free(elem);
+
+   rte_spinlock_unlock(&(heap->lock));
+
+   return ret;
+}
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size)
+{
+   int ret;
+
+   if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+   return -1;
+
+   rte_spinlock_lock(&(elem->heap->lock));
+
+   ret = malloc_elem_resize(elem, size);
+
+   rte_spinlock_unlock(&(elem->heap->lock));
+
+   return ret;
+}
+
 /*
  * Function to retrieve data for heap on given socket
  */
diff --git a/lib/librte_eal/common/malloc_heap.h 
b/lib/librte_eal/common/malloc_heap.h
index e0defa7..ab0005c 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -28,6 +28,12 @@ malloc_heap_alloc(struct malloc_heap *heap,  const char 
*type, size_t size,
unsigned flags, size_t align, size_t bound);
 
 int
+malloc_heap_free(struct malloc_elem *elem);
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size);
+
+int
 malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);
 
diff --git a/lib/librte_eal/common/rte_malloc.c 
b/lib/librte_eal/common/rte_malloc.c
index e0e0d0b..970813e 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -29,7 +29,7 @@
 void rte_free(void *addr)
 {
if (addr == NULL) return;
-   if (malloc_elem_free(malloc_elem_from_data(addr)) < 0)
+   if (malloc_heap_free(malloc_elem_from_data(addr)) < 0)
rte_panic("Fatal error: Invalid memory\n");
 }
 
@@ -140,7 +140,7 @@ rte_realloc(void *ptr, size_t size, unsigned align)
size = RTE_CACHE_LINE_ROUNDUP(size), align = 
RTE_CACHE_LINE_ROUNDUP(align);
/* check alignment matches first, and if ok, see if we can resize block 
*/
if (RTE_PTR_ALIGN(ptr,align) == ptr &&
-   malloc_elem_resize(elem, size) == 0)
+   malloc_heap_resize(elem, size) == 0)
return ptr;
 
/* either alignment is off, or we have no room to expand,
-- 
2.7.4


[dpdk-dev] [PATCH v4 06/70] eal: make malloc_elem_join_adjacent_free public

2018-04-08 Thread Anatoly Burakov
Down the line, we will need to join free segments to determine
whether the resulting contiguous free space is bigger than a
page size, allowing to free some memory back to the system.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/malloc_elem.c | 6 +++---
 lib/librte_eal/common/malloc_elem.h | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index e02ed88..2291ee1 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -325,8 +325,8 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem 
*elem2)
elem1->next = next;
 }
 
-static struct malloc_elem *
-elem_join_adjacent_free(struct malloc_elem *elem)
+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem)
 {
/*
 * check if next element exists, is adjacent and is free, if so join
@@ -388,7 +388,7 @@ malloc_elem_free(struct malloc_elem *elem)
ptr = RTE_PTR_ADD(elem, sizeof(*elem));
data_len = elem->size - MALLOC_ELEM_OVERHEAD;
 
-   elem = elem_join_adjacent_free(elem);
+   elem = malloc_elem_join_adjacent_free(elem);
 
malloc_elem_free_list_insert(elem);
 
diff --git a/lib/librte_eal/common/malloc_elem.h 
b/lib/librte_eal/common/malloc_elem.h
index 40e8eb5..99921d2 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -141,6 +141,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
 int
 malloc_elem_free(struct malloc_elem *elem);
 
+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem);
+
 /*
  * attempt to resize a malloc_elem by expanding into any free space
  * immediately after it in memory.
-- 
2.7.4


[dpdk-dev] [PATCH v4 04/70] eal: add function to dump malloc heap contents

2018-04-08 Thread Anatoly Burakov
Malloc heap is now a doubly linked list, so it's now possible to
iterate over each malloc element regardless of its state.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3: mark function as experimental

 lib/librte_eal/common/include/rte_malloc.h | 10 ++
 lib/librte_eal/common/malloc_elem.c| 24 
 lib/librte_eal/common/malloc_elem.h|  6 ++
 lib/librte_eal/common/malloc_heap.c| 22 ++
 lib/librte_eal/common/malloc_heap.h|  3 +++
 lib/librte_eal/common/rte_malloc.c | 17 +
 lib/librte_eal/rte_eal_version.map |  1 +
 7 files changed, 83 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h 
b/lib/librte_eal/common/include/rte_malloc.h
index f02a8ba..a9fb7e4 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -13,6 +13,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #ifdef __cplusplus
@@ -278,6 +279,15 @@ void
 rte_malloc_dump_stats(FILE *f, const char *type);
 
 /**
+ * Dump contents of all malloc heaps to a file.
+ *
+ * @param f
+ *   A pointer to a file for output
+ */
+void __rte_experimental
+rte_malloc_dump_heaps(FILE *f);
+
+/**
  * Set the maximum amount of allocated memory for this type.
  *
  * This is not yet implemented
diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index eb41200..e02ed88 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
  */
+#include 
 #include 
 #include 
 #include 
@@ -434,3 +435,26 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
}
return 0;
 }
+
+static inline const char *
+elem_state_to_str(enum elem_state state)
+{
+   switch (state) {
+   case ELEM_PAD:
+   return "PAD";
+   case ELEM_BUSY:
+   return "BUSY";
+   case ELEM_FREE:
+   return "FREE";
+   }
+   return "ERROR";
+}
+
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f)
+{
+   fprintf(f, "Malloc element at %p (%s)\n", elem,
+   elem_state_to_str(elem->state));
+   fprintf(f, "  len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad);
+   fprintf(f, "  prev: %p next: %p\n", elem->prev, elem->next);
+}
diff --git a/lib/librte_eal/common/malloc_elem.h 
b/lib/librte_eal/common/malloc_elem.h
index 238e451..40e8eb5 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -149,6 +149,12 @@ int
 malloc_elem_resize(struct malloc_elem *elem, size_t size);
 
 /*
+ * dump contents of malloc elem to a file.
+ */
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f);
+
+/*
  * Given an element size, compute its freelist index.
  */
 size_t
diff --git a/lib/librte_eal/common/malloc_heap.c 
b/lib/librte_eal/common/malloc_heap.c
index 9c95166..44538d7 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -217,6 +217,28 @@ malloc_heap_get_stats(struct malloc_heap *heap,
return 0;
 }
 
+/*
+ * Function to retrieve data for heap on given socket
+ */
+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f)
+{
+   struct malloc_elem *elem;
+
+   rte_spinlock_lock(&heap->lock);
+
+   fprintf(f, "Heap size: 0x%zx\n", heap->total_size);
+   fprintf(f, "Heap alloc count: %u\n", heap->alloc_count);
+
+   elem = heap->first;
+   while (elem) {
+   malloc_elem_dump(elem, f);
+   elem = elem->next;
+   }
+
+   rte_spinlock_unlock(&heap->lock);
+}
+
 int
 rte_eal_malloc_heap_init(void)
 {
diff --git a/lib/librte_eal/common/malloc_heap.h 
b/lib/librte_eal/common/malloc_heap.h
index ab0005c..bb28422 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -37,6 +37,9 @@ int
 malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);
 
+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f);
+
 int
 rte_eal_malloc_heap_init(void);
 
diff --git a/lib/librte_eal/common/rte_malloc.c 
b/lib/librte_eal/common/rte_malloc.c
index 970813e..f11a822 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -182,6 +182,23 @@ rte_malloc_get_socket_stats(int socket,
 }
 
 /*
+ * Function to dump contents of all heaps
+ */
+void __rte_experimental
+rte_malloc_dump_heaps(FILE *f)
+{
+   struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+   unsigned int idx;
+
+   for (idx = 0; idx < rte_socket_count(); idx++) {
+   unsigned int socket = rte_socket_id_by_idx(idx);
+   fprintf(f, "Heap on socket %i:\n", socket);
+   malloc_heap_dump(&mcfg->malloc_heaps[socket], f);
+   }
+
+}
+
+/*
  * Print stats on memory 

[dpdk-dev] [PATCH v4 03/70] eal: make malloc heap a doubly-linked list

2018-04-08 Thread Anatoly Burakov
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.

Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Make first/last element pointers volatile

 lib/librte_eal/common/include/rte_malloc_heap.h |   6 +
 lib/librte_eal/common/malloc_elem.c | 200 +++-
 lib/librte_eal/common/malloc_elem.h |  14 +-
 lib/librte_eal/common/malloc_heap.c |   8 +-
 4 files changed, 179 insertions(+), 49 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h 
b/lib/librte_eal/common/include/rte_malloc_heap.h
index ba99ed9..d43fa90 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -13,12 +13,18 @@
 /* Number of free lists per heap, grouped by size. */
 #define RTE_HEAP_NUM_FREELISTS  13
 
+/* dummy definition, for pointers */
+struct malloc_elem;
+
 /**
  * Structure to hold malloc heap
  */
 struct malloc_heap {
rte_spinlock_t lock;
LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
+   struct malloc_elem *volatile first;
+   struct malloc_elem *volatile last;
+
unsigned alloc_count;
size_t total_size;
 } __rte_cache_aligned;
diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index ea041e2..eb41200 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -31,6 +31,7 @@ malloc_elem_init(struct malloc_elem *elem,
elem->heap = heap;
elem->ms = ms;
elem->prev = NULL;
+   elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
@@ -39,15 +40,56 @@ malloc_elem_init(struct malloc_elem *elem,
set_trailer(elem);
 }
 
-/*
- * Initialize a dummy malloc_elem header for the end-of-memseg marker
- */
 void
-malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
+malloc_elem_insert(struct malloc_elem *elem)
 {
-   malloc_elem_init(elem, prev->heap, prev->ms, 0);
-   elem->prev = prev;
-   elem->state = ELEM_BUSY; /* mark busy so its never merged */
+   struct malloc_elem *prev_elem, *next_elem;
+   struct malloc_heap *heap = elem->heap;
+
+   if (heap->first == NULL && heap->last == NULL) {
+   /* if empty heap */
+   heap->first = elem;
+   heap->last = elem;
+   prev_elem = NULL;
+   next_elem = NULL;
+   } else if (elem < heap->first) {
+   /* if lower than start */
+   prev_elem = NULL;
+   next_elem = heap->first;
+   heap->first = elem;
+   } else if (elem > heap->last) {
+   /* if higher than end */
+   prev_elem = heap->last;
+   next_elem = NULL;
+   heap->last = elem;
+   } else {
+   /* the new memory is somewhere inbetween start and end */
+   uint64_t dist_from_start, dist_from_end;
+
+   dist_from_end = RTE_PTR_DIFF(heap->last, elem);
+   dist_from_start = RTE_PTR_DIFF(elem, heap->first);
+
+   /* check which is closer, and find closest list entries */
+   if (dist_from_start < dist_from_end) {
+   prev_elem = heap->first;
+   while (prev_elem->next < elem)
+   prev_elem = prev_elem->next;
+   next_elem = prev_elem->next;
+   } else {
+   next_elem = heap->last;
+   while (next_elem->prev > elem)
+   next_elem = next_elem->prev;
+   prev_elem = next_elem->prev;
+   }
+   }
+
+   /* insert new element */
+   elem->prev = prev_elem;
+   elem->next = next_elem;
+   if (prev_elem)
+   prev_elem->next = elem;
+   if (next_elem)
+   next_elem->prev = elem;
 }
 
 /*
@@ -98,18 +140,58 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t 
size,unsigned align,
 static void
 split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
 {
-   struct malloc_elem *next_elem = RTE_PTR_ADD(elem, elem->size);
+   struct malloc_elem *next_elem = elem->next;
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;
 
malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
split_pt->prev = elem;
-   next_elem->

[dpdk-dev] [PATCH v4 09/70] eal: replace panics with error messages in malloc

2018-04-08 Thread Anatoly Burakov
We shouldn't ever panic in system libraries, let alone in
such core ones as EAL, so replace all panic messages with
error messages.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/rte_malloc.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/rte_malloc.c 
b/lib/librte_eal/common/rte_malloc.c
index f11a822..2cda48e 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -30,7 +30,7 @@ void rte_free(void *addr)
 {
if (addr == NULL) return;
if (malloc_heap_free(malloc_elem_from_data(addr)) < 0)
-   rte_panic("Fatal error: Invalid memory\n");
+   RTE_LOG(ERR, EAL, "Error: Invalid memory\n");
 }
 
 /*
@@ -134,8 +134,10 @@ rte_realloc(void *ptr, size_t size, unsigned align)
return rte_malloc(NULL, size, align);
 
struct malloc_elem *elem = malloc_elem_from_data(ptr);
-   if (elem == NULL)
-   rte_panic("Fatal error: memory corruption detected\n");
+   if (elem == NULL) {
+   RTE_LOG(ERR, EAL, "Error: memory corruption detected\n");
+   return NULL;
+   }
 
size = RTE_CACHE_LINE_ROUNDUP(size), align = 
RTE_CACHE_LINE_ROUNDUP(align);
/* check alignment matches first, and if ok, see if we can resize block 
*/
-- 
2.7.4


[dpdk-dev] [PATCH v4 08/70] eal: make malloc free return resulting malloc element

2018-04-08 Thread Anatoly Burakov
This will be needed because we need to know how big is the
new empty space, to check whether we can free some pages as
a result.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3: clarified commit message

 lib/librte_eal/common/malloc_elem.c | 4 ++--
 lib/librte_eal/common/malloc_elem.h | 2 +-
 lib/librte_eal/common/malloc_heap.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index 008f5a3..c18f050 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -379,7 +379,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
  * blocks either immediately before or immediately after newly freed block
  * are also free, the blocks are merged together.
  */
-int
+struct malloc_elem *
 malloc_elem_free(struct malloc_elem *elem)
 {
void *ptr;
@@ -397,7 +397,7 @@ malloc_elem_free(struct malloc_elem *elem)
 
memset(ptr, 0, data_len);
 
-   return 0;
+   return elem;
 }
 
 /*
diff --git a/lib/librte_eal/common/malloc_elem.h 
b/lib/librte_eal/common/malloc_elem.h
index 46e2383..9c1614c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -138,7 +138,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
  * blocks either immediately before or immediately after newly freed block
  * are also free, the blocks are merged together.
  */
-int
+struct malloc_elem *
 malloc_elem_free(struct malloc_elem *elem);
 
 struct malloc_elem *
diff --git a/lib/librte_eal/common/malloc_heap.c 
b/lib/librte_eal/common/malloc_heap.c
index 44538d7..a2c2e4c 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,7 +145,7 @@ int
 malloc_heap_free(struct malloc_elem *elem)
 {
struct malloc_heap *heap;
-   int ret;
+   struct malloc_elem *ret;
 
if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -159,7 +159,7 @@ malloc_heap_free(struct malloc_elem *elem)
 
rte_spinlock_unlock(&(heap->lock));
 
-   return ret;
+   return ret != NULL ? 0 : -1;
 }
 
 int
-- 
2.7.4


[dpdk-dev] [PATCH v4 00/70] Memory Hotplug for DPDK

2018-04-08 Thread Anatoly Burakov
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].

Dependencies (to be applied in specified order):
- EAL IOVA fix [2]

Deprecation notices relevant to this patchset:
- General outline of memory hotplug changes [3]

The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.

Quick outline of all changes done as part of this patchset:

 * Malloc heap adjusted to handle holes in address space
 * Single memseg list replaced by multiple memseg lists
 * VA space for hugepages is preallocated in advance
 * Added alloc/free for pages happening as needed on rte_malloc/rte_free
 * Added contiguous memory allocation API's for rte_memzone
 * Added convenience API calls to walk over memsegs
 * Integrated Pawel Wodkowski's patch for registering/unregistering memory
   with VFIO [4]
 * Callbacks for registering memory allocations
 * Callbacks for allowing/disallowing allocations above specified limit
 * Multiprocess support done via DPDK IPC introduced in 18.02

The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.

Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.

For v4, the following limitations are present:
- VFIO support for multiple processes is not well-tested; work is ongoing
  to validate VFIO for all use cases
- There are known problems with PPC64 VFIO code
- For DPAA and FSLMC platforms, performance will be heavily degraded for
  IOVA as PA cases; separate patches are expected to address the issue

For testing, it is recommended to use the GitHub repository [5], as it will
have all of the dependencies already integrated.

Tested-by: Hemant Agrawal 
Tested-by: Santosh Shukla 

v4:
- Fixed bug in memzone lookup
- Added draft fslmc VFIO code
- Rebased on latest master + dependent patchset
- Documented limitations for *_walk() functions

v3:
- Lots of compile fixes
- Fixes for multiprocess synchronization
- Introduced support for sPAPR IOMMU, courtesy of Gowrishankar @ IBM
- Fixes for mempool size calculation
- Added convenience memseg walk() API's
- Added alloc validation callback

v2: - fixed deadlock at init
- reverted rte_panic changes at init, this is now handled inside IPC

[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IOVA_mode_fixes/
[3] http://dpdk.org/dev/patchwork/patch/34002/
[4] http://dpdk.org/dev/patchwork/patch/24484/
[5] https://github.com/anatolyburakov/dpdk

Anatoly Burakov (70):
  eal: move get_virtual_area out of linuxapp eal_memory.c
  eal: move all locking to heap
  eal: make malloc heap a doubly-linked list
  eal: add function to dump malloc heap contents
  test: add command to dump malloc heap contents
  eal: make malloc_elem_join_adjacent_free public
  eal: make malloc free list remove public
  eal: make malloc free return resulting malloc element
  eal: replace panics with error messages in malloc
  eal: add backend support for contiguous allocation
  eal: enable reserving physically contiguous memzones
  ethdev: use contiguous allocation for DMA memory
  crypto/qat: use contiguous allocation for DMA memory
  net/avf: use contiguous allocation for DMA memory
  net/bnx2x: use contiguous allocation for DMA memory
  net/bnxt: use contiguous allocation for DMA memory
  net/cxgbe: use contiguous allocation for DMA memory
  net/ena: use contiguous allocation for DMA memory
  net/enic: use contiguous allocation for DMA memory
  net/i40e: use contiguous allocation for DMA memory
  net/qede: use contiguous allocation for DMA memory
  net/virtio: use contiguous allocation for DMA memory
  net/vmxnet3: use contiguous allocation for DMA memory
  mempool: add support for the new allocation methods
  eal: add function to walk all memsegs
  bus/fslmc: use memseg walk instead of iteration
  bus/pci: use memseg walk instead of iter

[dpdk-dev] [PATCH v4 17/70] net/cxgbe: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the makefile

 drivers/net/cxgbe/sge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/cxgbe/sge.c b/drivers/net/cxgbe/sge.c
index 83e26d0..85846fc 100644
--- a/drivers/net/cxgbe/sge.c
+++ b/drivers/net/cxgbe/sge.c
@@ -1344,7 +1344,8 @@ static void *alloc_ring(size_t nelem, size_t elem_size,
 * handle the maximum ring size is allocated in order to allow for
 * resizing in later calls to the queue setup function.
 */
-   tz = rte_memzone_reserve_aligned(z_name, len, socket_id, 0, 4096);
+   tz = rte_memzone_reserve_aligned(z_name, len, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, 4096);
if (!tz)
return NULL;
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 12/70] ethdev: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

This fixes the following drivers in one go:

grep -Rl rte_eth_dma_zone_reserve drivers/

drivers/net/avf/avf_rxtx.c
drivers/net/thunderx/nicvf_ethdev.c
drivers/net/e1000/igb_rxtx.c
drivers/net/e1000/em_rxtx.c
drivers/net/fm10k/fm10k_ethdev.c
drivers/net/vmxnet3/vmxnet3_rxtx.c
drivers/net/liquidio/lio_rxtx.c
drivers/net/i40e/i40e_rxtx.c
drivers/net/sfc/sfc.c
drivers/net/ixgbe/ixgbe_rxtx.c
drivers/net/nfp/nfp_net.c

Signed-off-by: Anatoly Burakov 
---

Notes:
v4: replaced use of new API with additional memzone flag

v3: moved this patch earlier in the patchset

 lib/librte_ether/rte_ethdev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..d0cf0e7 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3403,7 +3403,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, 
const char *ring_name,
if (mz)
return mz;
 
-   return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+   return rte_memzone_reserve_aligned(z_name, size, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, align);
 }
 
 int
-- 
2.7.4


[dpdk-dev] [PATCH v4 11/70] eal: enable reserving physically contiguous memzones

2018-04-08 Thread Anatoly Burakov
This adds a new flag to request reserved memzone to be IOVA
contiguous. This is useful for allocating hardware resources like
NIC rings/queues etc.For now, hugepage memory is always contiguous,
but we need to prepare the drivers for the switch.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Replaced a new API with a memzone flag

v3:
- Moved this patch earlier

v3:
- Moved this patch earlier

 lib/librte_eal/common/eal_common_memzone.c  | 25 +
 lib/librte_eal/common/include/rte_memzone.h | 11 +++
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c 
b/lib/librte_eal/common/eal_common_memzone.c
index 16a2e7a..af68c00 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -99,12 +99,13 @@ find_heap_max_free_elem(int *s, unsigned align)
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
int socket_id, unsigned int flags, unsigned int align,
-   unsigned int bound, bool contig)
+   unsigned int bound)
 {
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
size_t requested_len;
int socket, i;
+   bool contig;
 
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -170,7 +171,17 @@ memzone_reserve_aligned_thread_unsafe(const char *name, 
size_t len,
if (!rte_eal_has_hugepages())
socket_id = SOCKET_ID_ANY;
 
+   contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0;
+   /* malloc only cares about size flags, remove contig flag from flags */
+   flags &= ~RTE_MEMZONE_IOVA_CONTIG;
+
if (len == 0) {
+   /* len == 0 is only allowed for non-contiguous zones */
+   if (contig) {
+   RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous 
memzones is not supported\n");
+   rte_errno = EINVAL;
+   return NULL;
+   }
if (bound != 0)
requested_len = bound;
else {
@@ -238,8 +249,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, 
size_t len,
 
 static const struct rte_memzone *
 rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id,
-   unsigned int flags, unsigned int align, unsigned int bound,
-   bool contig)
+   unsigned int flags, unsigned int align, unsigned int bound)
 {
struct rte_mem_config *mcfg;
const struct rte_memzone *mz = NULL;
@@ -250,7 +260,7 @@ rte_memzone_reserve_thread_safe(const char *name, size_t 
len, int socket_id,
rte_rwlock_write_lock(&mcfg->mlock);
 
mz = memzone_reserve_aligned_thread_unsafe(
-   name, len, socket_id, flags, align, bound, contig);
+   name, len, socket_id, flags, align, bound);
 
rte_rwlock_write_unlock(&mcfg->mlock);
 
@@ -267,7 +277,7 @@ rte_memzone_reserve_bounded(const char *name, size_t len, 
int socket_id,
unsigned flags, unsigned align, unsigned bound)
 {
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
-  align, bound, false);
+  align, bound);
 }
 
 /*
@@ -279,7 +289,7 @@ rte_memzone_reserve_aligned(const char *name, size_t len, 
int socket_id,
unsigned flags, unsigned align)
 {
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
-  align, 0, false);
+  align, 0);
 }
 
 /*
@@ -291,8 +301,7 @@ rte_memzone_reserve(const char *name, size_t len, int 
socket_id,
unsigned flags)
 {
return rte_memzone_reserve_thread_safe(name, len, socket_id,
-  flags, RTE_CACHE_LINE_SIZE, 0,
-  false);
+  flags, RTE_CACHE_LINE_SIZE, 0);
 }
 
 int
diff --git a/lib/librte_eal/common/include/rte_memzone.h 
b/lib/librte_eal/common/include/rte_memzone.h
index 2bfb273..e2630fd 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -23,6 +23,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 
@@ -39,6 +40,7 @@ extern "C" {
 #define RTE_MEMZONE_512MB  0x0004   /**< Use 512MB pages. */
 #define RTE_MEMZONE_4GB0x0008   /**< Use 4GB pages. */
 #define RTE_MEMZONE_SIZE_HINT_ONLY 0x0004   /**< Use available page size */
+#define RTE_MEMZONE_IOVA_CONTIG0x0010   /**< Ask for IOVA-contiguous 
memzone. */
 
 /**
  * A structure describing a memzone, which is a contiguous portion of
@@ -102,6 +104,9 @@ struc

[dpdk-dev] [PATCH v4 15/70] net/bnx2x: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the makefile

 drivers/net/bnx2x/bnx2x.c  | 2 +-
 drivers/net/bnx2x/bnx2x_rxtx.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x.c b/drivers/net/bnx2x/bnx2x.c
index fb02d0f..81f5dae 100644
--- a/drivers/net/bnx2x/bnx2x.c
+++ b/drivers/net/bnx2x/bnx2x.c
@@ -177,7 +177,7 @@ bnx2x_dma_alloc(struct bnx2x_softc *sc, size_t size, struct 
bnx2x_dma *dma,
rte_get_timer_cycles());
 
/* Caller must take care that strlen(mz_name) < RTE_MEMZONE_NAMESIZE */
-   z = rte_memzone_reserve_aligned(mz_name, (uint64_t) (size),
+   z = rte_memzone_reserve_aligned_contig(mz_name, (uint64_t)size,
SOCKET_ID_ANY,
0, align);
if (z == NULL) {
diff --git a/drivers/net/bnx2x/bnx2x_rxtx.c b/drivers/net/bnx2x/bnx2x_rxtx.c
index a0d4ac9..6be7277 100644
--- a/drivers/net/bnx2x/bnx2x_rxtx.c
+++ b/drivers/net/bnx2x/bnx2x_rxtx.c
@@ -26,7 +26,8 @@ ring_dma_zone_reserve(struct rte_eth_dev *dev, const char 
*ring_name,
if (mz)
return mz;
 
-   return rte_memzone_reserve_aligned(z_name, ring_size, socket_id, 0, 
BNX2X_PAGE_SIZE);
+   return rte_memzone_reserve_aligned(z_name, ring_size, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, BNX2X_PAGE_SIZE);
 }
 
 static void
-- 
2.7.4


[dpdk-dev] [PATCH v4 18/70] net/ena: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
Acked-by: Michal Krawczyk 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the Makefile

 drivers/net/ena/base/ena_plat_dpdk.h | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ena/base/ena_plat_dpdk.h 
b/drivers/net/ena/base/ena_plat_dpdk.h
index 8cba319..9334519 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -188,7 +188,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(handle);   \
snprintf(z_name, sizeof(z_name),\
"ena_alloc_%d", ena_alloc_cnt++);   \
-   mz = rte_memzone_reserve(z_name, size, SOCKET_ID_ANY, 0); \
+   mz = rte_memzone_reserve(z_name, size, SOCKET_ID_ANY,   \
+   RTE_MEMZONE_IOVA_CONTIG);   \
memset(mz->addr, 0, size);  \
virt = mz->addr;\
phys = mz->iova;\
@@ -206,7 +207,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name),\
"ena_alloc_%d", ena_alloc_cnt++);   \
-   mz = rte_memzone_reserve(z_name, size, node, 0); \
+   mz = rte_memzone_reserve(z_name, size, node,\
+   RTE_MEMZONE_IOVA_CONTIG);   \
memset(mz->addr, 0, size);  \
virt = mz->addr;\
phys = mz->iova;\
@@ -219,7 +221,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name),\
"ena_alloc_%d", ena_alloc_cnt++);   \
-   mz = rte_memzone_reserve(z_name, size, node, 0); \
+   mz = rte_memzone_reserve(z_name, size, node,\
+   RTE_MEMZONE_IOVA_CONTIG);   \
memset(mz->addr, 0, size);  \
virt = mz->addr;\
} while (0)
-- 
2.7.4


[dpdk-dev] [PATCH v4 13/70] crypto/qat: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Also, remove the weird page alignment code.

Signed-off-by: Anatoly Burakov 
Acked-by: Fiona Trahe 
---

Notes:
v4:
- Replace new API with new memzone flag

v3:
- Move the patch earlier in the patchset
- Fix build system files to allow experimental API's
- Removed non-sensical memzone flags code

v3:
- Move the patch earlier in the patchset
- Fix build system files to allow experimental API's
- Removed non-sensical memzone flags code

 drivers/crypto/qat/qat_qp.c | 23 ++-
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/drivers/crypto/qat/qat_qp.c b/drivers/crypto/qat/qat_qp.c
index 87b9ce0..478b7ba 100644
--- a/drivers/crypto/qat/qat_qp.c
+++ b/drivers/crypto/qat/qat_qp.c
@@ -54,8 +54,6 @@ queue_dma_zone_reserve(const char *queue_name, uint32_t 
queue_size,
int socket_id)
 {
const struct rte_memzone *mz;
-   unsigned memzone_flags = 0;
-   const struct rte_memseg *ms;
 
PMD_INIT_FUNC_TRACE();
mz = rte_memzone_lookup(queue_name);
@@ -78,25 +76,8 @@ queue_dma_zone_reserve(const char *queue_name, uint32_t 
queue_size,
 
PMD_DRV_LOG(DEBUG, "Allocate memzone for %s, size %u on socket %u",
queue_name, queue_size, socket_id);
-   ms = rte_eal_get_physmem_layout();
-   switch (ms[0].hugepage_sz) {
-   case(RTE_PGSIZE_2M):
-   memzone_flags = RTE_MEMZONE_2MB;
-   break;
-   case(RTE_PGSIZE_1G):
-   memzone_flags = RTE_MEMZONE_1GB;
-   break;
-   case(RTE_PGSIZE_16M):
-   memzone_flags = RTE_MEMZONE_16MB;
-   break;
-   case(RTE_PGSIZE_16G):
-   memzone_flags = RTE_MEMZONE_16GB;
-   break;
-   default:
-   memzone_flags = RTE_MEMZONE_SIZE_HINT_ONLY;
-   }
-   return rte_memzone_reserve_aligned(queue_name, queue_size, socket_id,
-   memzone_flags, queue_size);
+   return rte_memzone_reserve_aligned(queue_name, queue_size,
+   socket_id, RTE_MEMZONE_IOVA_CONTIG, queue_size);
 }
 
 int qat_crypto_sym_qp_setup(struct rte_cryptodev *dev, uint16_t queue_pair_id,
-- 
2.7.4


[dpdk-dev] [PATCH v4 19/70] net/enic: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
Acked-by: John Daley 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API in Makefile

v3:
- Moved patch earlier in the patchset
- Allowed experimental API in the build system

 drivers/net/enic/enic_main.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 69ad425..94e8e68 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -343,8 +343,8 @@ enic_alloc_consistent(void *priv, size_t size,
struct enic *enic = (struct enic *)priv;
struct enic_memzone_entry *mze;
 
-   rz = rte_memzone_reserve_aligned((const char *)name,
-size, SOCKET_ID_ANY, 0, ENIC_ALIGN);
+   rz = rte_memzone_reserve_aligned((const char *)name, size,
+   SOCKET_ID_ANY, RTE_MEMZONE_IOVA_CONTIG, ENIC_ALIGN);
if (!rz) {
pr_err("%s : Failed to allocate memory requested for %s\n",
__func__, name);
@@ -888,9 +888,8 @@ int enic_alloc_wq(struct enic *enic, uint16_t queue_idx,
instance++);
 
wq->cqmsg_rz = rte_memzone_reserve_aligned((const char *)name,
-  sizeof(uint32_t),
-  SOCKET_ID_ANY, 0,
-  ENIC_ALIGN);
+   sizeof(uint32_t), SOCKET_ID_ANY,
+   RTE_MEMZONE_IOVA_CONTIG, ENIC_ALIGN);
if (!wq->cqmsg_rz)
return -ENOMEM;
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 01/70] eal: move get_virtual_area out of linuxapp eal_memory.c

2018-04-08 Thread Anatoly Burakov
Move get_virtual_area out of linuxapp EAL memory and make it
common to EAL, so that other code could reserve virtual areas
as well.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3: replace uint64_t with size_t for size variables

 lib/librte_eal/common/eal_common_memory.c | 101 ++
 lib/librte_eal/common/eal_private.h   |  33 +++
 lib/librte_eal/linuxapp/eal/eal_memory.c  | 137 ++
 3 files changed, 161 insertions(+), 110 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c
index 852f3bb..5b8ced4 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -2,10 +2,12 @@
  * Copyright(c) 2010-2014 Intel Corporation
  */
 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -14,12 +16,111 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "eal_private.h"
 #include "eal_internal_cfg.h"
 
 /*
+ * Try to mmap *size bytes in /dev/zero. If it is successful, return the
+ * pointer to the mmap'd area and keep *size unmodified. Else, retry
+ * with a smaller zone: decrease *size by hugepage_sz until it reaches
+ * 0. In this case, return NULL. Note: this function returns an address
+ * which is a multiple of hugepage size.
+ */
+
+static uint64_t baseaddr_offset;
+static uint64_t system_page_sz;
+
+void *
+eal_get_virtual_area(void *requested_addr, size_t *size,
+   size_t page_sz, int flags, int mmap_flags)
+{
+   bool addr_is_hint, allow_shrink, unmap, no_align;
+   uint64_t map_sz;
+   void *mapped_addr, *aligned_addr;
+
+   if (system_page_sz == 0)
+   system_page_sz = sysconf(_SC_PAGESIZE);
+
+   mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+
+   RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
+
+   addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
+   allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
+   unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
+
+   if (requested_addr == NULL && internal_config.base_virtaddr != 0) {
+   requested_addr = (void *) (internal_config.base_virtaddr +
+   (size_t)baseaddr_offset);
+   requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
+   addr_is_hint = true;
+   }
+
+   /* if requested address is not aligned by page size, or if requested
+* address is NULL, add page size to requested length as we may get an
+* address that's aligned by system page size, which can be smaller than
+* our requested page size. additionally, we shouldn't try to align if
+* system page size is the same as requested page size.
+*/
+   no_align = (requested_addr != NULL &&
+   ((uintptr_t)requested_addr & (page_sz - 1)) == 0) ||
+   page_sz == system_page_sz;
+
+   do {
+   map_sz = no_align ? *size : *size + page_sz;
+
+   mapped_addr = mmap(requested_addr, map_sz, PROT_READ,
+   mmap_flags, -1, 0);
+   if (mapped_addr == MAP_FAILED && allow_shrink)
+   *size -= page_sz;
+   } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
+
+   /* align resulting address - if map failed, we will ignore the value
+* anyway, so no need to add additional checks.
+*/
+   aligned_addr = no_align ? mapped_addr :
+   RTE_PTR_ALIGN(mapped_addr, page_sz);
+
+   if (*size == 0) {
+   RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
+   strerror(errno));
+   rte_errno = errno;
+   return NULL;
+   } else if (mapped_addr == MAP_FAILED) {
+   RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
+   strerror(errno));
+   /* pass errno up the call chain */
+   rte_errno = errno;
+   return NULL;
+   } else if (requested_addr != NULL && !addr_is_hint &&
+   aligned_addr != requested_addr) {
+   RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested 
address: %p (got %p)\n",
+   requested_addr, aligned_addr);
+   munmap(mapped_addr, map_sz);
+   rte_errno = EADDRNOTAVAIL;
+   return NULL;
+   } else if (requested_addr != NULL && addr_is_hint &&
+   aligned_addr != requested_addr) {
+   RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p 
!= %p) not respected!\n",
+   requested_addr, aligned_addr);
+   RTE_LOG(WARNING, EAL, "   This may cause issues with mapping 
memory into secondary processes\n");
+   }
+
+   if (unmap)
+   munmap(mapped_addr, map_sz);
+
+   RTE_LOG(DEBUG

[dpdk-dev] [PATCH v4 07/70] eal: make malloc free list remove public

2018-04-08 Thread Anatoly Burakov
We will need to be able to remove entries from free lists from
heaps during certain events, such as rollbacks, or when freeing
memory to the system (where a previously element disappears and
thus can no longer be in the free list).

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/malloc_elem.c | 12 ++--
 lib/librte_eal/common/malloc_elem.h |  3 +++
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index 2291ee1..008f5a3 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -245,8 +245,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem)
 /*
  * Remove the specified element from its heap's free list.
  */
-static void
-elem_free_list_remove(struct malloc_elem *elem)
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem)
 {
LIST_REMOVE(elem, free_list);
 }
@@ -266,7 +266,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, 
unsigned align,
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;
 
-   elem_free_list_remove(elem);
+   malloc_elem_free_list_remove(elem);
 
if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
/* split it, too much free space after elem */
@@ -340,7 +340,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);
 
/* remove from free list, join to this one */
-   elem_free_list_remove(elem->next);
+   malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);
 
/* erase header and trailer */
@@ -360,7 +360,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);
 
/* remove from free list, join to this one */
-   elem_free_list_remove(elem->prev);
+   malloc_elem_free_list_remove(elem->prev);
 
new_elem = elem->prev;
join_elem(new_elem, elem);
@@ -423,7 +423,7 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
/* we now know the element fits, so remove from free list,
 * join the two
 */
-   elem_free_list_remove(elem->next);
+   malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);
 
if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
diff --git a/lib/librte_eal/common/malloc_elem.h 
b/lib/librte_eal/common/malloc_elem.h
index 99921d2..46e2383 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -151,6 +151,9 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem);
 int
 malloc_elem_resize(struct malloc_elem *elem, size_t size);
 
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem);
+
 /*
  * dump contents of malloc elem to a file.
  */
-- 
2.7.4


[dpdk-dev] [PATCH v4 25/70] eal: add function to walk all memsegs

2018-04-08 Thread Anatoly Burakov
For code that might need to iterate over list of allocated
segments, using this API will make it more resilient to
internal API changes and will prevent copying the same
iteration code over and over again.

Additionally, down the line there will be locking implemented,
so users of this API will not need to care about locking
either.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 21 +
 lib/librte_eal/common/include/rte_memory.h | 25 +
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 47 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c
index 5b8ced4..947db1f 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -218,6 +218,27 @@ rte_mem_lock_page(const void *virt)
return mlock((void *)aligned, page_size);
 }
 
+int __rte_experimental
+rte_memseg_walk(rte_memseg_walk_t func, void *arg)
+{
+   struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+   int i, ret;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   const struct rte_memseg *ms = &mcfg->memseg[i];
+
+   if (ms->addr == NULL)
+   continue;
+
+   ret = func(ms, arg);
+   if (ret < 0)
+   return -1;
+   if (ret > 0)
+   return 1;
+   }
+   return 0;
+}
+
 /* init memory subsystem */
 int
 rte_eal_memory_init(void)
diff --git a/lib/librte_eal/common/include/rte_memory.h 
b/lib/librte_eal/common/include/rte_memory.h
index 302f865..93eadaa 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -20,6 +20,7 @@ extern "C" {
 #endif
 
 #include 
+#include 
 #include 
 
 __extension__
@@ -130,6 +131,30 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
 rte_iova_t rte_mem_virt2iova(const void *virt);
 
 /**
+ * Memseg walk function prototype.
+ *
+ * Returning 0 will continue walk
+ * Returning 1 will stop the walk
+ * Returning -1 will stop the walk and report error
+ */
+typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
+
+/**
+ * Walk list of all memsegs.
+ *
+ * @param func
+ *   Iterator function
+ * @param arg
+ *   Argument passed to iterator
+ * @return
+ *   0 if walked over the entire list
+ *   1 if stopped by the user
+ *   -1 if user function reported error
+ */
+int __rte_experimental
+rte_memseg_walk(rte_memseg_walk_t func, void *arg);
+
+/**
  * Get the layout of the available physical memory.
  *
  * It can be useful for an application to have the full physical
diff --git a/lib/librte_eal/rte_eal_version.map 
b/lib/librte_eal/rte_eal_version.map
index d9fc458..716b965 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,7 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+   rte_memseg_walk;
rte_mp_action_register;
rte_mp_action_unregister;
rte_mp_reply;
-- 
2.7.4


[dpdk-dev] [PATCH v4 20/70] net/i40e: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API in the build system

 drivers/net/i40e/i40e_ethdev.c | 4 ++--
 drivers/net/i40e/i40e_rxtx.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index d0bf4e3..e00f402 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -4053,8 +4053,8 @@ i40e_allocate_dma_mem_d(__attribute__((unused)) struct 
i40e_hw *hw,
return I40E_ERR_PARAM;
 
snprintf(z_name, sizeof(z_name), "i40e_dma_%"PRIu64, rte_rand());
-   mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
-alignment, RTE_PGSIZE_2M);
+   mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY,
+   RTE_MEMZONE_IOVA_CONTIG, alignment, RTE_PGSIZE_2M);
if (!mz)
return I40E_ERR_NO_MEMORY;
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 1217e5a..56a854c 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2189,8 +2189,8 @@ i40e_memzone_reserve(const char *name, uint32_t len, int 
socket_id)
if (mz)
return mz;
 
-   mz = rte_memzone_reserve_aligned(name, len,
-socket_id, 0, I40E_RING_BASE_ALIGN);
+   mz = rte_memzone_reserve_aligned(name, len, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, I40E_RING_BASE_ALIGN);
return mz;
 }
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 27/70] bus/pci: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/bus/pci/Makefile|  3 +++
 drivers/bus/pci/linux/pci.c | 26 ++
 drivers/bus/pci/meson.build |  3 +++
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/bus/pci/Makefile b/drivers/bus/pci/Makefile
index f3df1c4..804a198 100644
--- a/drivers/bus/pci/Makefile
+++ b/drivers/bus/pci/Makefile
@@ -49,6 +49,9 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/pci/$(SYSTEM)
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/$(SYSTEM)app/eal
 
+# memseg walk is not part of stable API yet
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_pci
 
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index abde641..6dda054 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -116,22 +116,24 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
}
 }
 
-void *
-pci_find_max_end_va(void)
+static int
+find_max_end_va(const struct rte_memseg *ms, void *arg)
 {
-   const struct rte_memseg *seg = rte_eal_get_physmem_layout();
-   const struct rte_memseg *last = seg;
-   unsigned i = 0;
+   void *end_va = RTE_PTR_ADD(ms->addr, ms->len);
+   void **max_va = arg;
 
-   for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
-   if (seg->addr == NULL)
-   break;
+   if (*max_va < end_va)
+   *max_va = end_va;
+   return 0;
+}
 
-   if (seg->addr > last->addr)
-   last = seg;
+void *
+pci_find_max_end_va(void)
+{
+   void *va = NULL;
 
-   }
-   return RTE_PTR_ADD(last->addr, last->len);
+   rte_memseg_walk(find_max_end_va, &va);
+   return va;
 }
 
 /* parse one line of the "resource" sysfs file (note that the 'line'
diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build
index 12756a4..72939e5 100644
--- a/drivers/bus/pci/meson.build
+++ b/drivers/bus/pci/meson.build
@@ -14,3 +14,6 @@ else
sources += files('bsd/pci.c')
includes += include_directories('bsd')
 endif
+
+# memseg walk is not part of stable API yet
+allow_experimental_apis = true
-- 
2.7.4


[dpdk-dev] [PATCH v4 28/70] net/mlx5: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/net/mlx5/Makefile |  3 +++
 drivers/net/mlx5/mlx5.c   | 24 +++-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index afda411..25c8e10 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -92,6 +92,9 @@ CFLAGS += -Wno-error=cast-qual
 EXPORT_MAP := rte_pmd_mlx5_version.map
 LIBABIVER := 1
 
+# memseg walk is not part of stable API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 # DEBUG which is usually provided on the command-line may enable
 # CONFIG_RTE_LIBRTE_MLX5_DEBUG.
 ifeq ($(DEBUG),1)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7d58d66..1724b65 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -477,6 +477,19 @@ static struct rte_pci_driver mlx5_driver;
  */
 static void *uar_base;
 
+static int
+find_lower_va_bound(const struct rte_memseg *ms, void *arg)
+{
+   void **addr = arg;
+
+   if (*addr == NULL)
+   *addr = ms->addr;
+   else
+   *addr = RTE_MIN(*addr, ms->addr);
+
+   return 0;
+}
+
 /**
  * Reserve UAR address space for primary process.
  *
@@ -491,21 +504,14 @@ mlx5_uar_init_primary(struct rte_eth_dev *dev)
 {
struct priv *priv = dev->data->dev_private;
void *addr = (void *)0;
-   int i;
-   const struct rte_mem_config *mcfg;
 
if (uar_base) { /* UAR address space mapped. */
priv->uar_base = uar_base;
return 0;
}
/* find out lower bound of hugepage segments */
-   mcfg = rte_eal_get_configuration()->mem_config;
-   for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {
-   if (addr)
-   addr = RTE_MIN(addr, mcfg->memseg[i].addr);
-   else
-   addr = mcfg->memseg[i].addr;
-   }
+   rte_memseg_walk(find_lower_va_bound, &addr);
+
/* keep distance to hugepages to minimize potential conflicts. */
addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
/* anonymous mmap, no real memory consumption. */
-- 
2.7.4


[dpdk-dev] [PATCH v4 32/70] vfio/type1: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/linuxapp/eal/eal_vfio.c | 45 --
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 2421d51..2a34ae9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -665,39 +665,36 @@ vfio_get_group_no(const char *sysfs_base,
 }
 
 static int
-vfio_type1_dma_map(int vfio_container_fd)
+type1_map(const struct rte_memseg *ms, void *arg)
 {
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-   int i, ret;
+   int *vfio_container_fd = arg;
+   struct vfio_iommu_type1_dma_map dma_map;
+   int ret;
 
-   /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   struct vfio_iommu_type1_dma_map dma_map;
+   memset(&dma_map, 0, sizeof(dma_map));
+   dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+   dma_map.vaddr = ms->addr_64;
+   dma_map.size = ms->len;
+   dma_map.iova = ms->iova;
+   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 
-   if (ms[i].addr == NULL)
-   break;
-
-   memset(&dma_map, 0, sizeof(dma_map));
-   dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-   dma_map.vaddr = ms[i].addr_64;
-   dma_map.size = ms[i].len;
-   dma_map.iova = ms[i].iova;
-   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | 
VFIO_DMA_MAP_FLAG_WRITE;
+   ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
 
-   ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
-
-   if (ret) {
-   RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
-   return -1;
-   }
+   if (ret) {
+   RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i 
(%s)\n",
+   errno, strerror(errno));
+   return -1;
}
-
return 0;
 }
 
 static int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+   return rte_memseg_walk(type1_map, &vfio_container_fd);
+}
+
+static int
 vfio_spapr_dma_map(int vfio_container_fd)
 {
const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-- 
2.7.4


[dpdk-dev] [PATCH v4 29/70] eal: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/bsdapp/eal/eal.c   | 25 +++-
 lib/librte_eal/common/eal_common_memory.c | 67 ---
 lib/librte_eal/common/malloc_heap.c   | 33 +--
 lib/librte_eal/linuxapp/eal/eal.c | 22 +-
 4 files changed, 81 insertions(+), 66 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..8e25d78 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -429,23 +429,26 @@ eal_parse_args(int argc, char **argv)
return ret;
 }
 
+static int
+check_socket(const struct rte_memseg *ms, void *arg)
+{
+   int *socket_id = arg;
+
+   if (ms->socket_id == *socket_id)
+   return 1;
+
+   return 0;
+}
+
 static void
 eal_check_mem_on_local_socket(void)
 {
-   const struct rte_memseg *ms;
-   int i, socket_id;
+   int socket_id;
 
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
 
-   ms = rte_eal_get_physmem_layout();
-
-   for (i = 0; i < RTE_MAX_MEMSEG; i++)
-   if (ms[i].socket_id == socket_id &&
-   ms[i].len > 0)
-   return;
-
-   RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
-   "memory on local socket!\n");
+   if (rte_memseg_walk(check_socket, &socket_id) == 0)
+   RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on 
local socket!\n");
 }
 
 static int
diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c
index 947db1f..4f588c7 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -131,54 +131,57 @@ rte_eal_get_physmem_layout(void)
return rte_eal_get_configuration()->mem_config->memseg;
 }
 
+static int
+physmem_size(const struct rte_memseg *ms, void *arg)
+{
+   uint64_t *total_len = arg;
+
+   *total_len += ms->len;
+
+   return 0;
+}
 
 /* get the total size of memory */
 uint64_t
 rte_eal_get_physmem_size(void)
 {
-   const struct rte_mem_config *mcfg;
-   unsigned i = 0;
uint64_t total_len = 0;
 
-   /* get pointer to global configuration */
-   mcfg = rte_eal_get_configuration()->mem_config;
+   rte_memseg_walk(physmem_size, &total_len);
 
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if (mcfg->memseg[i].addr == NULL)
-   break;
+   return total_len;
+}
 
-   total_len += mcfg->memseg[i].len;
-   }
+static int
+dump_memseg(const struct rte_memseg *ms, void *arg)
+{
+   struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+   int i = ms - mcfg->memseg;
+   FILE *f = arg;
 
-   return total_len;
+   if (i < 0 || i >= RTE_MAX_MEMSEG)
+   return -1;
+
+   fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
+   "virt:%p, socket_id:%"PRId32", "
+   "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
+   "nrank:%"PRIx32"\n", i,
+   mcfg->memseg[i].iova,
+   mcfg->memseg[i].len,
+   mcfg->memseg[i].addr,
+   mcfg->memseg[i].socket_id,
+   mcfg->memseg[i].hugepage_sz,
+   mcfg->memseg[i].nchannel,
+   mcfg->memseg[i].nrank);
+
+   return 0;
 }
 
 /* Dump the physical memory layout on console */
 void
 rte_dump_physmem_layout(FILE *f)
 {
-   const struct rte_mem_config *mcfg;
-   unsigned i = 0;
-
-   /* get pointer to global configuration */
-   mcfg = rte_eal_get_configuration()->mem_config;
-
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if (mcfg->memseg[i].addr == NULL)
-   break;
-
-   fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
-  "virt:%p, socket_id:%"PRId32", "
-  "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
-  "nrank:%"PRIx32"\n", i,
-  mcfg->memseg[i].iova,
-  mcfg->memseg[i].len,
-  mcfg->memseg[i].addr,
-  mcfg->memseg[i].socket_id,
-  mcfg->memseg[i].hugepage_sz,
-  mcfg->memseg[i].nchannel,
-  mcfg->memseg[i].nrank);
-   }
+   rte_memseg_walk(dump_memseg, f);
 }
 
 /* return the number of memory channels */
diff --git a/lib/librte_eal/common/malloc_heap.c 
b/lib/librte_eal/common/malloc_heap.c
index 564b61a..79914fc 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -67,17 +67,32 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
  * to prevent overflow. The rest of the zone is 

[dpdk-dev] [PATCH v4 21/70] net/qede: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
Acked-by: Harish Patil 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved the patch earlier in the patchset
- Allowed experimental API in Makefile

 drivers/net/qede/base/bcm_osal.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/qede/base/bcm_osal.c b/drivers/net/qede/base/bcm_osal.c
index 91017b8..f550412 100644
--- a/drivers/net/qede/base/bcm_osal.c
+++ b/drivers/net/qede/base/bcm_osal.c
@@ -135,8 +135,8 @@ void *osal_dma_alloc_coherent(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = rte_get_master_lcore();
socket_id = rte_lcore_to_socket_id(core_id);
-   mz = rte_memzone_reserve_aligned(mz_name, size,
-socket_id, 0, RTE_CACHE_LINE_SIZE);
+   mz = rte_memzone_reserve_aligned(mz_name, size, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, RTE_CACHE_LINE_SIZE);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
   "of size %zu bytes - %s\n",
@@ -174,7 +174,8 @@ void *osal_dma_alloc_coherent_aligned(struct ecore_dev 
*p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = rte_get_master_lcore();
socket_id = rte_lcore_to_socket_id(core_id);
-   mz = rte_memzone_reserve_aligned(mz_name, size, socket_id, 0, align);
+   mz = rte_memzone_reserve_aligned(mz_name, size, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, align);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
   "of size %zu bytes - %s\n",
-- 
2.7.4


[dpdk-dev] [PATCH v4 10/70] eal: add backend support for contiguous allocation

2018-04-08 Thread Anatoly Burakov
No major changes, just add some checks in a few key places, and
a new parameter to pass around.

Also, add a function to check malloc element for physical
contiguousness. For now, assume hugepage memory is always
contiguous, while non-hugepage memory will be checked.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Moved this patch earlier
- Added physical contiguousness checking function

 lib/librte_eal/common/eal_common_memzone.c |  23 +++---
 lib/librte_eal/common/malloc_elem.c| 125 -
 lib/librte_eal/common/malloc_elem.h|   6 +-
 lib/librte_eal/common/malloc_heap.c|  11 +--
 lib/librte_eal/common/malloc_heap.h|   4 +-
 lib/librte_eal/common/rte_malloc.c |   7 +-
 6 files changed, 133 insertions(+), 43 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c 
b/lib/librte_eal/common/eal_common_memzone.c
index 1ab3ade..16a2e7a 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -98,7 +98,8 @@ find_heap_max_free_elem(int *s, unsigned align)
 
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
-   int socket_id, unsigned flags, unsigned align, unsigned bound)
+   int socket_id, unsigned int flags, unsigned int align,
+   unsigned int bound, bool contig)
 {
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
@@ -188,7 +189,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, 
size_t len,
 
/* allocate memory on heap */
void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
-   requested_len, flags, align, bound);
+   requested_len, flags, align, bound, contig);
 
if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
/* try other heaps */
@@ -197,7 +198,8 @@ memzone_reserve_aligned_thread_unsafe(const char *name, 
size_t len,
continue;
 
mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
-   NULL, requested_len, flags, align, 
bound);
+   NULL, requested_len, flags, align,
+   bound, contig);
if (mz_addr != NULL)
break;
}
@@ -235,9 +237,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, 
size_t len,
 }
 
 static const struct rte_memzone *
-rte_memzone_reserve_thread_safe(const char *name, size_t len,
-   int socket_id, unsigned flags, unsigned align,
-   unsigned bound)
+rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id,
+   unsigned int flags, unsigned int align, unsigned int bound,
+   bool contig)
 {
struct rte_mem_config *mcfg;
const struct rte_memzone *mz = NULL;
@@ -248,7 +250,7 @@ rte_memzone_reserve_thread_safe(const char *name, size_t 
len,
rte_rwlock_write_lock(&mcfg->mlock);
 
mz = memzone_reserve_aligned_thread_unsafe(
-   name, len, socket_id, flags, align, bound);
+   name, len, socket_id, flags, align, bound, contig);
 
rte_rwlock_write_unlock(&mcfg->mlock);
 
@@ -265,7 +267,7 @@ rte_memzone_reserve_bounded(const char *name, size_t len, 
int socket_id,
unsigned flags, unsigned align, unsigned bound)
 {
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
-  align, bound);
+  align, bound, false);
 }
 
 /*
@@ -277,7 +279,7 @@ rte_memzone_reserve_aligned(const char *name, size_t len, 
int socket_id,
unsigned flags, unsigned align)
 {
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
-  align, 0);
+  align, 0, false);
 }
 
 /*
@@ -289,7 +291,8 @@ rte_memzone_reserve(const char *name, size_t len, int 
socket_id,
unsigned flags)
 {
return rte_memzone_reserve_thread_safe(name, len, socket_id,
-  flags, RTE_CACHE_LINE_SIZE, 0);
+  flags, RTE_CACHE_LINE_SIZE, 0,
+  false);
 }
 
 int
diff --git a/lib/librte_eal/common/malloc_elem.c 
b/lib/librte_eal/common/malloc_elem.c
index c18f050..87695b9 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -94,33 +95,112 @@ malloc_elem_insert(struct malloc_elem *elem)
 }
 
 /*
+ * Attempt to find enough physically contiguous memory in this block

[dpdk-dev] [PATCH v4 24/70] mempool: add support for the new allocation methods

2018-04-08 Thread Anatoly Burakov
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.

However, for the non-IOVA contiguous case, existing mempool size
calculation function doesn't give us expected results, because it
will return memzone sizes aligned to page size (e.g. a 1MB mempool
may use an entire 1GB page), therefore in cases where we weren't
specifically asked to reserve non-contiguous memory, first try
reserving a memzone as IOVA-contiguous, and if that fails, then
try reserving with page-aligned size/alignment.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Fixed mempool size calculation
- Fixed handling of contiguous memzones
- Moved earlier in the patchset

 lib/librte_mempool/rte_mempool.c | 149 +--
 1 file changed, 127 insertions(+), 22 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 54f7f4b..85fbdca 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2016 6WIND S.A.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -98,6 +99,27 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
 }
 
+static size_t
+get_min_page_size(void)
+{
+   const struct rte_mem_config *mcfg =
+   rte_eal_get_configuration()->mem_config;
+   int i;
+   size_t min_pagesz = SIZE_MAX;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   const struct rte_memseg *ms = &mcfg->memseg[i];
+
+   if (ms->addr == NULL)
+   continue;
+
+   if (ms->hugepage_sz < min_pagesz)
+   min_pagesz = ms->hugepage_sz;
+   }
+
+   return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
+}
+
 static void
 mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
 {
@@ -204,7 +226,6 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
return sz->total_size;
 }
 
-
 /*
  * Calculate maximum amount of memory required to store given number of 
objects.
  */
@@ -367,16 +388,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char 
*vaddr,
/* update mempool capabilities */
mp->flags |= mp_capa_flags;
 
-   /* Detect pool area has sufficient space for elements */
-   if (mp_capa_flags & MEMPOOL_F_CAPA_PHYS_CONTIG) {
-   if (len < total_elt_sz * mp->size) {
-   RTE_LOG(ERR, MEMPOOL,
-   "pool area %" PRIx64 " not enough\n",
-   (uint64_t)len);
-   return -ENOSPC;
-   }
-   }
-
memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0);
if (memhdr == NULL)
return -ENOMEM;
@@ -549,6 +560,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+   bool force_contig, no_contig, try_contig, no_pageshift;
 
/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,9 +575,68 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;
 
-   if (rte_eal_has_hugepages()) {
-   pg_shift = 0; /* not needed, zone is physically contiguous */
+   no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+   force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+   /*
+* the following section calculates page shift and page size values.
+*
+* these values impact the result of rte_mempool_xmem_size(), which
+* returns the amount of memory that should be allocated to store the
+* desired number of objects. when not zero, it allocates more memory
+* for the padding between objects, to ensure that an object does not
+* cross a page boundary. in other words, page size/shift are to be set
+* to zero if mempool elements won't care about page boundaries.
+* there are several considerations for page size and page shift here.
+*
+* if we don't need our mempools to have physically contiguous objects,
+* then just set page shift and page size to 0, because the user has
+* indicated that there's no need to care about anything.
+*
+* if we do need contiguous objects, there is also an option to reserve
+* the entire mempool memory as one contiguous block of memory, in
+* which case the page shift and alignment wouldn't matter as well.
+*
+* if we require contiguous objects, but not necessarily

[dpdk-dev] [PATCH v4 23/70] net/vmxnet3: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API in Makefile

 drivers/net/vmxnet3/vmxnet3_ethdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c 
b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 4260087..104664a 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -150,13 +150,14 @@ gpa_zone_reserve(struct rte_eth_dev *dev, uint32_t size,
if (mz)
rte_memzone_free(mz);
return rte_memzone_reserve_aligned(z_name, size, socket_id,
-  0, align);
+   RTE_MEMZONE_IOVA_CONTIG, align);
}
 
if (mz)
return mz;
 
-   return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+   return rte_memzone_reserve_aligned(z_name, size, socket_id,
+   RTE_MEMZONE_IOVA_CONTIG, align);
 }
 
 /*
-- 
2.7.4


[dpdk-dev] [PATCH v4 31/70] test: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 test/test/test_malloc.c  | 40 +++-
 test/test/test_memory.c  | 23 +++--
 test/test/test_memzone.c | 53 
 3 files changed, 74 insertions(+), 42 deletions(-)

diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index ccc5fea..28c241f 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -705,16 +705,34 @@ test_malloc_bad_params(void)
return -1;
 }
 
+static int
+check_socket_mem(const struct rte_memseg *ms, void *arg)
+{
+   int32_t *socket = arg;
+
+   return *socket == ms->socket_id;
+}
+
 /* Check if memory is available on a specific socket */
 static int
 is_mem_on_socket(int32_t socket)
 {
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-   unsigned i;
+   return rte_memseg_walk(check_socket_mem, &socket);
+}
 
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if (socket == ms[i].socket_id)
-   return 1;
+struct walk_param {
+   void *addr;
+   int32_t socket;
+};
+static int
+find_socket(const struct rte_memseg *ms, void *arg)
+{
+   struct walk_param *param = arg;
+
+   if (param->addr >= ms->addr &&
+   param->addr < RTE_PTR_ADD(ms->addr, ms->len)) {
+   param->socket = ms->socket_id;
+   return 1;
}
return 0;
 }
@@ -726,15 +744,9 @@ is_mem_on_socket(int32_t socket)
 static int32_t
 addr_to_socket(void * addr)
 {
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-   unsigned i;
-
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if ((ms[i].addr <= addr) &&
-   ((uintptr_t)addr <
-   ((uintptr_t)ms[i].addr + (uintptr_t)ms[i].len)))
-   return ms[i].socket_id;
-   }
+   struct walk_param param = {.addr = addr, .socket = 0};
+   if (rte_memseg_walk(find_socket, ¶m) > 0)
+   return param.socket;
return -1;
 }
 
diff --git a/test/test/test_memory.c b/test/test/test_memory.c
index 972321f..c9b287c 100644
--- a/test/test/test_memory.c
+++ b/test/test/test_memory.c
@@ -23,12 +23,20 @@
  */
 
 static int
+check_mem(const struct rte_memseg *ms, void *arg __rte_unused)
+{
+   volatile uint8_t *mem = (volatile uint8_t *) ms->addr;
+   size_t i;
+
+   for (i = 0; i < ms->len; i++, mem++)
+   *mem;
+   return 0;
+}
+
+static int
 test_memory(void)
 {
uint64_t s;
-   unsigned i;
-   size_t j;
-   const struct rte_memseg *mem;
 
/*
 * dump the mapped memory: the python-expect script checks
@@ -45,14 +53,7 @@ test_memory(void)
}
 
/* try to read memory (should not segfault) */
-   mem = rte_eal_get_physmem_layout();
-   for (i = 0; i < RTE_MAX_MEMSEG && mem[i].addr != NULL ; i++) {
-
-   /* check memory */
-   for (j = 0; jhugepage_sz == RTE_PGSIZE_2M)
+   wa->hugepage_2MB_avail = 1;
+   if (ms->hugepage_sz == RTE_PGSIZE_1G)
+   wa->hugepage_1GB_avail = 1;
+   if (ms->hugepage_sz == RTE_PGSIZE_16M)
+   wa->hugepage_16MB_avail = 1;
+   if (ms->hugepage_sz == RTE_PGSIZE_16G)
+   wa->hugepage_16GB_avail = 1;
+
+   return 0;
+}
+
 static int
 test_memzone_reserve_flags(void)
 {
const struct rte_memzone *mz;
-   const struct rte_memseg *ms;
-   int hugepage_2MB_avail = 0;
-   int hugepage_1GB_avail = 0;
-   int hugepage_16MB_avail = 0;
-   int hugepage_16GB_avail = 0;
+   struct walk_arg wa;
+   int hugepage_2MB_avail, hugepage_1GB_avail;
+   int hugepage_16MB_avail, hugepage_16GB_avail;
const size_t size = 100;
-   int i = 0;
-   ms = rte_eal_get_physmem_layout();
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if (ms[i].hugepage_sz == RTE_PGSIZE_2M)
-   hugepage_2MB_avail = 1;
-   if (ms[i].hugepage_sz == RTE_PGSIZE_1G)
-   hugepage_1GB_avail = 1;
-   if (ms[i].hugepage_sz == RTE_PGSIZE_16M)
-   hugepage_16MB_avail = 1;
-   if (ms[i].hugepage_sz == RTE_PGSIZE_16G)
-   hugepage_16GB_avail = 1;
-   }
+
+   memset(&wa, 0, sizeof(wa));
+
+   rte_memseg_walk(find_available_pagesz, &wa);
+
+   hugepage_2MB_avail = wa.hugepage_2MB_avail;
+   hugepage_1GB_avail = wa.hugepage_1GB_avail;
+   hugepage_16MB_avail = wa.hugepage_16MB_avail;
+   hugepage_16GB_avail = wa.hugepage_16GB_avail;
+
/* Display the availability of 2MB ,1GB, 16MB, 16GB pages */
if (hugepage_2MB_avail)
printf("2MB Huge pages available\n");
-- 
2.7.4


[dpdk-dev] [PATCH v4 16/70] net/bnxt: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Added this driver to the list of modified drivers
- Add experimental API to build files

v3:
- Added this patch

All memzone reserve calls then check physical addresses,
so this looks like they're reserving DMA memory.
Corrections welcome.

 drivers/net/bnxt/bnxt_ethdev.c | 17 ++---
 drivers/net/bnxt/bnxt_ring.c   |  9 +
 drivers/net/bnxt/bnxt_vnic.c   |  8 
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 0b21653..ad7d925 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -3147,9 +3147,10 @@ bnxt_dev_init(struct rte_eth_dev *eth_dev)
sizeof(struct rx_port_stats) + 512);
if (!mz) {
mz = rte_memzone_reserve(mz_name, total_alloc_len,
-SOCKET_ID_ANY,
-RTE_MEMZONE_2MB |
-RTE_MEMZONE_SIZE_HINT_ONLY);
+   SOCKET_ID_ANY,
+   RTE_MEMZONE_2MB |
+   RTE_MEMZONE_SIZE_HINT_ONLY |
+   RTE_MEMZONE_IOVA_CONTIG);
if (mz == NULL)
return -ENOMEM;
}
@@ -3181,10 +3182,12 @@ bnxt_dev_init(struct rte_eth_dev *eth_dev)
total_alloc_len = RTE_CACHE_LINE_ROUNDUP(
sizeof(struct tx_port_stats) + 512);
if (!mz) {
-   mz = rte_memzone_reserve(mz_name, total_alloc_len,
-SOCKET_ID_ANY,
-RTE_MEMZONE_2MB |
-RTE_MEMZONE_SIZE_HINT_ONLY);
+   mz = rte_memzone_reserve(mz_name,
+   total_alloc_len,
+   SOCKET_ID_ANY,
+   RTE_MEMZONE_2MB |
+   RTE_MEMZONE_SIZE_HINT_ONLY |
+   RTE_MEMZONE_IOVA_CONTIG);
if (mz == NULL)
return -ENOMEM;
}
diff --git a/drivers/net/bnxt/bnxt_ring.c b/drivers/net/bnxt/bnxt_ring.c
index 8fb8972..0e8a6a2 100644
--- a/drivers/net/bnxt/bnxt_ring.c
+++ b/drivers/net/bnxt/bnxt_ring.c
@@ -166,10 +166,11 @@ int bnxt_alloc_rings(struct bnxt *bp, uint16_t qidx,
mz = rte_memzone_lookup(mz_name);
if (!mz) {
mz = rte_memzone_reserve_aligned(mz_name, total_alloc_len,
-SOCKET_ID_ANY,
-RTE_MEMZONE_2MB |
-RTE_MEMZONE_SIZE_HINT_ONLY,
-getpagesize());
+   SOCKET_ID_ANY,
+   RTE_MEMZONE_2MB |
+   RTE_MEMZONE_SIZE_HINT_ONLY |
+   RTE_MEMZONE_IOVA_CONTIG,
+   getpagesize());
if (mz == NULL)
return -ENOMEM;
}
diff --git a/drivers/net/bnxt/bnxt_vnic.c b/drivers/net/bnxt/bnxt_vnic.c
index d4aeb4c..9ccc67e 100644
--- a/drivers/net/bnxt/bnxt_vnic.c
+++ b/drivers/net/bnxt/bnxt_vnic.c
@@ -185,10 +185,10 @@ int bnxt_alloc_vnic_attributes(struct bnxt *bp)
mz = rte_memzone_lookup(mz_name);
if (!mz) {
mz = rte_memzone_reserve(mz_name,
-entry_length * max_vnics,
-SOCKET_ID_ANY,
-RTE_MEMZONE_2MB |
-RTE_MEMZONE_SIZE_HINT_ONLY);
+   entry_length * max_vnics, SOCKET_ID_ANY,
+   RTE_MEMZONE_2MB |
+   RTE_MEMZONE_SIZE_HINT_ONLY |
+   RTE_MEMZONE_IOVA_CONTIG);
if (!mz)
return -ENOMEM;
}
-- 
2.7.4


[dpdk-dev] [PATCH v4 39/70] crypto/dpaa_sec: use iova2virt instead of memseg iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/crypto/dpaa_sec/dpaa_sec.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c 
b/drivers/crypto/dpaa_sec/dpaa_sec.c
index c5191ce..b04510f 100644
--- a/drivers/crypto/dpaa_sec/dpaa_sec.c
+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c
@@ -120,16 +120,7 @@ dpaa_mem_vtop_ctx(struct dpaa_sec_op_ctx *ctx, void *vaddr)
 static inline void *
 dpaa_mem_ptov(rte_iova_t paddr)
 {
-   const struct rte_memseg *memseg = rte_eal_get_physmem_layout();
-   int i;
-
-   for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
-   if (paddr >= memseg[i].iova &&
-   paddr < memseg[i].iova + memseg[i].len)
-   return (void *)(size_t)(memseg[i].addr_64 +
-   (paddr - memseg[i].iova));
-   }
-   return NULL;
+   return rte_mem_iova2virt(paddr);
 }
 
 static void
-- 
2.7.4


[dpdk-dev] [PATCH v4 30/70] mempool: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_mempool/Makefile  |  3 +++
 lib/librte_mempool/meson.build   |  3 +++
 lib/librte_mempool/rte_mempool.c | 24 
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 24e735a..1f85d34 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -13,6 +13,9 @@ EXPORT_MAP := rte_mempool_version.map
 
 LIBABIVER := 3
 
+# memseg walk is not yet part of stable API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 712720f..89506c5 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -5,3 +5,6 @@ version = 3
 sources = files('rte_mempool.c', 'rte_mempool_ops.c')
 headers = files('rte_mempool.h')
 deps += ['ring']
+
+# memseg walk is not yet part of stable API
+allow_experimental_apis = true
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 85fbdca..ea62b6b 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -99,23 +99,23 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
 }
 
+static int
+find_min_pagesz(const struct rte_memseg *ms, void *arg)
+{
+   size_t *min = arg;
+
+   if (ms->hugepage_sz < *min)
+   *min = ms->hugepage_sz;
+
+   return 0;
+}
+
 static size_t
 get_min_page_size(void)
 {
-   const struct rte_mem_config *mcfg =
-   rte_eal_get_configuration()->mem_config;
-   int i;
size_t min_pagesz = SIZE_MAX;
 
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   const struct rte_memseg *ms = &mcfg->memseg[i];
-
-   if (ms->addr == NULL)
-   continue;
-
-   if (ms->hugepage_sz < min_pagesz)
-   min_pagesz = ms->hugepage_sz;
-   }
+   rte_memseg_walk(find_min_pagesz, &min_pagesz);
 
return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
 }
-- 
2.7.4


[dpdk-dev] [PATCH v4 14/70] net/avf: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Use new memzone flag instead of new API
- Remove experimental API from build files

v3:
- Add experimental API to build files

v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the makefile

 drivers/net/avf/avf_ethdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index 4442c3c..68a59b4 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -1365,8 +1365,8 @@ avf_allocate_dma_mem_d(__rte_unused struct avf_hw *hw,
return AVF_ERR_PARAM;
 
snprintf(z_name, sizeof(z_name), "avf_dma_%"PRIu64, rte_rand());
-   mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
-alignment, RTE_PGSIZE_2M);
+   mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY,
+   RTE_MEMZONE_IOVA_CONTIG, alignment, RTE_PGSIZE_2M);
if (!mz)
return AVF_ERR_NO_MEMORY;
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 33/70] vfio/spapr: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/linuxapp/eal/eal_vfio.c | 108 +++--
 1 file changed, 63 insertions(+), 45 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 2a34ae9..fb41e82 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -694,16 +694,69 @@ vfio_type1_dma_map(int vfio_container_fd)
return rte_memseg_walk(type1_map, &vfio_container_fd);
 }
 
+struct spapr_walk_param {
+   uint64_t window_size;
+   uint64_t hugepage_sz;
+};
 static int
-vfio_spapr_dma_map(int vfio_container_fd)
+spapr_window_size(const struct rte_memseg *ms, void *arg)
 {
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-   int i, ret;
+   struct spapr_walk_param *param = arg;
+   uint64_t max = ms->iova + ms->len;
+
+   if (max > param->window_size) {
+   param->hugepage_sz = ms->hugepage_sz;
+   param->window_size = max;
+   }
 
+   return 0;
+}
+
+static int
+spapr_map(const struct rte_memseg *ms, void *arg)
+{
+   struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_spapr_register_memory reg = {
.argsz = sizeof(reg),
.flags = 0
};
+   int *vfio_container_fd = arg;
+   int ret;
+
+   reg.vaddr = (uintptr_t) ms->addr;
+   reg.size = ms->len;
+   ret = ioctl(*vfio_container_fd,
+   VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
+   if (ret) {
+   RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, error %i 
(%s)\n",
+   errno, strerror(errno));
+   return -1;
+   }
+
+   memset(&dma_map, 0, sizeof(dma_map));
+   dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+   dma_map.vaddr = ms->addr_64;
+   dma_map.size = ms->len;
+   dma_map.iova = ms->iova;
+   dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+VFIO_DMA_MAP_FLAG_WRITE;
+
+   ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+   if (ret) {
+   RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i 
(%s)\n",
+   errno, strerror(errno));
+   return -1;
+   }
+
+   return 0;
+}
+
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+   struct spapr_walk_param param;
+   int ret;
struct vfio_iommu_spapr_tce_info info = {
.argsz = sizeof(info),
};
@@ -714,6 +767,8 @@ vfio_spapr_dma_map(int vfio_container_fd)
.argsz = sizeof(remove),
};
 
+   memset(¶m, 0, sizeof(param));
+
/* query spapr iommu info */
ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
@@ -732,17 +787,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
}
 
/* create DMA window from 0 to max(phys_addr + len) */
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if (ms[i].addr == NULL)
-   break;
-
-   create.window_size = RTE_MAX(create.window_size,
-   ms[i].iova + ms[i].len);
-   }
+   rte_memseg_walk(spapr_window_size, ¶m);
 
/* sPAPR requires window size to be a power of 2 */
-   create.window_size = rte_align64pow2(create.window_size);
-   create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+   create.window_size = rte_align64pow2(param.window_size);
+   create.page_shift = __builtin_ctzll(param.hugepage_sz);
create.levels = 1;
 
ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
@@ -758,39 +807,8 @@ vfio_spapr_dma_map(int vfio_container_fd)
}
 
/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   struct vfio_iommu_type1_dma_map dma_map;
-
-   if (ms[i].addr == NULL)
-   break;
-
-   reg.vaddr = (uintptr_t) ms[i].addr;
-   reg.size = ms[i].len;
-   ret = ioctl(vfio_container_fd,
-   VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
-   if (ret) {
-   RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
-   "error %i (%s)\n", errno, strerror(errno));
-   return -1;
-   }
-
-   memset(&dma_map, 0, sizeof(dma_map));
-   dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-   dma_map.vaddr = ms[i].addr_64;
-   dma_map.size = ms[i].len;
-   dma_map.iova = ms[i].iova;
-   dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
-VFIO_DMA_MAP_FLAG_WRITE;
-
-   ret = ioctl(vfio_container_fd, VFIO_IO

[dpdk-dev] [PATCH v4 43/70] net/mlx4: use virt2memseg instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/net/mlx4/mlx4_mr.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9a1e4de..47dd542 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -126,7 +126,7 @@ mlx4_check_mempool(struct rte_mempool *mp, uintptr_t 
*start, uintptr_t *end)
 struct mlx4_mr *
 mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
 {
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+   const struct rte_memseg *ms;
uintptr_t start;
uintptr_t end;
unsigned int i;
@@ -142,16 +142,13 @@ mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
  (void *)mp, (void *)start, (void *)end,
  (size_t)(end - start));
/* Round start and end to page boundary if found in memory segments. */
-   for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
-   uintptr_t addr = (uintptr_t)ms[i].addr;
-   size_t len = ms[i].len;
-   unsigned int align = ms[i].hugepage_sz;
+   ms = rte_mem_virt2memseg((void *)start);
+   if (ms != NULL)
+   start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
+   ms = rte_mem_virt2memseg((void *)end);
+   if (ms != NULL)
+   end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);
 
-   if ((start > addr) && (start < addr + len))
-   start = RTE_ALIGN_FLOOR(start, align);
-   if ((end > addr) && (end < addr + len))
-   end = RTE_ALIGN_CEIL(end, align);
-   }
DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
  (void *)mp, (void *)start, (void *)end,
  (size_t)(end - start));
-- 
2.7.4


[dpdk-dev] [PATCH v4 41/70] bus/fslmc: use virt2memseg instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h 
b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index d38fc49..45fd41e 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -270,20 +270,14 @@ static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr) 
__attribute__((unused));
 static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr)
 {
const struct rte_memseg *memseg;
-   int i;
 
if (dpaa2_virt_mode)
return vaddr;
 
-   memseg = rte_eal_get_physmem_layout();
-
-   for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
-   if (vaddr >= memseg[i].addr_64 &&
-   vaddr < memseg[i].addr_64 + memseg[i].len)
-   return memseg[i].iova
-   + (vaddr - memseg[i].addr_64);
-   }
-   return (size_t)(NULL);
+   memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr);
+   if (memseg)
+   return memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr);
+   return (size_t)NULL;
 }
 
 /**
-- 
2.7.4


[dpdk-dev] [PATCH v4 22/70] net/virtio: use contiguous allocation for DMA memory

2018-04-08 Thread Anatoly Burakov
All hardware drivers should allocate IOVA-contiguous
memzones for their hardware resources.

Signed-off-by: Anatoly Burakov 
Reviewed-by: Venkatesh Srinivas 
Reviewed-by: Maxime Coquelin 
---

Notes:
v4:
- Use new memzone flag instead of new API

v3:
- Moved patch earlier in the patchset

 drivers/net/virtio/virtio_ethdev.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio/virtio_ethdev.c 
b/drivers/net/virtio/virtio_ethdev.c
index 2ef213d..f03d790 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -391,8 +391,8 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t 
vtpci_queue_idx)
 size, vq->vq_ring_size);
 
mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size,
-SOCKET_ID_ANY,
-0, VIRTIO_PCI_VRING_ALIGN);
+   SOCKET_ID_ANY, RTE_MEMZONE_IOVA_CONTIG,
+   VIRTIO_PCI_VRING_ALIGN);
if (mz == NULL) {
if (rte_errno == EEXIST)
mz = rte_memzone_lookup(vq_name);
@@ -417,8 +417,8 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t 
vtpci_queue_idx)
snprintf(vq_hdr_name, sizeof(vq_hdr_name), "port%d_vq%d_hdr",
 dev->data->port_id, vtpci_queue_idx);
hdr_mz = rte_memzone_reserve_aligned(vq_hdr_name, sz_hdr_mz,
-SOCKET_ID_ANY, 0,
-RTE_CACHE_LINE_SIZE);
+   SOCKET_ID_ANY, RTE_MEMZONE_IOVA_CONTIG,
+   RTE_CACHE_LINE_SIZE);
if (hdr_mz == NULL) {
if (rte_errno == EEXIST)
hdr_mz = rte_memzone_lookup(vq_hdr_name);
-- 
2.7.4


[dpdk-dev] [PATCH v4 26/70] bus/fslmc: use memseg walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
Acked-by: Shreyansh Jain 
---
 drivers/bus/fslmc/fslmc_vfio.c | 78 ++
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 4291871..0c048dc 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -189,17 +189,51 @@ static int vfio_map_irq_region(struct fslmc_vfio_group 
*group)
return -errno;
 }
 
-int rte_fslmc_vfio_dmamap(void)
+static int
+fslmc_vfio_map(const struct rte_memseg *ms, void *arg)
 {
-   int ret;
+   int *n_segs = arg;
struct fslmc_vfio_group *group;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(struct vfio_iommu_type1_dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
};
+   int ret;
+
+   dma_map.size = ms->len;
+   dma_map.vaddr = ms->addr_64;
+#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
+   dma_map.iova = ms->iova;
+#else
+   dma_map.iova = dma_map.vaddr;
+#endif
+
+   /* SET DMA MAP for IOMMU */
+   group = &vfio_group;
+
+   if (!group->container) {
+   DPAA2_BUS_ERR("Container is not connected ");
+   return -1;
+   }
+
+   DPAA2_BUS_DEBUG("-->Initial SHM Virtual ADDR %llX",
+   dma_map.vaddr);
+   DPAA2_BUS_DEBUG("-> DMA size 0x%llX", dma_map.size);
+   ret = ioctl(group->container->fd, VFIO_IOMMU_MAP_DMA,
+   &dma_map);
+   if (ret) {
+   DPAA2_BUS_ERR("VFIO_IOMMU_MAP_DMA API(errno = %d)",
+   errno);
+   return -1;
+   }
+   (*n_segs)++;
+   return 0;
+}
 
-   int i;
+int rte_fslmc_vfio_dmamap(void)
+{
const struct rte_memseg *memseg;
+   int i = 0;
 
if (is_dma_done)
return 0;
@@ -210,51 +244,21 @@ int rte_fslmc_vfio_dmamap(void)
return -ENODEV;
}
 
-   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-   if (memseg[i].addr == NULL && memseg[i].len == 0) {
-   DPAA2_BUS_DEBUG("Total %d segments found", i);
-   break;
-   }
-
-   dma_map.size = memseg[i].len;
-   dma_map.vaddr = memseg[i].addr_64;
-#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
-   dma_map.iova = memseg[i].iova;
-#else
-   dma_map.iova = dma_map.vaddr;
-#endif
-
-   /* SET DMA MAP for IOMMU */
-   group = &vfio_group;
-
-   if (!group->container) {
-   DPAA2_BUS_ERR("Container is not connected");
-   return -1;
-   }
-
-   DPAA2_BUS_DEBUG("-->Initial SHM Virtual ADDR %llX",
-   dma_map.vaddr);
-   DPAA2_BUS_DEBUG("-> DMA size 0x%llX", dma_map.size);
-   ret = ioctl(group->container->fd, VFIO_IOMMU_MAP_DMA,
-   &dma_map);
-   if (ret) {
-   DPAA2_BUS_ERR("Unable to map DMA address (errno = %d)",
- errno);
-   return ret;
-   }
-   }
+   if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
+   return -1;
 
/* Verifying that at least single segment is available */
if (i <= 0) {
DPAA2_BUS_ERR("No Segments found for VFIO Mapping");
return -1;
}
+   DPAA2_BUS_DEBUG("Total %d segments found.", i);
 
/* TODO - This is a W.A. as VFIO currently does not add the mapping of
 * the interrupt region to SMMU. This should be removed once the
 * support is added in the Kernel.
 */
-   vfio_map_irq_region(group);
+   vfio_map_irq_region(&vfio_group);
 
is_dma_done = 1;
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 40/70] eal: add virt2memseg function

2018-04-08 Thread Anatoly Burakov
This can be used as a virt2iova function that only looks up
memory that is owned by DPDK (as opposed to doing pagemap walks).
Using this will result in less dependency on internals of mem API.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 37 ++
 lib/librte_eal/common/include/rte_memory.h | 11 +
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 49 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c
index ea3c5a7..fd78d2f 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -161,6 +161,43 @@ rte_mem_iova2virt(rte_iova_t iova)
return vi.virt;
 }
 
+struct virtms {
+   const void *virt;
+   struct rte_memseg *ms;
+};
+static int
+find_memseg(const struct rte_memseg *ms, void *arg)
+{
+   struct virtms *vm = arg;
+
+   if (arg >= ms->addr && arg < RTE_PTR_ADD(ms->addr, ms->len)) {
+   struct rte_memseg *memseg, *found_ms;
+   int idx;
+
+   memseg = rte_eal_get_configuration()->mem_config->memseg;
+   idx = ms - memseg;
+   found_ms = &memseg[idx];
+
+   vm->ms = found_ms;
+   return 1;
+   }
+   return 0;
+}
+
+__rte_experimental struct rte_memseg *
+rte_mem_virt2memseg(const void *addr)
+{
+   struct virtms vm;
+
+   memset(&vm, 0, sizeof(vm));
+
+   vm.virt = addr;
+
+   rte_memseg_walk(find_memseg, &vm);
+
+   return vm.ms;
+}
+
 static int
 physmem_size(const struct rte_memseg *ms, void *arg)
 {
diff --git a/lib/librte_eal/common/include/rte_memory.h 
b/lib/librte_eal/common/include/rte_memory.h
index 5c60b91..b3d7e61 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -143,6 +143,17 @@ __rte_experimental void *
 rte_mem_iova2virt(rte_iova_t iova);
 
 /**
+ * Get memseg to which a particular virtual address belongs.
+ *
+ * @param virt
+ *   The virtual address.
+ * @return
+ *   Memseg pointer on success, or NULL on error.
+ */
+__rte_experimental struct rte_memseg *
+rte_mem_virt2memseg(const void *virt);
+
+/**
  * Memseg walk function prototype.
  *
  * Returning 0 will continue walk
diff --git a/lib/librte_eal/rte_eal_version.map 
b/lib/librte_eal/rte_eal_version.map
index dccfc35..79433b7 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -224,6 +224,7 @@ EXPERIMENTAL {
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
rte_mem_iova2virt;
+   rte_mem_virt2memseg;
rte_memseg_contig_walk;
rte_memseg_walk;
rte_mp_action_register;
-- 
2.7.4


[dpdk-dev] [PATCH v4 35/70] virtio: use memseg contig walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/net/virtio/virtio_user/vhost_kernel.c | 83 +++
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c 
b/drivers/net/virtio/virtio_user/vhost_kernel.c
index 1711ead..93d7efe 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -70,6 +70,32 @@ static uint64_t vhost_req_user_to_kernel[] = {
[VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
 };
 
+struct walk_arg {
+   struct vhost_memory_kernel *vm;
+   uint32_t region_nr;
+};
+static int
+add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
+{
+   struct walk_arg *wa = arg;
+   struct vhost_memory_region *mr;
+   void *start_addr;
+
+   if (wa->region_nr >= max_regions)
+   return -1;
+
+   mr = &wa->vm->regions[wa->region_nr++];
+   start_addr = ms->addr;
+
+   mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
+   mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
+   mr->memory_size = len;
+   mr->mmap_offset = 0;
+
+   return 0;
+}
+
+
 /* By default, vhost kernel module allows 64 regions, but DPDK allows
  * 256 segments. As a relief, below function merges those virtually
  * adjacent memsegs into one region.
@@ -77,63 +103,24 @@ static uint64_t vhost_req_user_to_kernel[] = {
 static struct vhost_memory_kernel *
 prepare_vhost_memory_kernel(void)
 {
-   uint32_t i, j, k = 0;
-   struct rte_memseg *seg;
-   struct vhost_memory_region *mr;
struct vhost_memory_kernel *vm;
+   struct walk_arg wa;
 
vm = malloc(sizeof(struct vhost_memory_kernel) +
-   max_regions *
-   sizeof(struct vhost_memory_region));
+   max_regions *
+   sizeof(struct vhost_memory_region));
if (!vm)
return NULL;
 
-   for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
-   seg = &rte_eal_get_configuration()->mem_config->memseg[i];
-   if (!seg->addr)
-   break;
-
-   int new_region = 1;
-
-   for (j = 0; j < k; ++j) {
-   mr = &vm->regions[j];
+   wa.region_nr = 0;
+   wa.vm = vm;
 
-   if (mr->userspace_addr + mr->memory_size ==
-   (uint64_t)(uintptr_t)seg->addr) {
-   mr->memory_size += seg->len;
-   new_region = 0;
-   break;
-   }
-
-   if ((uint64_t)(uintptr_t)seg->addr + seg->len ==
-   mr->userspace_addr) {
-   mr->guest_phys_addr =
-   (uint64_t)(uintptr_t)seg->addr;
-   mr->userspace_addr =
-   (uint64_t)(uintptr_t)seg->addr;
-   mr->memory_size += seg->len;
-   new_region = 0;
-   break;
-   }
-   }
-
-   if (new_region == 0)
-   continue;
-
-   mr = &vm->regions[k++];
-   /* use vaddr here! */
-   mr->guest_phys_addr = (uint64_t)(uintptr_t)seg->addr;
-   mr->userspace_addr = (uint64_t)(uintptr_t)seg->addr;
-   mr->memory_size = seg->len;
-   mr->mmap_offset = 0;
-
-   if (k >= max_regions) {
-   free(vm);
-   return NULL;
-   }
+   if (rte_memseg_contig_walk(add_memory_region, &wa) < 0) {
+   free(vm);
+   return NULL;
}
 
-   vm->nregions = k;
+   vm->nregions = wa.region_nr;
vm->padding = 0;
return vm;
 }
-- 
2.7.4


[dpdk-dev] [PATCH v4 44/70] net/mlx5: use virt2memseg instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/net/mlx5/mlx5_mr.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 2bf1f9c..d8c04dc 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -234,7 +234,7 @@ struct mlx5_mr *
 mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
 {
struct priv *priv = dev->data->dev_private;
-   const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+   const struct rte_memseg *ms;
uintptr_t start;
uintptr_t end;
unsigned int i;
@@ -261,17 +261,15 @@ mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool 
*mp)
/* Save original addresses for exact MR lookup. */
mr->start = start;
mr->end = end;
+
/* Round start and end to page boundary if found in memory segments. */
-   for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
-   uintptr_t addr = (uintptr_t)ms[i].addr;
-   size_t len = ms[i].len;
-   unsigned int align = ms[i].hugepage_sz;
+   ms = rte_mem_virt2memseg((void *)start);
+   if (ms != NULL)
+   start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
+   ms = rte_mem_virt2memseg((void *)end);
+   if (ms != NULL)
+   end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);
 
-   if ((start > addr) && (start < addr + len))
-   start = RTE_ALIGN_FLOOR(start, align);
-   if ((end > addr) && (end < addr + len))
-   end = RTE_ALIGN_CEIL(end, align);
-   }
DRV_LOG(DEBUG,
"port %u mempool %p using start=%p end=%p size=%zu for memory"
" region",
-- 
2.7.4


[dpdk-dev] [PATCH v4 34/70] eal: add contig walk function

2018-04-08 Thread Anatoly Burakov
This function is meant to walk over first segment of each
VA-contiguous group of memsegs.

For future users of this function, this is done so that
there is less dependency on internals of mem API and less
noise later change sets.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 37 ++
 lib/librte_eal/common/include/rte_memory.h | 27 ++
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 65 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c
index 4f588c7..4b528b0 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -242,6 +242,43 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg)
return 0;
 }
 
+int __rte_experimental
+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
+{
+   struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+   int i, j, ret;
+
+   for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+   const struct rte_memseg *ms = &mcfg->memseg[i];
+   size_t total_len;
+   void *end_addr;
+
+   if (ms->addr == NULL)
+   continue;
+
+   end_addr = RTE_PTR_ADD(ms->addr, ms->len);
+
+   /* check how many more segments are contiguous to this one */
+   for (j = i + 1; j < RTE_MAX_MEMSEG; j++) {
+   const struct rte_memseg *next = &mcfg->memseg[j];
+
+   if (next->addr != end_addr)
+   break;
+
+   end_addr = RTE_PTR_ADD(next->addr, next->len);
+   i++;
+   }
+   total_len = RTE_PTR_DIFF(end_addr, ms->addr);
+
+   ret = func(ms, total_len, arg);
+   if (ret < 0)
+   return -1;
+   if (ret > 0)
+   return 1;
+   }
+   return 0;
+}
+
 /* init memory subsystem */
 int
 rte_eal_memory_init(void)
diff --git a/lib/librte_eal/common/include/rte_memory.h 
b/lib/librte_eal/common/include/rte_memory.h
index 93eadaa..45d067f 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -140,6 +140,18 @@ rte_iova_t rte_mem_virt2iova(const void *virt);
 typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
 
 /**
+ * Memseg contig walk function prototype. This will trigger a callback on every
+ * VA-contiguous are starting at memseg ``ms``, so total valid VA space at each
+ * callback call will be [``ms->addr``, ``ms->addr + len``).
+ *
+ * Returning 0 will continue walk
+ * Returning 1 will stop the walk
+ * Returning -1 will stop the walk and report error
+ */
+typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg *ms,
+   size_t len, void *arg);
+
+/**
  * Walk list of all memsegs.
  *
  * @param func
@@ -155,6 +167,21 @@ int __rte_experimental
 rte_memseg_walk(rte_memseg_walk_t func, void *arg);
 
 /**
+ * Walk each VA-contiguous area.
+ *
+ * @param func
+ *   Iterator function
+ * @param arg
+ *   Argument passed to iterator
+ * @return
+ *   0 if walked over the entire list
+ *   1 if stopped by the user
+ *   -1 if user function reported error
+ */
+int __rte_experimental
+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);
+
+/**
  * Get the layout of the available physical memory.
  *
  * It can be useful for an application to have the full physical
diff --git a/lib/librte_eal/rte_eal_version.map 
b/lib/librte_eal/rte_eal_version.map
index 716b965..93033b5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,7 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+   rte_memseg_contig_walk;
rte_memseg_walk;
rte_mp_action_register;
rte_mp_action_unregister;
-- 
2.7.4


[dpdk-dev] [PATCH v4 45/70] eal: use memzone walk instead of iteration

2018-04-08 Thread Anatoly Burakov
Simplify memzone dump code to use memzone walk, to not maintain
the same memzone iteration code twice.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memzone.c | 42 +++---
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c 
b/lib/librte_eal/common/eal_common_memzone.c
index af68c00..d60bde7 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -360,31 +360,31 @@ rte_memzone_lookup(const char *name)
return memzone;
 }
 
+static void
+dump_memzone(const struct rte_memzone *mz, void *arg)
+{
+   struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+   FILE *f = arg;
+   int mz_idx;
+
+   mz_idx = mz - mcfg->memzone;
+
+   fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx, virt:%p, "
+   "socket_id:%"PRId32", flags:%"PRIx32"\n",
+   mz_idx,
+   mz->name,
+   mz->iova,
+   mz->len,
+   mz->addr,
+   mz->socket_id,
+   mz->flags);
+}
+
 /* Dump all reserved memory zones on console */
 void
 rte_memzone_dump(FILE *f)
 {
-   struct rte_mem_config *mcfg;
-   unsigned i = 0;
-
-   /* get pointer to global configuration */
-   mcfg = rte_eal_get_configuration()->mem_config;
-
-   rte_rwlock_read_lock(&mcfg->mlock);
-   /* dump all zones */
-   for (i=0; imemzone[i].addr == NULL)
-   break;
-   fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx"
-  ", virt:%p, socket_id:%"PRId32", flags:%"PRIx32"\n", i,
-  mcfg->memzone[i].name,
-  mcfg->memzone[i].iova,
-  mcfg->memzone[i].len,
-  mcfg->memzone[i].addr,
-  mcfg->memzone[i].socket_id,
-  mcfg->memzone[i].flags);
-   }
-   rte_rwlock_read_unlock(&mcfg->mlock);
+   rte_memzone_walk(dump_memzone, f);
 }
 
 /*
-- 
2.7.4


[dpdk-dev] [PATCH v4 37/70] bus/dpaa: use iova2virt instead of memseg iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fixed usage of experimental API's

v3:
- Added this patch

 drivers/bus/dpaa/rte_dpaa_bus.h  | 12 +---
 drivers/mempool/dpaa/Makefile|  3 +++
 drivers/mempool/dpaa/meson.build |  3 +++
 drivers/net/dpaa/Makefile|  3 +++
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/bus/dpaa/rte_dpaa_bus.h b/drivers/bus/dpaa/rte_dpaa_bus.h
index 718701b..89aeac2 100644
--- a/drivers/bus/dpaa/rte_dpaa_bus.h
+++ b/drivers/bus/dpaa/rte_dpaa_bus.h
@@ -98,17 +98,7 @@ struct dpaa_portal {
 /* TODO - this is costly, need to write a fast coversion routine */
 static inline void *rte_dpaa_mem_ptov(phys_addr_t paddr)
 {
-   const struct rte_memseg *memseg = rte_eal_get_physmem_layout();
-   int i;
-
-   for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr != NULL; i++) {
-   if (paddr >= memseg[i].iova && paddr <
-   memseg[i].iova + memseg[i].len)
-   return (uint8_t *)(memseg[i].addr) +
-  (paddr - memseg[i].iova);
-   }
-
-   return NULL;
+   return rte_mem_iova2virt(paddr);
 }
 
 /**
diff --git a/drivers/mempool/dpaa/Makefile b/drivers/mempool/dpaa/Makefile
index 4c0d7aa..da8da1e 100644
--- a/drivers/mempool/dpaa/Makefile
+++ b/drivers/mempool/dpaa/Makefile
@@ -22,6 +22,9 @@ EXPORT_MAP := rte_mempool_dpaa_version.map
 # Lbrary version
 LIBABIVER := 1
 
+# depends on dpaa bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA_MEMPOOL) += dpaa_mempool.c
diff --git a/drivers/mempool/dpaa/meson.build b/drivers/mempool/dpaa/meson.build
index 08423c2..9163b3d 100644
--- a/drivers/mempool/dpaa/meson.build
+++ b/drivers/mempool/dpaa/meson.build
@@ -7,3 +7,6 @@ endif
 
 deps += ['bus_dpaa']
 sources = files('dpaa_mempool.c')
+
+# depends on dpaa bus which uses experimental API
+allow_experimental_apis = true
diff --git a/drivers/net/dpaa/Makefile b/drivers/net/dpaa/Makefile
index 9c2a5ea..d7a0a50 100644
--- a/drivers/net/dpaa/Makefile
+++ b/drivers/net/dpaa/Makefile
@@ -27,6 +27,9 @@ EXPORT_MAP := rte_pmd_dpaa_version.map
 
 LIBABIVER := 1
 
+# depends on dpaa bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 # Interfaces with DPDK
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA_PMD) += dpaa_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA_PMD) += dpaa_rxtx.c
-- 
2.7.4


[dpdk-dev] [PATCH v4 36/70] eal: add iova2virt function

2018-04-08 Thread Anatoly Burakov
This is reverse lookup of PA to VA. Using this will make
other code less dependent on internals of mem API.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/common/eal_common_memory.c  | 30 ++
 lib/librte_eal/common/include/rte_memory.h | 12 
 lib/librte_eal/rte_eal_version.map |  1 +
 3 files changed, 43 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c 
b/lib/librte_eal/common/eal_common_memory.c
index 4b528b0..ea3c5a7 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -131,6 +131,36 @@ rte_eal_get_physmem_layout(void)
return rte_eal_get_configuration()->mem_config->memseg;
 }
 
+struct virtiova {
+   rte_iova_t iova;
+   void *virt;
+};
+static int
+find_virt(const struct rte_memseg *ms, void *arg)
+{
+   struct virtiova *vi = arg;
+   if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
+   size_t offset = vi->iova - ms->iova;
+   vi->virt = RTE_PTR_ADD(ms->addr, offset);
+   /* stop the walk */
+   return 1;
+   }
+   return 0;
+}
+
+__rte_experimental void *
+rte_mem_iova2virt(rte_iova_t iova)
+{
+   struct virtiova vi;
+
+   memset(&vi, 0, sizeof(vi));
+
+   vi.iova = iova;
+   rte_memseg_walk(find_virt, &vi);
+
+   return vi.virt;
+}
+
 static int
 physmem_size(const struct rte_memseg *ms, void *arg)
 {
diff --git a/lib/librte_eal/common/include/rte_memory.h 
b/lib/librte_eal/common/include/rte_memory.h
index 45d067f..5c60b91 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -131,6 +131,18 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
 rte_iova_t rte_mem_virt2iova(const void *virt);
 
 /**
+ * Get virtual memory address corresponding to iova address.
+ *
+ * @param iova
+ *   The iova address.
+ * @return
+ *   Virtual address corresponding to iova address (or NULL if address does not
+ *   exist within DPDK memory map).
+ */
+__rte_experimental void *
+rte_mem_iova2virt(rte_iova_t iova);
+
+/**
  * Memseg walk function prototype.
  *
  * Returning 0 will continue walk
diff --git a/lib/librte_eal/rte_eal_version.map 
b/lib/librte_eal/rte_eal_version.map
index 93033b5..dccfc35 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,7 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+   rte_mem_iova2virt;
rte_memseg_contig_walk;
rte_memseg_walk;
rte_mp_action_register;
-- 
2.7.4


[dpdk-dev] [PATCH v4 38/70] bus/fslmc: use iova2virt instead of memseg iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fixed usage of experimental API's

v3:
- Added this patch

 drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 13 +
 drivers/event/dpaa2/Makefile|  3 +++
 drivers/mempool/dpaa2/Makefile  |  3 +++
 drivers/mempool/dpaa2/meson.build   |  3 +++
 drivers/net/dpaa2/Makefile  |  3 +++
 drivers/net/dpaa2/meson.build   |  3 +++
 6 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h 
b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index 4a19d42..d38fc49 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -260,21 +260,10 @@ static void *dpaa2_mem_ptov(phys_addr_t paddr) 
__attribute__((unused));
 /* todo - this is costly, need to write a fast coversion routine */
 static void *dpaa2_mem_ptov(phys_addr_t paddr)
 {
-   const struct rte_memseg *memseg;
-   int i;
-
if (dpaa2_virt_mode)
return (void *)(size_t)paddr;
 
-   memseg = rte_eal_get_physmem_layout();
-
-   for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
-   if (paddr >= memseg[i].iova &&
-   paddr < memseg[i].iova + memseg[i].len)
-   return (void *)(size_t)(memseg[i].addr_64
-   + (paddr - memseg[i].iova));
-   }
-   return NULL;
+   return rte_mem_iova2virt(paddr);
 }
 
 static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr) __attribute__((unused));
diff --git a/drivers/event/dpaa2/Makefile b/drivers/event/dpaa2/Makefile
index b26862c..a5b68b4 100644
--- a/drivers/event/dpaa2/Makefile
+++ b/drivers/event/dpaa2/Makefile
@@ -28,6 +28,9 @@ EXPORT_MAP := rte_pmd_dpaa2_event_version.map
 
 LIBABIVER := 1
 
+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 #
 # all source are stored in SRCS-y
 #
diff --git a/drivers/mempool/dpaa2/Makefile b/drivers/mempool/dpaa2/Makefile
index f0edb32..5125ad1 100644
--- a/drivers/mempool/dpaa2/Makefile
+++ b/drivers/mempool/dpaa2/Makefile
@@ -21,6 +21,9 @@ EXPORT_MAP := rte_mempool_dpaa2_version.map
 # Lbrary version
 LIBABIVER := 1
 
+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL) += dpaa2_hw_mempool.c
diff --git a/drivers/mempool/dpaa2/meson.build 
b/drivers/mempool/dpaa2/meson.build
index dee3a88..8b8b518 100644
--- a/drivers/mempool/dpaa2/meson.build
+++ b/drivers/mempool/dpaa2/meson.build
@@ -7,3 +7,6 @@ endif
 
 deps += ['mbuf', 'bus_fslmc']
 sources = files('dpaa2_hw_mempool.c')
+
+# depends on fslmc bus which uses experimental API
+allow_experimental_apis = true
diff --git a/drivers/net/dpaa2/Makefile b/drivers/net/dpaa2/Makefile
index 1b707ad..9b0b143 100644
--- a/drivers/net/dpaa2/Makefile
+++ b/drivers/net/dpaa2/Makefile
@@ -27,6 +27,9 @@ EXPORT_MAP := rte_pmd_dpaa2_version.map
 # library version
 LIBABIVER := 1
 
+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += base/dpaa2_hw_dpni.c
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += dpaa2_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += dpaa2_ethdev.c
diff --git a/drivers/net/dpaa2/meson.build b/drivers/net/dpaa2/meson.build
index ad1724d..8e96b5a 100644
--- a/drivers/net/dpaa2/meson.build
+++ b/drivers/net/dpaa2/meson.build
@@ -13,3 +13,6 @@ sources = files('base/dpaa2_hw_dpni.c',
'mc/dpni.c')
 
 includes += include_directories('base', 'mc')
+
+# depends on fslmc bus which uses experimental API
+allow_experimental_apis = true
-- 
2.7.4


[dpdk-dev] [PATCH v4 42/70] crypto/dpaa_sec: use virt2memseg instead of iteration

2018-04-08 Thread Anatoly Burakov
Reduce dependency on internal details of EAL memory subsystem, and
simplify code.

Signed-off-by: Anatoly Burakov 
---
 drivers/crypto/dpaa_sec/dpaa_sec.c | 19 +--
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c 
b/drivers/crypto/dpaa_sec/dpaa_sec.c
index b04510f..a14e669 100644
--- a/drivers/crypto/dpaa_sec/dpaa_sec.c
+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c
@@ -93,20 +93,11 @@ dpaa_sec_alloc_ctx(dpaa_sec_session *ses)
 static inline rte_iova_t
 dpaa_mem_vtop(void *vaddr)
 {
-   const struct rte_memseg *memseg = rte_eal_get_physmem_layout();
-   uint64_t vaddr_64, paddr;
-   int i;
-
-   vaddr_64 = (size_t)vaddr;
-   for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
-   if (vaddr_64 >= memseg[i].addr_64 &&
-   vaddr_64 < memseg[i].addr_64 + memseg[i].len) {
-   paddr = memseg[i].iova +
-   (vaddr_64 - memseg[i].addr_64);
-
-   return (rte_iova_t)paddr;
-   }
-   }
+   const struct rte_memseg *ms;
+
+   ms = rte_mem_virt2memseg(vaddr);
+   if (ms)
+   return ms->iova + RTE_PTR_DIFF(vaddr, ms->addr);
return (size_t)NULL;
 }
 
-- 
2.7.4


[dpdk-dev] [PATCH v4 46/70] vfio: allow to map other memory regions

2018-04-08 Thread Anatoly Burakov
Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.

Signed-off-by: Pawel Wodkowski 
Signed-off-by: Anatoly Burakov 
Signed-off-by: Gowrishankar Muthukrishnan 
---

Notes:
v3:
- Added PPC64, courtesy of Gowrishankar

v3:
- Moved to earlier in the patchset
- Made API experimental
- Do not print out error message if init isn't finished
- SPAPR code provided by Gowrishankar

 lib/librte_eal/bsdapp/eal/eal.c  |  16 ++
 lib/librte_eal/common/include/rte_vfio.h |  39 
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 347 ---
 lib/librte_eal/linuxapp/eal/eal_vfio.h   |  12 ++
 lib/librte_eal/rte_eal_version.map   |   2 +
 5 files changed, 341 insertions(+), 75 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 8e25d78..032a5ea 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -749,6 +749,8 @@ int rte_vfio_enable(const char *modname);
 int rte_vfio_is_enabled(const char *modname);
 int rte_vfio_noiommu_is_enabled(void);
 int rte_vfio_clear_group(int vfio_group_fd);
+int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
 
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
  __rte_unused const char *dev_addr,
@@ -784,3 +786,17 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
 {
return 0;
 }
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+   return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+   __rte_unused uint64_t len)
+{
+   return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h 
b/lib/librte_eal/common/include/rte_vfio.h
index 249095e..bd4663c 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -127,6 +127,45 @@ int rte_vfio_noiommu_is_enabled(void);
 int
 rte_vfio_clear_group(int vfio_group_fd);
 
+/**
+ * Map memory region for use with VFIO.
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *   0 if success.
+ *   -1 on error.
+ */
+int  __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+
+
+/**
+ * Unmap memory region from VFIO.
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *   0 if success.
+ *   -1 on error.
+ */
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index fb41e82..f6fe93e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -22,17 +22,35 @@
 static struct vfio_config vfio_cfg;
 
 static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 
 /* IOMMU types we support */
 static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
-   { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+   {
+   .type_id = RTE_VFIO_TYPE1,
+   .name = "Type 1",
+   .dma_map_func = &vfio_type1_dma_map,
+   .dma_user_map_func = &vfio_type1_dma_mem_map
+   },
/* ppc64 IOMMU, otherwise known as spapr */
-   { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
+   {
+   .type_id = RTE_VFIO_SPAPR,
+   .name = "sPAPR",
+   .dma_map_func = &vfio_spapr_dma_map,
+   .dma_user_map_func = &vfio_spapr_dma_mem_map
+   },
/* IOMMU-less mode */
-   { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+   {
+   .type_id = RTE_VFIO_NOIOMMU,
+   .name = "No-IOMMU",
+   .dma_map_func = &vfio_noiommu_dma_map,
+   .dma_user_map_func = &vfio_noiommu_dma_mem_map
+   },
 };
 
 int
@@ -333,9 +351,10 @@ rte_vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,

[dpdk-dev] [PATCH v4 47/70] eal: add "legacy memory" option

2018-04-08 Thread Anatoly Burakov
This adds a "--legacy-mem" command-line switch. It will be used to
go back to the old memory behavior, one where we can't dynamically
allocate/free memory (the downside), but one where the user can
get physically contiguous memory, like before (the upside).

For now, nothing but the legacy behavior exists, non-legacy
memory init sequence will be added later. For FreeBSD, non-legacy
memory init will never be enabled, while for Linux, it is
disabled in this patch to avoid breaking bisect, but will be
enabled once non-legacy mode will be fully operational.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Move to earlier in the patchset
- Make Linuxapp always load in legacy mode

 lib/librte_eal/bsdapp/eal/eal.c|  3 +++
 lib/librte_eal/common/eal_common_options.c |  4 
 lib/librte_eal/common/eal_internal_cfg.h   |  4 
 lib/librte_eal/common/eal_options.h|  2 ++
 lib/librte_eal/linuxapp/eal/eal.c  |  3 +++
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 24 
 6 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 032a5ea..f44b904 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -534,6 +534,9 @@ rte_eal_init(int argc, char **argv)
return -1;
}
 
+   /* FreeBSD always uses legacy memory model */
+   internal_config.legacy_mem = true;
+
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
rte_errno = EINVAL;
diff --git a/lib/librte_eal/common/eal_common_options.c 
b/lib/librte_eal/common/eal_common_options.c
index 8a51ade..fb5ea03 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -73,6 +73,7 @@ eal_long_options[] = {
{OPT_VDEV,  1, NULL, OPT_VDEV_NUM },
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM},
{OPT_VMWARE_TSC_MAP,0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
+   {OPT_LEGACY_MEM,0, NULL, OPT_LEGACY_MEM_NUM   },
{0, 0, NULL, 0}
 };
 
@@ -1184,6 +1185,9 @@ eal_parse_common_option(int opt, const char *optarg,
 
core_parsed = LCORE_OPT_MAP;
break;
+   case OPT_LEGACY_MEM_NUM:
+   conf->legacy_mem = 1;
+   break;
 
/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h 
b/lib/librte_eal/common/eal_internal_cfg.h
index a0082d1..fda087b 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -47,6 +47,10 @@ struct internal_config {
volatile unsigned force_sockets;
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory 
per socket */
uintptr_t base_virtaddr;  /**< base address to try and reserve 
memory from */
+   volatile unsigned legacy_mem;
+   /**< true to enable legacy memory behavior (no dynamic allocation,
+* IOVA-contiguous segments).
+*/
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h 
b/lib/librte_eal/common/eal_options.h
index e86c711..d301d0b 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -55,6 +55,8 @@ enum {
OPT_VFIO_INTR_NUM,
 #define OPT_VMWARE_TSC_MAP"vmware-tsc-map"
OPT_VMWARE_TSC_MAP_NUM,
+#define OPT_LEGACY_MEM"legacy-mem"
+   OPT_LEGACY_MEM_NUM,
OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c 
b/lib/librte_eal/linuxapp/eal/eal.c
index 77f6cb7..b34e57a 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -348,6 +348,7 @@ eal_usage(const char *prgname)
   "  --"OPT_BASE_VIRTADDR" Base virtual address\n"
   "  --"OPT_CREATE_UIO_DEV"Create /dev/uioX (usually done by 
hotplug)\n"
   "  --"OPT_VFIO_INTR" Interrupt mode for VFIO 
(legacy|msi|msix)\n"
+  "  --"OPT_LEGACY_MEM"Legacy memory mode (no dynamic 
allocation, contiguous segments)\n"
   "\n");
/* Allow the application to print its usage message too if hook is set 
*/
if ( rte_application_usage_hook ) {
@@ -767,6 +768,8 @@ rte_eal_init(int argc, char **argv)
rte_atomic32_clear(&run_once);
return -1;
}
+   /* for now, always set legacy mem */
+   internal_config.legacy_mem = 1;
 
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 2

[dpdk-dev] [PATCH v4 51/70] eal: add support for mapping hugepages at runtime

2018-04-08 Thread Anatoly Burakov
Nothing uses this code yet. The bulk of it is copied from old
memory allocation code (linuxapp eal_memory.c). We provide an
EAL-internal API to allocate either one page or multiple pages,
guaranteeing that we'll get contiguous VA for all of the pages
that we requested.

Not supported on FreeBSD.

Locking is done via fcntl() because that way, when it comes to
taking out write locks or unlocking on deallocation, we don't
have to keep original fd's around. Plus, using fcntl() gives us
ability to lock parts of a file, which is useful for single-file
segments, which are coming down the line.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Compile fixes for various platforms
- Split single file segments stuff into separate commit

v3:
- Split single file segments into separate patch
- Added missing FreeBSD implementation
- Removed rte_panic when unable to free page

v3:
- Added single file segments functionality in this
  commit, instead of later commits

 lib/librte_eal/bsdapp/eal/Makefile |   1 +
 lib/librte_eal/bsdapp/eal/eal_memalloc.c   |  26 ++
 lib/librte_eal/bsdapp/eal/meson.build  |   1 +
 lib/librte_eal/common/eal_memalloc.h   |  31 +++
 lib/librte_eal/linuxapp/eal/Makefile   |   2 +
 lib/librte_eal/linuxapp/eal/eal_memalloc.c | 429 +
 lib/librte_eal/linuxapp/eal/meson.build|   1 +
 7 files changed, 491 insertions(+)
 create mode 100644 lib/librte_eal/bsdapp/eal/eal_memalloc.c
 create mode 100644 lib/librte_eal/common/eal_memalloc.h
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile 
b/lib/librte_eal/bsdapp/eal/Makefile
index 1b43d77..19f9322 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -29,6 +29,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memory.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_hugepage_info.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memalloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_lcore.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_timer.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_interrupts.c
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c 
b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
new file mode 100644
index 000..8c30670
--- /dev/null
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include 
+
+#include 
+#include 
+
+#include "eal_memalloc.h"
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused,
+   int __rte_unused n_segs, size_t __rte_unused page_sz,
+   int __rte_unused socket, bool __rte_unused exact)
+{
+   RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+   return -1;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket)
+{
+   RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+   return NULL;
+}
diff --git a/lib/librte_eal/bsdapp/eal/meson.build 
b/lib/librte_eal/bsdapp/eal/meson.build
index e83fc91..4b40223 100644
--- a/lib/librte_eal/bsdapp/eal/meson.build
+++ b/lib/librte_eal/bsdapp/eal/meson.build
@@ -8,6 +8,7 @@ env_sources = files('eal_alarm.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
'eal_lcore.c',
+   'eal_memalloc.c',
'eal_thread.c',
'eal_timer.c',
'eal.c',
diff --git a/lib/librte_eal/common/eal_memalloc.h 
b/lib/librte_eal/common/eal_memalloc.h
new file mode 100644
index 000..f628514
--- /dev/null
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMALLOC_H
+#define EAL_MEMALLOC_H
+
+#include 
+
+#include 
+
+/*
+ * Allocate segment of specified page size.
+ */
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket);
+
+/*
+ * Allocate `n_segs` segments.
+ *
+ * Note: `ms` can be NULL.
+ *
+ * Note: it is possible to request best-effort allocation by setting `exact` to
+ * `false`, in which case allocator will return however many pages it managed 
to
+ * allocate successfully.
+ */
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
+   int socket, bool exact);
+
+#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/Makefile 
b/lib/librte_eal/linuxapp/eal/Makefile
index c407a43..af6b9be 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -36,6 +36,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_v

[dpdk-dev] [PATCH v4 48/70] eal: add rte_fbarray

2018-04-08 Thread Anatoly Burakov
rte_fbarray is a simple indexed array stored in shared memory
via mapping files into memory. Rationale for its existence is the
following: since we are going to map memory page-by-page, there
could be quite a lot of memory segments to keep track of (for
smaller page sizes, page count can easily reach thousands). We
can't really make page lists truly dynamic and infinitely expandable,
because that involves reallocating memory (which is a big no-no in
multiprocess). What we can do instead is have a maximum capacity as
something really, really large, and decide at allocation time how
big the array is going to be. We map the entire file into memory,
which makes it possible to use fbarray as shared memory, provided
the structure itself is allocated in shared memory. Per-fbarray
locking is also used to avoid index data races (but not contents
data races - that is up to user application to synchronize).

In addition, in understanding that we will frequently need to scan
this array for free space and iterating over array linearly can
become slow, rte_fbarray provides facilities to index array's
usage. The following use cases are covered:
 - find next free/used slot (useful either for adding new elements
   to fbarray, or walking the list)
 - find starting index for next N free/used slots (useful for when
   we want to allocate chunk of VA-contiguous memory composed of
   several pages)
 - find how many contiguous free/used slots there are, starting
   from specified index (useful for when we want to figure out
   how many pages we have until next hole in allocated memory, to
   speed up some bulk operations where we would otherwise have to
   walk the array and add pages one by one)

This is accomplished by storing a usage mask in-memory, right
after the data section of the array, and using some bit-level
magic to figure out the info we need.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Fixed index alignment bug
- Fixed compile issues

v3:
- MAP_POPULATE not supported on FreeBSD, removed it
- Bugfix for index size when it is unaligned
- Replace uint64_t with size_t for mapping sizes
- Make API experimental

Initial version of this had resizing capability, however it was
removed due to the fact that in multiprocess scenario, each
fbarray would have its own view of mapped memory, which might not
correspond with others due to some other process performing a
resize that current process didn't know about.

It was therefore decided that to avoid cost of synchronization on
each and every operation (to make sure the array wasn't resized),
resizing feature should be dropped.

 lib/librte_eal/bsdapp/eal/Makefile  |   1 +
 lib/librte_eal/common/Makefile  |   2 +-
 lib/librte_eal/common/eal_common_fbarray.c  | 859 
 lib/librte_eal/common/eal_filesystem.h  |  13 +
 lib/librte_eal/common/include/rte_fbarray.h | 353 
 lib/librte_eal/common/meson.build   |   2 +
 lib/librte_eal/linuxapp/eal/Makefile|   1 +
 lib/librte_eal/rte_eal_version.map  |  16 +
 8 files changed, 1246 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_eal/common/eal_common_fbarray.c
 create mode 100644 lib/librte_eal/common/include/rte_fbarray.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile 
b/lib/librte_eal/bsdapp/eal/Makefile
index ed1d17b..1b43d77 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -53,6 +53,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_dev.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_options.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index ea824a3..48f870f 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -16,7 +16,7 @@ INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
 INC += rte_malloc.h rte_keepalive.h rte_time.h
 INC += rte_service.h rte_service_component.h
 INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h
-INC += rte_reciprocal.h
+INC += rte_reciprocal.h rte_fbarray.h
 
 GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h
 GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h
diff --git a/lib/librte_eal/common/eal_common_fbarray.c 
b/lib/librte_eal/common/eal_common_fbarray.c
new file mode 100644
index 000..f65875d
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#incl

[dpdk-dev] [PATCH v4 50/70] eal: replace memzone array with fbarray

2018-04-08 Thread Anatoly Burakov
It's there, so we might as well use it. Some operations will be
sped up by that.

Since we have to allocate an fbarray for memzones, we have to do
it before we initialize memory subsystem, because that, in
secondary processes, will (later) allocate more fbarrays than the
primary process, which will result in inability to attach to
memzone fbarray if we do it after the fact.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fix bug in memzone lookup iteraion code

v4:
- Fix memzone lookup skipping over memzones
- Fix error message on failing to find space for memzone

v3:
- Moved earlier in patchset
- Fixed compiled issues
- Removed rte_panic() calls

 drivers/net/ena/Makefile  |   3 +
 drivers/net/ena/ena_ethdev.c  |  10 +-
 lib/librte_eal/bsdapp/eal/eal.c   |  14 ++-
 lib/librte_eal/common/eal_common_memzone.c| 113 --
 lib/librte_eal/common/include/rte_eal_memconfig.h |   4 +-
 lib/librte_eal/common/malloc_heap.c   |   4 +
 lib/librte_eal/linuxapp/eal/eal.c |  13 ++-
 test/test/test_memzone.c  |   9 +-
 8 files changed, 103 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ena/Makefile b/drivers/net/ena/Makefile
index f9bfe05..43339f3 100644
--- a/drivers/net/ena/Makefile
+++ b/drivers/net/ena/Makefile
@@ -43,6 +43,9 @@ INCLUDES :=-I$(SRCDIR) -I$(SRCDIR)/base/ena_defs 
-I$(SRCDIR)/base
 EXPORT_MAP := rte_pmd_ena_version.map
 LIBABIVER := 1
 
+# rte_fbarray is not yet part of stable API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 VPATH += $(SRCDIR)/base
 #
 # all source are stored in SRCS-y
diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d..f7bfc7a 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -264,11 +264,15 @@ static const struct eth_dev_ops ena_dev_ops = {
 static inline int ena_cpu_to_node(int cpu)
 {
struct rte_config *config = rte_eal_get_configuration();
+   struct rte_fbarray *arr = &config->mem_config->memzones;
+   const struct rte_memzone *mz;
 
-   if (likely(cpu < RTE_MAX_MEMZONE))
-   return config->mem_config->memzone[cpu].socket_id;
+   if (unlikely(cpu >= RTE_MAX_MEMZONE))
+   return NUMA_NO_NODE;
 
-   return NUMA_NO_NODE;
+   mz = rte_fbarray_get(arr, cpu);
+
+   return mz->socket_id;
 }
 
 static inline void ena_rx_mbuf_prepare(struct rte_mbuf *mbuf,
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index d009cf0..54330e1 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -599,14 +599,24 @@ rte_eal_init(int argc, char **argv)
}
}
 
+   /* in secondary processes, memory init may allocate additional fbarrays
+* not present in primary processes, so to avoid any potential issues,
+* initialize memzones first.
+*/
+   if (rte_eal_memzone_init() < 0) {
+   rte_eal_init_alert("Cannot init memzone\n");
+   rte_errno = ENODEV;
+   return -1;
+   }
+
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
return -1;
}
 
-   if (rte_eal_memzone_init() < 0) {
-   rte_eal_init_alert("Cannot init memzone\n");
+   if (rte_eal_malloc_heap_init() < 0) {
+   rte_eal_init_alert("Cannot init malloc heap\n");
rte_errno = ENODEV;
return -1;
}
diff --git a/lib/librte_eal/common/eal_common_memzone.c 
b/lib/librte_eal/common/eal_common_memzone.c
index 1f5f753..12ddd42 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -28,42 +28,30 @@
 static inline const struct rte_memzone *
 memzone_lookup_thread_unsafe(const char *name)
 {
-   const struct rte_mem_config *mcfg;
+   struct rte_mem_config *mcfg;
+   struct rte_fbarray *arr;
const struct rte_memzone *mz;
-   unsigned i = 0;
+   int i = 0;
 
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+   arr = &mcfg->memzones;
 
/*
 * the algorithm is not optimal (linear), but there are few
 * zones and this function should be called at init only
 */
-   for (i = 0; i < RTE_MAX_MEMZONE; i++) {
-   mz = &mcfg->memzone[i];
-   if (mz->addr != NULL && !strncmp(name, mz->name, 
RTE_MEMZONE_NAMESIZE))
-   return &mcfg->memzone[i];
+   i = rte_fbarray_find_next_used(arr, 0);
+   while (i >= 0) {
+   mz = rte_fbarray_get(arr, i);
+   if (mz->addr != NULL &&
+   !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
+   return mz;
+ 

[dpdk-dev] [PATCH v4 56/70] eal: read hugepage counts from node-specific sysfs path

2018-04-08 Thread Anatoly Burakov
For non-legacy memory init mode, instead of looking at generic
sysfs path, look at sysfs paths pertaining to each NUMA node
for hugepage counts. Note that per-NUMA node path does not
provide information regarding reserved pages, so we might not
get the best info from these paths, but this saves us from the
whole mapping/remapping business before we're actually able to
tell which page is on which socket, because we no longer require
our memory to be physically contiguous.

Legacy memory init will not use this.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 80 +++--
 1 file changed, 74 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c 
b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index afebd42..2e0819f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -31,6 +31,7 @@
 #include "eal_filesystem.h"
 
 static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
 
 /* this function is only called from eal_hugepage_info_init which itself
  * is only called from a primary process */
@@ -71,6 +72,45 @@ get_num_hugepages(const char *subdir)
return num_pages;
 }
 
+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+   char path[PATH_MAX], socketpath[PATH_MAX];
+   DIR *socketdir;
+   unsigned long num_pages = 0;
+   const char *nr_hp_file = "free_hugepages";
+
+   snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+   sys_pages_numa_dir_path, socket);
+
+   socketdir = opendir(socketpath);
+   if (socketdir) {
+   /* Keep calm and carry on */
+   closedir(socketdir);
+   } else {
+   /* Can't find socket dir, so ignore it */
+   return 0;
+   }
+
+   snprintf(path, sizeof(path), "%s/%s/%s",
+   socketpath, subdir, nr_hp_file);
+   if (eal_parse_sysfs_value(path, &num_pages) < 0)
+   return 0;
+
+   if (num_pages == 0)
+   RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+   subdir);
+
+   /*
+* we want to return a uint32_t and more than this looks suspicious
+* anyway ...
+*/
+   if (num_pages > UINT32_MAX)
+   num_pages = UINT32_MAX;
+
+   return num_pages;
+}
+
 static uint64_t
 get_default_hp_size(void)
 {
@@ -269,7 +309,7 @@ eal_hugepage_info_init(void)
 {
const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-   unsigned i, num_sizes = 0;
+   unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
struct dirent *dirent;
 
@@ -323,9 +363,28 @@ eal_hugepage_info_init(void)
if (clear_hugedir(hpi->hugedir) == -1)
break;
 
-   /* for now, put all pages into socket 0,
-* later they will be sorted */
-   hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+   /*
+* first, try to put all hugepages into relevant sockets, but
+* if first attempts fails, fall back to collecting all pages
+* in one socket and sorting them later
+*/
+   total_pages = 0;
+   /* we also don't want to do this for legacy init */
+   if (!internal_config.legacy_mem)
+   for (i = 0; i < rte_socket_count(); i++) {
+   int socket = rte_socket_id_by_idx(i);
+   unsigned int num_pages =
+   get_num_hugepages_on_node(
+   dirent->d_name, socket);
+   hpi->num_pages[socket] = num_pages;
+   total_pages += num_pages;
+   }
+   /*
+* we failed to sort memory from the get go, so fall
+* back to old way
+*/
+   if (total_pages == 0)
+   hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
 
 #ifndef RTE_ARCH_64
/* for 32-bit systems, limit number of hugepages to
@@ -349,10 +408,19 @@ eal_hugepage_info_init(void)
  sizeof(internal_config.hugepage_info[0]), compare_hpi);
 
/* now we have all info, check we have at least one valid size */
-   for (i = 0; i < num_sizes; i++)
+   for (i = 0; i < num_sizes; i++) {
+   /* pages may no longer all be on socket 0, so check all */
+   unsigned int j, num_pages = 0;
+
+   for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+   struct hugepage_info *hpi =
+ 

[dpdk-dev] [PATCH v4 53/70] eal: add "single file segments" command-line option

2018-04-08 Thread Anatoly Burakov
Currently, DPDK stores all pages as separate files in hugetlbfs.
This option will allow storing all pages in one file (one file
per memseg list).

We do this by using fallocate() calls on FreeBSD, however this is
only supported on fairly recent (4.3+) kernels, so ftruncate()
fallback is provided to grow (but not shrink) hugepage files.
Naming scheme is deterministic, so both primary and secondary
processes will be able to easily map needed files and offsets.

For multi-file segments, we can close fd's right away. For
single-file segments, we can reuse the same fd and reduce the
amount of fd's needed to map/use hugepages. However, we need to
store the fd's somewhere, so we add a tailq.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Split this change into a separate patch
- Provide more explanation as to how it works

 lib/librte_eal/common/eal_common_options.c |   4 +
 lib/librte_eal/common/eal_internal_cfg.h   |   4 +
 lib/librte_eal/common/eal_options.h|   2 +
 lib/librte_eal/linuxapp/eal/eal.c  |   1 +
 lib/librte_eal/linuxapp/eal/eal_memalloc.c | 337 -
 5 files changed, 297 insertions(+), 51 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c 
b/lib/librte_eal/common/eal_common_options.c
index fb5ea03..5b5da5f 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -74,6 +74,7 @@ eal_long_options[] = {
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM},
{OPT_VMWARE_TSC_MAP,0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
{OPT_LEGACY_MEM,0, NULL, OPT_LEGACY_MEM_NUM   },
+   {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
{0, 0, NULL, 0}
 };
 
@@ -1188,6 +1189,9 @@ eal_parse_common_option(int opt, const char *optarg,
case OPT_LEGACY_MEM_NUM:
conf->legacy_mem = 1;
break;
+   case OPT_SINGLE_FILE_SEGMENTS_NUM:
+   conf->single_file_segments = 1;
+   break;
 
/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h 
b/lib/librte_eal/common/eal_internal_cfg.h
index 5cf7102..9d33cf4 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -51,6 +51,10 @@ struct internal_config {
/**< true to enable legacy memory behavior (no dynamic allocation,
 * IOVA-contiguous segments).
 */
+   volatile unsigned single_file_segments;
+   /**< true if storing all pages within single files (per-page-size,
+* per-node) non-legacy mode only.
+*/
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h 
b/lib/librte_eal/common/eal_options.h
index d301d0b..211ae06 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -57,6 +57,8 @@ enum {
OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_LEGACY_MEM"legacy-mem"
OPT_LEGACY_MEM_NUM,
+#define OPT_SINGLE_FILE_SEGMENTS"single-file-segments"
+   OPT_SINGLE_FILE_SEGMENTS_NUM,
OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c 
b/lib/librte_eal/linuxapp/eal/eal.c
index 9832551..2c12811 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -349,6 +349,7 @@ eal_usage(const char *prgname)
   "  --"OPT_CREATE_UIO_DEV"Create /dev/uioX (usually done by 
hotplug)\n"
   "  --"OPT_VFIO_INTR" Interrupt mode for VFIO 
(legacy|msi|msix)\n"
   "  --"OPT_LEGACY_MEM"Legacy memory mode (no dynamic 
allocation, contiguous segments)\n"
+  "  --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in 
single files\n"
   "\n");
/* Allow the application to print its usage message too if hook is set 
*/
if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c 
b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 118b12d..545ac49 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -39,6 +39,31 @@
 #include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * If each page is in a separate file, we can close fd's since we need each fd
+ * only once. However, in single file segments mode, we can get away with using
+ * a single fd for entire segments, but we need to store them somewhere. Each
+ * fd is different within each process, so we'll store them in a local tailq.
+ */

[dpdk-dev] [PATCH v4 52/70] eal: add support for unmapping pages at runtime

2018-04-08 Thread Anatoly Burakov
This isn't used anywhere yet, but the support is now there. Also,
adding cleanup to allocation procedures, so that if we fail to
allocate everything we asked for, we can free all of it back.

Signed-off-by: Anatoly Burakov 
---
 lib/librte_eal/bsdapp/eal/eal_memalloc.c   |  15 +++
 lib/librte_eal/common/eal_memalloc.h   |  14 +++
 lib/librte_eal/linuxapp/eal/eal_memalloc.c | 149 -
 3 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c 
b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index 8c30670..e7bcd2b 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -24,3 +24,18 @@ eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int 
__rte_unused socket)
RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
return NULL;
 }
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused)
+{
+   RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+   return -1;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused,
+   int n_segs __rte_unused)
+{
+   RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+   return -1;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h 
b/lib/librte_eal/common/eal_memalloc.h
index f628514..6017345 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -28,4 +28,18 @@ int
 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
int socket, bool exact);
 
+/*
+ * Deallocate segment
+ */
+int
+eal_memalloc_free_seg(struct rte_memseg *ms);
+
+/*
+ * Deallocate `n_segs` segments. Returns 0 on successful deallocation of all
+ * segments, returns -1 on error. Any segments that could have been 
deallocated,
+ * will be deallocated even in case of error.
+ */
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs);
+
 #endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c 
b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 45ea0ad..118b12d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -289,6 +289,48 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
return -1;
 }
 
+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+   unsigned int list_idx, unsigned int seg_idx)
+{
+   char path[PATH_MAX];
+   int fd, ret;
+
+   /* erase page data */
+   memset(ms->addr, 0, ms->len);
+
+   if (mmap(ms->addr, ms->len, PROT_READ,
+   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+   MAP_FAILED) {
+   RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+   return -1;
+   }
+
+   fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+   if (fd < 0)
+   return -1;
+
+   /* if we're able to take out a write lock, we're the last one
+* holding onto this page.
+*/
+
+   ret = lock(fd, 0, ms->len, F_WRLCK);
+   if (ret >= 0) {
+   /* no one else is using this page */
+   if (ret == 1)
+   unlink(path);
+   ret = lock(fd, 0, ms->len, F_UNLCK);
+   if (ret != 1)
+   RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+   __func__, path);
+   }
+   close(fd);
+
+   memset(ms, 0, sizeof(*ms));
+
+   return ret;
+}
+
 struct alloc_walk_param {
struct hugepage_info *hi;
struct rte_memseg **ms;
@@ -305,7 +347,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
struct alloc_walk_param *wa = arg;
struct rte_memseg_list *cur_msl;
size_t page_sz;
-   int cur_idx;
+   int cur_idx, start_idx, j;
unsigned int msl_idx, need, i;
 
if (msl->page_sz != wa->page_sz)
@@ -324,6 +366,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
if (cur_idx < 0)
return 0;
+   start_idx = cur_idx;
 
for (i = 0; i < need; i++, cur_idx++) {
struct rte_memseg *cur;
@@ -341,6 +384,25 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void 
*arg)
/* if exact number wasn't requested, stop */
if (!wa->exact)
goto out;
+
+   /* clean up */
+   for (j = start_idx; j < cur_idx; j++) {
+   struct rte_memseg *tmp;
+   struct rte_fbarray *arr =
+   &cur_msl->memseg_arr;
+
+   tmp = rte_fbarray_get(arr, j);
+   if (free_seg(tmp, wa->hi, ms

[dpdk-dev] [PATCH v4 54/70] eal: add API to check if memory is contiguous

2018-04-08 Thread Anatoly Burakov
For now, memory is always contiguous because legacy mem mode is
enabled unconditionally, but this function will be helpful down
the line when we implement support for allocating physically
non-contiguous memory. We can no longer guarantee physically
contiguous memory unless we're in legacy or IOVA_AS_VA mode, but
we can certainly try and see if we succeed.

In addition, this would be useful for e.g. PMD's who may allocate
chunks that are smaller than the pagesize, but they must not cross
the page boundary, in which case we will be able to accommodate
that request. This function will also support non-hugepage memory.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Moved this earlier in the patchset
- Add support for non-hugepage memory
- Fix handling of IOVA as VA mode

v3:
- Add support for non-hugepage memory
- Support non-page-sized segments

 lib/librte_eal/bsdapp/eal/Makefile  |  1 +
 lib/librte_eal/common/eal_common_memalloc.c | 90 +
 lib/librte_eal/common/eal_memalloc.h| 10 
 lib/librte_eal/common/malloc_elem.c | 40 +
 lib/librte_eal/common/meson.build   |  1 +
 lib/librte_eal/linuxapp/eal/Makefile|  1 +
 6 files changed, 106 insertions(+), 37 deletions(-)
 create mode 100644 lib/librte_eal/common/eal_common_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile 
b/lib/librte_eal/bsdapp/eal/Makefile
index 19f9322..907e30d 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -41,6 +41,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_timer.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memzone.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_log.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memalloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memory.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_tailqs.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_errno.c
diff --git a/lib/librte_eal/common/eal_common_memalloc.c 
b/lib/librte_eal/common/eal_common_memalloc.c
new file mode 100644
index 000..607ec3f
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+bool
+eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
+   size_t len)
+{
+   void *end, *aligned_start, *aligned_end;
+   size_t pgsz = (size_t)msl->page_sz;
+   const struct rte_memseg *ms;
+
+   /* for IOVA_VA, it's always contiguous */
+   if (rte_eal_iova_mode() == RTE_IOVA_VA)
+   return true;
+
+   /* for legacy memory, it's always contiguous */
+   if (internal_config.legacy_mem)
+   return true;
+
+   end = RTE_PTR_ADD(start, len);
+
+   /* for nohuge, we check pagemap, otherwise check memseg */
+   if (!rte_eal_has_hugepages()) {
+   rte_iova_t cur, expected;
+
+   aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz);
+   aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz);
+
+   /* if start and end are on the same page, bail out early */
+   if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz)
+   return true;
+
+   /* skip first iteration */
+   cur = rte_mem_virt2iova(aligned_start);
+   expected = cur + pgsz;
+   aligned_start = RTE_PTR_ADD(aligned_start, pgsz);
+
+   while (aligned_start < aligned_end) {
+   cur = rte_mem_virt2iova(aligned_start);
+   if (cur != expected)
+   return false;
+   aligned_start = RTE_PTR_ADD(aligned_start, pgsz);
+   expected += pgsz;
+   }
+   } else {
+   int start_seg, end_seg, cur_seg;
+   rte_iova_t cur, expected;
+
+   aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz);
+   aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz);
+
+   start_seg = RTE_PTR_DIFF(aligned_start, msl->base_va) /
+   pgsz;
+   end_seg = RTE_PTR_DIFF(aligned_end, msl->base_va) /
+   pgsz;
+
+   /* if start and end are on the same page, bail out early */
+   if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz)
+   return true;
+
+   /* skip first iteration */
+   ms = rte_fbarray_get(&msl->memseg_arr, start_seg);
+   cur = ms->iova;
+   expected = cur + pgsz;
+
+   /* if we can't access IOVA addresses, assume non-contiguous */
+   if (cur == 

  1   2   >