[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.

2016-02-23 Thread Ravi Kerur
On Tue, Feb 23, 2016 at 4:22 AM, Wang, Zhihong 
wrote:

> > > It'd be great if you could format this patch into a patch set with
> several
> > > little ones. :-)
> > > Also, the kernel checkpatch is very helpful.
> > > Good coding style and patch organization make it easy for in-depth
> reviews.
> > >
> > Combination of scalar and vector (32/64/128) was done to get optimal
> performance numbers. If there is enough interest in this I can work on it
> and provide an updated patch set.
>
> That'll be very helpful! Looking forward to your patch :)
> BTW, have you tested real example performance with your patch?
>

Yes it was tested with hash functions in dpdk code.I will work on it and
send updated patch. Thanks for your inputs I will incorporate them in next
patch series.


[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.

2016-02-23 Thread Wang, Zhihong
> > It'd be great if you could format this patch into a patch set with several
> > little ones. :-)
> > Also, the kernel checkpatch is very helpful.
> > Good coding style and patch organization make it easy for in-depth reviews.
> > 
> Combination of scalar and vector (32/64/128) was done to get optimal 
> performance numbers. If there is enough interest in this I can work on it and 
> provide an updated patch set.

That'll be very helpful! Looking forward to your patch :)
BTW, have you tested real example performance with your patch?


[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.

2016-02-19 Thread Ravi Kerur
On Wed, Jan 27, 2016 at 7:08 PM, Zhihong Wang 
wrote:

> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> > /librte_eal/common/include/arch/x86/rte_memcmp.h
>
> [...]
>
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +/**
> > + * Compare bytes between two locations. The locations must not overlap.
> > + *
>
> Parameter names should be kept consistent as they are in function body.
>
> > + * @param src_1
> > + *   Pointer to the first source of the data.
> > + * @param src_2
> > + *   Pointer to the second source of the data.
> > + * @param n
> > + *   Number of bytes to compare.
> > + * @return
> > + *   zero if src_1 equal src_2
> > + *   -ve if src_1 less than src_2
> > + *   +ve if src_1 greater than src_2
> > + */
> > +static inline int
> > +rte_memcmp(const void *src_1, const void *src,
> > + size_t n) __attribute__((always_inline));
> > +
> > +/**
> > + * Find the first different bit for comparison.
> > + */
> > +static inline int
> > +rte_cmpffd (uint32_t x, uint32_t y)
> > +{
> > + int i;
> > + int pos = x ^ y;
> > + for (i = 0; i < 32; i++)
> > + if (pos & (1<
> Coding style check :-)
> BTW, does the bsf instruction provide this check?
>
> > + return i;
> > + return -1;
> > +}
> > +
>
> [...]
>
> > +/**
> > + * Compare 48 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp48(const void *src_1, const void *src_2)
>
> Guess this is not used.
>

I had left _unused_ with the assumption that it might be needed when actual
performance tests are done on high end servers.

>
> [...]
>
> > +/**
> > + * Compare 256 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp256(const void *src_1, const void *src_2)
> > +{
> > + int ret;
> > +
> > + ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> > + (const uint8_t *)src_2 + 0 * 64);
>
> Why not just use rte_cmp128?
>
>
> [...]
>
> > +static inline int
> > +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> > +{
> > + const uint8_t *src_1 = (const uint8_t *)_src_1;
> > + const uint8_t *src_2 = (const uint8_t *)_src_2;
> > + int ret = 0;
> > +
> > + if (n < 16)
> > + return rte_memcmp_regular(src_1, src_2, n);
> > +
> > + if (n <= 32) {
> > + ret = rte_cmp16(src_1, src_2);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > + }
> > +
>
> Too many conditions here may harm the overall performance.
> It's a trade-off thing, all about balancing the overhead.
> Just make sure this is tuned based on actual test numbers.
>
>
> > + if (n <= 48) {
> > + ret = rte_cmp32(src_1, src_2);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > + }
> > +
> > + if (n <= 64) {
> > + ret = rte_cmp32(src_1, src_2);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + ret = rte_cmp16(src_1 + 32, src_2 + 32);
> > +
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > + }
> > +
> > + if (n <= 96) {
> > + ret = rte_cmp64(src_1, src_2);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + ret = rte_cmp16(src_1 + 64, src_2 + 64);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > + }
> > +
> > + if (n <= 128) {
> > + ret = rte_cmp64(src_1, src_2);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + ret = rte_cmp32(src_1 + 64, src_2 + 64);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + ret = rte_cmp16(src_1 + 96, src_2 + 96);
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > + }
>
> [...]
>
> > +/**
> > + * Compare 48 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp48(const void *src_1, const void *src_2)
>
> Not used.
>
> > +{
> > + int ret;
> > +
> > + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> > + (const uint8_t *)src_2 + 0 * 16);
> > +
> > + if (unlikely(ret != 0))
> > + return ret;
> > +
> > + ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> > + (const uint8_t *)src_2 + 1 * 16);
> > +
> > + if (unlikely(ret != 0))
> > +

[dpdk-dev] [dpdk-dev, v3] Implement memcmp using Intel SIMD instrinsics.

2016-01-27 Thread Zhihong Wang
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> /librte_eal/common/include/arch/x86/rte_memcmp.h

[...]

> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compare bytes between two locations. The locations must not overlap.
> + *

Parameter names should be kept consistent as they are in function body.

> + * @param src_1
> + *   Pointer to the first source of the data.
> + * @param src_2
> + *   Pointer to the second source of the data.
> + * @param n
> + *   Number of bytes to compare.
> + * @return
> + *   zero if src_1 equal src_2
> + *   -ve if src_1 less than src_2
> + *   +ve if src_1 greater than src_2
> + */
> +static inline int
> +rte_memcmp(const void *src_1, const void *src,
> + size_t n) __attribute__((always_inline));
> +
> +/**
> + * Find the first different bit for comparison.
> + */
> +static inline int
> +rte_cmpffd (uint32_t x, uint32_t y)
> +{
> + int i;
> + int pos = x ^ y;
> + for (i = 0; i < 32; i++)
> + if (pos & (1< + return i;
> + return -1;
> +}
> +

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Guess this is not used.

[...]

> +/**
> + * Compare 256 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp256(const void *src_1, const void *src_2)
> +{
> + int ret;
> +
> + ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> + (const uint8_t *)src_2 + 0 * 64);

Why not just use rte_cmp128?


[...]

> +static inline int
> +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> +{
> + const uint8_t *src_1 = (const uint8_t *)_src_1;
> + const uint8_t *src_2 = (const uint8_t *)_src_2;
> + int ret = 0;
> +
> + if (n < 16)
> + return rte_memcmp_regular(src_1, src_2, n);
> +
> + if (n <= 32) {
> + ret = rte_cmp16(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +

Too many conditions here may harm the overall performance.
It's a trade-off thing, all about balancing the overhead.
Just make sure this is tuned based on actual test numbers.


> + if (n <= 48) {
> + ret = rte_cmp32(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 64) {
> + ret = rte_cmp32(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 32, src_2 + 32);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 96) {
> + ret = rte_cmp64(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 64, src_2 + 64);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }
> +
> + if (n <= 128) {
> + ret = rte_cmp64(src_1, src_2);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp32(src_1 + 64, src_2 + 64);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16(src_1 + 96, src_2 + 96);
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> + }

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Not used.

> +{
> + int ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> + (const uint8_t *)src_2 + 0 * 16);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> + (const uint8_t *)src_2 + 1 * 16);
> +
> + if (unlikely(ret != 0))
> + return ret;
> +
> + return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
> + (const uint8_t *)src_2 + 2 * 16);
> +}
> +
> +/**
> + * Compare 64 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp64(const void *src_1, const void *src_2)
> +{
> + int ret;
> +
> + ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> + (const uint8_t *)src_2 + 0 * 16);

Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so