LGTM, pushed, thanks.
> -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > rander.wang > Sent: Thursday, June 22, 2017 17:41 > To: beig...@freedesktop.org > Cc: Wang, Rander <rander.w...@intel.com> > Subject: [Beignet] [PATCH] backend: refine pow function > > Now save 40% time than before > (1) group many branches which deal with corner case to one branch. > (2) using HW exp2 and log2 to replace some instructions > > pass conformance tests and utest > > Signed-off-by: rander.wang <rander.w...@intel.com> > --- > backend/src/libocl/tmpl/ocl_math_common.tmpl.cl | 294 ++++++++++++--- > --------- > 1 file changed, 148 insertions(+), 146 deletions(-) > > diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl > b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl > index 2c0a702..6026629 100644 > --- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl > @@ -2352,7 +2352,8 @@ OVERLOADABLE float > __gen_ocl_internal_pow(float x, float y) { > float z,ax,z_h,z_l,p_h,p_l; > float y1,t1,t2,r,s,sn,t,u,v,w; > int i,j,k,yisint,n; > - int hx,hy,ix,iy,is; > + int hy,ix,iy,is; > + unsigned int hx; > float bp,dp_h,dp_l, > zero = 0.0, > one = 1.0, > @@ -2382,17 +2383,17 @@ OVERLOADABLE float > __gen_ocl_internal_pow(float x, float y) { > float retVal = 0.0f; > bool bRet = false; > > - GEN_OCL_GET_FLOAT_WORD(hx,x); > + hx = as_uint(x); > GEN_OCL_GET_FLOAT_WORD(hy,y); > ax = __gen_ocl_fabs(x); > ix = as_int(ax); iy = as_int(fabs(y)); > > - if(iy < 0x00800000 || hx==0x3f800000) > + if(iy < 0x00800000) > { > bRet = true; > retVal = one; > } > - else if (ix > 0x7f800000 || iy > 0x7f800000) > + else if (iy > 0x7f800000) > { > bRet = true; > retVal = NAN; > @@ -2403,120 +2404,152 @@ OVERLOADABLE float > __gen_ocl_internal_pow(float x, float y) { > * yisint = 1 ... y is an odd int > * yisint = 2 ... y is an even int > */ > - yisint = 0; > - if(hx<0) { > - k = (iy>>23)-0x7f; /* exponent */ > - j = iy>>(23-k); > - yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint; > - yisint = (iy>=0x4b800000) ? 2:yisint; > - } > - > - /* special value of x */ > - if(ix==0x7f800000||ix==0||ix==0x3f800000){ > - z = ax; /*x is +-0,+-inf,+-1*/ > + sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */ > > - z = (hy < 0)? one/z:z; > - z = ((hx<0) && (((ix-0x3f800000)|yisint)==0))? NAN:z; > - z = ((hx<0) && (yisint==1))? -z:z; > + if(hx >= 0x7f800000) > + { > + yisint = 0; > + n = (hx>>31)-1; > > - retVal = (bRet)? retVal:z; > - bRet = true; > - } > + if (!retVal && ix > 0x7f800000) > + { > + bRet = true; > + retVal = NAN; > + } > > - n = ((uint)hx>>31)-1; > + if(hx >= 0x80000000) { > + k = (iy>>23)-0x7f; /* exponent */ > + j = iy>>(23-k); > + yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint; > + yisint = (iy>=0x4b800000) ? 2:yisint; > + } > > - /* (x<0)**(non-int) is NaN */ > - if(!bRet && (n|yisint)==0) > - { > - bRet= true; > - retVal = NAN; > - } > + /* special value of x */ > + if(ix==0x7f800000||ix==0||ix==0x3f800000){ > + z = ax; /*x is +-0,+-inf,+-1*/ > + z = (hy < 0)? one/z:z; > + z = (((ix-0x3f800000)|yisint)==0)? NAN:z; > + z = (yisint==1)? -z:z; > + retVal = (bRet)? retVal:z; > + bRet = true; > + } > > - sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */ > - if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */ > + /* (x<0)**(non-int) is NaN */ > + if(!bRet && (n|yisint)==0) > + { > + bRet= true; > + retVal = NAN; > + } > > - /* |y| is huge */ > - if(iy>0x4d000000) > - { /* if |y| > 2**27 */ > - /* over/underflow if x is not close to one */ > - /* special value of y */ > - float b1 = (hy>=0)? y: zero; > - float b2 = (hy<0)?-y: zero; > - b1 = (ix > 0x3f800000)? b1:b2; > - retVal = (iy==0x7f800000 && !bRet)? b1:retVal; > - bRet = (iy==0x7f800000 && !bRet)? true: bRet; > + if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */ } > > + /* special value of x */ > + if((ix&0x7fffff) == 0) { > + if(hx == 0x3f800000) > + { > + retVal = one; > + bRet = true; > + } > > - b1 = (hy>0)? sn*huge*huge:0; > - retVal = (ix>0x3f800007 && !bRet)? b1:retVal; > - bRet = (ix>0x3f800007 && !bRet)? true:bRet; > + if(ix==0x7f800000||ix==0) { > + z = ax; /*x is +0,+inf*/ > + z = (hy < 0)? one/z:z; > + retVal = (bRet)? retVal:z; > + bRet = true; > + } > + } > > - /* now |1-x| is tiny <= 2**-20, suffice to compute > - log(x) by x-x^2/2+x^3/3-x^4/4 */ > - t = ax-1; /* t has 20 trailing zeros */ > - w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f)); > - u = ivln2_h*t; /* ivln2_h has 16 sig. bits */ > - v = t*ivln2_l-w*ivln2; > - t1 = u+v; > - GEN_OCL_GET_FLOAT_WORD(is,t1); > - GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000); > - t2 = v-(t1-u); > - } > - else > + /* |y| is not huge */ > + if(iy <= 0x4d000000) > { > - float s2,s_h,s_l,t_h,t_l; > - n = 0; > - n += ((ix)>>23)-0x7f; > - j = ix&0x007fffff; > - /* determine interval */ > - ix = j|0x3f800000; /* normalize ix */ > - > - n = (j >= 0x5db3d7)?n+1:n; > - ix = (j >= 0x5db3d7)? ix - 0x00800000:ix; > - k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1; > + float s2,s_h,s_l,t_h,t_l; > + n = ((ix)>>23)-0x7f; > + j = ix&0x007fffff; > + /* determine interval */ > + ix = j|0x3f800000; /* normalize ix */ > > - GEN_OCL_SET_FLOAT_WORD(ax,ix); > + if(x > 0x1.0p-6 && x < 0x1.2p7 && fabs(y) < 0x1.0p4) > + { > + GEN_OCL_SET_FLOAT_WORD(ax,ix); > + t1 = n; > + t2 = native_log2(ax); > + } > + else > + { > + n = (j >= 0x5db3d7)?n+1:n; > + ix = (j >= 0x5db3d7)? ix - 0x00800000:ix; > + k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1; > + > + GEN_OCL_SET_FLOAT_WORD(ax,ix); > + > + bp = k? 1.5:bp; > + dp_h = k? 5.84960938e-01:dp_h; > + dp_l = k? 1.56322085e-06:dp_l; > + > + /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */ > + u = ax-bp; /* bp[0]=1.0, bp[1]=1.5 */ > + v = one/(ax+bp); > + s = u*v; > + s_h = s; > + GEN_OCL_GET_FLOAT_WORD(is,s_h); > + GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000); > + /* t_h=ax+bp[k] High */ > + is = ((ix>>1)&0xfffff000)|0x20000000; > + GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21)); > + t_l = ax - (t_h-bp); > + s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u))); > + > + /* compute log(ax) */ > + s2 = s*s; > + r = s2*s2*(mad(s2, L2, L1)); > + r = mad(s_l, (s_h+s), r); > + t_h = mad(s_h, s_h, 3.0f)+r; > + GEN_OCL_GET_FLOAT_WORD(is,t_h); > + GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000); > + t_l = r-(mad(-s_h, s_h, (t_h-3.0f))); > + /* u+v = s*(1+...) */ > + u = s_h*t_h; > + v = mad(s_l, t_h, t_l*s); > + /* 2/(3log2)*(s+...) */ > + p_h = mad(s_h, t_h, v); > + GEN_OCL_GET_FLOAT_WORD(is,p_h); > + GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000); > + p_l = v-(mad(-s_h, t_h, p_h)); > + z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l)); > + /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */ > + t = (float)n; > + t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t); > + GEN_OCL_GET_FLOAT_WORD(is,t1); > + GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000); > + t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h)); > + } > + } > + else > + { /* if |y| > 2**27 */ > + /* over/underflow if x is not close to one */ > + /* special value of y */ > + float b1 = (hy>=0)? y: zero; > + float b2 = (hy<0)?-y: zero; > + b1 = (ix > 0x3f800000)? b1:b2; > + retVal = (iy==0x7f800000 && !bRet)? b1:retVal; > + bRet = (iy==0x7f800000 && !bRet)? true: bRet; > > - bp = k? 1.5:bp; > - dp_h = k? 5.84960938e-01:dp_h; > - dp_l = k? 1.56322085e-06:dp_l; > > - /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */ > - u = ax-bp; /* bp[0]=1.0, bp[1]=1.5 */ > - v = one/(ax+bp); > - s = u*v; > - s_h = s; > - GEN_OCL_GET_FLOAT_WORD(is,s_h); > - GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000); > - /* t_h=ax+bp[k] High */ > - is = ((ix>>1)&0xfffff000)|0x20000000; > - GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21)); > - t_l = ax - (t_h-bp); > - s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u))); > + b1 = (hy>0)? sn*huge*huge:0; > + retVal = (ix>0x3f800007 && !bRet)? b1:retVal; > + bRet = (ix>0x3f800007 && !bRet)? true:bRet; > > - /* compute log(ax) */ > - s2 = s*s; > - r = s2*s2*(mad(s2, L2, L1)); > - r = mad(s_l, (s_h+s), r); > - t_h = mad(s_h, s_h, 3.0f)+r; > - GEN_OCL_GET_FLOAT_WORD(is,t_h); > - GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000); > - t_l = r-(mad(-s_h, s_h, (t_h-3.0f))); > - /* u+v = s*(1+...) */ > - u = s_h*t_h; > - v = mad(s_l, t_h, t_l*s); > - /* 2/(3log2)*(s+...) */ > - p_h = mad(s_h, t_h, v); > - GEN_OCL_GET_FLOAT_WORD(is,p_h); > - GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000); > - p_l = v-(mad(-s_h, t_h, p_h)); > - z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l)); > - /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */ > - t = (float)n; > - t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t); > - GEN_OCL_GET_FLOAT_WORD(is,t1); > - GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000); > - t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h)); > + /* now |1-x| is tiny <= 2**-20, suffice to compute > + log(x) by x-x^2/2+x^3/3-x^4/4 */ > + t = ax-1; /* t has 20 trailing zeros */ > + w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f)); > + u = ivln2_h*t; /* ivln2_h has 16 sig. bits */ > + v = t*ivln2_l-w*ivln2; > + t1 = u+v; > + GEN_OCL_GET_FLOAT_WORD(is,t1); > + GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000); > + t2 = v-(t1-u); > } > > /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */ @@ -2524,52 > +2557,21 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) > { > GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000); > p_l = mad((y-y1), t1, y*t2); > p_h = y1*t1; > - z = mad(y1, t1, p_l); > > - GEN_OCL_GET_FLOAT_WORD(j,z); > - if((j&0x7fffffff) >= 0x43000000 && !bRet) > + z = mad(y1, t1, p_l); > + if(fabs(z) >= 128.0f) > { > - retVal = ((j&0x7fffffff)>0x43160000)? 0.0:retVal; > - retVal = (j > 0x43000000) ? sn*huge*huge:retVal; > - retVal = (j == 0x43000000 && p_l+ovt>z-p_h)? sn*huge*huge:retVal; > - retVal = (j == 0xc3160000 && p_l<=z-p_h)? 0.0:retVal; > - bRet = (((j&0x7fffffff)>0x43160000) || (j == 0xc3160000 && p_l<=z-p_h) > || retVal == sn*huge*huge)? true:false; > - } > - > - /* > - * compute 2**(p_h+p_l) > - */ > - i = j&0x7fffffff; > - k = (i>>23)-0x7f; > - n = 0; > - if(i>0x3f000000) { /* if |z| > 0.5, set n = [z+0.5] */ > - n = j+(0x00800000>>(k+1)); > - k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */ > - GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k)); > - n = ((n&0x007fffff)|0x00800000)>>(23-k); > - n = (j<0)?-n:n; > - p_h = mad(y1, t1, -t); > + if(!bRet) > + { > + retVal = (fabs(z) > 150.0f)? 0.0:retVal; > + retVal = (z > 128.0f) ? sn*INFINITY:retVal; > + retVal = (z == 128.0f && p_l+ovt>z-p_h)? sn*INFINITY:retVal; > + retVal = (z == -150.0f && p_l<=z-p_h)? 0.0:retVal; > + bRet = ((fabs(z) > 150.0f) || (z == -150.0f && p_l<=z-p_h) || retVal > == > sn*INFINITY)? true:false; > + } > } > > - t = p_l+p_h; > - GEN_OCL_GET_FLOAT_WORD(is,t); > - GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000); > - > - v = mad((p_l-(t-p_h)), lg2, t*lg2_l); > - z = mad(t, lg2_h, v); > - w = v-mad(-t, lg2_h, z); > - t = z*z; > - t1 = mad(-t, (mad(t, P2, P1)), z); > - r = (z*t1)/(t1-two)-(mad(z, w, w)); > - z = one-(r-z); > - > - GEN_OCL_GET_FLOAT_WORD(j,z); > - j += (n<<23); > - > - if((j>>23)<=0) > - z = 0;//__gen_ocl_scalbnf(z,n); /* subnormal output */ > - else > - GEN_OCL_SET_FLOAT_WORD(z,j); > + z = exp2(p_l)*exp2(p_h); > > return bRet ? retVal:sn*z; > } > -- > 2.7.4 > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet