![]() |
![]() |
#34 |
Jul 2009
Tokyo
10011000102 Posts |
![]()
v0.13 err check per 0xff iteration count,it is big jump.
v0.12 err check every iteration,err report per 0xff iteration count. |
![]() |
![]() |
![]() |
#35 |
Sep 2004
B0E16 Posts |
![]() |
![]() |
![]() |
![]() |
#36 |
Jul 2009
Tokyo
2×5×61 Posts |
![]() |
![]() |
![]() |
![]() |
#37 |
Sep 2004
2×5×283 Posts |
![]() |
![]() |
![]() |
![]() |
#38 | |
Jul 2009
Tokyo
2·5·61 Posts |
![]() Quote:
CUDALucas need 4% CPU time. I can not understand this reason. Last fiddled with by msft on 2011-01-10 at 14:01 |
|
![]() |
![]() |
![]() |
#39 |
Jan 2005
Caught in a sieve
5×79 Posts |
![]()
I don't believe I changed the rate of error checking. I changed it to check with floats instead of doubles; though that didn't seem to improve the speed much.
|
![]() |
![]() |
![]() |
#40 |
Jul 2009
Tokyo
61010 Posts |
![]() |
![]() |
![]() |
![]() |
#41 | |
Jan 2005
Caught in a sieve
5·79 Posts |
![]() Quote:
The only problem is that it doesn't work. ![]() Code:
__global__ void cuda_normalize2_kernel( double *x, int N, double *g_limitbw, double *g_invlimit, double *g_carry, int wrapindex, double wrapfactor, double BIGWORD, int STRIDE, double *g_hlimit, int *g_flag, int NOSMO // N Over Stride Minus One (N/STRIDE-1). Divisions are expensive! ) { int threadID = blockIdx.x * blockDim.x + threadIdx.x; register int j; register double xx, zz; register double carry = 0.0; int flag; double hlim, limbw; flag = 0; // First, work on the last section in the section containing wrapindex. if((wrapindex < STRIDE && wrapindex >= 0 && threadID==(NOSMO)) || (threadID+STRIDE+STRIDE > wrapindex && threadID+STRIDE <= wrapindex)) { threadID=(NOSMO); carry=g_carry[threadID]; if (carry) { double carry2 = 0.0; j = 0; if (wrapindex) carry2 = carry*wrapfactor; carry = -carry; while ((carry||carry2) && (j < STRIDE)) { if (wrapindex && !carry) { // Skip already normalized words j = wrapindex; } xx = x[IDX(j)] + carry; hlim = g_hlimit[IDX(j)]; if (wrapindex && j==wrapindex) { xx += carry2; carry2 = 0.0; } limbw = ((hlim+hlim)*BIGWORD)-BIGWORD; zz = (xx+limbw)-limbw; carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word xx = xx - zz; // And the balanced remainder in current word x[IDX(j)] = xx; if((xx>hlim) && (xx<-hlim)) flag=1; if (++j == N) { j = 0; if (wrapindex) carry2 = carry*wrapfactor; carry = -carry; } } if(flag==1)g_flag[threadID]=1; } // Do no more if this was the threadID it was assigned to. if(threadID == blockIdx.x * blockDim.x + threadIdx.x) return; // Otherwise reset to work on the originally planned section. threadID = blockIdx.x * blockDim.x + threadIdx.x; flag = 0; carry = 0.0; } if( threadID<(NOSMO)) { carry=g_carry[threadID]; if (carry) { j = threadID*STRIDE+STRIDE; while ((carry) && (j < (threadID*STRIDE+STRIDE+STRIDE))) { hlim = g_hlimit[IDX(j)]; xx = x[IDX(j)] + carry; limbw = ((hlim+hlim)*BIGWORD)-BIGWORD; zz = (xx+limbw)-limbw; carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word xx = xx - zz; // And the balanced remainder in current word x[IDX(j)] = xx; if((xx>hlim) && (xx<-hlim)) flag=1; ++j; } } if(flag==1)g_flag[threadID]=1; } } |
|
![]() |
![]() |
![]() |
#42 | |
Jul 2009
Tokyo
10011000102 Posts |
![]()
We can learn from code.
wrapindex depend k(k*2^n+1) value and fft length. Quote:
|
|
![]() |
![]() |
![]() |
#43 |
Jul 2009
Tokyo
10011000102 Posts |
![]()
We can merge cuda_normalize2_kernel & cuda_normalize3_kernel.
Code:
__global__ void cuda_normalize2_kernel( double *x, int N, double *g_limitbw, double *g_invlimit, double *g_carry, int wrapindex, double wrapfactor, double BIGWORD, int STRIDE, double *g_hlimit, int *g_flag ) { const int threadID = blockIdx.x * blockDim.x + threadIdx.x; register int j; register double *px , xx, zz; register double carry = 0.0, carry2 = 0.0; int flag; double hlim, limbw; flag = 0; if( threadID==(N/STRIDE-1)) { carry=g_carry[threadID]; if (carry) { j = 0; px = x; if (wrapindex) carry2 = carry*wrapfactor; carry = -carry; while ((carry||carry2) && (j < STRIDE)) { if (wrapindex && !carry) { // Skip already normalized words j = wrapindex; px = x + wrapindex; } xx = x[IDX(j)] + carry; hlim = g_hlimit[IDX(j)]; if (wrapindex && j==wrapindex) { xx += carry2; carry2 = 0.0; } limbw = ((hlim+hlim)*BIGWORD)-BIGWORD; zz = (xx+limbw)-limbw; carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word xx = xx - zz; // And the balanced remainder in current word px++; x[IDX(j)] = xx; if((xx>hlim) && (xx<-hlim)) flag=1; if (++j == N) { j = 0; px = x; if (wrapindex) carry2 = carry*wrapfactor; carry = -carry; } } } if(flag==1)g_flag[threadID]=1; } else { carry=g_carry[threadID]; if (carry) { j = threadID*STRIDE+STRIDE; px = &x[threadID*STRIDE+STRIDE]; while ((carry) && (j < (threadID*STRIDE+STRIDE+STRIDE))) { hlim = g_hlimit[IDX(j)]; xx = x[IDX(j)] + carry; limbw = ((hlim+hlim)*BIGWORD)-BIGWORD; zz = (xx+limbw)-limbw; carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word xx = xx - zz; // And the balanced remainder in current word px++; x[IDX(j)] = xx; if((xx>hlim) && (xx<-hlim)) flag=1; ++j; } } if(flag==1)g_flag[threadID]=1; } } |
![]() |
![]() |
![]() |
#44 |
Jul 2009
Tokyo
2×5×61 Posts |
![]()
merge cuda_normalize2_kernel & cuda_normalize3_kernel.
|
![]() |
![]() |
![]() |
Thread Tools | |
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
LLRcuda | shanecruise | Riesel Prime Search | 8 | 2014-09-16 02:09 |
LLRCUDA - getting it to work | diep | GPU Computing | 1 | 2013-10-02 12:12 |