![]() |
|
|
#34 |
|
Jul 2009
Tokyo
10011000102 Posts |
v0.13 err check per 0xff iteration count,it is big jump.
v0.12 err check every iteration,err report per 0xff iteration count. |
|
|
|
|
|
#35 |
|
Sep 2004
2·5·283 Posts |
|
|
|
|
|
|
#36 |
|
Jul 2009
Tokyo
2·5·61 Posts |
|
|
|
|
|
|
#37 |
|
Sep 2004
2×5×283 Posts |
|
|
|
|
|
|
#38 | |
|
Jul 2009
Tokyo
2·5·61 Posts |
Quote:
CUDALucas need 4% CPU time. I can not understand this reason. Last fiddled with by msft on 2011-01-10 at 14:01 |
|
|
|
|
|
|
#39 |
|
Jan 2005
Caught in a sieve
5·79 Posts |
I don't believe I changed the rate of error checking. I changed it to check with floats instead of doubles; though that didn't seem to improve the speed much.
|
|
|
|
|
|
#40 |
|
Jul 2009
Tokyo
61010 Posts |
|
|
|
|
|
|
#41 | |
|
Jan 2005
Caught in a sieve
5·79 Posts |
Quote:
The only problem is that it doesn't work. So does anybody see anything wrong with this kernel? I checked the NOSMO parameter alone; that works, but any speed improvement is within the margin of error.Code:
__global__ void cuda_normalize2_kernel(
double *x,
int N,
double *g_limitbw,
double *g_invlimit,
double *g_carry,
int wrapindex,
double wrapfactor,
double BIGWORD,
int STRIDE,
double *g_hlimit,
int *g_flag,
int NOSMO // N Over Stride Minus One (N/STRIDE-1). Divisions are expensive!
)
{
int threadID = blockIdx.x * blockDim.x + threadIdx.x;
register int j;
register double xx, zz;
register double carry = 0.0;
int flag;
double hlim, limbw;
flag = 0;
// First, work on the last section in the section containing wrapindex.
if((wrapindex < STRIDE && wrapindex >= 0 && threadID==(NOSMO)) ||
(threadID+STRIDE+STRIDE > wrapindex && threadID+STRIDE <= wrapindex))
{
threadID=(NOSMO);
carry=g_carry[threadID];
if (carry)
{
double carry2 = 0.0;
j = 0;
if (wrapindex) carry2 = carry*wrapfactor;
carry = -carry;
while ((carry||carry2) && (j < STRIDE))
{ if (wrapindex && !carry) { // Skip already normalized words
j = wrapindex;
}
xx = x[IDX(j)] + carry;
hlim = g_hlimit[IDX(j)];
if (wrapindex && j==wrapindex) {
xx += carry2;
carry2 = 0.0;
}
limbw = ((hlim+hlim)*BIGWORD)-BIGWORD;
zz = (xx+limbw)-limbw;
carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word
xx = xx - zz; // And the balanced remainder in current word
x[IDX(j)] = xx;
if((xx>hlim) && (xx<-hlim)) flag=1;
if (++j == N)
{
j = 0;
if (wrapindex)
carry2 = carry*wrapfactor;
carry = -carry;
}
}
if(flag==1)g_flag[threadID]=1;
}
// Do no more if this was the threadID it was assigned to.
if(threadID == blockIdx.x * blockDim.x + threadIdx.x) return;
// Otherwise reset to work on the originally planned section.
threadID = blockIdx.x * blockDim.x + threadIdx.x;
flag = 0;
carry = 0.0;
}
if( threadID<(NOSMO))
{
carry=g_carry[threadID];
if (carry)
{
j = threadID*STRIDE+STRIDE;
while ((carry) && (j < (threadID*STRIDE+STRIDE+STRIDE)))
{
hlim = g_hlimit[IDX(j)];
xx = x[IDX(j)] + carry;
limbw = ((hlim+hlim)*BIGWORD)-BIGWORD;
zz = (xx+limbw)-limbw;
carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word
xx = xx - zz; // And the balanced remainder in current word
x[IDX(j)] = xx;
if((xx>hlim) && (xx<-hlim)) flag=1;
++j;
}
}
if(flag==1)g_flag[threadID]=1;
}
}
|
|
|
|
|
|
|
#42 | |
|
Jul 2009
Tokyo
2×5×61 Posts |
We can learn from code.
wrapindex depend k(k*2^n+1) value and fft length. Quote:
|
|
|
|
|
|
|
#43 |
|
Jul 2009
Tokyo
2·5·61 Posts |
We can merge cuda_normalize2_kernel & cuda_normalize3_kernel.
Code:
__global__ void cuda_normalize2_kernel(
double *x,
int N,
double *g_limitbw,
double *g_invlimit,
double *g_carry,
int wrapindex,
double wrapfactor,
double BIGWORD,
int STRIDE,
double *g_hlimit,
int *g_flag
)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
register int j;
register double *px , xx, zz;
register double carry = 0.0, carry2 = 0.0;
int flag;
double hlim, limbw;
flag = 0;
if( threadID==(N/STRIDE-1))
{
carry=g_carry[threadID];
if (carry)
{
j = 0;
px = x;
if (wrapindex) carry2 = carry*wrapfactor;
carry = -carry;
while ((carry||carry2) && (j < STRIDE))
{ if (wrapindex && !carry) { // Skip already normalized words
j = wrapindex;
px = x + wrapindex;
}
xx = x[IDX(j)] + carry;
hlim = g_hlimit[IDX(j)];
if (wrapindex && j==wrapindex) {
xx += carry2;
carry2 = 0.0;
}
limbw = ((hlim+hlim)*BIGWORD)-BIGWORD;
zz = (xx+limbw)-limbw;
carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word
xx = xx - zz; // And the balanced remainder in current word
px++;
x[IDX(j)] = xx;
if((xx>hlim) && (xx<-hlim)) flag=1;
if (++j == N)
{
j = 0;
px = x;
if (wrapindex)
carry2 = carry*wrapfactor;
carry = -carry;
}
}
}
if(flag==1)g_flag[threadID]=1;
}
else
{
carry=g_carry[threadID];
if (carry)
{
j = threadID*STRIDE+STRIDE;
px = &x[threadID*STRIDE+STRIDE];
while ((carry) && (j < (threadID*STRIDE+STRIDE+STRIDE)))
{
hlim = g_hlimit[IDX(j)];
xx = x[IDX(j)] + carry;
limbw = ((hlim+hlim)*BIGWORD)-BIGWORD;
zz = (xx+limbw)-limbw;
carry = zz*g_invlimit[IDX(j)]; // Compute the carry on next word
xx = xx - zz; // And the balanced remainder in current word
px++;
x[IDX(j)] = xx;
if((xx>hlim) && (xx<-hlim)) flag=1;
++j;
}
}
if(flag==1)g_flag[threadID]=1;
}
}
|
|
|
|
|
|
#44 |
|
Jul 2009
Tokyo
2×5×61 Posts |
merge cuda_normalize2_kernel & cuda_normalize3_kernel.
|
|
|
|
![]() |
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| LLRcuda | shanecruise | Riesel Prime Search | 8 | 2014-09-16 02:09 |
| LLRCUDA - getting it to work | diep | GPU Computing | 1 | 2013-10-02 12:12 |