![]() |
|
|
#122 |
|
Dec 2011
New York, U.S.A.
97 Posts |
In preparation for assaulting the higher N numbers, I made some changes to the calculation of b^N at the beginning of the check() function. Here's the new beginning of that function, along with a few new helper functions it uses:
This is the OLD code: Code:
void check(const Int32 b, const UInt32 m, char *expectedResidue)
{
~~snip~~
if (read_checkpoint(b, m, z, &startTime, &iStart, &Nlen, &iOrigStart))
{
fprintf(STDOUT, "\rResuming %d^%u+1 from a checkpoint (%d iterations left)\n", b, m, iStart); //1.00
fflush(STDOUT);
}
else
{
Nlen = 1;
Na = (UInt32 *)myAlloc(1 * sizeof(UInt32));
Na[0] = (UInt32)b;
for (j = m; j != 1; j /= 2)
{
a = (UInt32 *)myAlloc(2 * Nlen * sizeof(UInt32));
for (i = 0; i != Nlen; ++i)
a[i] = 0;
for (i = 0; i != Nlen; ++i)
{
BOINC_GPU_CHECK_FOR_ABORT(true); // should be good spot for abort check. Runs sqrt of total times
a[i + Nlen] = mul_1_add_n(&a[i], Na[i], Na, Nlen);
}
myFree(Na);
Na = a;
Nlen *= 2;
if (Na[Nlen - 1] == 0) --Nlen;
}
iStart = iOrigStart = lg(Na, Nlen) - 1;
fprintf(STDOUT, "%sTesting %d^%u+1... %d steps to go ", ISBOINC ? "\n" : "\r", b, m, iStart);
fflush(STDOUT);
}
Code:
// transfer input int array into complex array for fft
__global__ void fftIn_kernel(cufftDoubleComplex * const g_fftIn, const UInt32 * const g_intArray, const UInt32 len)
{
int threadID = blockIdx.x * blockDim.x + threadIdx.x;
g_fftIn[threadID*2].y = g_fftIn[threadID*2+1].y = 0.0;
if (threadID < len)
{
g_fftIn[threadID*2].x = (double) (g_intArray[threadID] & 0xFFFF);
g_fftIn[threadID*2+1].x = (double) (g_intArray[threadID] >> 16);
}
else
g_fftIn[threadID*2].x = g_fftIn[threadID*2+1].x = (double) 0.0;
}
// Compute the size of the FFT array for compute_square
// round up to the nearest power of 2, minimum 128
UInt32 fftSize(const UInt32 len)
{
UInt32 size = lpt(len);
return size >= 128 ? size : 128;
}
// In order to compute the value of b^N, we need to repeatedly square b.
// Use cuFFT libraries to do the square
// The resulting value, the address of which is returned by this function
// will be twice as long as the input (i.e., 2*Nlen)
// This function allocates memory for the result -- the calling routine has
// responsibility to eventually free that memory.
//
// Try reducing bits per element to 16 -- maybe we have a loss of precision
// This means doubling the size of the FFT
// Yes, this was the problem -- so do fft's using 16 bit unisgned ints in doubles rather than 32 bits.
//
UInt32 * computeSquare(const UInt32 * Na, const UInt32 Nlen)
{
cufftDoubleComplex * g_fftIn; // FFT input array
cufftDoubleComplex * g_fftOut; // FFT Output array
cufftDoubleComplex * fftOut; // final output (easier to do this on CPU because of carry propogation)
UInt32 * g_intArray; // input/output integer array on the GPU
UInt32 * a; // imaginatively named pointer to the result
UInt32 fftLen = 2*fftSize(2*Nlen); // Size of the FFT -- NLen*2 rounded up to the next power of 2
cufftHandle plan; // the FFT plan
UInt32 i, carry; // index counter and carry
UInt64 l; // temporary 64 bit unsigned int for preserving carry
// allocated GPU memory for the FFT in and out (Complex double, 2*Nlen, rounded up to a good FFT size
cutilSafeCall(cudaMalloc((void**)&g_fftIn, sizeof(*g_fftIn)*fftLen));
cutilSafeCall(cudaMalloc((void**)&g_fftOut, sizeof(*g_fftOut)*fftLen));
// allocate GPU memory to load the integer array (UInt32, same count as the fft array)
cutilSafeCall(cudaMalloc((void**)&g_intArray, sizeof(*g_intArray)*fftLen/2));
// Copy Na into the device memory
cutilSafeCall(cudaMemcpy(g_intArray, Na, sizeof(*g_intArray)*Nlen, cudaMemcpyHostToDevice));
// invoke kernal to build FFT input
fftIn_kernel<<< (fftLen/2)/128, 128 >>> (g_fftIn, g_intArray, Nlen);
// Create the FFT plan
cufftSafeCall(cufftPlan1d(&plan, fftLen, CUFFT_Z2Z, 1));
// Do the forward FFT
cufftSafeCall(cufftExecZ2Z(plan, g_fftIn, g_fftOut, CUFFT_FORWARD));
// invoke kernel to square the resulting complex matrix
mul_kernel<<< fftLen/128,128 >>> (g_fftOut);
// Do the reverse FFT (from 'out' back to 'in')
cufftSafeCall(cufftExecZ2Z(plan, g_fftOut, g_fftIn, CUFFT_INVERSE));
// allocate memory for the result (UInt32, 2*Nlen)
a = (UInt32 *)myAlloc(sizeof(*a)*2*Nlen);
// allocate memory for output and copy the result from GPU
fftOut = (cufftDoubleComplex *)myAlloc(sizeof(*fftOut)*fftLen);
cutilSafeCall(cudaMemcpy(fftOut, g_fftIn, sizeof(*fftOut)*fftLen, cudaMemcpyDeviceToHost));
// Now extract the result from the array of complex variables
// Slower than doing this on the gpu, but much easier on host because of carry propogation.
// It's only run log n times, so it should be insignificant
for(i = 0, carry = 0; i < 2*Nlen; i++)
{
l = (UInt64) carry + (UInt64) round(fftOut[i*2].x / (double) fftLen) + (((UInt64) round(fftOut[i*2+1].x / (double) fftLen)) << 16);
a[i] = (UInt32) l; // lower 32 bits
carry = (UInt32) (l >> 32);
}
// free the 3 device memory arrays
cutilSafeCall(cudaFree((char *)g_fftIn));
cutilSafeCall(cudaFree((char *)g_fftOut));
cutilSafeCall(cudaFree((char *)g_intArray));
myFree((void*)fftOut);
// Destroy the FFT plan
cufftSafeCall(cufftDestroy(plan));
return a;
}
void check(const Int32 b, const UInt32 m, char *expectedResidue)
{
~~snip~~
if (read_checkpoint(b, m, z, &startTime, &iStart, &Nlen, &iOrigStart))
{
fprintf(STDOUT, "\rResuming %d^%u+1 from a checkpoint (%d iterations left)\n", b, m, iStart); //1.00
fflush(STDOUT);
}
else
{
Nlen = 1;
Na = (UInt32 *)myAlloc(1 * sizeof(UInt32));
Na[0] = (UInt32)b;
for (j = m; j != 1; j /= 2)
{
a = computeSquare(Na, Nlen); // compute_square will allocate "a"
myFree(Na);
Na = a;
Nlen *= 2;
if (Na[Nlen - 1] == 0) --Nlen;
}
iStart = iOrigStart = lg(Na, Nlen) - 1;
fprintf(STDOUT, "%sTesting %d^%u+1... %d steps to go ", ISBOINC ? "\n" : "\r", b, m, iStart);
fflush(STDOUT);
}
For 1248^4194304+1, this change reduces the initialization time, where it's calculating the value of b^N, from 2 hours down to a small fraction of one second. Full source and Windows executables will be available shortly. |
|
|
|
|
|
#123 |
|
Dec 2011
New York, U.S.A.
1418 Posts |
Source and executable can be downloaded here: GeneferCUDA-boinc.1.03beta.7z
I expected this to be released into production on PrimeGrid's Boinc server, but I want to do more testing first. This source is 100% compatible with Shoichiro's code up through Shoichiro's v1.049. Modifications from 1.050 onward have NOT been incorporated into my source. It can operate under Boinc, under PRPNet, or standalone. It's currently built for windows, but there's only a handful of places where the code is incompatible with Linux. (Setting Windows thread priority and variable arguments in a logging function are the two things I remember.) |
|
|
|
|
|
#124 | |
|
Jun 2003
32·5·113 Posts |
Quote:
Doesn't the computesquare call need BOINC_START_GPU/END_GPU? |
|
|
|
|
|
|
#125 | ||
|
Dec 2011
New York, U.S.A.
9710 Posts |
Quote:
The extra information in the checkpoint file doesn't make that much of a difference. I'm doing that calculation using each element as a 16 bit unsigned integer, rather than32 bits because if you square two 32 bit ints, your result is 64 bits long -- which will NOT fit in a 64 bit float since the float uses about 10 bits for the two signs and the exponent. However, this made the FFT have twice as many elements, which is itself causing problems. Quote:
BTW, I'm going to pull that build off the website until I find a fix for the problem. At N=4194304, it would produce incorrect results without any indication that the result is wrong. If anyone wants that source, let me know. |
||
|
|
|
|
|
#126 | |
|
Jun 2003
32·5·113 Posts |
Quote:
Also, you can gain a little bit more by dividing out all the twos from the 'b'. Eg:- 1248^4194304 = 39^4194304 * (2^5)^4194304 (1248 = 39*2^5). So you only need to do the FFT to calculate 39^4194304, and multiplication by (2^5)^4194304 is just shifting. Definitely getting more complex, isn't it? Well, good luck.
Last fiddled with by axn on 2012-01-10 at 13:08 |
|
|
|
|
|
|
#127 |
|
Jul 2009
Tokyo
2·5·61 Posts |
Hi ,
I retry "Reduce CPU Time". Code:
if(n1 > 16384) ithreads = 128;
else ithreads = 16;
for (i = iStart; i != 0; --i)
{
FFTsquareFFT(z, n1);
bt = Na[i / (8 * sizeof(UInt32))] >> (i % (8 * sizeof(UInt32))) & 1;
FFTnextStepGFN(b,invBase,(bt == 0) ? t1 : t2,n1,t3,ithreads,SHIFT);
if(i % 0xF == 0) cutilSafeCall(cudaMemcpy(l_maxErr,g_maxErr,sizeof(float),cudaMemcpyDeviceToHost));
if (!(i & 0xffff))
{
printf("\rTesting %d^%u+1... %d steps to go ", b, m, i);
fflush(stdout);
cutilSafeCall(cudaMemcpy(l_maxErr,g_maxErr,sizeof(float),cudaMemcpyDeviceToHost)); //1.00
Code:
24518^262144+1 is a probable prime. (1150678 digits) (err = 0.0002) (time = 1:27:48) 08:02:55 2347.29user 2912.59system 1:27:47elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k 336inputs+238616outputs (0major+13265minor)pagefaults 0swaps Code:
24518^262144+1 is a probable prime. (1150678 digits) (err = 0.0002) (time = 1:29:02) 06:35:07 215.20user 25.99system 1:29:02elapsed 4%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+238528outputs (0major+11867minor)pagefaults 0swaps |
|
|
|
|
|
#128 |
|
Apr 2010
Over the rainbow
2·1,303 Posts |
a 1% increase in time in exchange of an almost free cpu? nice
|
|
|
|
|
|
#129 |
|
Dec 2011
New York, U.S.A.
97 Posts |
It's actually a heck of a lot simpler than I imagined. First time I looked at this my first thought was "It just can not be that simple. I have to be missing something." I was very surprised when the right numbers started coming out. It's very much like like making a smoothie in a blender, reversing the power plug, and having whole fruit emerge. :)
Thanks for the advice, and the luck! |
|
|
|
|
|
#130 |
|
Dec 2011
New York, U.S.A.
11000012 Posts |
About 4 different ideas later, I think I've fixed 1.03. I'm going to test it more thoroughly and THEN post the code.
|
|
|
|
|
|
#131 |
|
Dec 2011
New York, U.S.A.
6116 Posts |
geneferCUDA-boinc.1.03a.7z
This has working code for the fast initialization. In the end, I sacrificed speed for precision. It's a whopping one and a half seconds slower than 1.03, but it has the slight advantage of actually working. I reduced the FFT element size to 8 bits, down from 16, which gave me tons of room to avoid rounding errors, but did push the FFT size above 8 million. I guess it's a good thing I GPUs are pretty fast! |
|
|
|
|
|
#132 |
|
Jul 2009
Tokyo
2×5×61 Posts |
Linux need #127 patch with CPU TIME 100% issue.
Win not need #127 patch. |
|
|
|
![]() |
| Thread Tools | |
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Genefer's FFT applied to Mersenne squaring | preda | Software | 0 | 2017-09-06 02:54 |
| CUDA 5.5 | ET_ | GPU Computing | 2 | 2013-06-13 15:50 |
| AVX CPU LL vs CUDA LL | nucleon | GPU Computing | 11 | 2012-01-04 17:52 |
| Best CUDA GPU for the $$ | Christenson | GPU Computing | 24 | 2011-05-01 00:06 |
| CUDA? | Xentar | Conjectures 'R Us | 6 | 2010-03-31 07:43 |