![]() |
![]() |
#1 |
(loop (#_fork))
Feb 2006
Cambridge, England
2·7·461 Posts |
![]()
The first google hit for 'CUDA sieve' provides something that doesn't compile and, when tweaked to compile, gives wrong answers. This disturbed my tranquility, so I determined to write something that worked and gave correct answers, albeit more slowly than a stunned sloth crawling backwards through treacle (3.3 seconds on a 9600GT to find the 5761455 primes <10^8). If this causes pstach to reappear, curse me roundly and give an alternative implementation of eleven thousand times the speed, I won't complain :)
Makefile: Code:
NVCC=/usr/local/cuda64/cuda/bin/nvcc CUTILI=/usr/local/cuda64/cuda-sdk/common/inc CUTILL=/usr/local/cuda64/cuda-sdk/lib/libcutil.a EXE=erato EXED=$(patsubst %,%-d,$(EXE)) all: $(EXE) $(EXED) %: %.cu $(NVCC) -I $(CUTILI) -o $@ $< $(CUTILL) %-d: %.cu $(NVCC) -g -G -I $(CUTILI) -o $@ $< $(CUTILL) Code:
// *-* c-mode *-* #include <stdio.h> #include <cutil_inline.h> int maxT = 0; __global__ static void Sieve(int * sieve,int* primes, int maxp, int sieve_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < maxp) { int ithPrime = primes[idx]; int delta = ithPrime; if (ithPrime != 2) delta = 2*delta; for(int i=ithPrime*ithPrime;i < sieve_size;i+=delta) sieve[i] = ithPrime; } } __global__ static void Individual(int * sieve, int nloc) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < nloc) { /* int myprime, delta; if (idx == 0) {myprime=2;delta=2;} if (idx == 1) {myprime=3;} if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2) - 1;} if (idx!=0) delta=2*myprime; for (int i=myprime*myprime; i<sieve_size; i+=delta) sieve[i] = 1;*/ // check individual primes by trial division sieve[idx] = 0; int b = 2; if (idx==0 || idx==1) sieve[idx] = 1; if (idx==2 || idx==3) sieve[idx] = 0; while (b*b <= idx) { if (idx%b == 0) sieve[idx] = 1; b++; } } } bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor); printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock); printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); maxT = prop.maxThreadsDim[0]; printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("total constant memory %d\n",prop.totalConstMem); break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } int main(int argc, char** argv) { const int N = 100000000; int Nsmall = sqrt(N); int *device_small_sieve, *device_big_sieve; if(!InitCUDA()) { return 0; } CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(int) * Nsmall)); CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 51, sizeof(int)*Nsmall)); unsigned int shandle; cutCreateTimer(&shandle); cutStartTimer(shandle); Individual<<<maxT,(Nsmall+maxT-1)/maxT, 0>>> (device_small_sieve,Nsmall); CUDA_SAFE_CALL(cudaThreadSynchronize()); cutStopTimer(shandle); printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle)); int* host_copy_of_smallsieve = (int*)malloc(Nsmall*sizeof(int)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve, device_small_sieve, sizeof(int) * Nsmall, cudaMemcpyDeviceToHost)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); // OK. We've got an array with 0 at the small primes int np = 0, test = 0; for (int k=0; k<Nsmall; k++) {test |= host_copy_of_smallsieve[k]; np += 1-host_copy_of_smallsieve[k];} if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);} printf("%d small primes (< %d) found\n", np, Nsmall); int* primes = (int*)malloc(np*sizeof(int)); int pptr = 0; for (int k=0; k<Nsmall; k++) if (host_copy_of_smallsieve[k] == 0) primes[pptr++]=k; // except that we needed the primes on the device int* device_primes; CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np)); CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice)); // now, set up the big array on the device CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(int) * N)); CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 51, sizeof(int)*N)); unsigned int thandle; cutCreateTimer(&thandle); cutStartTimer(thandle); Sieve<<<maxT, (np+maxT-1)/maxT, 0>>>(device_big_sieve, device_primes, np, N); cudaThreadSynchronize(); cutStopTimer(thandle); printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle)); int* host_sieve = (int*)malloc(N*sizeof(int)); cudaMemcpy(host_sieve, device_big_sieve, sizeof(int) * N, cudaMemcpyDeviceToHost); cudaFree(device_big_sieve); cudaFree(device_small_sieve); cudaFree(device_primes); int nbig = 0; for(int i=2;i < N;++i) if (host_sieve[i] == 0x33333333) nbig++; printf("%d big primes (< %d) found\n",nbig,N); return 0; } Last fiddled with by fivemack on 2009-05-21 at 14:09 |
![]() |
![]() |
![]() |
#2 |
Sep 2008
Kansas
383410 Posts |
![]()
I saw on one of the nVidia forums an integer divide uses 140 clock cycles. It is better to use bit-wise shifts whenever possible. (Some compiler optimizations may do that for you.)
|
![]() |
![]() |
![]() |
#3 |
Tribal Bullet
Oct 2004
2×13×137 Posts |
![]()
Integer division is not on the critical path for a properly written prime sieve, concurrent byte-wide memory access is.
|
![]() |
![]() |
![]() |
#4 |
Jan 2008
France
59610 Posts |
![]()
I think Tom's program was a joke, in fact I'd be willing to bet a beer it was
![]() |
![]() |
![]() |
![]() |
#5 |
(loop (#_fork))
Feb 2006
Cambridge, England
2·7·461 Posts |
![]()
It wasn't entirely a joke; I was wondering whether, in the finest traditions of Usenet where posting a bad answer is the best way to get a good answer, if I posted a lousy CUDA program I'd get useful advice as to how to write a less lousy one. It seems I posted too lousy a program, or am perhaps the only person with CUDA set up (and it was on a machine at work which has since been rebuilt ...) who reads the forum regularly.
I'm actually sieving in the Sieve<<<>>> kernel call; used trial division in the step to get the primes up to sqrt(N) because I found that it was quicker than sieving while you were dealing with very small primes. I'm well aware the algorithm I'm using is stupid, I tried a few techniques to make it less stupid which made things significantly worse. |
![]() |
![]() |
![]() |
#6 |
Bamboozled!
"๐บ๐๐ท๐ท๐ญ"
May 2003
Down not across
1172910 Posts |
![]() |
![]() |
![]() |
![]() |
#7 |
Jul 2003
So Cal
262110 Posts |
![]() |
![]() |
![]() |
![]() |
#8 |
(loop (#_fork))
Feb 2006
Cambridge, England
2×7×461 Posts |
![]()
This may be a rude question, but have you actually written much code on your CUDA setups? Google finds me lots of people who have run CUFFT and mention how large the gigaflops numbers that come out are, a few people doing the same for linpack, and the developers of some Russian GIS software which uses CUDA for all the heavy lifting, but nothing which looks like a particularly vibrant CUDA-writing community.
There are lots of papers full of what after a while read like the same commonplaces on how to write good CUDA code, but I've only really seen the one nVidia conference presentation where they take slow code and demonstrate how it may be changed into fast code, and there are several steps there that I don't quite follow and several others where I can't see how they could be made relevant for sieving. |
![]() |
![]() |
![]() |
#9 |
Jul 2003
So Cal
A3D16 Posts |
![]()
I've written just a few simple programs. Besides the obvious things like have lots of threads to mask latencies and access global memory as little as possible, getting the best performance really seems to be trial and error. I was involved in the integration of distributed.net's RC5 CUDA core. I found that seemingly trivial changes like changing an always positive integer from signed to unsigned, and assigning blockDim.x, blockIdx.x, and threadIdx.x to an integer before doing ANY math made huge differences in the speed of the code. In fact, just the move to CUDA 2.2 has halved the speed of the RC5 code. I recently asked why on the nVidia forums but haven't gotten any replies.
|
![]() |
![]() |
![]() |
#10 |
Jul 2003
So Cal
2,621 Posts |
![]()
OK, I'll bite. I'm testing on a 9600GSO. The card didn't have enough memory to run your code as written. I got an error when trying to allocate the big sieve array. I modified the code to use unsigned chars rather than ints and only store 0 or 1 instead of the prime itself. Running that version gave
Code:
Device 0 supports CUDA 1.1 It has warp size 32, 8192 regs per block, 512 threads per block max Threads 512 x 512 x 64 max Grid 65535 x 65535 x 1 total constant memory 65536 0.433000 milliseconds for small primes 0x2588530 0x2588530 1229 small primes (< 10000) found 4743.630859 milliseconds for big sieve 5761455 big primes (< 100000000) found Code:
Device 0 supports CUDA 1.1 It has warp size 32, 8192 regs per block, 512 threads per block max Threads 512 x 512 x 64 max Grid 65535 x 65535 x 1 total constant memory 65536 0.418000 milliseconds for small primes 0x8c3020 0x8c3020 1229 small primes (< 10000) found 1676.081055 milliseconds for big sieve 5761455 big primes (< 100000000) found Here's the modified code: Code:
#include <stdio.h> #include <cutil_inline.h> typedef unsigned char u8; int maxT = 0; __global__ static void Sieve(u8 * sieve,int * primes, int maxp, int sieve_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if ((idx>0) && (idx < maxp)) { int ithPrime = primes[idx]; for(int i=(3*ithPrime)>>1 ;i < sieve_size; i+=ithPrime) // i = (ithPrime-1)/2 + ithPrime though the compiler knew this sieve[i] = 1; } } __global__ static void Individual(u8 * sieve, int nloc) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < nloc) { /* int myprime, delta; if (idx == 0) {myprime=2;delta=2;} if (idx == 1) {myprime=3;} if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2) - 1;} if (idx!=0) delta=2*myprime; for (int i=myprime*myprime; i<sieve_size; i+=delta) sieve[i] = 1;*/ // check individual primes by trial division sieve[idx] = 0; int b = 2; if (idx==0 || idx==1) sieve[idx] = 1; if (idx==2 || idx==3) sieve[idx] = 0; while (b*b <= idx) { if (idx%b == 0) {sieve[idx] = 1; break;} b++; } } } bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor); printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock); printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); maxT = prop.maxThreadsDim[0]; printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("total constant memory %d\n",prop.totalConstMem); break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } int main(int argc, char** argv) { const int N = 100000000; int Nsmall = sqrt(N); u8 *device_small_sieve, *device_big_sieve; if(!InitCUDA()) { return 0; } CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(u8) * Nsmall)); CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 0, sizeof(u8)*Nsmall)); unsigned int shandle; cutCreateTimer(&shandle); cutStartTimer(shandle); Individual<<<maxT,(Nsmall+maxT-1)/maxT, 0>>> (device_small_sieve,Nsmall); CUDA_SAFE_CALL(cudaThreadSynchronize()); cutStopTimer(shandle); printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle)); u8* host_copy_of_smallsieve = (u8*)malloc(Nsmall*sizeof(u8)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve, device_small_sieve, sizeof(u8) * Nsmall, cudaMemcpyDeviceToHost)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); // OK. We've got an array with 0 at the small primes int np = 0, test = 0; for (int k=0; k<Nsmall; k++) {test |= host_copy_of_smallsieve[k]; np += 1-host_copy_of_smallsieve[k];} if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);} printf("%d small primes (< %d) found\n", np, Nsmall); int* primes = (int*)malloc(np*sizeof(int)); int pptr = 0; for (int k=0; k<Nsmall; k++) if (host_copy_of_smallsieve[k] == 0) primes[pptr++]=k; // except that we needed the primes on the device int* device_primes; CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np)); CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice)); // now, set up the big array on the device CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(u8) * N/2)); CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 0, sizeof(u8)*N/2)); unsigned int thandle; cutCreateTimer(&thandle); cutStartTimer(thandle); Sieve<<<maxT, (np+maxT-1)/maxT, 0>>>(device_big_sieve, device_primes, np, N/2); cudaThreadSynchronize(); cutStopTimer(thandle); printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle)); u8* host_sieve = (u8*)malloc(N/2*sizeof(u8)); cudaMemcpy(host_sieve, device_big_sieve, sizeof(u8) * N/2, cudaMemcpyDeviceToHost); cudaFree(device_big_sieve); cudaFree(device_small_sieve); cudaFree(device_primes); int nbig = 0; for(int i=0;i < N/2;++i) if (host_sieve[i] == 0) {nbig++;} //printf("%d\n",(i==0)?2:2*i+1);} printf("%d big primes (< %d) found\n",nbig,N); return 0; } |
![]() |
![]() |
![]() |
#11 | |
Bamboozled!
"๐บ๐๐ท๐ท๐ญ"
May 2003
Down not across
37·317 Posts |
![]() Quote:
When I get some time I plan on porting Ernst's code for trial factoring of the larger MM numbers. Unfortunately, I'm too busy with Real Work(tm). Paul |
|
![]() |
![]() |
![]() |
Thread Tools | |
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
mfaktc: a CUDA program for Mersenne prefactoring | TheJudger | GPU Computing | 3625 | 2023-03-30 00:08 |
The P-1 factoring CUDA program | firejuggler | GPU Computing | 753 | 2020-12-12 18:07 |
End of the world as we know it (in music) | firejuggler | Lounge | 3 | 2012-12-22 01:43 |
World Cup Soccer | davieddy | Hobbies | 111 | 2011-05-28 19:21 |
World's dumbest CUDA program? | xilman | Programming | 1 | 2009-11-16 10:26 |