![]() |
taking modulo for fixed m - source code
Hi,
Can you run this proggie and tell me which times you clock at your specific processor? I have to rely upon handclocked times here regrettably gmp-powersavings mode of laptop it seems. Sorry for the hastely coding of it. Don't try to code like this at home! [code] /* This file implements and tests speed of a faster modulo for fixed m, maybe. Using GMP to benchmark against the fast REDC and existing implementations Positively faster above REDC boundaries it seems when handclocking times... Author: Vincent Diepeveen email: diep at xs4all dot nl note: programmed at my macbookpro with os/x seems including header files of gmp is a problem there. I compile it with: /gmp/> cat makedm #!/bin/sh gcc -O2 -o dm diepmod.c /gmp/gmp-4.2.2/.libs/libgmp.a /gmp/> */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> #include <time.h> #include "gmp-4.2.2/gmp.h" // perhaps remove directory name at linux // as i'm os/x at home #define UNIX 1 // windoze boys have to fill in here what they are gmp_randstate_t randomstate; int GetClock(void) { #if UNIX //return(clock()); struct timeval timeval; gettimeofday(&timeval,NULL); return((int)(timeval.tv_sec*1000+(timeval.tv_usec/1000))); #else return((int)GetTickCount()); #endif } int main(void) { mpz_t p,q,n,M,t,res1,res2,res2a,res2b,res2c,s,pow2i,squaredM; int sizen,sizep,sizeq,sizeM,endcompare,t1,t2,i,testlen,coll; gmp_randinit_mt(randomstate); printf(" size of 1 limb = %i bytes\n", (int)sizeof(mp_limb_t)); /* in RSA we have n = p * q where p,q are both large 'industry grade' primes, therefore we have an odd n and we have message M that we want to encode with a big E, C = M ^ E (mod n) Now we repeat this for a lot of texts M of course. So the interesting thing is to take a faster modulo n, where n is a constant throughout the entire encryption of the entire data that we want to encode. Focus of this file is just generating a method to take that modulo faster than O ( n ^ 2 ) which is what REDC and division do currently in GMP. Let's first generate therefore p,q n = p.q The modulo idea explained: */ mpz_init(p); mpz_init(q); mpz_init(n); mpz_init(M); mpz_init(res1); mpz_init(res2); mpz_init(res2a); mpz_init(res2b); mpz_init(res2c); mpz_init(s); mpz_init(pow2i); mpz_init(t); mpz_init(squaredM); // thx rogue // results we have we collect as a summation in res1 and res2 mpz_init(res1); mpz_init(res2); testlen = 1024*1024; sizen = 128; // let's start with 128 bits to see whether it works bugfree for( sizen = 128; sizen <= 16384; sizen *= 2 ) { testlen >>= 1; sizep = (sizen * 55) / 100; // set sizep at 55% size of n sizeq = sizen - sizep; // sizeq is 45% of n sizeM = sizen - 2; // our message is 2 bits smaller than n // let's generate randomly p,q and then set it to first prime mpz_urandomb(p,randomstate,sizep); mpz_nextprime(p,p); mpz_urandomb(q,randomstate,sizeq); mpz_nextprime(q,q); mpz_mul(n,p,q); // n = p.q // and just like in RSA our n is hard to factorize // well, that is, for most of us // precalculate s = 2^2i / n // where 'i' is the number of bits of n mpz_set_ui(pow2i,1); mpz_mul_2exp(pow2i,pow2i,sizen+sizen); mpz_fdiv_q(s,pow2i,n); mpz_urandomb(M,randomstate,sizeM); // now make sure M is odd mpz_setbit(M,0); // square M mpz_mul(M,M,M); mpz_set(squaredM,M); // the correct GMP calculation // M = res1 (mod n) t1 = GetClock(); for( i = 0; i < testlen; i++ ) { mpz_add_ui(squaredM,squaredM,2); mpz_mod(res1,squaredM,n); } t2 = GetClock(); coll = t2-t1; printf("bits = %i time original code = %.03lf seconds\n", sizen,((double)coll)/1000.00); fflush(stdout); // get me the quotient also //mpz_fdiv_q(t,M,n); // the Diep approximation calculation // M = res2 (mod n) ; // res2 >>= sizen ; rightshift res2 mpz_set(squaredM,M); t1 = GetClock(); for( i = 0; i < testlen; i++ ) { mpz_add_ui(squaredM,squaredM,2); mpz_mul(res2a,squaredM,s); mpz_fdiv_q_2exp(res2b,res2a,sizen+sizen); //printf(" quotient benadering = "); //mpz_out_str(stdout,10,res2); //printf("\n"); mpz_mul(res2c,res2b,n); mpz_sub(res2,squaredM,res2c); for( ;; ) { int intres = mpz_cmp(res2,n); if( intres >= 0 ) { //printf("needed 1 correction\n"); mpz_sub(res2,res2,n); } else { break; } } } t2 = GetClock(); coll = t2-t1; printf("bits = %i time Diep Reduction = %.03lf seconds\n", sizen,((double)coll)/1000.00); fflush(stdout); // above is our mpz_diepmod(res2,M,n,s); endcompare = mpz_cmp(res1,res2); if( endcompare == 0 ) printf("both modulo's were equal and correct therefore\n"); else printf("modulo has an incorrect outcome\n"); fflush(stdout); } return 0; } [/code] |
You have one bug. You need to add mpz_init(squaredM) before you use it.
BTW, here are some timings on a 64-bit PPC G5 CPU: size of 1 limb = 8 bytes bits = 128 time original code = 0.161 seconds bits = 128 time Diep Reduction = 0.205 seconds both modulo's were equal and correct therefore bits = 256 time original code = 0.162 seconds bits = 256 time Diep Reduction = 0.134 seconds both modulo's were equal and correct therefore bits = 512 time original code = 0.109 seconds bits = 512 time Diep Reduction = 0.164 seconds both modulo's were equal and correct therefore bits = 1024 time original code = 0.208 seconds bits = 1024 time Diep Reduction = 0.474 seconds both modulo's were equal and correct therefore bits = 2048 time original code = 0.339 seconds bits = 2048 time Diep Reduction = 0.685 seconds both modulo's were equal and correct therefore bits = 4096 time original code = 0.282 seconds bits = 4096 time Diep Reduction = 0.748 seconds both modulo's were equal and correct therefore bits = 8192 time original code = 0.740 seconds bits = 8192 time Diep Reduction = 1.407 seconds both modulo's were equal and correct therefore |
Thanks,
And if you handclock the last times instead of rely upon the GMP library time? |
Here's another timing run, on a Athlon 64 X2 4200+.
I added a timer showing why one might think the gmp code is so slow: the precalculation starts to dominate the run time. The timings for the two reduction routines here (and previously) should be correct. - ben. [code] size of 1 limb = 4 bytes precalculation stuff took = 0.0000 seconds. bits = 128 time original code = 0.2970 seconds. bits = 128 time Diep Reduction = 0.1560 seconds. both modulo's were equal and correct therefore precalculation stuff took = 0.0000 seconds. bits = 256 time original code = 0.1720 seconds. bits = 256 time Diep Reduction = 0.1560 seconds. both modulo's were equal and correct therefore precalculation stuff took = 0.0160 seconds. bits = 512 time original code = 0.1560 seconds. bits = 512 time Diep Reduction = 0.2190 seconds. both modulo's were equal and correct therefore precalculation stuff took = 0.0470 seconds. bits = 1024 time original code = 0.1870 seconds. bits = 1024 time Diep Reduction = 0.3750 seconds. both modulo's were equal and correct therefore precalculation stuff took = 0.3600 seconds. bits = 2048 time original code = 0.3120 seconds. bits = 2048 time Diep Reduction = 0.5630 seconds. both modulo's were equal and correct therefore precalculation stuff took = 12.7190 seconds. bits = 4096 time original code = 0.4680 seconds. bits = 4096 time Diep Reduction = 0.8440 seconds. both modulo's were equal and correct therefore precalculation stuff took = 172.8750 seconds. bits = 8192 time original code = 0.7660 seconds. bits = 8192 time Diep Reduction = 1.2180 seconds. both modulo's were equal and correct therefore [/code] [code] /* This file implements and tests speed of a faster modulo for fixed m, maybe. Using GMP to benchmark against the fast REDC and existing implementations Positively faster above REDC boundaries it seems when handclocking times... Author: Vincent Diepeveen email: diep at xs4all dot nl note: programmed at my macbookpro with os/x seems including header files of gmp is a problem there. I compile it with: /gmp/> cat makedm #!/bin/sh gcc -O2 -o dm diepmod.c /gmp/gmp-4.2.2/.libs/libgmp.a /gmp/> */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> #include <time.h> #include "../gmp-4.2.1/gmp.h" // perhaps remove directory name at linux // as i'm os/x at home #define UNIX 1 // windoze boys have to fill in here what they are gmp_randstate_t randomstate; int GetClock(void) { #if UNIX //return(clock()); struct timeval timeval; gettimeofday(&timeval,NULL); return((int)(timeval.tv_sec*1000+(timeval.tv_usec/1000))); #else return((int)GetTickCount()); #endif } int main(void) { mpz_t p,q,n,M,t,res1,res2,res2a,res2b,res2c,s,pow2i,squaredM; int sizen,sizep,sizeq,sizeM,endcompare,t1,t2,i,testlen,coll; gmp_randinit_mt(randomstate); clock_t start, stop; double tt; printf(" size of 1 limb = %i bytes\n", (int)sizeof(mp_limb_t)); /* in RSA we have n = p * q where p,q are both large 'industry grade' primes, therefore we have an odd n and we have message M that we want to encode with a big E, C = M ^ E (mod n) Now we repeat this for a lot of texts M of course. So the interesting thing is to take a faster modulo n, where n is a constant throughout the entire encryption of the entire data that we want to encode. Focus of this file is just generating a method to take that modulo faster than O ( n ^ 2 ) which is what REDC and division do currently in GMP. Let's first generate therefore p,q n = p.q The modulo idea explained: */ mpz_init(p); mpz_init(q); mpz_init(n); mpz_init(M); mpz_init(res1); mpz_init(res2); mpz_init(res2a); mpz_init(res2b); mpz_init(res2c); mpz_init(s); mpz_init(pow2i); mpz_init(t); mpz_init(squaredM); // thx rogue // results we have we collect as a summation in res1 and res2 mpz_init(res1); mpz_init(res2); testlen = 1024*1024; sizen = 128; // let's start with 128 bits to see whether it works bugfree for( sizen = 128; sizen <= 16384; sizen *= 2 ) { start = clock(); testlen >>= 1; sizep = (sizen * 55) / 100; // set sizep at 55% size of n sizeq = sizen - sizep; // sizeq is 45% of n sizeM = sizen - 2; // our message is 2 bits smaller than n // let's generate randomly p,q and then set it to first prime mpz_urandomb(p,randomstate,sizep); mpz_nextprime(p,p); mpz_urandomb(q,randomstate,sizeq); mpz_nextprime(q,q); mpz_mul(n,p,q); // n = p.q // and just like in RSA our n is hard to factorize // well, that is, for most of us // precalculate s = 2^2i / n // where 'i' is the number of bits of n mpz_set_ui(pow2i,1); mpz_mul_2exp(pow2i,pow2i,sizen+sizen); mpz_fdiv_q(s,pow2i,n); mpz_urandomb(M,randomstate,sizeM); // now make sure M is odd mpz_setbit(M,0); // square M mpz_mul(M,M,M); mpz_set(squaredM,M); stop = clock(); tt = (double)(stop - start)/(double)CLOCKS_PER_SEC; printf("precalculation stuff took = %6.4f seconds.\n",tt); // the correct GMP calculation // M = res1 (mod n) //t1 = GetClock(); start = clock(); for( i = 0; i < testlen; i++ ) { mpz_add_ui(squaredM,squaredM,2); mpz_mod(res1,squaredM,n); } //t2 = GetClock(); //coll = t2-t1; //printf("bits = %i time original code = %.03lf seconds\n", stop = clock(); tt = (double)(stop - start)/(double)CLOCKS_PER_SEC; printf("bits = %i time original code = %6.4f seconds.\n",sizen,tt); //sizen,((double)coll)/1000.00); fflush(stdout); // get me the quotient also //mpz_fdiv_q(t,M,n); // the Diep approximation calculation // M = res2 (mod n) ; // res2 >>= sizen ; rightshift res2 mpz_set(squaredM,M); //t1 = GetClock(); start = clock(); for( i = 0; i < testlen; i++ ) { mpz_add_ui(squaredM,squaredM,2); mpz_mul(res2a,squaredM,s); mpz_fdiv_q_2exp(res2b,res2a,sizen+sizen); //printf(" quotient benadering = "); //mpz_out_str(stdout,10,res2); //printf("\n"); mpz_mul(res2c,res2b,n); mpz_sub(res2,squaredM,res2c); for( ;; ) { int intres = mpz_cmp(res2,n); if( intres >= 0 ) { //printf("needed 1 correction\n"); mpz_sub(res2,res2,n); } else { break; } } } //t2 = GetClock(); //coll = t2-t1; //printf("bits = %i time Diep Reduction = %.03lf seconds\n", stop = clock(); tt = (double)(stop - start)/(double)CLOCKS_PER_SEC; printf("bits = %i time Diep Reduction = %6.4f seconds.\n",sizen,tt); //sizen,((double)coll)/1000.00); fflush(stdout); // above is our mpz_diepmod(res2,M,n,s); endcompare = mpz_cmp(res1,res2); if( endcompare == 0 ) printf("both modulo's were equal and correct therefore\n"); else printf("modulo has an incorrect outcome\n"); fflush(stdout); } return 0; } [/code] My guess is the either the fdiv crosses some threshold and becomes a pig, or nextprime's pseudoprime test. |
Thanks for the comments, the finding of 2 primes was unnecessary of course. Just 2 random odd numbers is what i use now.
Seems now it is outperforming the division method when FFT kicks in. Karatsuba or whatever multiplication method that gets selected is too slow compared to the average running time of division it seems. Karatsuba being of course theta( n ^ (log 3/log 2) ) is not near n log n. How is the code below doing at 64 bits processors? I just tested on 32 bits GMP despite this mac is 64 bits (macbookpro) and of course it has powersavings which give weird testing results always: size of 1 limb = 4 bytes bits = 128 time original code = 0.315 seconds bits = 128 time Diep Reduction = 0.453 seconds both modulo's were equal and correct therefore bits = 256 time original code = 0.320 seconds bits = 256 time Diep Reduction = 0.593 seconds both modulo's were equal and correct therefore bits = 512 time original code = 0.433 seconds bits = 512 time Diep Reduction = 1.003 seconds both modulo's were equal and correct therefore bits = 1024 time original code = 0.712 seconds bits = 1024 time Diep Reduction = 1.742 seconds both modulo's were equal and correct therefore bits = 2048 time original code = 1.294 seconds bits = 2048 time Diep Reduction = 2.618 seconds both modulo's were equal and correct therefore bits = 4096 time original code = 2.139 seconds bits = 4096 time Diep Reduction = 4.030 seconds both modulo's were equal and correct therefore bits = 8192 time original code = 3.502 seconds bits = 8192 time Diep Reduction = 5.935 seconds both modulo's were equal and correct therefore bits = 16384 time original code = 5.501 seconds bits = 16384 time Diep Reduction = 8.875 seconds both modulo's were equal and correct therefore bits = 32768 time original code = 8.445 seconds bits = 32768 time Diep Reduction = 12.740 seconds both modulo's were equal and correct therefore bits = 65536 time original code = 12.759 seconds bits = 65536 time Diep Reduction = 16.414 seconds both modulo's were equal and correct therefore bits = 131072 time original code = 18.103 seconds bits = 131072 time Diep Reduction = 14.430 seconds both modulo's were equal and correct therefore bits = 262144 time original code = 23.880 seconds bits = 262144 time Diep Reduction = 17.175 seconds both modulo's were equal and correct therefore bits = 524288 time original code = 30.304 seconds bits = 524288 time Diep Reduction = 18.535 seconds both modulo's were equal and correct therefore bits = 1048576 time original code = 36.325 seconds bits = 1048576 time Diep Reduction = 20.808 seconds both modulo's were equal and correct therefore bits = 2097152 time original code = 44.866 seconds bits = 2097152 time Diep Reduction = 24.014 seconds both modulo's were equal and correct therefore /* This file implements and tests speed of a faster modulo for fixed m, maybe. Using GMP to benchmark against the fast REDC and existing implementations Positively faster above REDC boundaries it seems when FFT kicks in... Author: Vincent Diepeveen email: diep at xs4all dot nl note: programmed at my macbookpro with os/x seems including header files of gmp is a problem there. I compile it with: /gmp/> cat makedm #!/bin/sh gcc -O2 -o dm diepmod.c /gmp/gmp-4.2.2/.libs/libgmp.a /gmp/> */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> #include <time.h> #include "gmp-4.2.2/gmp.h" // perhaps remove directory name at linux // as i'm os/x at home #define UNIX 1 // windoze boys have to fill in here what they are gmp_randstate_t randomstate; int GetClock(void) { #if UNIX //return(clock()); struct timeval timeval; gettimeofday(&timeval,NULL); return((int)(timeval.tv_sec*1000+(timeval.tv_usec/1000))); #else return((int)GetTickCount()); #endif } int main(void) { mpz_t p,q,n,M,t,res1,res2,res2a,res2b,res2c,s,pow2i,squaredM; int sizen,sizep,sizeq,sizeM,endcompare,t1,t2,i,testlen,coll; gmp_randinit_mt(randomstate); printf(" size of 1 limb = %i bytes\n", (int)sizeof(mp_limb_t)); /* in RSA we have n = p * q where p,q are both large 'industry grade' primes, therefore we have an odd n and we have message M that we want to encode with a big E, C = M ^ E (mod n) Now we repeat this for a lot of texts M of course. So the interesting thing is to take a faster modulo n, where n is a constant throughout the entire encryption of the entire data that we want to encode. Focus of this file is just generating a method to take that modulo faster than O ( n ^ 2 ) which is what REDC and division do currently in GMP. Let's first generate therefore p,q n = p.q The modulo idea explained: */ mpz_init(p); mpz_init(q); mpz_init(n); mpz_init(M); mpz_init(res1); mpz_init(res2); mpz_init(res2a); mpz_init(res2b); mpz_init(res2c); mpz_init(s); mpz_init(pow2i); mpz_init(t); mpz_init(squaredM); // results we have we collect as a summation in res1 and res2 mpz_init(res1); mpz_init(res2); testlen = 1024*1024; sizen = 128; // let's start with 128 bits to see whether it works bugfree for( sizen = 128; sizen <= 128*16384; sizen *= 2 ) { testlen >>= 1; sizep = (sizen * 55) / 100; // set sizep at 55% size of n sizeq = sizen - sizep; // sizeq is 45% of n sizeM = sizen - 2; // our message is 2 bits smaller than n // let's generate randomly p,q and then set it to first prime mpz_urandomb(p,randomstate,sizep); //mpz_nextprime(p,p); mpz_setbit(p,0); mpz_urandomb(q,randomstate,sizeq); //mpz_nextprime(q,q); mpz_setbit(q,0); mpz_mul(n,p,q); // n = p.q // and just like in RSA our n is hard to factorize // well, that is, for most of us // precalculate s = 2^2i / n // where 'i' is the number of bits of n mpz_set_ui(pow2i,1); mpz_mul_2exp(pow2i,pow2i,sizen+sizen); mpz_fdiv_q(s,pow2i,n); mpz_urandomb(M,randomstate,sizeM); // now make sure M is odd mpz_setbit(M,0); // square M mpz_mul(M,M,M); mpz_set(squaredM,M); // the correct GMP calculation // M = res1 (mod n) t1 = GetClock(); for( i = 0; i < testlen; i++ ) { mpz_add_ui(squaredM,squaredM,2); mpz_mod(res1,squaredM,n); } t2 = GetClock(); coll = t2-t1; printf("bits = %i time original code = %.03lf seconds\n", sizen,((double)coll)/1000.00); fflush(stdout); // get me the quotient also //mpz_fdiv_q(t,M,n); // the Diep approximation calculation // M = res2 (mod n) ; // res2 >>= sizen ; rightshift res2 mpz_set(squaredM,M); t1 = GetClock(); for( i = 0; i < testlen; i++ ) { mpz_add_ui(squaredM,squaredM,2); mpz_mul(res2a,squaredM,s); mpz_fdiv_q_2exp(res2b,res2a,sizen+sizen); //printf(" quotient benadering = "); //mpz_out_str(stdout,10,res2); //printf("\n"); mpz_mul(res2c,res2b,n); mpz_sub(res2,squaredM,res2c); for( ;; ) { int intres = mpz_cmp(res2,n); if( intres >= 0 ) { //printf("needed 1 correction\n"); mpz_sub(res2,res2,n); } else { break; } } } t2 = GetClock(); coll = t2-t1; printf("bits = %i time Diep Reduction = %.03lf seconds\n", sizen,((double)coll)/1000.00); fflush(stdout); // above is our mpz_diepmod(res2,M,n,s); endcompare = mpz_cmp(res1,res2); if( endcompare == 0 ) printf("both modulo's were equal and correct therefore\n"); else printf("modulo has an incorrect outcome\n"); fflush(stdout); } return 0; } |
| All times are UTC. The time now is 22:11. |
Powered by vBulletin® Version 3.8.11
Copyright ©2000 - 2021, Jelsoft Enterprises Ltd.