mersenneforum.org

mersenneforum.org (https://www.mersenneforum.org/index.php)
-   Math (https://www.mersenneforum.org/forumdisplay.php?f=8)
-   -   taking modulo for fixed m - source code (https://www.mersenneforum.org/showthread.php?t=10207)

diep 2008-04-16 02:02

taking modulo for fixed m - source code
 
Hi,

Can you run this proggie and tell me which times you clock at your specific processor? I have to rely upon handclocked times here regrettably gmp-powersavings mode of laptop it seems. Sorry for the hastely coding of it. Don't try to code like this at home!

[code]

/*
This file implements and tests speed of
a faster modulo for fixed m, maybe.

Using GMP to benchmark against the fast REDC and existing implementations
Positively faster above REDC boundaries it seems when handclocking times...

Author: Vincent Diepeveen
email: diep at xs4all dot nl

note: programmed at my macbookpro with os/x
seems including header files of gmp is
a problem there. I compile it with:

/gmp/> cat makedm
#!/bin/sh
gcc -O2 -o dm diepmod.c /gmp/gmp-4.2.2/.libs/libgmp.a
/gmp/>

*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>

#include "gmp-4.2.2/gmp.h" // perhaps remove directory name at linux
// as i'm os/x at home

#define UNIX 1 // windoze boys have to fill in here what they are

gmp_randstate_t randomstate;

int GetClock(void) {
#if UNIX
//return(clock());
struct timeval timeval;
gettimeofday(&timeval,NULL);
return((int)(timeval.tv_sec*1000+(timeval.tv_usec/1000)));
#else
return((int)GetTickCount());
#endif
}

int main(void)
{
mpz_t p,q,n,M,t,res1,res2,res2a,res2b,res2c,s,pow2i,squaredM;
int sizen,sizep,sizeq,sizeM,endcompare,t1,t2,i,testlen,coll;
gmp_randinit_mt(randomstate);

printf(" size of 1 limb = %i bytes\n",
(int)sizeof(mp_limb_t));
/*
in RSA we have n = p * q
where p,q are both large 'industry grade' primes,
therefore we have an odd n

and we have message M that we want to encode with a big E,

C = M ^ E (mod n)

Now we repeat this for a lot of texts M of course.

So the interesting thing is to take a faster modulo n,
where n is a constant throughout the entire encryption
of the entire data that we want to encode.

Focus of this file is just generating a method to take that modulo faster
than O ( n ^ 2 ) which is what REDC and division do currently in GMP.

Let's first generate therefore p,q
n = p.q



The modulo idea explained:

*/
mpz_init(p);
mpz_init(q);
mpz_init(n);
mpz_init(M);
mpz_init(res1);
mpz_init(res2);
mpz_init(res2a);
mpz_init(res2b);
mpz_init(res2c);
mpz_init(s);
mpz_init(pow2i);
mpz_init(t);
mpz_init(squaredM); // thx rogue

// results we have we collect as a summation in res1 and res2
mpz_init(res1);
mpz_init(res2);

testlen = 1024*1024;
sizen = 128; // let's start with 128 bits to see whether it works bugfree

for( sizen = 128; sizen <= 16384; sizen *= 2 ) {
testlen >>= 1;
sizep = (sizen * 55) / 100; // set sizep at 55% size of n
sizeq = sizen - sizep; // sizeq is 45% of n
sizeM = sizen - 2; // our message is 2 bits smaller than n

// let's generate randomly p,q and then set it to first prime
mpz_urandomb(p,randomstate,sizep);
mpz_nextprime(p,p);
mpz_urandomb(q,randomstate,sizeq);
mpz_nextprime(q,q);

mpz_mul(n,p,q);
// n = p.q
// and just like in RSA our n is hard to factorize
// well, that is, for most of us

// precalculate s = 2^2i / n
// where 'i' is the number of bits of n
mpz_set_ui(pow2i,1);
mpz_mul_2exp(pow2i,pow2i,sizen+sizen);
mpz_fdiv_q(s,pow2i,n);

mpz_urandomb(M,randomstate,sizeM);
// now make sure M is odd
mpz_setbit(M,0);

// square M
mpz_mul(M,M,M);
mpz_set(squaredM,M);

// the correct GMP calculation
// M = res1 (mod n)
t1 = GetClock();
for( i = 0; i < testlen; i++ )
{
mpz_add_ui(squaredM,squaredM,2);
mpz_mod(res1,squaredM,n);
}
t2 = GetClock();
coll = t2-t1;
printf("bits = %i time original code = %.03lf seconds\n",
sizen,((double)coll)/1000.00);
fflush(stdout);

// get me the quotient also
//mpz_fdiv_q(t,M,n);

// the Diep approximation calculation
// M = res2 (mod n) ;
// res2 >>= sizen ; rightshift res2

mpz_set(squaredM,M);
t1 = GetClock();
for( i = 0; i < testlen; i++ )
{
mpz_add_ui(squaredM,squaredM,2);
mpz_mul(res2a,squaredM,s);
mpz_fdiv_q_2exp(res2b,res2a,sizen+sizen);
//printf(" quotient benadering = ");
//mpz_out_str(stdout,10,res2);
//printf("\n");
mpz_mul(res2c,res2b,n);
mpz_sub(res2,squaredM,res2c);
for( ;; ) {
int intres = mpz_cmp(res2,n);
if( intres >= 0 )
{
//printf("needed 1 correction\n");
mpz_sub(res2,res2,n);
}
else
{
break;
}
}
}
t2 = GetClock();
coll = t2-t1;
printf("bits = %i time Diep Reduction = %.03lf seconds\n",
sizen,((double)coll)/1000.00);
fflush(stdout);
// above is our mpz_diepmod(res2,M,n,s);

endcompare = mpz_cmp(res1,res2);
if( endcompare == 0 )
printf("both modulo's were equal and correct therefore\n");
else
printf("modulo has an incorrect outcome\n");
fflush(stdout);
}
return 0;
}

[/code]

rogue 2008-04-16 02:10

You have one bug. You need to add mpz_init(squaredM) before you use it.

BTW, here are some timings on a 64-bit PPC G5 CPU:

size of 1 limb = 8 bytes
bits = 128 time original code = 0.161 seconds
bits = 128 time Diep Reduction = 0.205 seconds
both modulo's were equal and correct therefore
bits = 256 time original code = 0.162 seconds
bits = 256 time Diep Reduction = 0.134 seconds
both modulo's were equal and correct therefore
bits = 512 time original code = 0.109 seconds
bits = 512 time Diep Reduction = 0.164 seconds
both modulo's were equal and correct therefore
bits = 1024 time original code = 0.208 seconds
bits = 1024 time Diep Reduction = 0.474 seconds
both modulo's were equal and correct therefore
bits = 2048 time original code = 0.339 seconds
bits = 2048 time Diep Reduction = 0.685 seconds
both modulo's were equal and correct therefore
bits = 4096 time original code = 0.282 seconds
bits = 4096 time Diep Reduction = 0.748 seconds
both modulo's were equal and correct therefore
bits = 8192 time original code = 0.740 seconds
bits = 8192 time Diep Reduction = 1.407 seconds
both modulo's were equal and correct therefore

diep 2008-04-16 02:12

Thanks,
And if you handclock the last times instead of rely upon the GMP library time?

bsquared 2008-04-16 04:02

Here's another timing run, on a Athlon 64 X2 4200+.

I added a timer showing why one might think the gmp code is so slow: the precalculation starts to dominate the run time. The timings for the two reduction routines here (and previously) should be correct.

- ben.

[code]
size of 1 limb = 4 bytes
precalculation stuff took = 0.0000 seconds.
bits = 128 time original code = 0.2970 seconds.
bits = 128 time Diep Reduction = 0.1560 seconds.
both modulo's were equal and correct therefore
precalculation stuff took = 0.0000 seconds.
bits = 256 time original code = 0.1720 seconds.
bits = 256 time Diep Reduction = 0.1560 seconds.
both modulo's were equal and correct therefore
precalculation stuff took = 0.0160 seconds.
bits = 512 time original code = 0.1560 seconds.
bits = 512 time Diep Reduction = 0.2190 seconds.
both modulo's were equal and correct therefore
precalculation stuff took = 0.0470 seconds.
bits = 1024 time original code = 0.1870 seconds.
bits = 1024 time Diep Reduction = 0.3750 seconds.
both modulo's were equal and correct therefore
precalculation stuff took = 0.3600 seconds.
bits = 2048 time original code = 0.3120 seconds.
bits = 2048 time Diep Reduction = 0.5630 seconds.
both modulo's were equal and correct therefore
precalculation stuff took = 12.7190 seconds.
bits = 4096 time original code = 0.4680 seconds.
bits = 4096 time Diep Reduction = 0.8440 seconds.
both modulo's were equal and correct therefore
precalculation stuff took = 172.8750 seconds.
bits = 8192 time original code = 0.7660 seconds.
bits = 8192 time Diep Reduction = 1.2180 seconds.
both modulo's were equal and correct therefore

[/code]

[code]
/*
This file implements and tests speed of
a faster modulo for fixed m, maybe.
Using GMP to benchmark against the fast REDC and existing implementations
Positively faster above REDC boundaries it seems when handclocking times...
Author: Vincent Diepeveen
email: diep at xs4all dot nl
note: programmed at my macbookpro with os/x
seems including header files of gmp is
a problem there. I compile it with:
/gmp/> cat makedm
#!/bin/sh
gcc -O2 -o dm diepmod.c /gmp/gmp-4.2.2/.libs/libgmp.a
/gmp/>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>
#include "../gmp-4.2.1/gmp.h" // perhaps remove directory name at linux
// as i'm os/x at home
#define UNIX 1 // windoze boys have to fill in here what they are
gmp_randstate_t randomstate;
int GetClock(void) {
#if UNIX
//return(clock());
struct timeval timeval;
gettimeofday(&timeval,NULL);
return((int)(timeval.tv_sec*1000+(timeval.tv_usec/1000)));
#else
return((int)GetTickCount());
#endif
}
int main(void)
{
mpz_t p,q,n,M,t,res1,res2,res2a,res2b,res2c,s,pow2i,squaredM;
int sizen,sizep,sizeq,sizeM,endcompare,t1,t2,i,testlen,coll;
gmp_randinit_mt(randomstate);
clock_t start, stop;
double tt;
printf(" size of 1 limb = %i bytes\n",
(int)sizeof(mp_limb_t));
/*
in RSA we have n = p * q
where p,q are both large 'industry grade' primes,
therefore we have an odd n
and we have message M that we want to encode with a big E,
C = M ^ E (mod n)
Now we repeat this for a lot of texts M of course.
So the interesting thing is to take a faster modulo n,
where n is a constant throughout the entire encryption
of the entire data that we want to encode.
Focus of this file is just generating a method to take that modulo faster
than O ( n ^ 2 ) which is what REDC and division do currently in GMP.
Let's first generate therefore p,q
n = p.q

The modulo idea explained:
*/
mpz_init(p);
mpz_init(q);
mpz_init(n);
mpz_init(M);
mpz_init(res1);
mpz_init(res2);
mpz_init(res2a);
mpz_init(res2b);
mpz_init(res2c);
mpz_init(s);
mpz_init(pow2i);
mpz_init(t);
mpz_init(squaredM); // thx rogue
// results we have we collect as a summation in res1 and res2
mpz_init(res1);
mpz_init(res2);
testlen = 1024*1024;
sizen = 128; // let's start with 128 bits to see whether it works bugfree
for( sizen = 128; sizen <= 16384; sizen *= 2 ) {
start = clock();
testlen >>= 1;
sizep = (sizen * 55) / 100; // set sizep at 55% size of n
sizeq = sizen - sizep; // sizeq is 45% of n
sizeM = sizen - 2; // our message is 2 bits smaller than n
// let's generate randomly p,q and then set it to first prime
mpz_urandomb(p,randomstate,sizep);
mpz_nextprime(p,p);
mpz_urandomb(q,randomstate,sizeq);
mpz_nextprime(q,q);
mpz_mul(n,p,q);
// n = p.q
// and just like in RSA our n is hard to factorize
// well, that is, for most of us
// precalculate s = 2^2i / n
// where 'i' is the number of bits of n
mpz_set_ui(pow2i,1);
mpz_mul_2exp(pow2i,pow2i,sizen+sizen);
mpz_fdiv_q(s,pow2i,n);
mpz_urandomb(M,randomstate,sizeM);
// now make sure M is odd
mpz_setbit(M,0);
// square M
mpz_mul(M,M,M);
mpz_set(squaredM,M);
stop = clock();
tt = (double)(stop - start)/(double)CLOCKS_PER_SEC;
printf("precalculation stuff took = %6.4f seconds.\n",tt);
// the correct GMP calculation
// M = res1 (mod n)
//t1 = GetClock();
start = clock();
for( i = 0; i < testlen; i++ )
{
mpz_add_ui(squaredM,squaredM,2);
mpz_mod(res1,squaredM,n);
}
//t2 = GetClock();
//coll = t2-t1;
//printf("bits = %i time original code = %.03lf seconds\n",
stop = clock();
tt = (double)(stop - start)/(double)CLOCKS_PER_SEC;
printf("bits = %i time original code = %6.4f seconds.\n",sizen,tt);
//sizen,((double)coll)/1000.00);
fflush(stdout);
// get me the quotient also
//mpz_fdiv_q(t,M,n);
// the Diep approximation calculation
// M = res2 (mod n) ;
// res2 >>= sizen ; rightshift res2
mpz_set(squaredM,M);
//t1 = GetClock();
start = clock();
for( i = 0; i < testlen; i++ )
{
mpz_add_ui(squaredM,squaredM,2);
mpz_mul(res2a,squaredM,s);
mpz_fdiv_q_2exp(res2b,res2a,sizen+sizen);
//printf(" quotient benadering = ");
//mpz_out_str(stdout,10,res2);
//printf("\n");
mpz_mul(res2c,res2b,n);
mpz_sub(res2,squaredM,res2c);
for( ;; )
{
int intres = mpz_cmp(res2,n);
if( intres >= 0 )
{
//printf("needed 1 correction\n");
mpz_sub(res2,res2,n);
}
else
{
break;
}
}
}
//t2 = GetClock();
//coll = t2-t1;
//printf("bits = %i time Diep Reduction = %.03lf seconds\n",
stop = clock();
tt = (double)(stop - start)/(double)CLOCKS_PER_SEC;
printf("bits = %i time Diep Reduction = %6.4f seconds.\n",sizen,tt);
//sizen,((double)coll)/1000.00);
fflush(stdout);
// above is our mpz_diepmod(res2,M,n,s);
endcompare = mpz_cmp(res1,res2);
if( endcompare == 0 )
printf("both modulo's were equal and correct therefore\n");
else
printf("modulo has an incorrect outcome\n");
fflush(stdout);
}
return 0;
}

[/code]

My guess is the either the fdiv crosses some threshold and becomes a pig, or nextprime's pseudoprime test.

diep 2008-04-16 13:40

Thanks for the comments, the finding of 2 primes was unnecessary of course. Just 2 random odd numbers is what i use now.

Seems now it is outperforming the division method when FFT kicks in. Karatsuba or whatever multiplication method that gets selected is too slow compared to the average running time of division it seems.

Karatsuba being of course theta( n ^ (log 3/log 2) ) is not near n log n.
How is the code below doing at 64 bits processors?

I just tested on 32 bits GMP despite this mac is 64 bits (macbookpro) and of course it has powersavings which give weird testing results always:

size of 1 limb = 4 bytes
bits = 128 time original code = 0.315 seconds
bits = 128 time Diep Reduction = 0.453 seconds
both modulo's were equal and correct therefore
bits = 256 time original code = 0.320 seconds
bits = 256 time Diep Reduction = 0.593 seconds
both modulo's were equal and correct therefore
bits = 512 time original code = 0.433 seconds
bits = 512 time Diep Reduction = 1.003 seconds
both modulo's were equal and correct therefore
bits = 1024 time original code = 0.712 seconds
bits = 1024 time Diep Reduction = 1.742 seconds
both modulo's were equal and correct therefore
bits = 2048 time original code = 1.294 seconds
bits = 2048 time Diep Reduction = 2.618 seconds
both modulo's were equal and correct therefore
bits = 4096 time original code = 2.139 seconds
bits = 4096 time Diep Reduction = 4.030 seconds
both modulo's were equal and correct therefore
bits = 8192 time original code = 3.502 seconds
bits = 8192 time Diep Reduction = 5.935 seconds
both modulo's were equal and correct therefore
bits = 16384 time original code = 5.501 seconds
bits = 16384 time Diep Reduction = 8.875 seconds
both modulo's were equal and correct therefore
bits = 32768 time original code = 8.445 seconds
bits = 32768 time Diep Reduction = 12.740 seconds
both modulo's were equal and correct therefore
bits = 65536 time original code = 12.759 seconds
bits = 65536 time Diep Reduction = 16.414 seconds
both modulo's were equal and correct therefore
bits = 131072 time original code = 18.103 seconds
bits = 131072 time Diep Reduction = 14.430 seconds
both modulo's were equal and correct therefore
bits = 262144 time original code = 23.880 seconds
bits = 262144 time Diep Reduction = 17.175 seconds
both modulo's were equal and correct therefore
bits = 524288 time original code = 30.304 seconds
bits = 524288 time Diep Reduction = 18.535 seconds
both modulo's were equal and correct therefore
bits = 1048576 time original code = 36.325 seconds
bits = 1048576 time Diep Reduction = 20.808 seconds
both modulo's were equal and correct therefore
bits = 2097152 time original code = 44.866 seconds
bits = 2097152 time Diep Reduction = 24.014 seconds
both modulo's were equal and correct therefore



/*
This file implements and tests speed of
a faster modulo for fixed m, maybe.

Using GMP to benchmark against the fast REDC and existing implementations
Positively faster above REDC boundaries it seems when FFT kicks in...

Author: Vincent Diepeveen
email: diep at xs4all dot nl

note: programmed at my macbookpro with os/x
seems including header files of gmp is
a problem there. I compile it with:

/gmp/> cat makedm
#!/bin/sh
gcc -O2 -o dm diepmod.c /gmp/gmp-4.2.2/.libs/libgmp.a
/gmp/>

*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>

#include "gmp-4.2.2/gmp.h" // perhaps remove directory name at linux
// as i'm os/x at home

#define UNIX 1 // windoze boys have to fill in here what they are

gmp_randstate_t randomstate;

int GetClock(void) {
#if UNIX
//return(clock());
struct timeval timeval;
gettimeofday(&timeval,NULL);
return((int)(timeval.tv_sec*1000+(timeval.tv_usec/1000)));
#else
return((int)GetTickCount());
#endif
}

int main(void)
{
mpz_t p,q,n,M,t,res1,res2,res2a,res2b,res2c,s,pow2i,squaredM;
int sizen,sizep,sizeq,sizeM,endcompare,t1,t2,i,testlen,coll;
gmp_randinit_mt(randomstate);

printf(" size of 1 limb = %i bytes\n",
(int)sizeof(mp_limb_t));
/*
in RSA we have n = p * q
where p,q are both large 'industry grade' primes,
therefore we have an odd n

and we have message M that we want to encode with a big E,

C = M ^ E (mod n)

Now we repeat this for a lot of texts M of course.

So the interesting thing is to take a faster modulo n,
where n is a constant throughout the entire encryption
of the entire data that we want to encode.

Focus of this file is just generating a method to take that modulo faster
than O ( n ^ 2 ) which is what REDC and division do currently in GMP.

Let's first generate therefore p,q
n = p.q



The modulo idea explained:

*/
mpz_init(p);
mpz_init(q);
mpz_init(n);
mpz_init(M);
mpz_init(res1);
mpz_init(res2);
mpz_init(res2a);
mpz_init(res2b);
mpz_init(res2c);
mpz_init(s);
mpz_init(pow2i);
mpz_init(t);
mpz_init(squaredM);

// results we have we collect as a summation in res1 and res2
mpz_init(res1);
mpz_init(res2);

testlen = 1024*1024;
sizen = 128; // let's start with 128 bits to see whether it works bugfree

for( sizen = 128; sizen <= 128*16384; sizen *= 2 ) {
testlen >>= 1;
sizep = (sizen * 55) / 100; // set sizep at 55% size of n
sizeq = sizen - sizep; // sizeq is 45% of n
sizeM = sizen - 2; // our message is 2 bits smaller than n

// let's generate randomly p,q and then set it to first prime
mpz_urandomb(p,randomstate,sizep);
//mpz_nextprime(p,p);
mpz_setbit(p,0);
mpz_urandomb(q,randomstate,sizeq);
//mpz_nextprime(q,q);
mpz_setbit(q,0);

mpz_mul(n,p,q);
// n = p.q
// and just like in RSA our n is hard to factorize
// well, that is, for most of us

// precalculate s = 2^2i / n
// where 'i' is the number of bits of n
mpz_set_ui(pow2i,1);
mpz_mul_2exp(pow2i,pow2i,sizen+sizen);
mpz_fdiv_q(s,pow2i,n);

mpz_urandomb(M,randomstate,sizeM);
// now make sure M is odd
mpz_setbit(M,0);

// square M
mpz_mul(M,M,M);
mpz_set(squaredM,M);

// the correct GMP calculation
// M = res1 (mod n)
t1 = GetClock();
for( i = 0; i < testlen; i++ )
{
mpz_add_ui(squaredM,squaredM,2);
mpz_mod(res1,squaredM,n);
}
t2 = GetClock();
coll = t2-t1;
printf("bits = %i time original code = %.03lf seconds\n",
sizen,((double)coll)/1000.00);
fflush(stdout);

// get me the quotient also
//mpz_fdiv_q(t,M,n);

// the Diep approximation calculation
// M = res2 (mod n) ;
// res2 >>= sizen ; rightshift res2

mpz_set(squaredM,M);
t1 = GetClock();
for( i = 0; i < testlen; i++ )
{
mpz_add_ui(squaredM,squaredM,2);
mpz_mul(res2a,squaredM,s);
mpz_fdiv_q_2exp(res2b,res2a,sizen+sizen);
//printf(" quotient benadering = ");
//mpz_out_str(stdout,10,res2);
//printf("\n");
mpz_mul(res2c,res2b,n);
mpz_sub(res2,squaredM,res2c);
for( ;; ) {
int intres = mpz_cmp(res2,n);
if( intres >= 0 )
{
//printf("needed 1 correction\n");
mpz_sub(res2,res2,n);
}
else
{
break;
}
}
}
t2 = GetClock();
coll = t2-t1;
printf("bits = %i time Diep Reduction = %.03lf seconds\n",
sizen,((double)coll)/1000.00);
fflush(stdout);
// above is our mpz_diepmod(res2,M,n,s);

endcompare = mpz_cmp(res1,res2);
if( endcompare == 0 )
printf("both modulo's were equal and correct therefore\n");
else
printf("modulo has an incorrect outcome\n");
fflush(stdout);
}
return 0;
}


All times are UTC. The time now is 22:11.

Powered by vBulletin® Version 3.8.11
Copyright ©2000 - 2021, Jelsoft Enterprises Ltd.