![]() |
|
|
#56 |
|
Nov 2005
22×33 Posts |
Hello, I made some improvements which speed the Java code up by 4%.
I think this will work for C as well. I rearranged the decision tree for the values of q = dividend / divisor. I looked at the probability for the cases the quotient of q = dividend / divisor takes the values 0,1,2,3,4,5,.. The probability that the quotient is k is around .6/sqrt(k). The decision trees (for the different q) in the algorithms of Silverman and Kruppa were a degenerated Tree, a linear List. This is optimal if the probability will be decreasing exponential (like 1/2^k). This is not the case. A more balanced tree will be better. There were results (Huffmancoding) for the best possible trees. If q is to great the benefit of reducing the height of the tree will be overtaken by calculating the decision. The test for q = 3 or 4 (5 or 6 in the picture below) is expensive. The test t < divisor * 3 can be done (in Java) faster via: t < (divisor<<1) + divisor. The optimal tree might bedend on the language, compiler, bs, ... The best trees I get the best results from were the following two. Code:
.
/ \
1 .
/ \
2 .
/ \
. .
/\ / \
3 4 . >6
/ \
5 6
.
/ \
1 .
/ \
2 .
/ \
. >6
/ \
. .
/ \ / \
3 4 5 6
public static long invertKruppaTree2 (long a, long modulus)
{
long ps1, ps2, parity, dividend, divisor, rem, q, t;
if (a == 1)
return 1;
q = modulus / a;
rem = modulus - a * q;
dividend = a;
divisor = rem;
ps1 = q;
ps2 = 1;
parity = ~0;
while (divisor > 1)
{
rem = dividend - divisor;
t = rem - divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
// q = divided/divisor is already reduced by 2
if (t > 0)
{
if (t < divisor << 1) // divided/divisor <= 2
{
if (t < divisor) // divided/divisor == 1
{
q += ps1;
rem = t;
} else // divided/divisor == 2
{
q += ps1 << 1;
rem = t - divisor;
}
} else // divided/divisor > 2
{
if (t < divisor << 2) // divided/divisor <= 4
{
long divisor3 = (divisor<<1) + divisor;
if (t < divisor3) // divided/divisor == 3
{
q += (ps1<<1) + ps1;
rem = t - (divisor << 1);
} else // divided/divisor == 4
{
q += ps1 << 2;
rem = t - divisor3;
}
}
else // divided/divisor > 5
{
q = dividend / divisor;
rem = dividend - q * divisor;
q *= ps1;
}
}
}
}
}
q += ps2;
parity = ~parity;
dividend = divisor;
divisor = rem;
ps2 = ps1;
ps1 = q;
}
if (parity == 0)
return (ps1);
else
return (modulus - ps1);
}
The original algorithm I started with was the following. Is it the actually best? How fast is the binary eucliedean algorithm? Code:
public static long invertKruppa (long a, long modulus)
{
long ps1, ps2, parity, dividend, divisor, rem, q, t;
if (a == 1)
return 1;
q = modulus / a;
rem = modulus - a * q;
dividend = a;
divisor = rem;
ps1 = q;
ps2 = 1;
parity = ~0;
while (divisor > 1)
{
rem = dividend - divisor;
t = rem - divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
if (rem >= divisor)
{
q = dividend / divisor;
rem = dividend - q * divisor;
q *= ps1;
}
}
}
}
}
}
}
}
}
q += ps2;
parity = ~parity;
dividend = divisor;
divisor = rem;
ps2 = ps1;
ps1 = q;
}
if (parity == 0)
return (ps1);
else
return (modulus - ps1);
}
Last fiddled with by ThiloHarich on 2005-12-09 at 18:05 |
|
|
|
|
|
#57 |
|
Mar 2004
Belgium
15178 Posts |
Why don't anyone uses a recursive function?
(Starting from the following code) Code:
while (divisor > 1)
{
rem = dividend - divisor;
t = rem - divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
t -= divisor;
if (t >= 0)
{
q += ps1;
rem = t;
if (rem >= divisor)
{
q = dividend / divisor;
rem = dividend - q * divisor;
q *= ps1;
}
}
}
}
}
}
}
}
}
q += ps2;
parity = ~parity;
dividend = divisor;
divisor = rem;
ps2 = ps1;
ps1 = q;
}
if (parity == 0)
return (ps1);
else
return (modulus - ps1);
}
|
|
|
|
|
|
#58 |
|
Nov 2005
22×33 Posts |
Hi Cedric,
I have been writing a iteration (a for loop). It is not faster then the direct handling of each case. The increasing of the counter might be the reason. The other reason is, you have to stop somewhere with the recursion, because doing the division and the 2 multiplication is faster at some point. Of course a recursive call looks a lot nicer. I still made some improvements to the code. I removed the variable t, which is unnecessary. I changed the tree. This implementation is 10% faster then the original one. Code:
/**
* .
* / \
* 1 .
* / \
* . .
* / \ / \
* 2 . . >8
* / \ / \
* 3 4 5 .
* / \
* 6 .
* / \
* 7 8
* @param a
* @param modulus
* @return
*/
public static long invertKruppaTree10 (long a, long modulus)
{
long ps1, ps2, parity, dividend, divisor, rem, q;
if (a == 1)
return 1;
q = modulus / a;
rem = modulus - a * q;
dividend = a;
divisor = rem;
ps1 = q;
ps2 = 1;
parity = ~0;
while (divisor > 1)
{
rem = dividend - divisor;
if (rem >= divisor)
{
rem -= divisor; // q=1
if (rem < divisor)
{
q += ps1;
}
else
{
long divisor4 = divisor << 2;
if (rem < divisor4)
{
rem -= divisor; // q=2
long p2 = ps1<<1;
if (rem < divisor)
{
q += p2;
} else
{
rem -= divisor; // q=3
if (rem < divisor)
{
q += p2 + ps1;
}else
{
q += ps1<<2; // q=4
rem -= divisor;
}
}
}else
{
long divisor8 = divisor<<3;
if (rem < divisor8)
{
rem -= divisor4; // q=5
if (rem < divisor)
{
q += ps1 * 5;
}
else
{
rem -= divisor; // q=6
if (rem < divisor)
{
q += ps1 * 6;
} else
{
rem -= divisor; // q=7
if (rem < divisor)
{
q += ps1 * 7;
}else
{
q += ps1<<3; // q=8
rem -= divisor;
}
}
}
}else
{
q = dividend / divisor; // q >8
rem = dividend - q * divisor;
q *= ps1;
}
}
}
}
q += ps2;
parity = ~parity;
dividend = divisor;
divisor = rem;
ps2 = ps1;
ps1 = q;
}
if (parity == 0)
return (ps1);
else
return (modulus - ps1);
}
|
|
|
|
|
|
#59 |
|
Nov 2005
22×33 Posts |
No body wants to run this code under C?
|
|
|
|
|
|
#60 |
|
Aug 2004
8516 Posts |
Did anyone come up with an improvement to the lattice reduction function?
I couldn't see one scanning through the thread quickly. Chris |
|
|
|
|
|
#61 | |
|
"Bob Silverman"
Nov 2003
North of Boston
5×17×89 Posts |
Quote:
I then moved on to other aspects of the code. |
|
|
|
|
|
|
#62 | |
|
Aug 2004
7×19 Posts |
Quote:
all the code in the for(;;) loop sx2 = (x2 << 4);etc. is replaced by a single division long int q = x3 / x2;Altogether it speeded up my siever by about 3 or 4 %, which isn't much, but every little helps. Chris |
|
|
|
|
|
|
#63 | |
|
Jan 2003
31 Posts |
Quote:
Actually there is! namely BSF—Bit Scan Forward BSR—Bit Scan Reverse and these are quite fast in current intel computers. (latency is 2 in Core 2 and 4 in P4) You can use these from inline assembler and here is example of simple binary GDC (not extended): /*---------------------------------------------------//*! * brief Computes GCD of two 32-bit unsigned integers * param x First unsigned integer * param y Second unsigned integer * return gcd(x,y) * note gcd(x,0) -> x, gcd(0,y) -> y * note Implemented in x86 assembler (PentiumPro and * above only as cmov instructions are used) * note Send all comments/whines to wili@hybrid.fi * todo [wili 021026] Implement another version that * uses sbb trickery rather than cmov * instructions *-----------------------------------------------------*/ #pragma warning(disable:4035) // no missing return value inline unsigned int gcd (Uint32 x, Uint32 y) { __asm { mov ecx, dword ptr [y] mov edx, dword ptr [x] test ecx, ecx mov eax, edx je done ;// if (y = 0) -> return x test eax, eax mov eax, ecx je done ;// if (x = 0) -> return y push edi bsf ebx, eax ;// ebx = trailingZeroes(y) bsf ecx, edx ;// ecx = trailingZeroes(x) cmp ebx, ecx mov edi, ecx cmovl edi, ebx ;// edi = min(ebx,ecx) shr edx, cl ;// x >>= trailingzeroes(x) mov ecx, ebx shr eax, cl ;// y >>= trailingzeroes(y) align 8 mainloop: ;// for (;;) cmp eax, edx mov ecx, eax je skiploop ;// if (x == y) -> break cmovb eax, edx cmovb edx, ecx ;// if (y > x) swap(x,y) sub eax, edx ;// x -= y bsf ecx, eax shr eax, cl ;// x >>= trailingzeroes(x) jmp mainloop align 8 skiploop: mov ecx, edi shl eax, cl ;// return x<<finalShiftLeft pop edi done: ;// return value in eax } } #pragma warning(default:4035) from: http://www.azillionmonkeys.com/qed/asmexample.html Someone just have to modify this to extended binary GCDversion. Yours, Nuutti |
|
|
|
|
|
|
#64 |
|
Mar 2003
New Zealand
13·89 Posts |
I just realised this thread is where the modular inverse code I stole from Msieve came from. Thanks, it it was very useful :-)
(I am now trying to speed up the calculation of 1/a (mod p) when a stays fixed but p varies, and 1 < a < 2^32 < p < 2^64) |
|
|
|
![]() |
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Calling airsquirrels | Prime95 | GPU Computing | 16 | 2015-09-29 18:06 |
| Help from coders | ET_ | GPU Computing | 5 | 2014-01-26 13:58 |
| Calling all 64-bit Linux sievers! | frmky | NFS@Home | 25 | 2013-10-16 15:58 |
| IA-32 Assembly Coders, anyone? | xenon | Programming | 6 | 2005-06-02 13:26 |
| Bob, I'm calling you out! | synergy | Miscellaneous Math | 17 | 2004-10-26 15:26 |