![]() |
|
|
#1 |
|
(loop (#_fork))
Feb 2006
Cambridge, England
23×11×73 Posts |
This is just a load of independent 64x64->128 multiplies; compiled with g++ -march=native -O3
Code:
#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;
typedef unsigned long long u64;
typedef unsigned int u32;
u64 rdtsc(void)
{
u64 d;
u32 hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi) );
d=hi; d<<=32; return d|lo;
}
inline u64 hisq(u64 A)
{
u64 r0,r1;
asm ("mulq %2" \
: "=a"(r0),"=d"(r1) \
: "a"(A) \
: "cc");
return r1;
}
int main(void)
{
u64 a,b,c,d,e,f,g,h,t0,t1;
a=0x123456789abcdef0LL;
b=(a<<4)|1; c=(b<<4)|2; d=(c<<4)|3;
e=(d<<4)|4; f=(e<<4)|5; g=(f<<4)|6; h=(g<<4)|7;
vector<u64> times;
for (int trial=0; trial<10000;)
{
t0=rdtsc();
for (u64 i=0; i<100000; i++)
{
a=a+hisq(a);
b=b+hisq(b);
c=c+hisq(c);
//d=d+hisq(d);
//e=e+hisq(e);
//f=f+hisq(f);
}
t1=rdtsc();
if ((t1-t0 < 3500000)) { times.push_back(t1-t0);
trial++;}
}
cout << dec << (a^b^c^d^e^f^g^h) << " just to make sure it gets computed\n";
sort(times.begin(),times.end());
cout << times[0]<<" "<<times[100] << " " << times[5000] << " " << times[9900] << endl;
}
The timing on the core2 isn't that hard to understand Code:
// core2 // one 9.1 / 9.1 / 9.1 / 13.7 // two 9.1 / 9.2 / 9.2 / 10.1 // three 12.0 / 12.0 / 12.0 / 18.0 // four 16.0 / 16.0 / 16.0 / 17.0 // five 20.0 / 20.0 / 20.0 / 21.0 // six 24.0 / 24.0 / 24.0 / 24.6 Code:
// k10 phenom // one 7.0 / 7.0 / 7.0 / 7.1 // two 8.0 / 8.0 / 8.0 / 8.1 // three 11.0 / 11.0 / 11.0 / 11.1 // four 13.0 / 13.0 / 13.0 / 13.1 // five 18.0 / 18.0 / 18.0 / 18.1 // six 21.0 / 21.0 / 21.0 / 21.1 Code:
// i7 // one multiply: 11.4 / 11.4 / 11.5 / 13.7 // two 11.4 / 11.4 / 11.8 / 13.4 // three 11.8 / 12.2 / 12.5 / 14.2 // four multiplies: 12.3 / 12.7 / 13.6 / 16.2 // five 13.7 / 13.8 / 15.3 / 18.6 // six 14.7 / 14.7 / 17.1 / 22.5 |
|
|
|
|
|
#2 |
|
A Sunny Moo
Aug 2007
USA (GMT-5)
3·2,083 Posts |
While I really don't know too much about low-level CPU instructions and my comments here should be taken with a grain of salt, might hyperthreading be the cause of the strange numbers on the i7? If I understand it correctly, the idea behind hyperthreading is that when you have all the hyperthreads busy (i.e. two processes per core), each will mostly run at 1/2 speed, but every once in a while some of them will be doing the same kind of operation at the same time, and the CPU can save some time by doing them both at once. (Or something like that.) Hence, an operation can be considered to take a partial cycle.
Again, what I'm saying here could be complete bunk so take it with a grain of salt. |
|
|
|
|
|
#3 |
|
Oct 2007
2·53 Posts |
Since there aren't memory accesses in this, we can measure what's going on fairly accurate using IACA:
Code:
$ iaca -64 -cp PERFORMANCE -arch nehalem a.out Intel(R) Architecture Code Analyzer Version - 1.1.0 Analyzed File - a.out Binary Format - 64Bit Architecture - Intel(R) microArchitecture - codename Nehalem Analysis Report --------------- Total Throughput: 32 Cycles; Throughput Bottleneck: Port0, Port1, Port5 Total number of Uops bound to ports: 96 Data Dependency Latency: 40 Cycles; Performance Latency: 51 Cycles Port Binding in cycles: ------------------------------------------------------- | Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | ------------------------------------------------------- | Cycles | 32 | 0 | 32 | 0 | 0 | 0 | 0 | 0 | 32 | ------------------------------------------------------- N - port number, DV - Divider pipe (on port 0), D - Data fetch pipe (on ports CP - on a critical Performance Path N - number of cycles port was bound X - other ports that can be used by this instructions F - Macro Fusion with the previous instruction occurred ^ - Micro Fusion happened * - instruction micro-ops not bound to a port @ - Intel(R) AVX to Intel(R) SSE code switch, dozens of cycles penalty is expe ! - instruction not supported, was not accounted in Analysis | Num of | Ports pressure in cycles | | | Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | | ------------------------------------------------------------ | 1 | 1 | | X | | | | | | X | | mov rax, r14 | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r15 | 1 | X | | 1 | | | | | | X | | add r14, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r12 | 1 | X | | X | | | | | | 1 | | add r15, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbp | 1 | 1 | | X | | | | | | X | | add r12, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbx | 1 | X | | 1 | | | | | | X | | add rbp, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | CP | mov rax, r13 | 1 | X | | X | | | | | | 1 | | add rbx, rdx | 2 | 1 | | 1 | | | | | | | CP | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r14 | 1 | 1 | | X | | | | | | X | CP | add r13, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r15 | 1 | X | | 1 | | | | | | X | | add r14, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r12 | 1 | X | | X | | | | | | 1 | | add r15, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbp | 1 | 1 | | X | | | | | | X | | add r12, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbx | 1 | X | | 1 | | | | | | X | | add rbp, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | CP | mov rax, r13 | 1 | X | | X | | | | | | 1 | | add rbx, rdx | 2 | 1 | | 1 | | | | | | | CP | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r14 | 1 | 1 | | X | | | | | | X | CP | add r13, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r15 | 1 | X | | 1 | | | | | | X | | add r14, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r12 | 1 | X | | X | | | | | | 1 | | add r15, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbp | 1 | 1 | | X | | | | | | X | | add r12, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbx | 1 | X | | 1 | | | | | | X | | add rbp, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | CP | mov rax, r13 | 1 | X | | X | | | | | | 1 | | add rbx, rdx | 2 | 1 | | 1 | | | | | | | CP | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r14 | 1 | 1 | | X | | | | | | X | CP | add r13, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r15 | 1 | X | | 1 | | | | | | X | | add r14, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, r12 | 1 | X | | X | | | | | | 1 | | add r15, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbp | 1 | 1 | | X | | | | | | X | | add r12, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | | mov rax, rbx | 1 | X | | 1 | | | | | | X | | add rbp, rdx | 2 | 1 | | 1 | | | | | | | | mul rax | 1 | X | | X | | | | | | 1 | CP | mov rax, r13 | 1 | X | | X | | | | | | 1 | | add rbx, rdx | 2 | 1 | | 1 | | | | | | | CP | mul rax | 1 | X | | X | | | | | | 1 | CP | add r13, rdx Last fiddled with by Robert Holmes on 2010-09-21 at 23:43 |
|
|
|
![]() |
| Thread Tools | |
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Rho code | Happy5214 | YAFU | 3 | 2015-11-01 21:54 |
| Please help me with my code | daxmick | Programming | 15 | 2014-02-14 11:57 |
| Code Help | Andrew | Programming | 12 | 2013-02-16 20:53 |
| New Code | JohnFullspeed | Programming | 20 | 2011-09-04 04:28 |
| Code | Primeinator | Software | 20 | 2009-06-11 22:22 |