Other pages


About the benchmark program

The program compares the performance of various simple loops, for a number of loop lengths, in order to map out the strengths and weaknesses of the compiler.

The dependence on loop length primarily reflects loop overhead (short loops) and cache beahavior (long loops).

The number of repitions of each loop is inversely proportional to the number of flops, to make each loop take approximately the same time. The average speed is therefore approximately a a harmonic mean of the speed of all the loops.


mflop6.f Benchmark Listing


  1	************************************************************************
  2	*      program mflop6
  3	*
  4	*  Test the speed of various floating point operations.  Beware that
  5	*  some compilers may be smart enough to eliminate the outer loops.
  6	*
  7	*  To run:
  8	*
  9	*    f77 -O [ vendor specific optimization, to be reported back ] mflop6.f
 10	*    time a.out >&log   # time is used to verify the timings.
 11	*    history -2 >>log   # to document the options
 12	*    mail aake@astro.ku.dk
 13	*
 14	        parameter (mop=8000000,m=1000000,mtime=14,mlength=5)
 15	        parameter (mk=10,mm=m/mk)
 16	        common /cshort/a(m),b(m),c(m),d(m),sm(mk),b1(m)
 17	        real times(mtime,mlength)
 18	*
 19	*  Calibrate nop to approx "expect" seconds total
 20	*
 21	        nop=mop
 22	        i=3
 23	        expect=5.
 24	        length=100*10**(i-1)
 25	        call loops(nop,length,times(1,i),ltime,tot)
 26	        nop=nop*(expect/tot)
 27	*
 28	        do 100 i=1,mlength
 29	          length=100*10**(i-1)
 30	          call loops(nop,length,times(1,i),ltime,tot)
 31	100     continue
 32	*
 33	        f=mop*1e-6
 34	        print 103,'OP/LEN:',(100*10**(i-1),i=1,mlength)
 35	        print 102,'a=b',(times(1,i),i=1,mlength)
 36	        print 102,'a=a*cc',(times(2,i),i=1,mlength)
 37	        print 102,'a=a*d',(times(3,i),i=1,mlength)
 38	        print 102,'a=b+c',(times(4,i),i=1,mlength)
 39	        print 102,'a=a+b*cc',(times(5,i),i=1,mlength)
 40	        print 102,'am=am+bm*cc',(times(6,i),i=1,mlength)
 41	        print 102,'a=a+b*c',(times(7,i),i=1,mlength)
 42	        print 102,'sm=sm+b*c',(times(8,i),i=1,mlength)
 43	        print 102,'a=(a+b*c)/d',(times(9,i),i=1,mlength)
 44	        print 102,'a=(a+b*c)/(b+a*c)',(times(10,i),i=1,mlength)
 45	        print 102,'a=(aa+b*(bb+b*(cc+b)))',(times(11,i),i=1,mlength)
 46	        print 102,'a=(aa+b1*(bb+b1*(cc+b1)))',(times(12,i),i=1,mlength)
 47	        print 102,'a=(aa+sqrt(amax1(0.,b)))',(times(13,i),i=1,mlength)
 48	        print 102,'a=(aa+sqrt(b)) [if]',(times(14,i),i=1,mlength)
 49	102     format(1x,a24,10f8.2)
 50	103     format(1x,a24,10i8)
 51	        end
 52	************************************************************************
 53	        subroutine loops(mop,l,times,itime,tot)
 54	        parameter (m=1000000,mtime=14,mlength=5)
 55	        parameter (mk=10,mm=m/mk)
 56	*
 57	        real a(m),b(m),c(m),d(m),am(mm,mk),bm(mm,mk),sm(mk),b1(m)
 58	        common /cshort/a,b,c,d,sm,b1
 59	        equivalence (a,am),(b,bm)
 60	        real times(mtime),dt(2)
 61	*
 62	*  Make some dummy data, close to one to avoid over/underflow.
 63	*
 64	        if (l.gt.m) then
 65	          print *,'too short dimension:',m,' lt ',l
 66	          stop
 67	        endif
 68	        aa=0.999999
 69	        bb=0.999999
 70	        cc=0.999999
 71	        do 100 i=1,l
 72	          a(i)=1.-1.e-6*i
 73	          b(i)=1.-1.e-7*i
 74	          c(i)=1.e-6
 75	          d(i)=0.999999
 76	100   continue
 77	        do 101 k=1,mk
 78	        do 101 i=1,l/mk
 79	          am(i,k)=a(i)
 80	          bm(i,k)=b(i)
 81	101   continue
 82	        call etime(dt)
 83	        itime=0
 84	*
 85	*  First, not a real flop, just a copy from array to array, to
 86	*  check how it organizes load/store.
 87	*
 88	        do 110 j=1,mop/l
 89	        do 111 i=1,l
 90	          a(i)=b(i)
 91	111     continue
 92	110     continue
 93	        call dtime(dt)
 94	        itime=itime+1
 95	        times(itime)=dt(1)
 96	*
 97	*  This one illustrates the speed it can achieve on a two load/store
 98	*  per flop operation, when it does well.
 99	*
100	        do 120 j=1,mop/l
101	        do 121 i=1,l
102	          a(i)=a(i)*cc
103	121     continue
104	120     continue
105	        call dtime(dt)
106	        itime=itime+1
107	        times(itime)=dt(1)
108	*
109	*  This should just loose one more load, relative to above.
110	*
111	        do 130 j=1,mop/l
112	        do 131 i=1,l
113	          a(i)=a(i)*d(i)
114	131     continue
115	130     continue
116	        call dtime(dt)
117	        itime=itime+1
118	        times(itime)=dt(1)
119	*
120	*  Same thing, with plus.
121	*
122	        do 140 j=1,mop/l
123	        do 141 i=1,l
124	          a(i)=b(i)+c(i)
125	141     continue
126	140     continue
127	        call dtime(dt)
128	        itime=itime+1
129	        times(itime)=dt(1)
130	*
131	*  The (in-)famous Linpack SAXPY operation, 3 load/store, 2 flop.
132	*
133	        do 150 j=1,mop/l/2
134	        do 151 i=1,l
135	          a(i)=a(i)+b(i)*cc
136	151     continue
137	150     continue
138	        call dtime(dt)
139	        itime=itime+1
140	        times(itime)=dt(1)
141	*
142	*  Exactly the same thing, but with one redundant index.  Should
143	*  run as fast as the one above, minus totally negligible outer
144	*  loop overhead.  Is sometimes a LOT slower.
145	*
146	        do 162 j=1,mop/l/2
147	        do 163 k=1,mk
148	        do 163 i=1,l/mk
149	          am(i,k)=am(i,k)+bm(i,k)*cc
150	163     continue
151	162     continue
152	        call dtime(dt)
153	        itime=itime+1
154	        times(itime)=dt(1)
155	*
156	*  Is this not simple enough?  Three variables, two flops.
157	*
158	        do 170 j=1,mop/l/2
159	        do 171 i=1,l
160	          a(i)=a(i)+b(i)*c(i)
161	171     continue
162	170     continue
163	        call dtime(dt)
164	        itime=itime+1
165	        times(itime)=dt(1)
166	*
167	*  Dot product, saving in array
168	*
169	        do 180 j=1,mop/l/2
170	        do 181 k=1,mk
171	        sm(k)=0.
172	        do 181 i=1,l/mk
173	          sm(k)=sm(k)+am(i,k)*bm(i,k)
174	181     continue
175	180     continue
176	        call dtime(dt)
177	        itime=itime+1
178	        times(itime)=dt(1)
179	*
180	*  Four variables, three flops
181	*
182	        do 190 j=1,mop/l/3
183	        do 191 i=1,l
184	          a(i)=(a(i)+b(i)*c(i))/d(i)
185	191     continue
186	190     continue
187	        call dtime(dt)
188	        itime=itime+1
189	        times(itime)=dt(1)
190	*
191	*  Trying to be nice to pipeline, 3 variables, 5 flops.
192	*
193	        do 200 j=1,mop/l/5
194	        do 201 i=1,l
195	          a(i)=(a(i)+b(i)*c(i))/(b(i)+a(i)*c(i))
196	201     continue
197	200     continue
198	        call dtime(dt)
199	        itime=itime+1
200	        times(itime)=dt(1)
201	*
202	*  Trying to be as nice as possible to the compiler, on a good
203	*  compiler this should come close to max speed.  5 flop, one
204	*  load, one store.
205	*
206	        do 210 j=1,mop/l/5
207	        do 211 i=1,l
208	          a(i)=(aa+b(i)*(bb+b(i)*(cc+b(i))))
209	211     continue
210	210     continue
211	        call dtime(dt)
212	        itime=itime+1
213	        times(itime)=dt(1)
214	*
215	*  Same thing, new vector; check cache miss
216	*
217	        do 212 j=1,mop/l/5
218	        do 213 i=1,l
219	          a(i)=(aa+b1(i)*(bb+b1(i)*(cc+b1(i))))
220	213     continue
221	212     continue
222	        call dtime(dt)
223	        itime=itime+1
224	        times(itime)=dt(1)
225	*
226	*  Std functions, approx 8 "pseudo"-flops
227	*
228	        do 220 j=1,mop/l/8
229	        do 221 i=1,l
230	          a(i)=(aa+sqrt(amax1(0.0,b(i))))
231	221     continue
232	220     continue
233	        call dtime(dt)
234	        itime=itime+1
235	        times(itime)=dt(1)
236	*
237	*  Std functions, with if
238	*
239	        do 230 j=1,mop/l/8
240	        do 231 i=1,l
241	          if (b(i) .ge. 0.0) then
242	            a(i)=(aa+sqrt(b(i)))
243	          else
244	            a(i)=aa
245	          endif
246	231     continue
247	230     continue
248	        call dtime(dt)
249	        itime=itime+1
250	        times(itime)=dt(1)
    
251	        tot=0.
252	        do 300 i=1,itime
253	          tot=tot+times(i)
254	          times(i)=mop*1e-6/times(i)
255	300     continue
256	        print 301,itime*mop/1000000,' Mflops, vector length ',l,':',tot
257	     &   ,' sec',itime*mop*1e-6/tot,' Mfl'
258	301     format(1x,i4,a,i7,a,f7.2,a,f7.2,a)
259	*
260	        end