The dependence on loop length primarily reflects loop overhead (short loops) and cache beahavior (long loops).
The number of repitions of each loop is inversely proportional to the number of flops, to make each loop take approximately the same time. The average speed is therefore approximately a a harmonic mean of the speed of all the loops.
1 ************************************************************************
2 * program mflop6
3 *
4 * Test the speed of various floating point operations. Beware that
5 * some compilers may be smart enough to eliminate the outer loops.
6 *
7 * To run:
8 *
9 * f77 -O [ vendor specific optimization, to be reported back ] mflop6.f
10 * time a.out >&log # time is used to verify the timings.
11 * history -2 >>log # to document the options
12 * mail aake@astro.ku.dk
13 *
14 parameter (mop=8000000,m=1000000,mtime=14,mlength=5)
15 parameter (mk=10,mm=m/mk)
16 common /cshort/a(m),b(m),c(m),d(m),sm(mk),b1(m)
17 real times(mtime,mlength)
18 *
19 * Calibrate nop to approx "expect" seconds total
20 *
21 nop=mop
22 i=3
23 expect=5.
24 length=100*10**(i-1)
25 call loops(nop,length,times(1,i),ltime,tot)
26 nop=nop*(expect/tot)
27 *
28 do 100 i=1,mlength
29 length=100*10**(i-1)
30 call loops(nop,length,times(1,i),ltime,tot)
31 100 continue
32 *
33 f=mop*1e-6
34 print 103,'OP/LEN:',(100*10**(i-1),i=1,mlength)
35 print 102,'a=b',(times(1,i),i=1,mlength)
36 print 102,'a=a*cc',(times(2,i),i=1,mlength)
37 print 102,'a=a*d',(times(3,i),i=1,mlength)
38 print 102,'a=b+c',(times(4,i),i=1,mlength)
39 print 102,'a=a+b*cc',(times(5,i),i=1,mlength)
40 print 102,'am=am+bm*cc',(times(6,i),i=1,mlength)
41 print 102,'a=a+b*c',(times(7,i),i=1,mlength)
42 print 102,'sm=sm+b*c',(times(8,i),i=1,mlength)
43 print 102,'a=(a+b*c)/d',(times(9,i),i=1,mlength)
44 print 102,'a=(a+b*c)/(b+a*c)',(times(10,i),i=1,mlength)
45 print 102,'a=(aa+b*(bb+b*(cc+b)))',(times(11,i),i=1,mlength)
46 print 102,'a=(aa+b1*(bb+b1*(cc+b1)))',(times(12,i),i=1,mlength)
47 print 102,'a=(aa+sqrt(amax1(0.,b)))',(times(13,i),i=1,mlength)
48 print 102,'a=(aa+sqrt(b)) [if]',(times(14,i),i=1,mlength)
49 102 format(1x,a24,10f8.2)
50 103 format(1x,a24,10i8)
51 end
52 ************************************************************************
53 subroutine loops(mop,l,times,itime,tot)
54 parameter (m=1000000,mtime=14,mlength=5)
55 parameter (mk=10,mm=m/mk)
56 *
57 real a(m),b(m),c(m),d(m),am(mm,mk),bm(mm,mk),sm(mk),b1(m)
58 common /cshort/a,b,c,d,sm,b1
59 equivalence (a,am),(b,bm)
60 real times(mtime),dt(2)
61 *
62 * Make some dummy data, close to one to avoid over/underflow.
63 *
64 if (l.gt.m) then
65 print *,'too short dimension:',m,' lt ',l
66 stop
67 endif
68 aa=0.999999
69 bb=0.999999
70 cc=0.999999
71 do 100 i=1,l
72 a(i)=1.-1.e-6*i
73 b(i)=1.-1.e-7*i
74 c(i)=1.e-6
75 d(i)=0.999999
76 100 continue
77 do 101 k=1,mk
78 do 101 i=1,l/mk
79 am(i,k)=a(i)
80 bm(i,k)=b(i)
81 101 continue
82 call etime(dt)
83 itime=0
84 *
85 * First, not a real flop, just a copy from array to array, to
86 * check how it organizes load/store.
87 *
88 do 110 j=1,mop/l
89 do 111 i=1,l
90 a(i)=b(i)
91 111 continue
92 110 continue
93 call dtime(dt)
94 itime=itime+1
95 times(itime)=dt(1)
96 *
97 * This one illustrates the speed it can achieve on a two load/store
98 * per flop operation, when it does well.
99 *
100 do 120 j=1,mop/l
101 do 121 i=1,l
102 a(i)=a(i)*cc
103 121 continue
104 120 continue
105 call dtime(dt)
106 itime=itime+1
107 times(itime)=dt(1)
108 *
109 * This should just loose one more load, relative to above.
110 *
111 do 130 j=1,mop/l
112 do 131 i=1,l
113 a(i)=a(i)*d(i)
114 131 continue
115 130 continue
116 call dtime(dt)
117 itime=itime+1
118 times(itime)=dt(1)
119 *
120 * Same thing, with plus.
121 *
122 do 140 j=1,mop/l
123 do 141 i=1,l
124 a(i)=b(i)+c(i)
125 141 continue
126 140 continue
127 call dtime(dt)
128 itime=itime+1
129 times(itime)=dt(1)
130 *
131 * The (in-)famous Linpack SAXPY operation, 3 load/store, 2 flop.
132 *
133 do 150 j=1,mop/l/2
134 do 151 i=1,l
135 a(i)=a(i)+b(i)*cc
136 151 continue
137 150 continue
138 call dtime(dt)
139 itime=itime+1
140 times(itime)=dt(1)
141 *
142 * Exactly the same thing, but with one redundant index. Should
143 * run as fast as the one above, minus totally negligible outer
144 * loop overhead. Is sometimes a LOT slower.
145 *
146 do 162 j=1,mop/l/2
147 do 163 k=1,mk
148 do 163 i=1,l/mk
149 am(i,k)=am(i,k)+bm(i,k)*cc
150 163 continue
151 162 continue
152 call dtime(dt)
153 itime=itime+1
154 times(itime)=dt(1)
155 *
156 * Is this not simple enough? Three variables, two flops.
157 *
158 do 170 j=1,mop/l/2
159 do 171 i=1,l
160 a(i)=a(i)+b(i)*c(i)
161 171 continue
162 170 continue
163 call dtime(dt)
164 itime=itime+1
165 times(itime)=dt(1)
166 *
167 * Dot product, saving in array
168 *
169 do 180 j=1,mop/l/2
170 do 181 k=1,mk
171 sm(k)=0.
172 do 181 i=1,l/mk
173 sm(k)=sm(k)+am(i,k)*bm(i,k)
174 181 continue
175 180 continue
176 call dtime(dt)
177 itime=itime+1
178 times(itime)=dt(1)
179 *
180 * Four variables, three flops
181 *
182 do 190 j=1,mop/l/3
183 do 191 i=1,l
184 a(i)=(a(i)+b(i)*c(i))/d(i)
185 191 continue
186 190 continue
187 call dtime(dt)
188 itime=itime+1
189 times(itime)=dt(1)
190 *
191 * Trying to be nice to pipeline, 3 variables, 5 flops.
192 *
193 do 200 j=1,mop/l/5
194 do 201 i=1,l
195 a(i)=(a(i)+b(i)*c(i))/(b(i)+a(i)*c(i))
196 201 continue
197 200 continue
198 call dtime(dt)
199 itime=itime+1
200 times(itime)=dt(1)
201 *
202 * Trying to be as nice as possible to the compiler, on a good
203 * compiler this should come close to max speed. 5 flop, one
204 * load, one store.
205 *
206 do 210 j=1,mop/l/5
207 do 211 i=1,l
208 a(i)=(aa+b(i)*(bb+b(i)*(cc+b(i))))
209 211 continue
210 210 continue
211 call dtime(dt)
212 itime=itime+1
213 times(itime)=dt(1)
214 *
215 * Same thing, new vector; check cache miss
216 *
217 do 212 j=1,mop/l/5
218 do 213 i=1,l
219 a(i)=(aa+b1(i)*(bb+b1(i)*(cc+b1(i))))
220 213 continue
221 212 continue
222 call dtime(dt)
223 itime=itime+1
224 times(itime)=dt(1)
225 *
226 * Std functions, approx 8 "pseudo"-flops
227 *
228 do 220 j=1,mop/l/8
229 do 221 i=1,l
230 a(i)=(aa+sqrt(amax1(0.0,b(i))))
231 221 continue
232 220 continue
233 call dtime(dt)
234 itime=itime+1
235 times(itime)=dt(1)
236 *
237 * Std functions, with if
238 *
239 do 230 j=1,mop/l/8
240 do 231 i=1,l
241 if (b(i) .ge. 0.0) then
242 a(i)=(aa+sqrt(b(i)))
243 else
244 a(i)=aa
245 endif
246 231 continue
247 230 continue
248 call dtime(dt)
249 itime=itime+1
250 times(itime)=dt(1)
251 tot=0.
252 do 300 i=1,itime
253 tot=tot+times(i)
254 times(i)=mop*1e-6/times(i)
255 300 continue
256 print 301,itime*mop/1000000,' Mflops, vector length ',l,':',tot
257 & ,' sec',itime*mop*1e-6/tot,' Mfl'
258 301 format(1x,i4,a,i7,a,f7.2,a,f7.2,a)
259 *
260 end