[mpir-devel] Re: Linux and Window compairison

Jason Moxham Wed, 22 Jul 2009 03:28:13 -0700

On Friday 17 July 2009 08:32:30 Cactus wrote:
> On Jul 16, 11:55 pm, Jason Moxham <[email protected]> wrote:
> > A comparison of MPIR-1.2.1 on intel nehalem  built as a core2 on Linux
> > and Windows.
> >
> > This is in cycles and first col is Linux and second col is windows ,
> > ignore the # , they are wrong
> >
> > ./speed -c -s 1-40 mpn_add_n colfile=1,win64_bat/time_add
> > overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> > 2664.77 MHz
> >             mpn_add_n colfile=1,win64_bat/time_add
> > 1              #12.38         12.38
> > 2              #11.43         12.38
> > 3              #13.09         13.50
> > 4              #18.41         18.10
> > 5              #23.81         23.81
> > 6              #24.00         24.45
> > 7              #27.76         27.25
> > 8              #26.19         26.19
> > 9              #28.42         30.00
> > 10             #29.62         31.43
> > 11             #32.31         32.86
> > 12             #34.50         34.29
> > 13             #37.39         37.14
> > 14             #40.00         38.73
> > 15             #41.91         42.86
> > 16             #40.48         42.22
> > 17             #44.77         45.72
> > 18             #45.85         46.77
> > 19             #48.39         48.19
> > 20             #49.53         49.76
> > 21             #53.66         53.53
> > 22             #53.81         54.65
> > 23             #55.24         56.33
> > 24             #56.91         57.31
> > 25             #60.48         60.96
> > 26             #60.84         62.65
> > 27             #62.60         63.43
> > 28             #65.72         64.76
> > 29             #68.58         67.46
> > 30             #67.39         69.80
> > 31             #72.67         69.53
> > 32             #72.39         72.96
> > 33             #75.72         77.15
> > 34             #76.91         78.81
> > 35             #76.20         78.26
> > 36             #79.06         80.72
> > 37             #83.34         83.43
> > 38             #84.77         84.90
> > 39             #85.62         85.34
> > 40             #87.49         88.58
> >
> > ./speed -c -s 1-40 mpn_addmul_1.333 colfile=1,win64_bat/time_addmul1
> > overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> > 2664.77 MHz
> >         mpn_addmul_1.333 colfile=1,win64_bat/time_addmul1
> > 1              #11.43         37.14
> > 2              #17.15         28.33
> > 3              #23.15         31.43
> > 4              #29.05         39.53
> > 5              #34.62         46.11
> > 6              #40.52         49.23
> > 7              #45.72         54.29
> > 8              #51.75         57.60
> > 9              #57.15         65.12
> > 10             #63.09         67.21
> > 11             #69.00         72.63
> > 12             #74.97         75.98
> > 13             #80.01         82.07
> > 14             #76.77         85.72
> > 15             #80.32         90.48
> > 16             #84.24         94.29
> > 17             #91.04        101.52
> > 18             #95.12        104.77
> > 19             #98.95        110.62
> > 20            #103.40        114.86
> > 21            #108.45        118.74
> > 22            #112.95        124.58
> > 23            #116.22        129.26
> > 24            #120.21        133.34
> > 25            #127.04        138.46
> > 26            #131.78        142.97
> > 27            #135.26        147.81
> > 28            #140.71        152.63
> > 29            #146.01        156.67
> > 30            #149.29        161.39
> > 31            #151.82        167.20
> > 32            #157.58        171.12
> > 33            #163.28        177.38
> > 34            #168.20        181.00
> > 35            #173.00        185.66
> > 36            #176.42        191.12
> > 37            #183.57        195.69
> > 38            #185.68        200.07
> > 39            #190.67        204.46
> > 40            #191.45        209.37
> >
> > ./speed -c -s 1-40 mpn_addmul_2 colfile=1,win64_bat/time_addmul2
> > overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> > 2664.77 MHz
> >          mpn_addmul_2 colfile=1,win64_bat/time_addmul2
> > 1                 n/a         #0.00
> > 2              #25.10         31.82
> > 3              #33.71         39.53
> > 4              #41.34         49.40
> > 5              #51.20         57.57
> > 6              #57.15         65.53
> > 7              #65.72         73.81
> > 8              #74.00         81.95
> > 9              #83.00         90.01
> > 10             #91.31         97.15
> > 11             #99.70        104.77
> > 12            #108.22        115.16
> > 13            #116.44        122.20
> > 14            #121.93        130.01
> > 15            #131.60        139.69
> > 16            #139.31        147.51
> > 17            #149.05        155.11
> > 18            #156.21        164.43
> > 19            #165.05        171.73
> > 20            #172.72        180.22
> > 21            #181.55        188.48
> > 22            #189.37        199.52
> > 23            #199.07        203.65
> > 24            #206.69        212.16
> > 25            #212.50        221.57
> > 26            #222.27        229.50
> > 27            #231.47        237.38
> > 28            #238.32        246.91
> > 29            #247.81        254.30
> > 30            #255.24        261.65
> > 31            #261.85        272.16
> > 32            #271.55        279.69
> > 33            #280.37        285.91
> > 34            #288.60        295.80
> > 35            #297.46        302.80
> > 36            #304.37        311.85
> > 37            #312.94        320.60
> > 38            #320.62        327.36
> > 39            #328.93        337.28
> > 40            #338.25        343.47
> >
> > ./speed -c -s 1-40 mpn_mul_basecase colfile=1,win64_bat/time_mulbase
> > overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> > 2664.77 MHz
> >         mpn_mul_basecase colfile=1,win64_bat/time_mulbase
> > 1               #9.52          8.57
> > 2              #28.79         48.57
> > 3              #50.85         72.54
> > 4              #82.68        100.00
> > 5             #114.62        135.24
> > 6             #155.92        178.51
> > 7             #210.50        232.87
> > 8             #267.02        289.22
> > 9             #341.31        438.35
> > 10            #416.70        431.50
> > 11            #504.12        523.84
> > 12            #592.83        612.98
> > 13            #698.18        727.01
> > 14            #803.90        828.87
> > 15            #927.52        951.48
> > 16           #1044.89       1069.58
> > 17           #1197.26       1222.67
> > 18           #1328.40       1350.14
> > 19           #1480.56       1504.67
> > 20           #1633.17       1652.56
> > 21           #1818.39       1838.12
> > 22           #1988.85       2004.68
> > 23           #2185.06       2194.37
> > 24           #2355.79       2372.55
> > 25           #2589.94       2599.87
> > 26           #2769.85       2792.36
> > 27           #2993.14       3007.09
> > 28           #3205.00       3403.94
> > 29           #3445.63       3475.97
> > 30           #3678.54       3708.50
> > 31           #3929.52       3976.62
> > 32           #4184.47       4208.07
> > 33           #4478.52       4500.55
> > 34           #4726.07       4756.50
> > 35           #5034.90       5048.47
> > 36           #5295.16       5317.34
> > 37           #5626.73       5646.50
> > 38           #5902.33       5917.02
> > 39           #6225.96       6252.33
> > 40           #6541.47       6563.06
> >
> > ./speed -c -s 1-40 mpn_sqr_basecase colfile=1,win64_bat/time_sqrbase
> > overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> > 2664.77 MHz
> >         mpn_sqr_basecase colfile=1,win64_bat/time_sqrbase
> > 1               #7.62          7.62
> > 2              #17.14         17.14
> > 3              #43.81         47.53
> > 4              #74.29         85.72
> > 5             #106.19        114.45
> > 6             #133.83        145.72
> > 7             #171.69        183.59
> > 8             #213.36        223.73
> > 9             #259.08        270.49
> > 10            #310.52        322.39
> > 11            #367.15        376.37
> > 12            #426.64        439.26
> > 13            #494.58        508.89
> > 14            #562.71        572.72
> > 15            #641.03        649.69
> > 16            #713.74        728.12
> > 17            #803.52        812.86
> > 18            #890.97        899.63
> > 19            #982.98        995.49
> > 20           #1082.37       1092.61
> > 21           #1183.96       1204.66
> > 22           #1291.96       1300.94
> > 23           #1405.15       1405.02
> > 24           #1521.08       1531.02
> > 25           #1651.29       1651.48
> > 26           #1769.72       1778.50
> > 27           #1895.14       1911.57
> > 28           #2030.37       2525.85
> > 29           #2195.53       2188.53
> > 30           #2317.56       2333.57
> > 31           #2470.87       2481.63
> > 32           #2627.25       2636.25
> > 33           #2791.54       2805.56
> > 34           #2936.54       2958.32
> > 35           #3118.37       3126.90
> > 36           #3280.58       3298.26
> > 37           #3454.08       3479.90
> > 38           #3648.92       3655.95
> > 39           #3844.89       3844.73
> > 40           #4031.90       4078.56
> >
> > The very small differences between mpn_add_n on Linux and Windows show
> > that the other differences are not just down to how we call the cpu timer
> > or function call overheads, therefore they are real timing differences
> > and not some artifact. So we hopefully can improve this. I would of
> > preferred AMD timings as I am more familiar with that chip.
>
> Hi Jason,
>
> We have done these comparisons before and none of these figures are
> surprising since they reflect the three different staregies that I
> have to use for *nix to Windows assembler code conversion.
>
> 1. If a *nix assembler function (a) doesn't use the stack, and (b) can
> leave two scratch registers unused, it can be converted by simply
> remapping the registers and, except for use of different registers, it
> will be identical on *nix and Windows.  This applies, for example, to
> mpn_add_n.  This is a Windows leaf function that does not need to
> support exception handling and stack unwinding. Other conversions
> require Windows frame functions with exception support.
>
> 2. If not enough scratch registers are available, I have to save and
> restore registers on the stack but when the function is simple enough
> I can still remap the registers.  This gives a constant (independent
> of limb count) overhead.
>
> 3. For complex functions remapping the registers can be too hard to do
> so in such cases I save registers on the stack and then move input
> parameters from their Windows registers to where the *nix assembler
> expects them to be.  This again gives a constant overhead, one that is
> a bit higher than in 2 above.
>
> To reduce the overhead in 2 it is necessary to change the assembler to
> use fewer registers. This is becaause rsi and rdi are scratch
> registers on *nix but not on Windows.
>
> Some overhead can be saved on 3 by remapping registers but these are
> functions where the overhead is typically a small proportion of the
> functions average cost since these are generally the functions where
> the cost is quadratic on limb count.
>
>      Brian
>


Hi

Just got my internet back , Phew...

I was aware of the different methods and why , I was just surprised by the 
size of differences , it seems like the windows versions are using more 
cycles than they "should". Even for the linux code I only really concentrated 
on the inner loops , there is no point in tuning the setup code while the 
inner loops are changing. I plan to change the inner loops yet again , so 
there is little point in changing the outer code now.

Jason



--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to 
[email protected]
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en
-~----------~----~----~----~------~----~------~--~---

[mpir-devel] Re: Linux and Window compairison

Reply via email to