[mpir-devel] Re: Linux and Window compairison

Cactus Fri, 17 Jul 2009 00:32:34 -0700

On Jul 16, 11:55 pm, Jason Moxham <[email protected]> wrote:
> A comparison of MPIR-1.2.1 on intel nehalem  built as a core2 on Linux and
> Windows.
>
> This is in cycles and first col is Linux and second col is windows , ignore
> the # , they are wrong
>
> ./speed -c -s 1-40 mpn_add_n colfile=1,win64_bat/time_add
> overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> 2664.77 MHz
>             mpn_add_n colfile=1,win64_bat/time_add
> 1              #12.38         12.38
> 2              #11.43         12.38
> 3              #13.09         13.50
> 4              #18.41         18.10
> 5              #23.81         23.81
> 6              #24.00         24.45
> 7              #27.76         27.25
> 8              #26.19         26.19
> 9              #28.42         30.00
> 10             #29.62         31.43
> 11             #32.31         32.86
> 12             #34.50         34.29
> 13             #37.39         37.14
> 14             #40.00         38.73
> 15             #41.91         42.86
> 16             #40.48         42.22
> 17             #44.77         45.72
> 18             #45.85         46.77
> 19             #48.39         48.19
> 20             #49.53         49.76
> 21             #53.66         53.53
> 22             #53.81         54.65
> 23             #55.24         56.33
> 24             #56.91         57.31
> 25             #60.48         60.96
> 26             #60.84         62.65
> 27             #62.60         63.43
> 28             #65.72         64.76
> 29             #68.58         67.46
> 30             #67.39         69.80
> 31             #72.67         69.53
> 32             #72.39         72.96
> 33             #75.72         77.15
> 34             #76.91         78.81
> 35             #76.20         78.26
> 36             #79.06         80.72
> 37             #83.34         83.43
> 38             #84.77         84.90
> 39             #85.62         85.34
> 40             #87.49         88.58
>
> ./speed -c -s 1-40 mpn_addmul_1.333 colfile=1,win64_bat/time_addmul1
> overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> 2664.77 MHz
>         mpn_addmul_1.333 colfile=1,win64_bat/time_addmul1
> 1              #11.43         37.14
> 2              #17.15         28.33
> 3              #23.15         31.43
> 4              #29.05         39.53
> 5              #34.62         46.11
> 6              #40.52         49.23
> 7              #45.72         54.29
> 8              #51.75         57.60
> 9              #57.15         65.12
> 10             #63.09         67.21
> 11             #69.00         72.63
> 12             #74.97         75.98
> 13             #80.01         82.07
> 14             #76.77         85.72
> 15             #80.32         90.48
> 16             #84.24         94.29
> 17             #91.04        101.52
> 18             #95.12        104.77
> 19             #98.95        110.62
> 20            #103.40        114.86
> 21            #108.45        118.74
> 22            #112.95        124.58
> 23            #116.22        129.26
> 24            #120.21        133.34
> 25            #127.04        138.46
> 26            #131.78        142.97
> 27            #135.26        147.81
> 28            #140.71        152.63
> 29            #146.01        156.67
> 30            #149.29        161.39
> 31            #151.82        167.20
> 32            #157.58        171.12
> 33            #163.28        177.38
> 34            #168.20        181.00
> 35            #173.00        185.66
> 36            #176.42        191.12
> 37            #183.57        195.69
> 38            #185.68        200.07
> 39            #190.67        204.46
> 40            #191.45        209.37
>
> ./speed -c -s 1-40 mpn_addmul_2 colfile=1,win64_bat/time_addmul2
> overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> 2664.77 MHz
>          mpn_addmul_2 colfile=1,win64_bat/time_addmul2
> 1                 n/a         #0.00
> 2              #25.10         31.82
> 3              #33.71         39.53
> 4              #41.34         49.40
> 5              #51.20         57.57
> 6              #57.15         65.53
> 7              #65.72         73.81
> 8              #74.00         81.95
> 9              #83.00         90.01
> 10             #91.31         97.15
> 11             #99.70        104.77
> 12            #108.22        115.16
> 13            #116.44        122.20
> 14            #121.93        130.01
> 15            #131.60        139.69
> 16            #139.31        147.51
> 17            #149.05        155.11
> 18            #156.21        164.43
> 19            #165.05        171.73
> 20            #172.72        180.22
> 21            #181.55        188.48
> 22            #189.37        199.52
> 23            #199.07        203.65
> 24            #206.69        212.16
> 25            #212.50        221.57
> 26            #222.27        229.50
> 27            #231.47        237.38
> 28            #238.32        246.91
> 29            #247.81        254.30
> 30            #255.24        261.65
> 31            #261.85        272.16
> 32            #271.55        279.69
> 33            #280.37        285.91
> 34            #288.60        295.80
> 35            #297.46        302.80
> 36            #304.37        311.85
> 37            #312.94        320.60
> 38            #320.62        327.36
> 39            #328.93        337.28
> 40            #338.25        343.47
>
> ./speed -c -s 1-40 mpn_mul_basecase colfile=1,win64_bat/time_mulbase
> overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> 2664.77 MHz
>         mpn_mul_basecase colfile=1,win64_bat/time_mulbase
> 1               #9.52          8.57
> 2              #28.79         48.57
> 3              #50.85         72.54
> 4              #82.68        100.00
> 5             #114.62        135.24
> 6             #155.92        178.51
> 7             #210.50        232.87
> 8             #267.02        289.22
> 9             #341.31        438.35
> 10            #416.70        431.50
> 11            #504.12        523.84
> 12            #592.83        612.98
> 13            #698.18        727.01
> 14            #803.90        828.87
> 15            #927.52        951.48
> 16           #1044.89       1069.58
> 17           #1197.26       1222.67
> 18           #1328.40       1350.14
> 19           #1480.56       1504.67
> 20           #1633.17       1652.56
> 21           #1818.39       1838.12
> 22           #1988.85       2004.68
> 23           #2185.06       2194.37
> 24           #2355.79       2372.55
> 25           #2589.94       2599.87
> 26           #2769.85       2792.36
> 27           #2993.14       3007.09
> 28           #3205.00       3403.94
> 29           #3445.63       3475.97
> 30           #3678.54       3708.50
> 31           #3929.52       3976.62
> 32           #4184.47       4208.07
> 33           #4478.52       4500.55
> 34           #4726.07       4756.50
> 35           #5034.90       5048.47
> 36           #5295.16       5317.34
> 37           #5626.73       5646.50
> 38           #5902.33       5917.02
> 39           #6225.96       6252.33
> 40           #6541.47       6563.06
>
> ./speed -c -s 1-40 mpn_sqr_basecase colfile=1,win64_bat/time_sqrbase
> overhead 5.71 cycles, precision 1000000 units of 3.75e-10 secs, CPU freq
> 2664.77 MHz
>         mpn_sqr_basecase colfile=1,win64_bat/time_sqrbase
> 1               #7.62          7.62
> 2              #17.14         17.14
> 3              #43.81         47.53
> 4              #74.29         85.72
> 5             #106.19        114.45
> 6             #133.83        145.72
> 7             #171.69        183.59
> 8             #213.36        223.73
> 9             #259.08        270.49
> 10            #310.52        322.39
> 11            #367.15        376.37
> 12            #426.64        439.26
> 13            #494.58        508.89
> 14            #562.71        572.72
> 15            #641.03        649.69
> 16            #713.74        728.12
> 17            #803.52        812.86
> 18            #890.97        899.63
> 19            #982.98        995.49
> 20           #1082.37       1092.61
> 21           #1183.96       1204.66
> 22           #1291.96       1300.94
> 23           #1405.15       1405.02
> 24           #1521.08       1531.02
> 25           #1651.29       1651.48
> 26           #1769.72       1778.50
> 27           #1895.14       1911.57
> 28           #2030.37       2525.85
> 29           #2195.53       2188.53
> 30           #2317.56       2333.57
> 31           #2470.87       2481.63
> 32           #2627.25       2636.25
> 33           #2791.54       2805.56
> 34           #2936.54       2958.32
> 35           #3118.37       3126.90
> 36           #3280.58       3298.26
> 37           #3454.08       3479.90
> 38           #3648.92       3655.95
> 39           #3844.89       3844.73
> 40           #4031.90       4078.56
>
> The very small differences between mpn_add_n on Linux and Windows show that
> the other differences are not just down to how we call the cpu timer or
> function call overheads, therefore they are real timing differences and not
> some artifact. So we hopefully can improve this. I would of preferred AMD
> timings as I am more familiar with that chip.

Hi Jason,

We have done these comparisons before and none of these figures are
surprising since they reflect the three different staregies that I
have to use for *nix to Windows assembler code conversion.

1. If a *nix assembler function (a) doesn't use the stack, and (b) can
leave two scratch registers unused, it can be converted by simply
remapping the registers and, except for use of different registers, it
will be identical on *nix and Windows.  This applies, for example, to
mpn_add_n.  This is a Windows leaf function that does not need to
support exception handling and stack unwinding. Other conversions
require Windows frame functions with exception support.

2. If not enough scratch registers are available, I have to save and
restore registers on the stack but when the function is simple enough
I can still remap the registers.  This gives a constant (independent
of limb count) overhead.

3. For complex functions remapping the registers can be too hard to do
so in such cases I save registers on the stack and then move input
parameters from their Windows registers to where the *nix assembler
expects them to be.  This again gives a constant overhead, one that is
a bit higher than in 2 above.

To reduce the overhead in 2 it is necessary to change the assembler to
use fewer registers. This is becaause rsi and rdi are scratch
registers on *nix but not on Windows.

Some overhead can be saved on 3 by remapping registers but these are
functions where the overhead is typically a small proportion of the
functions average cost since these are generally the functions where
the cost is quadratic on limb count.

     Brian

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"mpir-devel" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to 
[email protected]
For more options, visit this group at 
http://groups.google.com/group/mpir-devel?hl=en
-~----------~----~----~----~------~----~------~--~---
[mpir-devel] Re: Linux and Window compairison

Reply via email to