Re: Performance of int/long in Python 3

Neil Hodgson Wed, 03 Apr 2013 04:13:17 -0700

Dave Angel:

That would seem to imply that the speed regression on your data is NOT
caused by the differing size encodings. Perhaps it is the difference in
MSC compiler version, or other changes made between 3.2 and 3.3

Its not caused by there actually being different size encodings butthat the code is checking encoding size 2-4 times for each character.


   Back in 3.2 the comparison loop looked like:

    while (len1 > 0 && len2 > 0) {
        Py_UNICODE c1, c2;

        c1 = *s1++;
        c2 = *s2++;

        if (c1 != c2)
            return (c1 < c2) ? -1 : 1;

        len1--; len2--;
    }

   For 3.3 this has changed to

    for (i = 0; i < len1 && i < len2; ++i) {
        Py_UCS4 c1, c2;
        c1 = PyUnicode_READ(kind1, data1, i);
        c2 = PyUnicode_READ(kind2, data2, i);

        if (c1 != c2)
            return (c1 < c2) ? -1 : 1;
    }

        with PyUnicode_READ being

#define PyUnicode_READ(kind, data, index) \
    ((Py_UCS4) \
    ((kind) == PyUnicode_1BYTE_KIND ? \
        ((const Py_UCS1 *)(data))[(index)] : \
        ((kind) == PyUnicode_2BYTE_KIND ? \
            ((const Py_UCS2 *)(data))[(index)] : \
            ((const Py_UCS4 *)(data))[(index)] \
        ) \
    ))

There are either 1 or 2 kind checks in each call to PyUnicode_READand 2 calls to PyUnicode_READ inside the loop. A compiler may decide tomove the kind checks out of the loop and specialize the loop but MSVC2010 appears to not do so. The assembler (32-bit build) for eachPyUnicode_READ looks like


        mov     ecx, DWORD PTR _kind1$[ebp]
        cmp     ecx, 1
        jne     SHORT $LN17@unicode_co@2
        lea     ecx, DWORD PTR [ebx+eax]
        movzx   edx, BYTE PTR [ecx+edx]
        jmp     SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
        cmp     ecx, 2
        jne     SHORT $LN15@unicode_co@2
        movzx   edx, WORD PTR [ebx+edi]
        jmp     SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
        mov     edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:

The kind1/kind2 variables aren't even going into registers and atleast one test+branch and a jump are executed for every character. Twotests for 2 and 4 byte kinds. len1 and len2 don't get to go intoregisters either.


   Here's the full assembler output for unicode_compare:

;       COMDAT _unicode_compare
_TEXT   SEGMENT
_kind2$ = -20                                           ; size = 4
_kind1$ = -16                                           ; size = 4
_len2$ = -12                                            ; size = 4
_len1$ = -8                                             ; size = 4
_data2$ = -4                                            ; size = 4
_unicode_compare PROC                                   ; COMDAT
; _str1$ = ecx
; _str2$ = eax

; 10417: {

        push    ebp
        mov     ebp, esp
        sub     esp, 20                                 ; 00000014H
        push    ebx
        push    esi
        mov     esi, eax

; 10418:     int kind1, kind2;
; 10419:     void *data1, *data2;
; 10420:     Py_ssize_t len1, len2, i;
; 10421:
; 10422:     kind1 = PyUnicode_KIND(str1);

        mov     eax, DWORD PTR [ecx+16]
        mov     edx, eax
        shr     edx, 2
        and     edx, 7
        push    edi
        mov     DWORD PTR _kind1$[ebp], edx

; 10423:     kind2 = PyUnicode_KIND(str2);

        mov     edx, DWORD PTR [esi+16]
        mov     edi, edx
        shr     edi, 2
        and     edi, 7
        mov     DWORD PTR _kind2$[ebp], edi

; 10424:     data1 = PyUnicode_DATA(str1);

        test    al, 32                                  ; 00000020H
        je      SHORT $LN9@unicode_co@2
        test    al, 64                                  ; 00000040H
        je      SHORT $LN7@unicode_co@2
        lea     ebx, DWORD PTR [ecx+24]
        jmp     SHORT $LN10@unicode_co@2
$LN7@unicode_co@2:
        lea     ebx, DWORD PTR [ecx+36]
        jmp     SHORT $LN10@unicode_co@2
$LN9@unicode_co@2:
        mov     ebx, DWORD PTR [ecx+36]
$LN10@unicode_co@2:

; 10425:     data2 = PyUnicode_DATA(str2);

        test    dl, 32                                  ; 00000020H
        je      SHORT $LN13@unicode_co@2
        test    dl, 64                                  ; 00000040H
        je      SHORT $LN11@unicode_co@2
        lea     edx, DWORD PTR [esi+24]
        jmp     SHORT $LN30@unicode_co@2
$LN11@unicode_co@2:
        lea     eax, DWORD PTR [esi+36]
        mov     DWORD PTR _data2$[ebp], eax
        mov     edx, eax
        jmp     SHORT $LN14@unicode_co@2
$LN13@unicode_co@2:
        mov     edx, DWORD PTR [esi+36]
$LN30@unicode_co@2:
        mov     DWORD PTR _data2$[ebp], edx
$LN14@unicode_co@2:

; 10426:     len1 = PyUnicode_GET_LENGTH(str1);

        mov     edi, DWORD PTR [ecx+8]

; 10427:     len2 = PyUnicode_GET_LENGTH(str2);

        mov     ecx, DWORD PTR [esi+8]

; 10428:
; 10429:     for (i = 0; i < len1 && i < len2; ++i) {

        xor     eax, eax
        mov     DWORD PTR _len1$[ebp], edi
        mov     DWORD PTR _len2$[ebp], ecx
        test    edi, edi
        jle     SHORT $LN2@unicode_co@2

; 10426:     len1 = PyUnicode_GET_LENGTH(str1);

        mov     esi, edx
        mov     edi, edx

; 10428:
; 10429:     for (i = 0; i < len1 && i < len2; ++i) {

        sub     ebx, edx
        jmp     SHORT $LN4@unicode_co@2
$LL28@unicode_co@2:
        mov     edx, DWORD PTR _data2$[ebp]
$LN4@unicode_co@2:
        cmp     eax, ecx
        jge     SHORT $LN29@unicode_co@2

; 10430:         Py_UCS4 c1, c2;
; 10431:         c1 = PyUnicode_READ(kind1, data1, i);

        mov     ecx, DWORD PTR _kind1$[ebp]
        cmp     ecx, 1
        jne     SHORT $LN17@unicode_co@2
        lea     ecx, DWORD PTR [ebx+eax]
        movzx   edx, BYTE PTR [ecx+edx]
        jmp     SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
        cmp     ecx, 2
        jne     SHORT $LN15@unicode_co@2
        movzx   edx, WORD PTR [ebx+edi]
        jmp     SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
        mov     edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:

; 10432:         c2 = PyUnicode_READ(kind2, data2, i);

        mov     ecx, DWORD PTR _kind2$[ebp]
        cmp     ecx, 1
        jne     SHORT $LN21@unicode_co@2
        mov     ecx, DWORD PTR _data2$[ebp]
        movzx   ecx, BYTE PTR [eax+ecx]
        jmp     SHORT $LN20@unicode_co@2
$LN21@unicode_co@2:
        cmp     ecx, 2
        jne     SHORT $LN19@unicode_co@2
        movzx   ecx, WORD PTR [edi]
        jmp     SHORT $LN20@unicode_co@2
$LN19@unicode_co@2:
        mov     ecx, DWORD PTR [esi]
$LN20@unicode_co@2:

; 10433:
; 10434:         if (c1 != c2)

        cmp     edx, ecx
        jne     SHORT $LN31@unicode_co@2
        mov     ecx, DWORD PTR _len2$[ebp]
        inc     eax
        add     edi, 2
        add     esi, 4
        cmp     eax, DWORD PTR _len1$[ebp]
        jl      SHORT $LL28@unicode_co@2
$LN29@unicode_co@2:
        mov     edi, DWORD PTR _len1$[ebp]
$LN2@unicode_co@2:

; 10436:     }
; 10437:
; 10438:     return (len1 < len2) ? -1 : (len1 != len2);

        cmp     edi, ecx
        jge     SHORT $LN23@unicode_co@2
        pop     edi
        pop     esi
        or      eax, -1
        pop     ebx

; 10439: }

        mov     esp, ebp
        pop     ebp
        ret     0
$LN31@unicode_co@2:

; 10435:             return (c1 < c2) ? -1 : 1;

        sbb     eax, eax
        pop     edi
        and     eax, -2                                 ; fffffffeH
        pop     esi
        inc     eax
        pop     ebx

; 10439: }

        mov     esp, ebp
        pop     ebp
        ret     0
$LN23@unicode_co@2:

; 10436:     }
; 10437:
; 10438:     return (len1 < len2) ? -1 : (len1 != len2);

        xor     eax, eax
        cmp     edi, ecx
        pop     edi
        pop     esi
        setne   al
        pop     ebx

; 10439: }

        mov     esp, ebp
        pop     ebp
        ret     0
_unicode_compare ENDP

   Neil
--
http://mail.python.org/mailman/listinfo/python-list

Re: Performance of int/long in Python 3

Reply via email to