>>Dmitry wrote:
>David Brown wrote:

Fellows,
can anybody send me a code which does multiplication of two longs (32bits)
with result of 64 bits for both signed and unsigned.

I haven't figured out the signed multiply yet (maybe I'll get a chance to
think about it at home tonight), but this *should* work for unsigned.  It's
untested as yet.  Using MAC saves a couple of instructions compared to bare
MPY.

Simultaneously to your mails, I noticed, that MSPGCC implements the 32x32=>64 HW multiplication in an inefficient way. I am not shure, if Dmitry is fixing this and therefore has asked, because this (wanted) solution does not follow the calling convention. Inspired by Davids solution I wrote one, which follows the calling convention:

uint64_t mul32(uint32_t arg_b, uint32_t arg_a) {
// (ah,al)*(bh,bl) = al*bl + al*bh*2^16 + ( ah*bl + ah*bh*2^16 )*2^16
//                 = al*bl + ( al*bh + ah*bl )*2^16 + ah*bh*2^32
        asm(    "push      R4"        );              // PUSH R4
        asm(    "mov       R12, &__MPY"   );      // MOV al,MPY
        asm(    "mov       R14, &__OP2"   );      // MOV bl,OP2
        asm(    "mov       R12, &__MPY"   );      // MOV al,MPY
        asm(    "mov       R13, R4"   );      // MOV ah,R4
        asm(    "mov       &__RESLO, R12" );      // MOV RESLO,al
        asm(    "mov       &__RESHI, R13");       // MOV RESHI,ah
        asm(    "mov       R15, &__OP2");         // MOV bh,OP2
        asm(    "mov       R4, &__MAC"    );      // MOV R4,MAC
        asm(    "mov       R14, &__OP2"   );      // MOV bl,OP2
        asm(    "mov       #0, R14"   );      // MOV #0,bl
        asm(    "mov       R15, &__MPY"   );      // MOV bh,MPY
        asm(    "mov       #0, R15"   );      // MOV #0,bh
        asm(    "add       &__RESLO, R13" );      // ADD RESLO,ah
        asm(    "addc      &__RESHI, R14");       // ADDC RESHI,bl
        asm(    "addc      &__SUMEXT, R15");      // ADDC SUMEXT,bh
        asm(    "mov       R4, &__OP2"    );      // MOV R4,OP2
        asm(    "add       &__RESLO, R14" );      // ADD RESLO,bl
        asm(    "addc      &__RESHI, R15" );      // ADDC RESHI,bh
        asm(    "pop       R4"        );              // POP R4
        /*RET*/
}

Note 1: This is 14 clocks slower than Davids solution. The reason is simple: David had more registers to play with. Note 2: Davids idea to accumulate the lowest multiplication result and the final result in the registers and _not_ in the MAC is faster than the following obvious solution (RAM->RAM transfers are too slow):

uint64_t mul32(uint32_t arg_b, uint32_t arg_a) {
// (ah,al)*(bh,bl) = al*bl + al*bh*2^16 + ( ah*bl + ah*bh*2^16 )*2^16
//                 = al*bl + ( al*bh + ah*bl )*2^16 + ah*bh*2^32
        asm(    "mov       R12, &__MPY"   );      // MOV al,MPY
        asm(    "mov       R14, &__OP2"   );      // MOV bl,OP2
        asm(    "mov       R12, &__MAC"   );      // MOV al,MAC
        asm(    "mov       &__RESLO, R12" );      // MOV RESLO,al
        asm(    "mov       &__RESHI, &__RESLO");      // MOV RESHI,RESLO
        asm(    "mov       #0, &__RESHI");                // MOV #0,RESHI
        asm(    "mov       R15, &__OP2"   );      // MOV bh,OP2
        asm(    "mov       R14, &__MAC"   );      // MOV bl,MAC
        asm(    "mov       R13, &__OP2"   );      // MOV ah,OP2
        asm(    "mov       R13, &__MAC"   );      // MOV ah,MAC
        asm(    "mov       &__RESLO, R13" );      // MOV RESLO,ah
        asm(    "mov       &__RESHI, &__RESLO");      // MOV RESHI,RESLO
        asm(    "mov       &__SUMEXT, &__RESHI");     // MOV SUMEXT,RESHI
        asm(    "mov       R15, &__OP2"   );      // MOV bh,OP2
        asm(    "mov       &__RESLO, R14" );      // MOV RESLO,bl
        asm(    "mov       &__RESHI, R15" );      // MOV RESHI,bh
        /*RET*/
}


Note 3: Can anybody tell me why MSPGCC "forgets" some of the lines of the following code (same solution like #2)? E.g. the last two asm-instructions disappear.

static uint64_t mul32(uint32_t arg_b, uint32_t arg_a) {
asm("mov %A[src], &__MPY" : : [src] "r" (arg_a));// MOV al,MPY
asm("mov %A[src], &__OP2" : : [src] "r" (arg_b));// MOV bl,OP2
asm("mov %A[src], &__MAC" : : [src] "r" (arg_a));// MOV al,MAC
asm("mov &__RESLO, %A[dst]" : [dst] "=r" (arg_a));// MOV RESLO,al
asm("mov &__RESHI, &__RESLO");// MOV RESHI,RESLO
asm("mov #0, &__RESHI");// MOV #0,RESHI
asm("mov %B[src], &__OP2" : : [src] "r" (arg_b));// MOV bh,OP2
asm("mov %A[src], &__MAC" : : [src] "r" (arg_b));// MOV bl,MAC
asm("mov %B[src], &__OP2" : : [src] "r" (arg_a));// MOV ah,OP2
asm("mov %B[src], &__MAC" : : [src] "r" (arg_a));// MOV ah,MAC
asm("mov &__RESLO, %B[dst]" : [dst] "=r" (arg_a));// MOV RESLO,ah
asm("mov &__RESHI, &__RESLO");// MOV RESHI,RESLO
asm("mov &__SUMEXT, &__RESHI");// MOV SUMEXT,RESHI
asm("mov %B[src], &__OP2" : : [src] "r" (arg_b));// MOV bh,OP2
asm("mov &__RESLO, %A[dst]" : [dst] "=r" (arg_b));// MOV RESLO,bl
asm("mov &__RESHI, %B[dst]" : [dst] "=r" (arg_b));// MOV RESHI,bh
/*RET*/
}

Ralf

Reply via email to