The branch OpenSSL-fips-2_0-stable has been updated
       via  e576b67e1a79a846d45b641b73ee378db212d763 (commit)
       via  7d91d9ea6b2281d847e6dcb05e6e3bbd88b60404 (commit)
      from  e1a9268d81238aa12acfb9725a13c858c8937cd7 (commit)


- Log -----------------------------------------------------------------
commit e576b67e1a79a846d45b641b73ee378db212d763
Author: Andy Polyakov <[email protected]>
Date:   Fri Nov 25 11:52:06 2016 +0100

    c6x/* "facelift":
    
    - make scripts executable;
    - "parameterize" platform selection in c6x/do_fips;
    - add c6x/fips_algvs.mak;
    - add c6x/run6x.js launcher for more recent CCS versions;
    
    Reviewed-by: Rich Salz <[email protected]>
    Reviewed-by: Tim Hudson <[email protected]>
    Reviewed-by: Stephen Henson <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/4265)
    
    (cherry picked from commit 781280094ad389e8958631b97e70f498becbd9cb)

commit 7d91d9ea6b2281d847e6dcb05e6e3bbd88b60404
Author: Andy Polyakov <[email protected]>
Date:   Fri Nov 25 13:11:09 2016 +0100

    Add some C64x assembly modules [by minor adjustments of C64x+ modules].
    
    AES, SHA256 and SHA512 modules can actually replace corresponding
    C64x+ modules. This is because C64x+ instructions don't actually
    provide "killer-argument" advantage in these modules. As for SHA1,
    even though its performance exactly same, C64x+ module is more
    responsive to interrupts, i.e. doesn't inhibit them for as long
    periods as C64x module.
    
    Reviewed-by: Rich Salz <[email protected]>
    Reviewed-by: Tim Hudson <[email protected]>
    Reviewed-by: Stephen Henson <[email protected]>
    (Merged from https://github.com/openssl/openssl/pull/4265)
    
    (cherry picked from commit 5526e5791f1426553b6f4806d1ac82efd6ab33bc)

-----------------------------------------------------------------------

Summary of changes:
 Configure                                          |   3 +-
 c6x/do_fips                                        |   7 +-
 c6x/fips_algvs.mak                                 |  14 ++
 c6x/fips_standalone_sha1                           |   0
 c6x/incore6x                                       |   0
 c6x/run6x                                          |   0
 c6x/run6x.js                                       |  91 ++++++++
 crypto/aes/asm/{aes-c64xplus.pl => aes-c64x.pl}    | 176 ++++++++++------
 crypto/{c64xpluscpuid.pl => c64xcpuid.pl}          | 170 +++++++++++----
 crypto/sha/asm/sha1-c64x-large.pl                  | 230 +++++++++++++++++++++
 crypto/sha/asm/{sha1-c64xplus.pl => sha1-c64x.pl}  |  85 ++++----
 .../sha/asm/{sha256-c64xplus.pl => sha256-c64x.pl} |  49 +++--
 .../sha/asm/{sha512-c64xplus.pl => sha512-c64x.pl} |  75 ++++---
 test/fips_algvs.c                                  |   2 +-
 util/mk1mf.pl                                      |   2 +-
 15 files changed, 713 insertions(+), 191 deletions(-)
 mode change 100644 => 100755 c6x/do_fips
 create mode 100644 c6x/fips_algvs.mak
 mode change 100644 => 100755 c6x/fips_standalone_sha1
 mode change 100644 => 100755 c6x/incore6x
 mode change 100644 => 100755 c6x/run6x
 create mode 100755 c6x/run6x.js
 copy crypto/aes/asm/{aes-c64xplus.pl => aes-c64x.pl} (93%)
 copy crypto/{c64xpluscpuid.pl => c64xcpuid.pl} (56%)
 create mode 100644 crypto/sha/asm/sha1-c64x-large.pl
 copy crypto/sha/asm/{sha1-c64xplus.pl => sha1-c64x.pl} (85%)
 copy crypto/sha/asm/{sha256-c64xplus.pl => sha256-c64x.pl} (89%)
 copy crypto/sha/asm/{sha512-c64xplus.pl => sha512-c64x.pl} (89%)

diff --git a/Configure b/Configure
index 4fff98b..0753675 100755
--- a/Configure
+++ b/Configure
@@ -624,13 +624,14 @@ my %table=(
 "uClinux-dist64","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) 
\$(LDLIBS):SIXTY_FOUR_BIT_LONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",
 
 "c64xplus","cl6x:-mv6400+ -o2 -ox -ms -pden -DNO_SYS_TYPES_H 
-DGETPID_IS_MEANINGLESS -DMD32_REG_T=int 
-DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o
 c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o 
sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:",
+"c64x","cl6x:-mv6400 -o2 -ox -ms -as -pden -DNO_SYS_TYPES_H 
-DGETPID_IS_MEANINGLESS -DMD32_REG_T=int 
-DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS:::c64xcpuid.o:::aes-c64x.o aes_cbc.o 
aes_ctr.o:::sha1-c64x.o sha256-c64x.o sha512-c64x.o:::::::::void:",
 
 );
 
 my @MK1MF_Builds=qw(VC-WIN64I VC-WIN64A
                    debug-VC-WIN64I debug-VC-WIN64A
                    VC-NT VC-CE VC-WIN32 debug-VC-WIN32
-                   BC-32 c64xplus
+                   BC-32 c64xplus c64x
                    netware-clib netware-clib-bsdsock
                    netware-libc netware-libc-bsdsock);
 
diff --git a/c6x/do_fips b/c6x/do_fips
old mode 100644
new mode 100755
index c1c29fc..4045e60
--- a/c6x/do_fips
+++ b/c6x/do_fips
@@ -1,6 +1,11 @@
 #!/bin/sh
 
-perl Configure c64xplus fipscanisteronly no-engine
+if ! which cl6x > /dev/null 2>&1; then
+       echo 'fatal: cl6x is not on $PATH'
+       exit 1
+fi
+
+perl Configure ${C6XPLATFORM:-c64xplus} fipscanisteronly no-engine
 perl util/mkfiles.pl > MINFO
 perl util/mk1mf.pl auto > c6x/fips.mak
 make -f c6x/fips.mak
diff --git a/c6x/fips_algvs.mak b/c6x/fips_algvs.mak
new file mode 100644
index 0000000..7f67927
--- /dev/null
+++ b/c6x/fips_algvs.mak
@@ -0,0 +1,14 @@
+CC=cl6x
+CFLAGS=-mv$${C6XSILICON:-6400+} -o2 -I. -Ic6x/inc -Ifips -DNO_SYS_TYPES_H
+OBJ_D=c6x/tmp
+OUT_D=c6x
+
+all:   $(OUT_D)/fips_algvs.out
+
+$(OBJ_D)/fips_algvs.obj:       test/fips_algvs.c
+       $(CC) --obj_directory=$(OBJ_D) $(CFLAGS) -c $<
+
+$(OUT_D)/fips_algvs.out:       $(OBJ_D)/fips_algvs.obj 
$(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+       $(OUT_D)/fips_standalone_sha1 -verify $(OUT_D)/fipscanister.obj
+       $(CC) -z -o $@ -m $(OUT_D)/fips_algvs.map $< $(OUT_D)/fipscanister.obj 
c6x/fips_algvs.cmd
+       $(OUT_D)/incore6x $@ || rm $@
diff --git a/c6x/fips_standalone_sha1 b/c6x/fips_standalone_sha1
old mode 100644
new mode 100755
diff --git a/c6x/incore6x b/c6x/incore6x
old mode 100644
new mode 100755
diff --git a/c6x/run6x b/c6x/run6x
old mode 100644
new mode 100755
diff --git a/c6x/run6x.js b/c6x/run6x.js
new file mode 100755
index 0000000..6d94949
--- /dev/null
+++ b/c6x/run6x.js
@@ -0,0 +1,91 @@
+#!/usr/bin/env dss.sh
+//
+// Debug Server Scripting C6x launcher.
+//
+
+importPackage(Packages.com.ti.debug.engine.scripting);
+importPackage(Packages.com.ti.ccstudio.scripting.environment);
+importPackage(Packages.java.lang);
+
+if (arguments.length == 0) {
+    // Extract script name from eclipse
+    var regex = new RegExp("-dss\\.rhinoArgs\n(.*)");
+    var matches = regex.exec(environment["eclipse.commands"]);
+
+    System.err.println("Usage: " + matches[1] + " executable [args]");
+    System.err.println();
+    System.err.println("You're also required to set CCSTARGETCONFIG " +
+                       "environment variable to appoint");
+    System.err.println("proper .ccxml file, customarily one of " +
+                       "$HOME/ti/CCSTargetConfigurations/*.ccxml");
+    quit(1);
+}
+
+try {
+    var prog = arguments[0];
+    var script = ScriptingEnvironment.instance();
+
+    var debugServer = script.getServer("DebugServer.1");
+
+    // CCSTARGETCONFIG environment variable should point at proper .ccxml,
+    // customarily one of $HOME/ti/CCSTargetConfigurations/*.ccxml.
+    debugServer.setConfig(System.getenv("CCSTARGETCONFIG"));
+
+    var debugSession = debugServer.openSession("*", "*");
+
+    // Redirect GEL output to |prog|.gel file, so that it doesn't clobber
+    // standard output from the program...
+    var dot = prog.lastIndexOf(".");
+    var gel_out = prog + ".gel";
+    if (dot > 0) {
+        gel_out = prog.substr(0,dot) + ".gel";
+    }
+    debugSession.expression.evaluate('GEL_EnableFileOutput("'
+                                      + gel_out + '", 0, 0)');
+
+    debugSession.target.connect();
+
+    // It should be noted that "current working directory" for program
+    // executed on the target system is one where |prog| resides, and
+    // not where script executed [as one would expect]...
+    debugSession.memory.loadProgram(prog, arguments);
+
+    // Pull exit()'s address and set breakpoint, then just execute till
+    // it's reached...
+    var exitAddr = debugSession.symbol.getAddress("exit");
+    debugSession.breakpoint.add(exitAddr);
+
+    while (1) {
+        debugSession.target.run();
+
+        var PC = debugSession.expression.evaluate("PC");
+        if (PC == exitAddr) {
+            break;
+        }
+    }
+
+    // Snatch value passed to exit(), so that it can be passed down to
+    // shell as exit code from this script...
+    var exitCode = debugSession.expression.evaluate("A4");
+
+    // Last run to termination...
+    debugSession.target.run();
+    // Clean up...
+    debugSession.terminate();
+    debugServer.stop();
+
+    // It should be noted that there is kind of a bug in C6x run-time.
+    // Return value from main() is not passed to last implicit exit()
+    // call [as it would on other systems], but instead constant 1 is
+    // passed, which conventionally indicates an error. So that if one
+    // wants to pass specific exit code, or even 0 indicating "success",
+    // one has to call exit() explicitly instead of relying on value
+    // returned by main()...
+    quit(exitCode);
+
+} catch (e) {
+    // We catch everything, because default handler terminates script with
+    // "success" exit code upon exception...
+    System.err.println(e.rhinoException);
+    quit(139);
+}
diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64x.pl
similarity index 93%
copy from crypto/aes/asm/aes-c64xplus.pl
copy to crypto/aes/asm/aes-c64x.pl
index 206d7dc..0817128 100644
--- a/crypto/aes/asm/aes-c64xplus.pl
+++ b/crypto/aes/asm/aes-c64x.pl
@@ -7,9 +7,9 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# [Endian-neutral] AES for C64x+.
+# [Endian-neutral] AES for C64x.
 #
-# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# Even though loops are scheduled for 13 cycles, and thus expected
 # performance is ~8.5 cycles per byte processed with 128-bit key,
 # measured performance turned to be ~10 cycles per byte. Discrepancy
 # must be caused by limitations of L1D memory banking(*), see SPRU871
@@ -45,6 +45,18 @@ open STDOUT,">$output";
 $code=<<___;
        .text
 
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .nocmp
+       .asg    AES_encrypt,_AES_encrypt
+       .asg    AES_decrypt,_AES_decrypt
+       .asg    AES_set_encrypt_key,_AES_set_encrypt_key
+       .asg    AES_set_decrypt_key,_AES_set_decrypt_key
+       .asg    AES_ctr32_encrypt,_AES_ctr32_encrypt
+       .endif
+
        .asg    B3,RA
        .asg    A4,INP
        .asg    B4,OUT
@@ -75,13 +87,23 @@ _AES_encrypt:
        .asmfunc
        MVK     1,B2
 __encrypt:
+       .if     __TI_EABI__
    [B2]        LDNDW   *INP++,A9:A8                    ; load input
-||     MVKL    (AES_Te-_AES_encrypt),$TEA
-||     ADDKPC  _AES_encrypt,B0
+||     MVKL    \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||     ADDKPC  __encrypt,B0
    [B2]        LDNDW   *INP++,B9:B8
-||     MVKH    (AES_Te-_AES_encrypt),$TEA
+||     MVKH    \$PCR_OFFSET(AES_Te,__encrypt),$TEA
 ||     ADD     0,KEY,$KPA
 ||     ADD     4,KEY,$KPB
+       .else
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    (AES_Te-__encrypt),$TEA
+||     ADDKPC  __encrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    (AES_Te-__encrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .endif
        LDW     *$KPA++[2],$Te0[0]              ; zero round key
 ||     LDW     *$KPB++[2],$Te0[1]
 ||     MVK     60,A0
@@ -107,15 +129,14 @@ __encrypt:
 ||     XOR     $Te0[1],$s[1],$s[1]
 ||     LDW     *$KPA++[2],$K[0]                ; 1st round key
 ||     LDW     *$KPB++[2],$K[1]
-       SUB     B0,2,B0
 
-       SPLOOPD 13
-||     MVC     B0,ILC
-||     LDW     *$KPA++[2],$K[2]
+       LDW     *$KPA++[2],$K[2]
 ||     LDW     *$KPB++[2],$K[3]
-;;====================================================================
-       EXTU    $s[1],EXT1,24,$Te1[1]
+||     EXTU    $s[1],EXT1,24,$Te1[1]
 ||     EXTU    $s[0],EXT3,24,$Te3[0]
+||     SUB     B0,1,B0
+;;====================================================================
+enc_loop?:
        LDW     *${TEB}[$Te1[1]],$Te1[1]        ; Te1[s1>>8],   t0
 ||     LDW     *${TEA}[$Te3[0]],$Te3[0]        ; Te3[s0>>24],  t1
 ||     XOR     $s[2],$Te0[2],$s[2]             ; modulo-scheduled
@@ -150,12 +171,14 @@ __encrypt:
 ||     ROTL    $Te1[0],TBL1,$Te3[1]            ; t3
 ||     EXTU    $s[2],EXT0,24,$Te0[2]
 ||     EXTU    $s[3],EXT0,24,$Te0[3]
+|| [B0]        SUB     B0,1,B0
        LDW     *${TEA}[$Te0[2]],$Te0[2]        ; Te0[s2],      t2
 ||     LDW     *${TEB}[$Te0[3]],$Te0[3]        ; Te0[s3],      t3
 ||     ROTL    $Te2[2],TBL2,$Te2[2]            ; t0
 ||     ROTL    $Te2[3],TBL2,$Te2[3]            ; t1
 ||     XOR     $K[0],$Te3[0],$s[0]
 ||     XOR     $K[1],$Te1[1],$s[1]
+|| [B0]        BNOP    enc_loop?
        ROTL    $Te3[3],TBL3,$Te1[2]            ; t0
 ||     ROTL    $Te1[2],TBL1,$Te3[3]            ; t1
 ||     XOR     $K[2],$Te1[0],$s[2]
@@ -176,14 +199,13 @@ __encrypt:
 ||     XOR     $s[3],$Te2[1],$s[3]
 ||     XOR     $s[0],$Te0[0],$s[0]
 ||     XOR     $s[1],$Te0[1],$s[1]
-       SPKERNEL
-||     XOR.L   $s[2],$Te3[2],$s[2]
-||     XOR.L   $s[3],$Te1[3],$s[3]
-;;====================================================================
-       ADD.D   ${TEA},A0,${TEA}                ; point to Te4
-||     ADD.D   ${TEB},A0,${TEB}
+       XOR     $s[2],$Te3[2],$s[2]
+||     XOR     $s[3],$Te1[3],$s[3]
 ||     EXTU    $s[1],EXT1,24,$Te1[1]
 ||     EXTU    $s[0],EXT3,24,$Te3[0]
+||[!B0]        ADD     ${TEA},A0,${TEA}                ; point to Te4
+||[!B0]        ADD     ${TEB},A0,${TEB}
+;;====================================================================
        LDBU    *${TEB}[$Te1[1]],$Te1[1]        ; Te1[s1>>8],   t0
 ||     LDBU    *${TEA}[$Te3[0]],$Te3[0]        ; Te3[s0>>24],  t1
 ||     XOR     $s[2],$Te0[2],$s[2]             ; modulo-scheduled
@@ -277,13 +299,23 @@ _AES_decrypt:
        .asmfunc
        MVK     1,B2
 __decrypt:
+       .if     __TI_EABI__
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||     ADDKPC  __decrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .else
    [B2]        LDNDW   *INP++,A9:A8                    ; load input
-||     MVKL    (AES_Td-_AES_decrypt),$TEA
-||     ADDKPC  _AES_decrypt,B0
+||     MVKL    (AES_Td-__decrypt),$TEA
+||     ADDKPC  __decrypt,B0
    [B2]        LDNDW   *INP++,B9:B8
-||     MVKH    (AES_Td-_AES_decrypt),$TEA
+||     MVKH    (AES_Td-__decrypt),$TEA
 ||     ADD     0,KEY,$KPA
 ||     ADD     4,KEY,$KPB
+       .endif
        LDW     *$KPA++[2],$Td0[0]              ; zero round key
 ||     LDW     *$KPB++[2],$Td0[1]
 ||     MVK     60,A0
@@ -309,15 +341,14 @@ __decrypt:
 ||     XOR     $Td0[1],$s[1],$s[1]
 ||     LDW     *$KPA++[2],$K[0]                ; 1st round key
 ||     LDW     *$KPB++[2],$K[1]
-       SUB     B0,2,B0
 
-       SPLOOPD 13
-||     MVC     B0,ILC
-||     LDW     *$KPA++[2],$K[2]
+       LDW     *$KPA++[2],$K[2]
 ||     LDW     *$KPB++[2],$K[3]
-;;====================================================================
-       EXTU    $s[1],EXT3,24,$Td3[1]
+||     EXTU    $s[1],EXT3,24,$Td3[1]
 ||     EXTU    $s[0],EXT1,24,$Td1[0]
+||     SUB     B0,1,B0
+;;====================================================================
+dec_loop?:
        LDW     *${TEB}[$Td3[1]],$Td3[1]        ; Td3[s1>>24],  t0
 ||     LDW     *${TEA}[$Td1[0]],$Td1[0]        ; Td1[s0>>8],   t1
 ||     XOR     $s[2],$Td0[2],$s[2]             ; modulo-scheduled
@@ -352,12 +383,14 @@ __decrypt:
 ||     ROTL    $Td3[0],TBL3,$Td1[1]            ; t3
 ||     EXTU    $s[2],EXT0,24,$Td0[2]
 ||     EXTU    $s[3],EXT0,24,$Td0[3]
+|| [B0]        SUB     B0,1,B0
        LDW     *${TEA}[$Td0[2]],$Td0[2]        ; Td0[s2],      t2
 ||     LDW     *${TEB}[$Td0[3]],$Td0[3]        ; Td0[s3],      t3
 ||     ROTL    $Td2[2],TBL2,$Td2[2]            ; t0
 ||     ROTL    $Td2[3],TBL2,$Td2[3]            ; t1
 ||     XOR     $K[0],$Td1[0],$s[0]
 ||     XOR     $K[1],$Td3[1],$s[1]
+|| [B0]        BNOP    dec_loop?
        ROTL    $Td1[3],TBL1,$Td3[2]            ; t0
 ||     ROTL    $Td3[2],TBL3,$Td1[3]            ; t1
 ||     XOR     $K[2],$Td3[0],$s[2]
@@ -378,14 +411,13 @@ __decrypt:
 ||     XOR     $s[3],$Td2[1],$s[3]
 ||     XOR     $s[0],$Td0[0],$s[0]
 ||     XOR     $s[1],$Td0[1],$s[1]
-       SPKERNEL
-||     XOR.L   $s[2],$Td1[2],$s[2]
-||     XOR.L   $s[3],$Td3[3],$s[3]
-;;====================================================================
-       ADD.D   ${TEA},A0,${TEA}                ; point to Td4
-||     ADD.D   ${TEB},A0,${TEB}
+       XOR     $s[2],$Td1[2],$s[2]
+||     XOR     $s[3],$Td3[3],$s[3]
 ||     EXTU    $s[1],EXT3,24,$Td3[1]
 ||     EXTU    $s[0],EXT1,24,$Td1[0]
+||[!B0]        ADD     ${TEA},A0,${TEA}                ; point to Td4
+||[!B0]        ADD     ${TEB},A0,${TEB}
+;;====================================================================
        LDBU    *${TEB}[$Td3[1]],$Td3[1]        ; Td3[s1>>24],  t0
 ||     LDBU    *${TEA}[$Td1[0]],$Td1[0]        ; Td1[s0>>8],   t1
 ||     XOR     $s[2],$Td0[2],$s[2]             ; modulo-scheduled
@@ -515,17 +547,26 @@ __set_encrypt_key:
    [B0]        B       key256?
 || [A1]        LDNDW   *INP++,B19:B18
 
+       .if     __TI_EABI__
    [A0]        ADD     0,KEY,$KPA
 || [A0]        ADD     4,KEY,$KPB
-|| [A0]        MVKL    (AES_Te4-_AES_set_encrypt_key),$TEA
-|| [A0]        ADDKPC  _AES_set_encrypt_key,B6
-   [A0]        MVKH    (AES_Te4-_AES_set_encrypt_key),$TEA
+|| [A0]        MVKL    \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0]        ADDKPC  __set_encrypt_key,B6
+   [A0]        MVKH    \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
    [A0]        ADD     B6,$TEA,$TEA                    ; AES_Te4
+       .else
+   [A0]        ADD     0,KEY,$KPA
+|| [A0]        ADD     4,KEY,$KPB
+|| [A0]        MVKL    (AES_Te4-__set_encrypt_key),$TEA
+|| [A0]        ADDKPC  __set_encrypt_key,B6
+   [A0]        MVKH    (AES_Te4-__set_encrypt_key),$TEA
+   [A0]        ADD     B6,$TEA,$TEA                    ; AES_Te4
+       .endif
        NOP
        NOP
 
        BNOP    RA,5
-||     MVK     -2,RET                          ; unknown bit lenght
+||     MVK     -2,RET                          ; unknown bit length
 ||     MVK     0,B0                            ; redundant
 ;;====================================================================
 ;;====================================================================
@@ -543,13 +584,12 @@ key128?:
        .endif
 
        MVK     256,A0
-||     MVK     9,B0
+||     MVK     8,B0
 
-       SPLOOPD 14
-||     MVC     B0,ILC
-||     MV      $TEA,$TEB
+       MV      $TEA,$TEB
 ||     ADD     $TEA,A0,A30                     ; rcon
 ;;====================================================================
+loop128?:
        LDW     *A30++[1],A31                   ; rcon[i]
 ||     MV      $Te4[2],$K[2]
 ||     EXTU    $K[3],EXT1,24,$Te4[0]
@@ -576,10 +616,12 @@ key128?:
        .if     .BIG_ENDIAN
        PACK2   $Te4[0],$Te4[1],$Te4[1]
        PACK2   $Te4[3],A0,$Te4[3]
+||     BDEC    loop128?,B0
        PACKL4  $Te4[1],$Te4[3],$Te4[3]
        .else
        PACK2   $Te4[1],$Te4[0],$Te4[1]
        PACK2   $Te4[3],A0,$Te4[3]
+||     BDEC    loop128?,B0
        PACKL4  $Te4[3],$Te4[1],$Te4[3]
        .endif
        XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
@@ -587,7 +629,6 @@ key128?:
        MV      $Te4[0],$K[0]
 ||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
        XOR     $Te4[2],$K[3],$K[3]             ; K[3]
-       SPKERNEL
 ;;====================================================================
        BNOP    RA
        MV      $Te4[2],$K[2]
@@ -802,17 +843,15 @@ _AES_set_decrypt_key:
 ret?:                                          ; B0 holds rounds or zero
   [!B0]        BNOP    B31                             ; return if zero
    [B0]        SHL     B0,4,A0                         ; offset to last round 
key
-   [B0]        SHRU    B0,1,B1
-   [B0]        SUB     B1,1,B1
-   [B0]        MVK     0x0000001B,B3                   ; AES polynomial
+   [B0]        SHRU    B0,1,B2
+   [B0]        SUB     B2,2,B2
+|| [B0]        MVK     0x0000001B,B3                   ; AES polynomial
    [B0]        MVKH    0x07000000,B3
-
-       SPLOOPD 9                               ; flip round keys
-||     MVC     B1,ILC
-||     MV      B30,$KPA
-||     ADD     B30,A0,$KPB
-||     MVK     16,A0                           ; sizeof(round key)
+|| [B0]        MV      B30,$KPA
+   [B0]        ADD     B30,A0,$KPB
+|| [B0]        MVK     16,A0                           ; sizeof(round key)
 ;;====================================================================
+flip_loop?:
        LDW     *${KPA}[0],A16
 ||     LDW     *${KPB}[0],B16
        LDW     *${KPA}[1],A17
@@ -823,6 +862,7 @@ ret?:                                               ; B0 
holds rounds or zero
 ||     ADD     $KPA,A0,$KPA
 ||     LDW     *${KPB}[3],B19
 ||     SUB     $KPB,A0,$KPB
+||     BDEC    flip_loop?,B2
        NOP
        STW     B16,*${KPA}[-4]
 ||     STW     A16,*${KPB}[4]
@@ -832,7 +872,6 @@ ret?:                                               ; B0 
holds rounds or zero
 ||     STW     A18,*${KPB}[6]
        STW     B19,*${KPA}[-1]
 ||     STW     A19,*${KPB}[7]
-       SPKERNEL
 ;;====================================================================
        SUB     B0,1,B0                         ; skip last round
 ||     ADD     B30,A0,$KPA                     ; skip first round
@@ -847,10 +886,9 @@ ret?:                                              ; B0 
holds rounds or zero
 ||     MVK     0x00000B0B,B24
        MVKH    0x09090000,A24
 ||     MVKH    0x0B0B0000,B24
-       MVC     B0,ILC
-||     SUB     B0,1,B0
+       SUB     B0,1,B0
 
-       GMPY4   $K[0],A24,$Kx9[0]               ; �0x09
+       GMPY4   $K[0],A24,$Kx9[0]               ; ·0x09
 ||     GMPY4   $K[1],A24,$Kx9[1]
 ||     MVK     0x00000D0D,A25
 ||     MVK     0x00000E0E,B25
@@ -859,14 +897,14 @@ ret?:                                             ; B0 
holds rounds or zero
 ||     MVKH    0x0D0D0000,A25
 ||     MVKH    0x0E0E0000,B25
 
-       GMPY4   $K[0],B24,$KxB[0]               ; �0x0B
+       GMPY4   $K[0],B24,$KxB[0]               ; ·0x0B
 ||     GMPY4   $K[1],B24,$KxB[1]
        GMPY4   $K[2],B24,$KxB[2]
 ||     GMPY4   $K[3],B24,$KxB[3]
 
-       SPLOOP  11                              ; InvMixColumns
 ;;====================================================================
-       GMPY4   $K[0],A25,$KxD[0]               ; �0x0D
+invmix_loop?:
+       GMPY4   $K[0],A25,$KxD[0]               ; ·0x0D
 ||     GMPY4   $K[1],A25,$KxD[1]
 ||     SWAP2   $Kx9[0],$Kx9[0]                 ; rotate by 16
 ||     SWAP2   $Kx9[1],$Kx9[1]
@@ -883,7 +921,7 @@ ret?:                                               ; B0 
holds rounds or zero
 || [B0]        LDW     *${KPA}[6],$K[2]
 || [B0]        LDW     *${KPB}[7],$K[3]
 
-       GMPY4   $s[0],B25,$KxE[0]               ; �0x0E
+       GMPY4   $s[0],B25,$KxE[0]               ; ·0x0E
 ||     GMPY4   $s[1],B25,$KxE[1]
 ||     XOR     $Kx9[0],$KxB[0],$KxB[0]
 ||     XOR     $Kx9[1],$KxB[1],$KxB[1]
@@ -900,10 +938,11 @@ ret?:                                             ; B0 
holds rounds or zero
 ||     ROTL    $KxB[3],TBL3,$KxB[3]
 ||     SWAP2   $KxD[2],$KxD[2]
 ||     SWAP2   $KxD[3],$KxD[3]
+|| [B0]        B       invmix_loop?
 
        XOR     $KxE[0],$KxD[0],$KxE[0]
 ||     XOR     $KxE[1],$KxD[1],$KxE[1]
-|| [B0]        GMPY4   $K[0],A24,$Kx9[0]               ; �0x09
+|| [B0]        GMPY4   $K[0],A24,$Kx9[0]               ; ·0x09
 || [B0]        GMPY4   $K[1],A24,$Kx9[1]
 ||     ADDAW   $KPA,4,$KPA
        XOR     $KxE[2],$KxD[2],$KxE[2]
@@ -914,7 +953,7 @@ ret?:                                               ; B0 
holds rounds or zero
 
        XOR     $KxB[0],$KxE[0],$KxE[0]
 ||     XOR     $KxB[1],$KxE[1],$KxE[1]
-|| [B0]        GMPY4   $K[0],B24,$KxB[0]               ; �0x0B
+|| [B0]        GMPY4   $K[0],B24,$KxB[0]               ; ·0x0B
 || [B0]        GMPY4   $K[1],B24,$KxB[1]
        XOR     $KxB[2],$KxE[2],$KxE[2]
 ||     XOR     $KxB[3],$KxE[3],$KxE[3]
@@ -925,7 +964,6 @@ ret?:                                               ; B0 
holds rounds or zero
        STW     $KxE[2],*${KPA}[-2]
 ||     STW     $KxE[3],*${KPB}[-1]
 || [B0]        SUB     B0,1,B0
-       SPKERNEL
 ;;====================================================================
        BNOP    B31,3
        MVC     B30,GFPGFR                      ; restore GFPGFR(*)
@@ -943,7 +981,8 @@ _AES_ctr32_encrypt:
        .asmfunc
        LDNDW   *${ivp}[0],A31:A30      ; load counter value
 ||     MV      $blocks,A2              ; reassign $blocks
-||     DMV     RA,$key,B27:B26         ; reassign RA and $key
+||     MV      RA,B27                  ; reassign RA
+||     MV      $key,B26                ; reassign $key
        LDNDW   *${ivp}[1],B31:B30
 ||     MVK     0,B2                    ; don't let __encrypt load input
 ||     MVK     0,A1                    ; and postpone writing output
@@ -965,13 +1004,15 @@ ctr32_loop?:
 || [A2]        LDNDW   *INP++,B29:B28
        .if     .BIG_ENDIAN
    [A1]        STNDW   A9:A8,*OUT++            ; save output
-|| [A2]        DMV     A31,A30,A9:A8           ; pass counter value to 
__encrypt
+|| [A2]        MV      A31,A9                  ; pass counter value to 
__encrypt
+|| [A2]        MV      A30,A8                  ; pass counter value to 
__encrypt
    [A1]        STNDW   B9:B8,*OUT++
 || [A2]        DMV     B31,B30,B9:B8
 || [A2]        ADD     B30,1,B30               ; counter++
        .else
    [A1]        STNDW   A9:A8,*OUT++            ; save output
-|| [A2]        DMV     A31,A30,A9:A8
+|| [A2]        MV      A31,A9
+|| [A2]        MV      A30,A8
 || [A2]        SWAP2   B31,B0
 || [A2]        ADD     B31,1,B31               ; counter++
    [A1]        STNDW   B9:B8,*OUT++
@@ -989,7 +1030,11 @@ ___
 }
 # Tables are kept in endian-neutral manner
 $code.=<<___;
+       .if     __TI_EABI__
+       .sect   ".text:aes_asm.const"
+       .else
        .sect   ".const:aes_asm"
+       .endif
        .align  128
 AES_Te:
        .byte   0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84
@@ -1322,8 +1367,9 @@ AES_Td4:
        .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
        .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
        .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-       .cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .cstring "AES for C64x, CRYPTOGAMS by <appro\@openssl.org>"
        .align  4
 ___
 
 print $code;
+close STDOUT;
diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xcpuid.pl
similarity index 56%
copy from crypto/c64xpluscpuid.pl
copy to crypto/c64xcpuid.pl
index 067b693..88fd153 100644
--- a/crypto/c64xpluscpuid.pl
+++ b/crypto/c64xcpuid.pl
@@ -1,5 +1,10 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
 #
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
 
 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -7,17 +12,39 @@ open STDOUT,">$output";
 $code.=<<___;
        .text
 
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .asg    OPENSSL_rdtsc,_OPENSSL_rdtsc
+       .asg    OPENSSL_cleanse,_OPENSSL_cleanse
+       .asg    CRYPTO_memcmp,_CRYPTO_memcmp
+       .asg    OPENSSL_atomic_add,_OPENSSL_atomic_add
+       .asg    OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+       .asg    OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+       .asg    OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+       .endif
+
        .asg    B3,RA
+       .asg    0x01AC0000,TIMER_BASE   ; Timer 2
 
        .global _OPENSSL_rdtsc
 _OPENSSL_rdtsc:
        .asmfunc
-       B       RA
-       MVC     TSCL,B0
-       MVC     TSCH,B1
-  [!B0]        MVC     B0,TSCL         ; start TSC
-       MV      B0,A4
-       MV      B1,A5
+       MVKL    TIMER_BASE,A5
+       MVKH    TIMER_BASE,A5
+       LDW     *A5[0],A2       ; load CTL
+       LDW     *A5[2],A4       ; load CTN
+       NOP     2
+       .if     .BIG_ENDIAN
+       MVK     0x2c0,A7        ; internal clock source, don't hold, go
+||     MVK     -1,A6           ; maximum period
+       .else
+       MVK     0x2c0,A6        ; internal clock source, don't hold, go
+||     MVK     -1,A7           ; maximum period
+       .endif
+  [!A2]        STDW    A7:A6,*A5[0]    ; fire it up
+||     BNOP    RA,5
        .endasmfunc
 
        .global _OPENSSL_cleanse
@@ -28,28 +55,34 @@ _OPENSSL_cleanse:
 ||     SHRU    B4,3,B0         ; is length >= 8
 ||     ADD     1,A4,B6
   [!B0]        BNOP    RA
+|| [B0]        SUB     B0,1,B2
 ||     ZERO    A1
 ||     ZERO    B1
-   [B0]        MVC     B0,ILC
+   [B2]        BDEC    cleanse_loop?,B2
 ||[!B0]        CMPLT   0,B4,A1
 ||[!B0]        CMPLT   1,B4,B1
+||     ZERO    B5
    [A1]        STB     A2,*A4++[2]
-|| [B1] STB    B2,*B6++[2]
+|| [B1] STB    B5,*B6++[2]
+|| [B2]        BDEC    cleanse_loop?,B2
 ||[!B0]        CMPLT   2,B4,A1
 ||[!B0]        CMPLT   3,B4,B1
    [A1]        STB     A2,*A4++[2]
-|| [B1] STB    B2,*B6++[2]
+|| [B1] STB    B5,*B6++[2]
+|| [B2]        BDEC    cleanse_loop?,B2
 ||[!B0]        CMPLT   4,B4,A1
 ||[!B0]        CMPLT   5,B4,B1
    [A1]        STB     A2,*A4++[2]
-|| [B1] STB    B2,*B6++[2]
+|| [B1] STB    B5,*B6++[2]
+|| [B2]        BDEC    cleanse_loop?,B2
 ||[!B0]        CMPLT   6,B4,A1
    [A1]        STB     A2,*A4++[2]
+|| [B2]        BDEC    cleanse_loop?,B2
 
-       SPLOOP  1
+cleanse_loop?:
        STNDW   A3:A2,*A4++
 ||     SUB     B4,8,B4
-       SPKERNEL
+|| [B2]        BDEC    cleanse_loop?,B2
 
        MV      B4,B0           ; remaining bytes
 ||     ADD     1,A4,B6
@@ -57,33 +90,73 @@ _OPENSSL_cleanse:
    [B0]        CMPLT   0,B0,A1
 || [B0]        CMPLT   1,B0,B1
    [A1]        STB     A2,*A4++[2]
-|| [B1] STB    B2,*B6++[2]
+|| [B1] STB    B5,*B6++[2]
 || [B0]        CMPLT   2,B0,A1
 || [B0]        CMPLT   3,B0,B1
    [A1]        STB     A2,*A4++[2]
-|| [B1] STB    B2,*B6++[2]
+|| [B1] STB    B5,*B6++[2]
 || [B0]        CMPLT   4,B0,A1
 || [B0]        CMPLT   5,B0,B1
    [A1]        STB     A2,*A4++[2]
-|| [B1] STB    B2,*B6++[2]
+|| [B1] STB    B5,*B6++[2]
 || [B0]        CMPLT   6,B0,A1
    [A1]        STB     A2,*A4++[2]
        .endasmfunc
 
+       .if     0
+       .global _CRYPTO_memcmp
+_CRYPTO_memcmp:
+       .asmfunc
+       MV      A6,B0
+  [!B0]        BNOP    RA
+||[!B0]        ZERO    A4
+|| [B0]        ZERO    A1:A0
+   [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+   [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+   [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+   [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+   [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+       XOR     A5,B5,A1
+|| [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+
+memcmp_loop?:
+       OR      A1,A0,A0
+||     XOR     A5,B5,A1
+|| [B0]        LDBU    *A4++,A5
+|| [B0]        LDBU    *B4++,B5
+|| [B0]        BDEC    memcmp_loop?,B0
+
+       BNOP    RA,3
+       ZERO    A4
+  [A0] MVK     1,A4
+       .endasmfunc
+       .endif
+
        .global _OPENSSL_atomic_add
 _OPENSSL_atomic_add:
        .asmfunc
-       MV      A4,B0
-atomic_add?:
-       LL      *B0,B5
-       NOP     4
+       BNOP    atomic_store?   ; pre-C64x+ systems are uni-processor, it's
+||     LDW     *A4,B5          ; enough to hold interrupts off through
+                               ; the load-update-store cycle to achieve
+                               ; atomicity
+       NOP
+       BNOP    RA,3            ; and this branch stretches even over store
        ADD     B4,B5,B5
-       SL      B5,*B0
-       CMTL    *B0,B1
-       NOP     4
-  [!B1]        B       atomic_add?
-   [B1]        BNOP    RA,4
-       MV      B5,A4
+atomic_store?:
+       STW     B5,*A4
+||     MV      B5,A4
        .endasmfunc
 
        .global _OPENSSL_wipe_cpu
@@ -150,35 +223,34 @@ _OPENSSL_instrument_bus:
        MV      B4,B0                   ; reassign sizeof(output)
 ||     MV      A4,B4                   ; reassign output
 ||     MVK     0x00004030,A3
+||     MVKL    TIMER_BASE,B16
        MV      B0,A4                   ; return value
 ||     MVK     1,A1
 ||     MVKH    0x01840000,A3           ; L1DWIBAR
-       MVC     TSCL,B8                 ; collect 1st tick
+||     MVKH    TIMER_BASE,B16
+       LDW     *B16[2],B8              ; collect 1st tick
 ||     MVK     0x00004010,A5
+       NOP     4
        MV      B8,B9                   ; lasttick = tick
 ||     MVK     0,B7                    ; lastdiff = 0
 ||     MVKH    0x01840000,A5           ; L2WIBAR
        CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
        CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
-       LL      *B4,B5
+       LDW     *B4,B5
        NOP     4
        ADD     B7,B5,B5
-       SL      B5,*B4
-       CMTL    *B4,B1
-       NOP     4
        STW     B5,*B4
 bus_loop1?:
-       MVC     TSCL,B8
+       LDW     *B16[2],B8
 || [B0]        SUB     B0,1,B0
+       NOP     4
        SUB     B8,B9,B7                ; lastdiff = tick - lasttick
 ||     MV      B8,B9                   ; lasttick = tick
        CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
        CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
-       LL      *B4,B5
+       LDW     *B4,B5
        NOP     4
        ADD     B7,B5,B5
-       SL      B5,*B4
-       CMTL    *B4,B1
        STW     B5,*B4                  ; [!B1] is removed to flatten samples
 ||     ADDK    4,B4
 || [B0]        BNOP    bus_loop1?,5
@@ -192,42 +264,42 @@ _OPENSSL_instrument_bus2:
        MV      A6,B0                   ; reassign max
 ||     MV      B4,A6                   ; reassing sizeof(output)
 ||     MVK     0x00004030,A3
+||     MVKL    TIMER_BASE,B16
        MV      A4,B4                   ; reassign output
 ||     MVK     0,A4                    ; return value
 ||     MVK     1,A1
 ||     MVKH    0x01840000,A3           ; L1DWIBAR
+||     MVKH    TIMER_BASE,B16
 
-       MVC     TSCL,B8                 ; collect 1st tick
+       LDW     *B16[2],B8              ; collect 1st tick
 ||     MVK     0x00004010,A5
+       NOP     4
        MV      B8,B9                   ; lasttick = tick
 ||     MVK     0,B7                    ; lastdiff = 0
 ||     MVKH    0x01840000,A5           ; L2WIBAR
        CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
        CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
-       LL      *B4,B5
+       LDW     *B4,B5
        NOP     4
        ADD     B7,B5,B5
-       SL      B5,*B4
-       CMTL    *B4,B1
-       NOP     4
        STW     B5,*B4
 
-       MVC     TSCL,B8                 ; collect 1st diff
+       LDW     *B16[2],B8              ; collect 1st diff
+       NOP     4
        SUB     B8,B9,B7                ; lastdiff = tick - lasttick
 ||     MV      B8,B9                   ; lasttick = tick
 ||     SUB     B0,1,B0
 bus_loop2?:
        CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
        CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
-       LL      *B4,B5
+       LDW     *B4,B5
        NOP     4
        ADD     B7,B5,B5
-       SL      B5,*B4
-       CMTL    *B4,B1
        STW     B5,*B4                  ; [!B1] is removed to flatten samples
 ||[!B0]        BNOP    bus_loop2_done?,2
 ||     SUB     B0,1,B0
-       MVC     TSCL,B8
+       LDW     *B16[2],B8
+       NOP     4
        SUB     B8,B9,B8
 ||     MV      B8,B9
        CMPEQ   B8,B7,B2
@@ -240,6 +312,14 @@ bus_loop2?:
 bus_loop2_done?:
        BNOP    RA,5
        .endasmfunc
+
+       .if     __TI_EABI__
+       .sect   ".init_array"
+       .else
+       .sect   ".pinit"
+       .endif
+       .align  4
+       .long   _OPENSSL_rdtsc          ; auto-start timer
 ___
 
 print $code;
diff --git a/crypto/sha/asm/sha1-c64x-large.pl 
b/crypto/sha/asm/sha1-c64x-large.pl
new file mode 100644
index 0000000..3916ff3
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x-large.pl
@@ -0,0 +1,230 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# This is fully-unrolled SHA1 implementation. It's 25% faster than
+# one with compact loops, doesn't use in-memory ring buffer, as
+# everything is accomodated in registers, and has "perfect" interrupt
+# agility. Drawback is obviously the code size...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");           # arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
+@V = ($A,$B,$C,$D,$E);
+@X = map("B$_",(16..31));
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___                           if ($i<14);
+       ROTL    $a,5,$Arot              ;; $i
+||     AND     $c,$b,$F
+||     ANDN    $d,$b,$F0
+||     ADD     $K,$e,$e                ; E+=K
+||      LDNW   *${INP}++,@X[$i+2]
+       OR      $F0,$F,$F               ; F_00_19(B,C,D)
+||     ROTL    $b,30,$b
+||      SWAP2  @X[$i+1],@X[$i+1]
+||     ADD     @X[$i],$e,$e            ; E+=X[i]
+       ADD     $Arot,$e,$e             ; E+=rot(A,5)
+||      SWAP4  @X[$i+1],@X[$i+1]
+       ADD     $F,$e,$e                ; E+=F_00_19(B,C,D)
+___
+$code.=<<___                           if ($i==14);
+       ROTL    $a,5,$Arot              ;; $i
+||     AND     $c,$b,$F
+||     ANDN    $d,$b,$F0
+||     ADD     $K,$e,$e                ; E+=K
+       OR      $F0,$F,$F               ; F_00_19(B,C,D)
+||     ROTL    $b,30,$b
+||     ADD     @X[$i],$e,$e            ; E+=X[i]
+||      SWAP2  @X[$i+1],@X[$i+1]
+       ADD     $Arot,$e,$e             ; E+=rot(A,5)
+||      SWAP4  @X[$i+1],@X[$i+1]
+       ADD     $F,$e,$e                ; E+=F_00_19(B,C,D)
+___
+$code.=<<___                           if ($i==15);
+||      XOR    @X[($j+2)&15],@X[$j],@X[$j]
+       ROTL    $a,5,$Arot              ;; $i
+||     AND     $c,$b,$F
+||     ANDN    $d,$b,$F0
+||     ADD     $K,$e,$e                ; E+=K
+||      XOR    @X[($j+8)&15],@X[$j],@X[$j]
+       OR      $F0,$F,$F               ; F_00_19(B,C,D)
+||     ROTL    $b,30,$b
+||     ADD     @X[$i],$e,$e            ; E+=X[i]
+||      XOR    @X[($j+13)&15],@X[$j],@X[$j]
+       ADD     $Arot,$e,$e             ; E+=rot(A,5)
+||      ROTL   @X[$j],1,@X[$j]
+       ADD     $F,$e,$e                ; E+=F_00_19(B,C,D)
+___
+$code.=<<___                           if ($i>15);
+||      XOR    @X[($j+2)&15],@X[$j],@X[$j]
+       ROTL    $a,5,$Arot              ;; $i
+||     AND     $c,$b,$F
+||     ANDN    $d,$b,$F0
+||     ADD     $K,$e,$e                ; E+=K
+||      XOR    @X[($j+8)&15],@X[$j],@X[$j]
+       OR      $F0,$F,$F               ; F_00_19(B,C,D)
+||     ROTL    $b,30,$b
+||     ADD     @X[$i&15],$e,$e         ; E+=X[i]
+||      XOR    @X[($j+13)&15],@X[$j],@X[$j]
+       ADD     $Arot,$e,$e             ; E+=rot(A,5)
+||      ROTL   @X[$j],1,@X[$j]
+       ADD     $F,$e,$e                ; E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___                           if ($i<79);
+||      XOR    @X[($j+2)&15],@X[$j],@X[$j]
+       ROTL    $a,5,$Arot              ;; $i
+||     XOR     $c,$b,$F
+||     ADD     $K,$e,$e                ; E+=K
+||      XOR    @X[($j+8)&15],@X[$j],@X[$j]
+       XOR     $d,$F,$F                ; F_20_39(B,C,D)
+||     ROTL    $b,30,$b
+||     ADD     @X[$i&15],$e,$e         ; E+=X[i]
+||      XOR    @X[($j+13)&15],@X[$j],@X[$j]
+       ADD     $Arot,$e,$e             ; E+=rot(A,5)
+||      ROTL   @X[$j],1,@X[$j]
+       ADD     $F,$e,$e                ; E+=F_20_39(B,C,D)
+___
+$code.=<<___                           if ($i==79);
+|| [A0]        B       loop?
+|| [A0]        LDNW    *${INP}++,@X[0]         ; pre-fetch input
+       ROTL    $a,5,$Arot              ;; $i
+||     XOR     $c,$b,$F
+||     ADD     $K,$e,$e                ; E+=K
+|| [A0]        LDNW    *${INP}++,@X[1]
+       XOR     $d,$F,$F                ; F_20_39(B,C,D)
+||     ROTL    $b,30,$b
+||     ADD     @X[$i&15],$e,$e         ; E+=X[i]
+       ADD     $Arot,$e,$e             ; E+=rot(A,5)
+       ADD     $F,$e,$e                ; E+=F_20_39(B,C,D)
+||     ADD     $Bctx,$a,$a             ; accumulate context
+||     ADD     $Cctx,$b,$b
+       ADD     $Dctx,$c,$c
+||     ADD     $Ectx,$d,$d
+||     ADD     $Actx,$e,$e
+;;===== branch to loop? is taken here
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___;
+||      XOR    @X[($j+2)&15],@X[$j],@X[$j]
+       ROTL    $a,5,$Arot              ;; $i
+||     AND     $c,$b,$F
+||     AND     $d,$b,$F0
+||     ADD     $K,$e,$e                ; E+=K
+||      XOR    @X[($j+8)&15],@X[$j],@X[$j]
+       XOR     $F0,$F,$F
+||     AND     $c,$d,$F0
+||     ROTL    $b,30,$b
+||      XOR    @X[($j+13)&15],@X[$j],@X[$j]
+||     ADD     @X[$i&15],$e,$e         ; E+=X[i]
+       XOR     $F0,$F,$F               ; F_40_59(B,C,D)
+||     ADD     $Arot,$e,$e             ; E+=rot(A,5)
+||      ROTL   @X[$j],1,@X[$j]
+       ADD     $F,$e,$e                ; E+=F_20_39(B,C,D)
+___
+}
+
+$code=<<___;
+       .text
+
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .asg    sha1_block_data_order,_sha1_block_data_order
+       .endif
+
+       .asg    B3,RA
+       .asg    A15,FP
+       .asg    B15,SP
+
+       .if     .BIG_ENDIAN
+       .asg    MV,SWAP2
+       .asg    MV,SWAP4
+       .endif
+
+       .global _sha1_block_data_order
+_sha1_block_data_order:
+       .asmfunc
+       MV      $NUM,A0                 ; reassign $NUM
+  [!A0]        BNOP    RA                      ; if ($NUM==0) return;
+|| [A0]        LDW     *${CTX}[0],$A           ; load A-E...
+   [A0]        LDW     *${CTX}[1],$B
+   [A0]        LDW     *${CTX}[2],$C
+   [A0]        LDW     *${CTX}[3],$D
+   [A0]        LDW     *${CTX}[4],$E
+   [A0]        LDNW    *${INP}++,@X[0]         ; pre-fetch input
+   [A0]        LDNW    *${INP}++,@X[1]
+       NOP     3
+
+loop?:
+       SUB     A0,1,A0
+||     MV      $A,$Actx
+||     MVD     $B,$Bctx
+||     SWAP2   @X[0],@X[0]
+||     MVKL    0x5a827999,$K
+       MVKH    0x5a827999,$K           ; K_00_19
+||     MV      $C,$Cctx
+||     MV      $D,$Dctx
+||     MVD     $E,$Ectx
+||     SWAP4   @X[0],@X[0]
+___
+for ($i=0;$i<20;$i++)  { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||     MVKL    0x6ed9eba1,$K
+       MVKH    0x6ed9eba1,$K           ; K_20_39
+___
+for (;$i<40;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||     MVKL    0x8f1bbcdc,$K
+       MVKH    0x8f1bbcdc,$K           ; K_40_59
+___
+for (;$i<60;$i++)      { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||     MVKL    0xca62c1d6,$K
+       MVKH    0xca62c1d6,$K           ; K_60_79
+___
+for (;$i<80;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       BNOP    RA                      ; return
+       STW     $A,*${CTX}[0]           ; emit A-E...
+       STW     $B,*${CTX}[1]
+       STW     $C,*${CTX}[2]
+       STW     $D,*${CTX}[3]
+       STW     $E,*${CTX}[4]
+       .endasmfunc
+
+       .sect   .const
+       .cstring "SHA1 block transform for C64x, CRYPTOGAMS by 
<appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64x.pl
similarity index 85%
copy from crypto/sha/asm/sha1-c64xplus.pl
copy to crypto/sha/asm/sha1-c64x.pl
index 87000d1..d7a9dd1 100644
--- a/crypto/sha/asm/sha1-c64xplus.pl
+++ b/crypto/sha/asm/sha1-c64x.pl
@@ -7,19 +7,19 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# SHA1 for C64x+.
+# SHA1 for C64x.
 #
-# November 2011
+# November 2016
 #
 # If compared to compiler-generated code with similar characteristics,
 # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
 # this implementation is 25% smaller and >2x faster. In absolute terms
 # performance is (quite impressive) ~6.5 cycles per processed byte.
-# Fully unrolled assembler would be ~5x larger and is likely to be
-# ~15% faster. It would be free from references to intermediate ring
-# buffer, but put more pressure on L1P [both because the code would be
-# larger and won't be using SPLOOP buffer]. There are no plans to
-# realize fully unrolled variant though...
+# Unlike its predecessor, sha1-c64xplus module, this module has worse
+# interrupt agility. While original added up to 5 cycles delay to
+# response to interrupt, this module adds up to 100. Fully unrolled
+# implementation doesn't add any delay and even 25% faster, but is
+# almost 5x larger...
 #
 # !!! Note that this module uses AMR, which means that all interrupt
 # service routines are expected to preserve it and for own well-being
@@ -39,6 +39,13 @@ open STDOUT,">$output";
 $code=<<___;
        .text
 
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .asg    sha1_block_data_order,_sha1_block_data_order
+       .endif
+
        .asg    B3,RA
        .asg    A15,FP
        .asg    B15,SP
@@ -70,21 +77,21 @@ _sha1_block_data_order:
        NOP     1
 
 loop?:
-       MVK     0x00007999,$K
-||     ADDAW   SP,2,$XPA
-||     SUB     A0,1,A0
-||     MVK     13,B0
-       MVKH    0x5a820000,$K           ; K_00_19
+       MVKL    0x5a827999,$K
 ||     ADDAW   SP,2,$XPB
+||     SUB     A0,1,A0
+       MVKH    0x5a827999,$K           ; K_00_19
 ||     MV      $A,$Actx
 ||     MV      $B,$Bctx
 ;;==================================================
-       SPLOOPD 5                       ; BODY_00_13
+       B       body_00_13?             ; BODY_00_13
+||     MVK     11,B0
+||     MV      $XPB,$XPA
 ||     MV      $C,$Cctx
 ||     MV      $D,$Dctx
-||     MV      $E,$Ectx
-||     MVC     B0,ILC
+||     MVD     $E,$Ectx
 
+body_00_13?:
        ROTL    $A,5,$Arot
 ||     AND     $C,$B,$F
 ||     ANDN    $D,$B,$F0
@@ -105,7 +112,7 @@ loop?:
 
        ADD     $TX3,$T,$A              ; A=T+Xi
 ||     STW     $TX3,*${XPB}++
-       SPKERNEL
+||     BDEC    body_00_13?,B0
 ;;==================================================
        ROTL    $A,5,$Arot              ; BODY_14
 ||     AND     $C,$B,$F
@@ -160,11 +167,11 @@ loop?:
        ADD     $TX2,$T,$A              ; A=T+Xi
 ||     STW     $TX2,*${XPB}++
 ||     XOR     $TX0,$TX1,$TX1
-||     MVK     3,B0
 ;;==================================================
-       SPLOOPD 5                       ; BODY_16_19
-||     MVC     B0,ILC
+||     B       body_16_19?             ; BODY_16_19
+||     MVK     1,B0
 
+body_16_19?:
        ROTL    $A,5,$Arot
 ||     AND     $C,$B,$F
 ||     ANDN    $D,$B,$F0
@@ -191,18 +198,19 @@ loop?:
        ADD     $TX2,$T,$A              ; A=T+Xi
 ||     STW     $TX2,*${XPB}++
 ||     XOR     $TX0,$TX1,$TX1
-       SPKERNEL
+||     BDEC    body_16_19?,B0
 
-       MVK     0xffffeba1,$K
-||     MVK     19,B0
-       MVKH    0x6ed90000,$K           ; K_20_39
+       MVKL    0x6ed9eba1,$K
+||     MVK     17,B0
+       MVKH    0x6ed9eba1,$K           ; K_20_39
 ___
 sub BODY_20_39 {
+my $label = shift;
 $code.=<<___;
 ;;==================================================
-       SPLOOPD 5                       ; BODY_20_39
-||     MVC     B0,ILC
+||     B       $label                  ; BODY_20_39
 
+$label:
        ROTL    $A,5,$Arot
 ||     XOR     $B,$C,$F
 ||     ADD     $K,$E,$T                ; T=E+K
@@ -228,20 +236,19 @@ $code.=<<___;
        ADD     $TX2,$T,$A              ; A=T+Xi
 ||     STW     $TX2,*${XPB}++          ; last one is redundant
 ||     XOR     $TX0,$TX1,$TX1
-       SPKERNEL
-___
-$code.=<<___ if (!shift);
-       MVK     0xffffbcdc,$K
-       MVKH    0x8f1b0000,$K           ; K_40_59
+||     BDEC    $label,B0
 ___
-}      &BODY_20_39();
+}      &BODY_20_39("body_20_39?");
 $code.=<<___;
 ;;==================================================
-       SPLOOPD 5                       ; BODY_40_59
-||     MVC     B0,ILC
+       MVKL    0x8f1bbcdc,$K
+||     MVK     17,B0
+       MVKH    0x8f1bbcdc,$K           ; K_40_59
+||     B       body_40_59?             ; BODY_40_59
 ||     AND     $B,$C,$F
 ||     AND     $B,$D,$F0
 
+body_40_59?:
        ROTL    $A,5,$Arot
 ||     XOR     $F0,$F,$F
 ||     AND     $C,$D,$F0
@@ -270,13 +277,13 @@ $code.=<<___;
 ||     XOR     $TX0,$TX1,$TX1
 ||     AND     $B,$C,$F
 ||     AND     $B,$D,$F0
-       SPKERNEL
+||     BDEC    body_40_59?,B0
 
-       MVK     0xffffc1d6,$K
-||     MVK     18,B0
-       MVKH    0xca620000,$K           ; K_60_79
+       MVKL    0xca62c1d6,$K
+||     MVK     16,B0
+       MVKH    0xca62c1d6,$K           ; K_60_79
 ___
-       &BODY_20_39(-1);                # BODY_60_78
+       &BODY_20_39("body_60_78?");     # BODY_60_78
 $code.=<<___;
 ;;==================================================
    [A0]        B       loop?
@@ -315,7 +322,7 @@ $code.=<<___;
        .endasmfunc
 
        .sect   .const
-       .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by 
<appro\@openssl.org>"
+       .cstring "SHA1 block transform for C64x, CRYPTOGAMS by 
<appro\@openssl.org>"
        .align  4
 ___
 
diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64x.pl
similarity index 89%
copy from crypto/sha/asm/sha256-c64xplus.pl
copy to crypto/sha/asm/sha256-c64x.pl
index 8b92c84..fbe99c0 100644
--- a/crypto/sha/asm/sha256-c64xplus.pl
+++ b/crypto/sha/asm/sha256-c64x.pl
@@ -7,9 +7,9 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# SHA256 for C64x+.
+# SHA256 for C64x.
 #
-# January 2012
+# November 2016
 #
 # Performance is just below 10 cycles per processed byte, which is
 # almost 40% faster than compiler-generated code. Unroll is unlikely
@@ -39,6 +39,14 @@ open STDOUT,">$output";
 $code.=<<___;
        .text
 
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .nocmp
+       .asg    sha256_block_data_order,_sha256_block_data_order
+       .endif
+
        .asg    B3,RA
        .asg    A15,FP
        .asg    B15,SP
@@ -50,6 +58,7 @@ $code.=<<___;
 
        .global _sha256_block_data_order
 _sha256_block_data_order:
+__sha256_block:
        .asmfunc stack_usage(64)
        MV      $NUM,A0                         ; reassign $NUM
 ||     MVK     -64,B0
@@ -58,10 +67,17 @@ _sha256_block_data_order:
 || [A0]        MV      SP,FP
    [A0]        ADDKPC  _sha256_block_data_order,B2
 || [A0]        AND     B0,SP,SP                        ; align stack at 64 
bytes
+       .if     __TI_EABI__
    [A0]        MVK     0x00404,B1
-|| [A0]        MVKL    (K256-_sha256_block_data_order),$K256
+|| [A0]        MVKL    \$PCR_OFFSET(K256,__sha256_block),$K256
    [A0]        MVKH    0x50000,B1
-|| [A0]        MVKH    (K256-_sha256_block_data_order),$K256
+|| [A0]        MVKH    \$PCR_OFFSET(K256,__sha256_block),$K256
+       .else
+   [A0]        MVK     0x00404,B1
+|| [A0]        MVKL    (K256-__sha256_block),$K256
+   [A0]        MVKH    0x50000,B1
+|| [A0]        MVKH    (K256-__sha256_block),$K256
+       .endif
    [A0]        MVC     B1,AMR                          ; setup circular 
addressing
 || [A0]        MV      SP,$Xia
    [A0]        MV      SP,$Xib
@@ -79,9 +95,8 @@ _sha256_block_data_order:
 
        LDNW    *$INP++,$Xn                     ; pre-fetch input
        LDW     *$K256++,$K                     ; pre-fetch K256[0]
-       MVK     14,B0                           ; loop counters
-       MVK     47,B1
-||     ADDAW   $Xia,9,$Xia
+       NOP
+       ADDAW   $Xia,9,$Xia
 outerloop?:
        SUB     A0,1,A0
 ||     MV      $A,$Actx
@@ -94,10 +109,10 @@ outerloop?:
 ||     MVD     $H,$Hctx
 ||     SWAP4   $Xn,$X0
 
-       SPLOOPD 8                               ; BODY_00_14
-||     MVC     B0,ILC
+       MVK     14,B0                           ; loop counter
 ||     SWAP2   $X0,$X0
 
+loop_00_14?:                                   ; BODY_00_14
        LDNW    *$INP++,$Xn
 ||     ROTL    $A,30,$S0
 ||     OR      $A,$B,$Maj
@@ -113,6 +128,7 @@ outerloop?:
 ||     OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
 ||     ROTL    $E,7,$t1e
 ||     ADD     $K,$H,$T1                       ; T1 = h + K256[i]
+|| [B0]        BDEC    loop_00_14?,B0
        ADD     $X0,$T1,$T1                     ; T1 += X[i];
 ||     STW     $X0,*$Xib++
 ||     XOR     $t0a,$S0,$S0
@@ -134,7 +150,7 @@ outerloop?:
        MV      $B,$C                           ; c = b
 ||     MV      $A,$B                           ; b = a
 ||     ADD     $T1,$T2,$A                      ; a = T1 + T2
-       SPKERNEL
+;;===== branch to loop00_14? is taken here
 
        ROTL    $A,30,$S0                       ; BODY_15
 ||     OR      $A,$B,$Maj
@@ -178,11 +194,11 @@ outerloop?:
 ||     MV      $A,$B                           ; b = a
 ||     ADD     $T1,$T2,$A                      ; a = T1 + T2
 
-       SPLOOPD 10                              ; BODY_16_63
-||     MVC     B1,ILC
+       MVK     47,B1                           ; loop counter
 ||     ROTL    $X1,14,$t1e                     ; modulo-scheduled
 ||     ROTL    $X14,13,$t1a                    ; modulo-scheduled
 
+loop_16_63?:                                   ; BODY_16_63
        XOR     $t0e,$s0,$s0
 ||     XOR     $t0a,$s1,$s1
 ||     MV      $X15,$X14
@@ -207,6 +223,7 @@ outerloop?:
 ||     ROTL    $E,7,$t1e
 ||     ADD     $H,$K,$T1                       ; T1 = h + K256[i]
 ||     ADD     $s1,$X0,$X0                     ; X[i] += sigma1(X[i+14])
+|| [B1]        BDEC    loop_16_63?,B1
        XOR     $t0a,$S0,$S0
 ||     XOR     $t0e,$S1,$S1
 ||     ADD     $X0,$T1,$T1                     ; T1 += X[i]
@@ -234,7 +251,7 @@ outerloop?:
 ||     ADD     $T1,$T2,$A                      ; a = T1 + T2
 ||     SHRU    $X1,3,$s0                       ; modulo-scheduled
 ||     SHRU    $X14,10,$s1                     ; modulo-scheduled
-       SPKERNEL
+;;===== branch to loop16_63? is taken here
 
    [A0]        B       outerloop?
 || [A0]        LDNW    *$INP++,$Xn                     ; pre-fetch input
@@ -265,7 +282,11 @@ outerloop?:
 ||     STW     $H,*${CTXB}[7]
        .endasmfunc
 
+       .if     __TI_EABI__
+       .sect   ".text:sha_asm.const"
+       .else
        .sect   ".const:sha_asm"
+       .endif
        .align  128
 K256:
        .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -284,7 +305,7 @@ K256:
        .uword  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
        .uword  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
        .uword  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-       .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by 
<appro\@openssl.org>"
+       .cstring "SHA256 block transform for C64x, CRYPTOGAMS by 
<appro\@openssl.org>"
        .align  4
 
 ___
diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64x.pl
similarity index 89%
copy from crypto/sha/asm/sha512-c64xplus.pl
copy to crypto/sha/asm/sha512-c64x.pl
index 56c8583..e35a72a 100644
--- a/crypto/sha/asm/sha512-c64xplus.pl
+++ b/crypto/sha/asm/sha512-c64x.pl
@@ -7,11 +7,11 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# SHA512 for C64x+.
+# SHA512 for C64x.
 #
-# January 2012
+# November 2016
 #
-# Performance is 19 cycles per processed byte. Compared to block
+# Performance is ~19 cycles per processed byte. Compared to block
 # transform function from sha512.c compiled with cl6x with -mv6400+
 # -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
 # Loop unroll won't make it, this implementation, any faster, because
@@ -47,6 +47,14 @@ open STDOUT,">$output";
 $code.=<<___;
        .text
 
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .nocmp
+       .asg    sha512_block_data_order,_sha512_block_data_order
+       .endif
+
        .asg    B3,RA
        .asg    A15,FP
        .asg    B15,SP
@@ -61,6 +69,7 @@ $code.=<<___;
 
        .global _sha512_block_data_order
 _sha512_block_data_order:
+__sha512_block:
        .asmfunc stack_usage(40+128)
        MV      $NUM,A0                         ; reassign $NUM
 ||     MVK     -128,B0
@@ -75,13 +84,21 @@ _sha512_block_data_order:
    [A0]        STDW    A11:A10,*SP[1]
 || [A0]        MVC     B1,AMR                          ; setup circular 
addressing
 || [A0]        ADD     B0,SP,SP                        ; alloca(128)
+       .if     __TI_EABI__
    [A0]        AND     B0,SP,SP                        ; align stack at 128 
bytes
-|| [A0]        ADDKPC  _sha512_block_data_order,B1
-|| [A0]        MVKL    (K512-_sha512_block_data_order),$K512
-   [A0]        MVKH    (K512-_sha512_block_data_order),$K512
+|| [A0]        ADDKPC  __sha512_block,B1
+|| [A0]        MVKL    \$PCR_OFFSET(K512,__sha512_block),$K512
+   [A0]        MVKH    \$PCR_OFFSET(K512,__sha512_block),$K512
 || [A0]        SUBAW   SP,2,SP                         ; reserve two words 
above buffer
+       .else
+   [A0]        AND     B0,SP,SP                        ; align stack at 128 
bytes
+|| [A0]        ADDKPC  __sha512_block,B1
+|| [A0]        MVKL    (K512-__sha512_block),$K512
+   [A0]        MVKH    (K512-__sha512_block),$K512
+|| [A0]        SUBAW   SP,2,SP                         ; reserve two words 
above buffer
+       .endif
        ADDAW   SP,3,$Xilo
-       ADDAW   SP,2,$Xihi
+       ADD     SP,4*2,$Xihi                    ; ADDAW SP,2,$Xihi
 
 ||     MV      $CTXA,$CTXB
        LDW     *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
@@ -134,13 +151,13 @@ loop0_15?:
        SWAP2   $T1hi,$T1hi
 ||     SWAP2   $T1lo,$T1lo
        .endif
-loop16_79?:
-       STW     $T1hi,*$Xihi++[2]
+       STW     $T1hi,*$Xihi++[2]                       ; original loop16_79?
 ||     STW     $T1lo,*$Xilo++[2]                       ; X[i] = T1
 ||     ADD     $Hhi,$T1hi,$T1hi
 ||     ADDU    $Hlo,$T1lo,$T1carry:$T1lo               ; T1 += h
 ||     SHRU    $Ehi,14,$S1hi
 ||     SHL     $Ehi,32-14,$S1lo
+loop16_79?:
        XOR     $Fhi,$Ghi,$CHhi
 ||     XOR     $Flo,$Glo,$CHlo
 ||     ADD     KHI,$T1hi,$T1hi
@@ -213,21 +230,21 @@ loop16_79?:
 ||     XOR     $t0lo,$S0lo,$S0lo
 ||     ADD     $Ehi,$T1hi,$T1hi
 ||     ADDU    $Elo,$T1carry:$T1lo,$T1carry:$T1lo      ; T1 += e
-|| [B0]        BNOP    loop0_15?
 ||     SHRU    $Ahi,39-32,$t0lo
 ||     SHL     $Ahi,64-39,$t0hi
+   [B0]        BNOP    loop0_15?
+|| [B0]        LDNDW   *$INP++,B11:B10                         ; pre-fetch 
input
        XOR     $t0hi,$S0hi,$S0hi
 ||     XOR     $t0lo,$S0lo,$S0lo
-|| [B0]        LDNDW   *$INP++,B11:B10                         ; pre-fetch 
input
-||[!B1]        BNOP    break?
 ||     SHRU    $Alo,39-32,$t0hi
 ||     SHL     $Alo,64-39,$t0lo
+||[!B0]        LDW     *${Xihi}[28],$T1hi
+||[!B0]        LDW     *${Xilo}[28],$T1lo                      ; X[i+14]
        XOR     $t0hi,$S0hi,$S0hi
 ||     XOR     $t0lo,$S0lo,$S0lo                       ; Sigma0(a)
 ||     ADD     $T1carry,$T1hi,$Ehi
-||     MV      $T1lo,$Elo                              ; e = T1
-||[!B0]        LDW     *${Xihi}[28],$T1hi
-||[!B0]        LDW     *${Xilo}[28],$T1lo                      ; X[i+14]
+||     ROTL    $T1lo,0,$Elo                            ; e = T1, "ghost" value
+||[!B1]        BNOP    break?
        ADD     $S0hi,$T2hi,$T2hi
 ||     ADDU    $S0lo,$T2carry:$T2lo,$T2carry:$T2lo     ; T2 += Sigma0(a)
 || [B1]        LDDW    *$K512++,$Khi:$Klo                      ; pre-fetch 
K512[i]
@@ -236,14 +253,13 @@ loop16_79?:
 ||     MV      $T2lo,$Alo                              ; a = T2
 || [B0]        SUB     B0,1,B0
 ;;===== branch to loop00_15? is taken here
-       NOP
+   [B1]        LDW     *${Xihi}[2],$T2hi
+|| [B1]        LDW     *${Xilo}[2],$T2lo                       ; X[i+1]
+|| [B1]        SHRU    $T1hi,19,$S1hi
+|| [B1]        SHL     $T1hi,32-19,$S1lo
+   [B1]        SHRU    $T1lo,19,$t0lo
+|| [B1]        SHL     $T1lo,32-19,$t0hi
 ;;===== branch to break? is taken here
-       LDW     *${Xihi}[2],$T2hi
-||     LDW     *${Xilo}[2],$T2lo                       ; X[i+1]
-||     SHRU    $T1hi,19,$S1hi
-||     SHL     $T1hi,32-19,$S1lo
-       SHRU    $T1lo,19,$t0lo
-||     SHL     $T1lo,32-19,$t0hi
        XOR     $t0hi,$S1hi,$S1hi
 ||     XOR     $t0lo,$S1lo,$S1lo
 ||     SHRU    $T1hi,61-32,$t0lo
@@ -281,7 +297,6 @@ loop16_79?:
 ||     XOR     $t0lo,$S0lo,$S0lo
 ||     ADD     $S1hi,$T1hi,$T1hi
 ||     ADDU    $S1lo,$T1lo,$T1carry:$T1lo              ; T1 = X[i+9]+sigma1()
-|| [B1]        BNOP    loop16_79?
 ||     SHRU    $T2hi,7,$t0hi
 ||     SHL     $T2hi,32-7,$t0lo
        XOR     $t0hi,$S0hi,$S0hi
@@ -289,6 +304,7 @@ loop16_79?:
 ||     ADD     $CHhi,$T1hi,$T1hi
 ||     ADDU    $CHlo,$T1carry:$T1lo,$T1carry:$T1lo     ; T1 += X[i]
 ||     SHRU    $T2lo,7,$t0lo
+|| [B1]        BNOP    loop16_79?
        XOR     $t0lo,$S0lo,$S0lo                       ; sigma0(Xi[i+1]
 
        ADD     $S0hi,$T1hi,$T1hi
@@ -296,6 +312,13 @@ loop16_79?:
 || [B1]        SUB     B1,1,B1
        NOP                                             ; avoid cross-path stall
        ADD     $T1carry,$T1hi,$T1hi
+
+       STW     $T1hi,*$Xihi++[2]                       ; copied "top" bundle
+||     STW     $T1lo,*$Xilo++[2]                       ; X[i] = T1
+||     ADD     $Hhi,$T1hi,$T1hi
+||     ADDU    $Hlo,$T1lo,$T1carry:$T1lo               ; T1 += h
+||     SHRU    $Ehi,14,$S1hi
+||     SHL     $Ehi,32-14,$S1lo
 ;;===== branch to loop16_79? is taken here
 
 break?:
@@ -359,7 +382,11 @@ break?:
        NOP     2                               ; wait till FP is committed
        .endasmfunc
 
+       .if     __TI_EABI__
+       .sect   ".text:sha_asm.const"
+       .else
        .sect   ".const:sha_asm"
+       .endif
        .align  128
 K512:
        .uword  0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
@@ -402,7 +429,7 @@ K512:
        .uword  0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
        .uword  0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
        .uword  0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
-       .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by 
<appro\@openssl.org>"
+       .cstring "SHA512 block transform for C64x, CRYPTOGAMS by 
<appro\@openssl.org>"
        .align  4
 ___
 
diff --git a/test/fips_algvs.c b/test/fips_algvs.c
index 8ff75dc..2bfd213 100644
--- a/test/fips_algvs.c
+++ b/test/fips_algvs.c
@@ -150,7 +150,7 @@ extern int fips_rsavtest_main(int argc, char **argv);
 extern int fips_shatest_main(int argc, char **argv);
 extern int fips_test_suite_main(int argc, char **argv);
 
-#if !defined(_TMS320C6400_PLUS)
+#if !defined(_TMS320C6400_PLUS) && !defined(_TMS320C6400)
 #include "fips_aesavs.c"
 #include "fips_cmactest.c"
 #include "fips_desmovs.c"
diff --git a/util/mk1mf.pl b/util/mk1mf.pl
index 2325607..b0a868a 100755
--- a/util/mk1mf.pl
+++ b/util/mk1mf.pl
@@ -249,7 +249,7 @@ elsif (($platform eq "netware-clib") || ($platform eq 
"netware-libc") ||
        $BSDSOCK=1 if ($platform eq "netware-libc-bsdsock") || ($platform eq 
"netware-clib-bsdsock");
        require 'netware.pl';
        }
-elsif ($platform eq "c64xplus")
+elsif ($platform =~ /^c64x/)
        {
        require "TI_CGTOOLS.pl";
        }
_____
openssl-commits mailing list
To unsubscribe: https://mta.openssl.org/mailman/listinfo/openssl-commits

Reply via email to