diff options
Diffstat (limited to 'security/nss/lib/freebl/mpi/hppa20.s')
-rw-r--r-- | security/nss/lib/freebl/mpi/hppa20.s | 904 |
1 files changed, 904 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/mpi/hppa20.s b/security/nss/lib/freebl/mpi/hppa20.s new file mode 100644 index 0000000000..c72de8a12b --- /dev/null +++ b/security/nss/lib/freebl/mpi/hppa20.s @@ -0,0 +1,904 @@ +; This Source Code Form is subject to the terms of the Mozilla Public +; License, v. 2.0. If a copy of the MPL was not distributed with this +; file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifdef __LP64__ + .LEVEL 2.0W +#else +; .LEVEL 1.1 +; .ALLOW 2.0N + .LEVEL 2.0 +#endif + .SPACE $TEXT$,SORT=8 + .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24 + +; *************************************************************** +; +; maxpy_[little/big] +; +; *************************************************************** + +; There is no default -- you must specify one or the other. +#define LITTLE_WORDIAN 1 + +#ifdef LITTLE_WORDIAN +#define EIGHT 8 +#define SIXTEEN 16 +#define THIRTY_TWO 32 +#define UN_EIGHT -8 +#define UN_SIXTEEN -16 +#define UN_TWENTY_FOUR -24 +#endif + +#ifdef BIG_WORDIAN +#define EIGHT -8 +#define SIXTEEN -16 +#define THIRTY_TWO -32 +#define UN_EIGHT 8 +#define UN_SIXTEEN 16 +#define UN_TWENTY_FOUR 24 +#endif + +; This performs a multiple-precision integer version of "daxpy", +; Using the selected addressing direction. "Little-wordian" means that +; the least significant word of a number is stored at the lowest address. +; "Big-wordian" means that the most significant word is at the lowest +; address. Either way, the incoming address of the vector is that +; of the least significant word. That means that, for little-wordian +; addressing, we move the address upward as we propagate carries +; from the least significant word to the most significant. For +; big-wordian we move the address downward. + +; We use the following registers: +; +; r2 return PC, of course +; r26 = arg1 = length +; r25 = arg2 = address of scalar +; r24 = arg3 = multiplicand vector +; r23 = arg4 = result vector +; +; fr9 = scalar loaded once only from r25 + +; The cycle counts shown in the bodies below are simply the result of a +; scheduling by hand. The actual PCX-U hardware does it differently. +; The intention is that the overall speed is the same. + +; The pipeline startup and shutdown code is constructed in the usual way, +; by taking the loop bodies and removing unnecessary instructions. +; We have left the comments describing cycle numbers in the code. +; These are intended for reference when comparing with the main loop, +; and have no particular relationship to actual cycle numbers. + +#ifdef LITTLE_WORDIAN +maxpy_little +#else +maxpy_big +#endif + .PROC + .CALLINFO FRAME=120,ENTRY_GR=4 + .ENTRY + STW,MA %r3,128(%sp) + STW %r4,-124(%sp) + + ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately. + FLDD 0(%r25),%fr9 ; fr9 = scalar + +; First startup + + FLDD 0(%r24),%fr24 ; Cycle 1 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 + CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3 + XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 + FLDD EIGHT(%r24),%fr28 ; Cycle 8 + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 + FSTD %fr24,-96(%sp) + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 + FSTD %fr25,-80(%sp) + LDO SIXTEEN(%r24),%r24 ; Cycle 12 + FSTD %fr31,-64(%sp) + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 + FSTD %fr27,-48(%sp) + +; Second startup + + XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 + FSTD %fr30,-56(%sp) + FLDD 0(%r24),%fr24 + + FSTD %fr26,-88(%sp) ; Cycle 2 + + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 + FSTD %fr28,-104(%sp) + + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 + LDD -96(%sp),%r3 + FSTD %fr29,-72(%sp) + + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 + LDD -64(%sp),%r19 + LDD -80(%sp),%r21 + + XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 + LDD -56(%sp),%r20 + ADD %r21,%r3,%r3 + + ADD,DC %r20,%r19,%r19 ; Cycle 7 + LDD -88(%sp),%r4 + SHRPD %r3,%r0,32,%r21 + LDD -48(%sp),%r1 + + FLDD EIGHT(%r24),%fr28 ; Cycle 8 + LDD -104(%sp),%r31 + ADD,DC %r0,%r0,%r20 + SHRPD %r19,%r3,32,%r3 + + LDD -72(%sp),%r29 ; Cycle 9 + SHRPD %r20,%r19,32,%r20 + ADD %r21,%r1,%r1 + + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 + ADD,DC %r3,%r4,%r4 + FSTD %fr24,-96(%sp) + + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 + ADD,DC %r0,%r20,%r20 + LDD 0(%r23),%r3 + FSTD %fr25,-80(%sp) + + LDO SIXTEEN(%r24),%r24 ; Cycle 12 + FSTD %fr31,-64(%sp) + + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 + ADD %r0,%r0,%r0 ; clear the carry bit + ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12 + FSTD %fr27,-48(%sp) +; MFCTL %cr16,%r21 ; for timing +; STD %r21,-112(%sp) + +; Here is the loop. + +$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 + ADD,DC %r29,%r4,%r4 + FSTD %fr30,-56(%sp) + FLDD 0(%r24),%fr24 + + LDO SIXTEEN(%r23),%r23 ; Cycle 2 + ADD,DC %r0,%r20,%r20 + FSTD %fr26,-88(%sp) + + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 + ADD %r3,%r1,%r1 + FSTD %fr28,-104(%sp) + LDD UN_EIGHT(%r23),%r21 + + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 + ADD,DC %r21,%r4,%r28 + FSTD %fr29,-72(%sp) + LDD -96(%sp),%r3 + + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 + ADD,DC %r20,%r31,%r22 + LDD -64(%sp),%r19 + LDD -80(%sp),%r21 + + XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 + ADD %r21,%r3,%r3 + LDD -56(%sp),%r20 + STD %r1,UN_SIXTEEN(%r23) + + ADD,DC %r20,%r19,%r19 ; Cycle 7 + SHRPD %r3,%r0,32,%r21 + LDD -88(%sp),%r4 + LDD -48(%sp),%r1 + + ADD,DC %r0,%r0,%r20 ; Cycle 8 + SHRPD %r19,%r3,32,%r3 + FLDD EIGHT(%r24),%fr28 + LDD -104(%sp),%r31 + + SHRPD %r20,%r19,32,%r20 ; Cycle 9 + ADD %r21,%r1,%r1 + STD %r28,UN_EIGHT(%r23) + LDD -72(%sp),%r29 + + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 + ADD,DC %r3,%r4,%r4 + FSTD %fr24,-96(%sp) + + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 + ADD,DC %r0,%r20,%r20 + FSTD %fr25,-80(%sp) + LDD 0(%r23),%r3 + + LDO SIXTEEN(%r24),%r24 ; Cycle 12 + FSTD %fr31,-64(%sp) + + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 + ADD %r22,%r1,%r1 + ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12 + FSTD %fr27,-48(%sp) + +$ENDLOOP + +; Shutdown code, first stage. + +; MFCTL %cr16,%r21 ; for timing +; STD %r21,UN_SIXTEEN(%r23) +; LDD -112(%sp),%r21 +; STD %r21,UN_EIGHT(%r23) + + XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 + ADD,DC %r29,%r4,%r4 + CMPIB,= 0,%r26,$ONEMORE + FSTD %fr30,-56(%sp) + + LDO SIXTEEN(%r23),%r23 ; Cycle 2 + ADD,DC %r0,%r20,%r20 + FSTD %fr26,-88(%sp) + + ADD %r3,%r1,%r1 ; Cycle 3 + FSTD %fr28,-104(%sp) + LDD UN_EIGHT(%r23),%r21 + + ADD,DC %r21,%r4,%r28 ; Cycle 4 + FSTD %fr29,-72(%sp) + STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9 + LDD -96(%sp),%r3 + + ADD,DC %r20,%r31,%r22 ; Cycle 5 + STD %r1,UN_SIXTEEN(%r23) +$JOIN4 + LDD -64(%sp),%r19 + LDD -80(%sp),%r21 + + ADD %r21,%r3,%r3 ; Cycle 6 + LDD -56(%sp),%r20 + + ADD,DC %r20,%r19,%r19 ; Cycle 7 + SHRPD %r3,%r0,32,%r21 + LDD -88(%sp),%r4 + LDD -48(%sp),%r1 + + ADD,DC %r0,%r0,%r20 ; Cycle 8 + SHRPD %r19,%r3,32,%r3 + LDD -104(%sp),%r31 + + SHRPD %r20,%r19,32,%r20 ; Cycle 9 + ADD %r21,%r1,%r1 + LDD -72(%sp),%r29 + + ADD,DC %r3,%r4,%r4 ; Cycle 10 + + ADD,DC %r0,%r20,%r20 ; Cycle 11 + LDD 0(%r23),%r3 + + ADD %r22,%r1,%r1 ; Cycle 13 + +; Shutdown code, second stage. + + ADD,DC %r29,%r4,%r4 ; Cycle 1 + + LDO SIXTEEN(%r23),%r23 ; Cycle 2 + ADD,DC %r0,%r20,%r20 + + LDD UN_EIGHT(%r23),%r21 ; Cycle 3 + ADD %r3,%r1,%r1 + + ADD,DC %r21,%r4,%r28 ; Cycle 4 + + ADD,DC %r20,%r31,%r22 ; Cycle 5 + + STD %r1,UN_SIXTEEN(%r23); Cycle 6 + + STD %r28,UN_EIGHT(%r23) ; Cycle 9 + + LDD 0(%r23),%r3 ; Cycle 11 + +; Shutdown code, third stage. + + LDO SIXTEEN(%r23),%r23 + ADD %r3,%r22,%r1 +$JOIN1 ADD,DC %r0,%r0,%r21 + CMPIB,*= 0,%r21,$L0 ; if no overflow, exit + STD %r1,UN_SIXTEEN(%r23) + +; Final carry propagation + +$FINAL1 LDO EIGHT(%r23),%r23 + LDD UN_SIXTEEN(%r23),%r21 + ADDI 1,%r21,%r21 + CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry. + STD %r21,UN_SIXTEEN(%r23) + B $L0 + NOP + +; Here is the code that handles the difficult cases N=1, N=2, and N=3. +; We do the usual trick -- branch out of the startup code at appropriate +; points, and branch into the shutdown code. + +$N_IS_SMALL + CMPIB,= 0,%r26,$N_IS_ONE + FSTD %fr24,-96(%sp) ; Cycle 10 + FLDD EIGHT(%r24),%fr28 ; Cycle 8 + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 + FSTD %fr25,-80(%sp) + FSTD %fr31,-64(%sp) ; Cycle 12 + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 + FSTD %fr27,-48(%sp) + XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 + CMPIB,= 2,%r26,$N_IS_THREE + FSTD %fr30,-56(%sp) + +; N = 2 + FSTD %fr26,-88(%sp) ; Cycle 2 + FSTD %fr28,-104(%sp) ; Cycle 3 + LDD -96(%sp),%r3 ; Cycle 4 + FSTD %fr29,-72(%sp) + B $JOIN4 + ADD %r0,%r0,%r22 + +$N_IS_THREE + FLDD SIXTEEN(%r24),%fr24 + FSTD %fr26,-88(%sp) ; Cycle 2 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 + FSTD %fr28,-104(%sp) + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 + LDD -96(%sp),%r3 + FSTD %fr29,-72(%sp) + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 + LDD -64(%sp),%r19 + LDD -80(%sp),%r21 + B $JOIN3 + ADD %r0,%r0,%r22 + +$N_IS_ONE + FSTD %fr25,-80(%sp) + FSTD %fr27,-48(%sp) + FSTD %fr26,-88(%sp) ; Cycle 2 + B $JOIN5 + ADD %r0,%r0,%r22 + +; We came out of the unrolled loop with wrong parity. Do one more +; single cycle. This is quite tricky, because of the way the +; carry chains and SHRPD chains have been chopped up. + +$ONEMORE + + FLDD 0(%r24),%fr24 + + LDO SIXTEEN(%r23),%r23 ; Cycle 2 + ADD,DC %r0,%r20,%r20 + FSTD %fr26,-88(%sp) + + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 + FSTD %fr28,-104(%sp) + LDD UN_EIGHT(%r23),%r21 + ADD %r3,%r1,%r1 + + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 + ADD,DC %r21,%r4,%r28 + STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 + LDD -96(%sp),%r3 + FSTD %fr29,-72(%sp) + + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 + ADD,DC %r20,%r31,%r22 + LDD -64(%sp),%r19 + LDD -80(%sp),%r21 + + STD %r1,UN_SIXTEEN(%r23); Cycle 6 +$JOIN3 + XMPYU %fr9L,%fr24R,%fr24 + LDD -56(%sp),%r20 + ADD %r21,%r3,%r3 + + ADD,DC %r20,%r19,%r19 ; Cycle 7 + LDD -88(%sp),%r4 + SHRPD %r3,%r0,32,%r21 + LDD -48(%sp),%r1 + + LDD -104(%sp),%r31 ; Cycle 8 + ADD,DC %r0,%r0,%r20 + SHRPD %r19,%r3,32,%r3 + + LDD -72(%sp),%r29 ; Cycle 9 + SHRPD %r20,%r19,32,%r20 + ADD %r21,%r1,%r1 + + ADD,DC %r3,%r4,%r4 ; Cycle 10 + FSTD %fr24,-96(%sp) + + ADD,DC %r0,%r20,%r20 ; Cycle 11 + LDD 0(%r23),%r3 + FSTD %fr25,-80(%sp) + + ADD %r22,%r1,%r1 ; Cycle 13 + FSTD %fr27,-48(%sp) + +; Shutdown code, stage 1-1/2. + + ADD,DC %r29,%r4,%r4 ; Cycle 1 + + LDO SIXTEEN(%r23),%r23 ; Cycle 2 + ADD,DC %r0,%r20,%r20 + FSTD %fr26,-88(%sp) + + LDD UN_EIGHT(%r23),%r21 ; Cycle 3 + ADD %r3,%r1,%r1 + + ADD,DC %r21,%r4,%r28 ; Cycle 4 + STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 + + ADD,DC %r20,%r31,%r22 ; Cycle 5 + STD %r1,UN_SIXTEEN(%r23) +$JOIN5 + LDD -96(%sp),%r3 ; moved from cycle 4 + LDD -80(%sp),%r21 + ADD %r21,%r3,%r3 ; Cycle 6 + ADD,DC %r0,%r0,%r19 ; Cycle 7 + LDD -88(%sp),%r4 + SHRPD %r3,%r0,32,%r21 + LDD -48(%sp),%r1 + SHRPD %r19,%r3,32,%r3 ; Cycle 8 + ADD %r21,%r1,%r1 ; Cycle 9 + ADD,DC %r3,%r4,%r4 ; Cycle 10 + LDD 0(%r23),%r3 ; Cycle 11 + ADD %r22,%r1,%r1 ; Cycle 13 + +; Shutdown code, stage 2-1/2. + + ADD,DC %r0,%r4,%r4 ; Cycle 1 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 + LDD UN_EIGHT(%r23),%r21 ; Cycle 3 + ADD %r3,%r1,%r1 + STD %r1,UN_SIXTEEN(%r23) + ADD,DC %r21,%r4,%r1 + B $JOIN1 + LDO EIGHT(%r23),%r23 + +; exit + +$L0 + LDW -124(%sp),%r4 + BVE (%r2) + .EXIT + LDW,MB -128(%sp),%r3 + + .PROCEND + +; *************************************************************** +; +; add_diag_[little/big] +; +; *************************************************************** + +; The arguments are as follows: +; r2 return PC, of course +; r26 = arg1 = length +; r25 = arg2 = vector to square +; r24 = arg3 = result vector + +#ifdef LITTLE_WORDIAN +add_diag_little +#else +add_diag_big +#endif + .PROC + .CALLINFO FRAME=120,ENTRY_GR=4 + .ENTRY + STW,MA %r3,128(%sp) + STW %r4,-124(%sp) + + ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately. + NOP + +; Startup code + + FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body) + XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 + XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 + XMPYU %fr7L,%fr7L,%fr30 + LDO SIXTEEN(%r25),%r25 ; Cycle 6 + FSTD %fr29,-88(%sp) + FSTD %fr27,-72(%sp) ; Cycle 7 + CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body) + FSTD %fr30,-96(%sp) + FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2 + LDD -88(%sp),%r22 ; Cycle 3 + LDD -72(%sp),%r31 ; Cycle 4 + XMPYU %fr7R,%fr7R,%fr28 + XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 + XMPYU %fr7L,%fr7L,%fr31 + LDD -96(%sp),%r20 ; Cycle 6 + FSTD %fr28,-80(%sp) + ADD %r0,%r0,%r0 ; clear the carry bit + ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7 + FSTD %fr24,-64(%sp) + +; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body". + +$DIAGLOOP + SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) + LDO SIXTEEN(%r25),%r25 + LDD 0(%r24),%r1 + FSTD %fr31,-104(%sp) + SHRPD %r0,%r31,31,%r4 ; Cycle 2 + ADD,DC %r22,%r3,%r3 + FLDD UN_SIXTEEN(%r25),%fr7 + ADD,DC %r0,%r20,%r20 ; Cycle 3 + ADD %r1,%r3,%r3 + XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 + LDD -80(%sp),%r21 + STD %r3,0(%r24) + XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 + XMPYU %fr7L,%fr7L,%fr30 + LDD -64(%sp),%r29 + LDD EIGHT(%r24),%r1 + ADD,DC %r4,%r20,%r20 ; Cycle 6 + LDD -104(%sp),%r19 + FSTD %fr29,-88(%sp) + ADD %r20,%r1,%r1 ; Cycle 7 + FSTD %fr27,-72(%sp) + SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) + LDO THIRTY_TWO(%r24),%r24 + LDD UN_SIXTEEN(%r24),%r28 + FSTD %fr30,-96(%sp) + SHRPD %r0,%r29,31,%r3 ; Cycle 2 + ADD,DC %r21,%r4,%r4 + FLDD UN_EIGHT(%r25),%fr7 + STD %r1,UN_TWENTY_FOUR(%r24) + ADD,DC %r0,%r19,%r19 ; Cycle 3 + ADD %r28,%r4,%r4 + XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4 + LDD -88(%sp),%r22 + STD %r4,UN_SIXTEEN(%r24) + XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 + XMPYU %fr7L,%fr7L,%fr31 + LDD -72(%sp),%r31 + LDD UN_EIGHT(%r24),%r28 + ADD,DC %r3,%r19,%r19 ; Cycle 6 + LDD -96(%sp),%r20 + FSTD %fr28,-80(%sp) + ADD %r19,%r28,%r28 ; Cycle 7 + FSTD %fr24,-64(%sp) + ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8 + STD %r28,UN_EIGHT(%r24) + +$ENDDIAGLOOP + + ADD,DC %r0,%r22,%r22 + CMPIB,= 0,%r26,$ONEMOREDIAG + SHRPD %r31,%r0,31,%r3 + +; Shutdown code, first stage. + + FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) + LDD 0(%r24),%r28 + SHRPD %r0,%r31,31,%r4 ; Cycle 2 + ADD %r3,%r22,%r3 + ADD,DC %r0,%r20,%r20 ; Cycle 3 + LDD -80(%sp),%r21 + ADD %r3,%r28,%r3 + LDD -64(%sp),%r29 ; Cycle 4 + STD %r3,0(%r24) + LDD EIGHT(%r24),%r1 ; Cycle 5 + LDO SIXTEEN(%r25),%r25 ; Cycle 6 + LDD -104(%sp),%r19 + ADD,DC %r4,%r20,%r20 + ADD %r20,%r1,%r1 ; Cycle 7 + ADD,DC %r0,%r21,%r21 ; Cycle 8 + STD %r1,EIGHT(%r24) + +; Shutdown code, second stage. + + SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) + LDO THIRTY_TWO(%r24),%r24 + LDD UN_SIXTEEN(%r24),%r1 + SHRPD %r0,%r29,31,%r3 ; Cycle 2 + ADD %r4,%r21,%r4 + ADD,DC %r0,%r19,%r19 ; Cycle 3 + ADD %r4,%r1,%r4 + STD %r4,UN_SIXTEEN(%r24); Cycle 4 + LDD UN_EIGHT(%r24),%r28 ; Cycle 5 + ADD,DC %r3,%r19,%r19 ; Cycle 6 + ADD %r19,%r28,%r28 ; Cycle 7 + ADD,DC %r0,%r0,%r22 ; Cycle 8 + CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit + STD %r28,UN_EIGHT(%r24) + +; Final carry propagation + +$FDIAG2 + LDO EIGHT(%r24),%r24 + LDD UN_EIGHT(%r24),%r26 + ADDI 1,%r26,%r26 + CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry. + STD %r26,UN_EIGHT(%r24) + + B $Z0 + NOP + +; Here is the code that handles the difficult case N=1. +; We do the usual trick -- branch out of the startup code at appropriate +; points, and branch into the shutdown code. + +$DIAG_N_IS_ONE + + LDD -88(%sp),%r22 + LDD -72(%sp),%r31 + B $JOINDIAG + LDD -96(%sp),%r20 + +; We came out of the unrolled loop with wrong parity. Do one more +; single cycle. This is the "alternate body". It will, of course, +; give us opposite registers from the other case, so we need +; completely different shutdown code. + +$ONEMOREDIAG + FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) + LDD 0(%r24),%r28 + FLDD 0(%r25),%fr7 ; Cycle 2 + SHRPD %r0,%r31,31,%r4 + ADD %r3,%r22,%r3 + ADD,DC %r0,%r20,%r20 ; Cycle 3 + LDD -80(%sp),%r21 + ADD %r3,%r28,%r3 + LDD -64(%sp),%r29 ; Cycle 4 + STD %r3,0(%r24) + XMPYU %fr7R,%fr7R,%fr29 + LDD EIGHT(%r24),%r1 ; Cycle 5 + XMPYU %fr7L,%fr7R,%fr27 + XMPYU %fr7L,%fr7L,%fr30 + LDD -104(%sp),%r19 ; Cycle 6 + FSTD %fr29,-88(%sp) + ADD,DC %r4,%r20,%r20 + FSTD %fr27,-72(%sp) ; Cycle 7 + ADD %r20,%r1,%r1 + ADD,DC %r0,%r21,%r21 ; Cycle 8 + STD %r1,EIGHT(%r24) + +; Shutdown code, first stage. + + SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) + LDO THIRTY_TWO(%r24),%r24 + FSTD %fr30,-96(%sp) + LDD UN_SIXTEEN(%r24),%r1 + SHRPD %r0,%r29,31,%r3 ; Cycle 2 + ADD %r4,%r21,%r4 + ADD,DC %r0,%r19,%r19 ; Cycle 3 + LDD -88(%sp),%r22 + ADD %r4,%r1,%r4 + LDD -72(%sp),%r31 ; Cycle 4 + STD %r4,UN_SIXTEEN(%r24) + LDD UN_EIGHT(%r24),%r28 ; Cycle 5 + LDD -96(%sp),%r20 ; Cycle 6 + ADD,DC %r3,%r19,%r19 + ADD %r19,%r28,%r28 ; Cycle 7 + ADD,DC %r0,%r22,%r22 ; Cycle 8 + STD %r28,UN_EIGHT(%r24) + +; Shutdown code, second stage. + +$JOINDIAG + SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) + LDD 0(%r24),%r28 + SHRPD %r0,%r31,31,%r4 ; Cycle 2 + ADD %r3,%r22,%r3 + ADD,DC %r0,%r20,%r20 ; Cycle 3 + ADD %r3,%r28,%r3 + STD %r3,0(%r24) ; Cycle 4 + LDD EIGHT(%r24),%r1 ; Cycle 5 + ADD,DC %r4,%r20,%r20 + ADD %r20,%r1,%r1 ; Cycle 7 + ADD,DC %r0,%r0,%r21 ; Cycle 8 + CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit + STD %r1,EIGHT(%r24) + +; Final carry propagation + +$FDIAG1 + LDO EIGHT(%r24),%r24 + LDD EIGHT(%r24),%r26 + ADDI 1,%r26,%r26 + CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry. + STD %r26,EIGHT(%r24) + +$Z0 + LDW -124(%sp),%r4 + BVE (%r2) + .EXIT + LDW,MB -128(%sp),%r3 + .PROCEND +; .ALLOW + + .SPACE $TEXT$ + .SUBSPA $CODE$ +#ifdef LITTLE_WORDIAN +#ifdef __GNUC__ +; GNU-as (as of 2.19) does not support LONG_RETURN + .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR +#else + .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN + .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN +#endif +#else + .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN + .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN +#endif + .END + + +; How to use "maxpy_PA20_little" and "maxpy_PA20_big" +; +; The routine "maxpy_PA20_little" or "maxpy_PA20_big" +; performs a 64-bit x any-size multiply, and adds the +; result to an area of memory. That is, it performs +; something like +; +; A B C D +; * Z +; __________ +; P Q R S T +; +; and then adds the "PQRST" vector into an area of memory, +; handling all carries. +; +; Digression on nomenclature and endian-ness: +; +; Each of the capital letters in the above represents a 64-bit +; quantity. That is, you could think of the discussion as +; being in terms of radix-16-quintillion arithmetic. The data +; type being manipulated is "unsigned long long int". This +; requires the 64-bit extension of the HP-UX C compiler, +; available at release 10. You need these compiler flags to +; enable these extensions: +; +; -Aa +e +DA2.0 +DS2.0 +; +; (The first specifies ANSI C, the second enables the +; extensions, which are beyond ANSI C, and the third and +; fourth tell the compiler to use whatever features of the +; PA2.0 architecture it wishes, in order to made the code more +; efficient. Since the presence of the assembly code will +; make the program unable to run on anything less than PA2.0, +; you might as well gain the performance enhancements in the C +; code as well.) +; +; Questions of "endian-ness" often come up, usually in the +; context of byte ordering in a word. These routines have a +; similar issue, that could be called "wordian-ness". +; Independent of byte ordering (PA is always big-endian), one +; can make two choices when representing extremely large +; numbers as arrays of 64-bit doublewords in memory. +; +; "Little-wordian" layout means that the least significant +; word of a number is stored at the lowest address. +; +; MSW LSW +; | | +; V V +; +; A B C D E +; +; ^ ^ ^ +; | | |____ address 0 +; | | +; | |_______address 8 +; | +; address 32 +; +; "Big-wordian" means that the most significant word is at the +; lowest address. +; +; MSW LSW +; | | +; V V +; +; A B C D E +; +; ^ ^ ^ +; | | |____ address 32 +; | | +; | |_______address 24 +; | +; address 0 +; +; When you compile the file, you must specify one or the other, with +; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN". +; +; Incidentally, you assemble this file as part of your +; project with the same C compiler as the rest of the program. +; My "makefile" for a superprecision arithmetic package has +; the following stuff: +; +; # definitions: +; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1 +; CFLAGS = +O3 +; LDFLAGS = -L /usr/lib -Wl,-aarchive +; +; # general build rule for ".s" files: +; .s.o: +; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN +; +; # Now any bind step that calls for pa20.o will assemble pa20.s +; +; End of digression, back to arithmetic: +; +; The way we multiply two huge numbers is, of course, to multiply +; the "ABCD" vector by each of the "WXYZ" doublewords, adding +; the result vectors with increasing offsets, the way we learned +; in school, back before we all used calculators: +; +; A B C D +; * W X Y Z +; __________ +; P Q R S T +; E F G H I +; M N O P Q +; + R S T U V +; _______________ +; F I N A L S U M +; +; So we call maxpy_PA20_big (in my case; my package is +; big-wordian) repeatedly, giving the W, X, Y, and Z arguments +; in turn as the "scalar", and giving the "ABCD" vector each +; time. We direct it to add its result into an area of memory +; that we have cleared at the start. We skew the exact +; location into that area with each call. +; +; The prototype for the function is +; +; extern void maxpy_PA20_big( +; int length, /* Number of doublewords in the multiplicand vector. */ +; const long long int *scalaraddr, /* Address to fetch the scalar. */ +; const long long int *multiplicand, /* The multiplicand vector. */ +; long long int *result); /* Where to accumulate the result. */ +; +; (You should place a copy of this prototype in an include file +; or in your C file.) +; +; Now, IN ALL CASES, the given address for the multiplicand or +; the result is that of the LEAST SIGNIFICANT DOUBLEWORD. +; That word is, of course, the word at which the routine +; starts processing. "maxpy_PA20_little" then increases the +; addresses as it computes. "maxpy_PA20_big" decreases them. +; +; In our example above, "length" would be 4 in each case. +; "multiplicand" would be the "ABCD" vector. Specifically, +; the address of the element "D". "scalaraddr" would be the +; address of "W", "X", "Y", or "Z" on the four calls that we +; would make. (The order doesn't matter, of course.) +; "result" would be the appropriate address in the result +; area. When multiplying by "Z", that would be the least +; significant word. When multiplying by "Y", it would be the +; next higher word (8 bytes higher if little-wordian; 8 bytes +; lower if big-wordian), and so on. The size of the result +; area must be the the sum of the sizes of the multiplicand +; and multiplier vectors, and must be initialized to zero +; before we start. +; +; Whenever the routine adds its partial product into the result +; vector, it follows carry chains as far as they need to go. +; +; Here is the super-precision multiply routine that I use for +; my package. The package is big-wordian. I have taken out +; handling of exponents (it's a floating point package): +; +; static void mul_PA20( +; int size, +; const long long int *arg1, +; const long long int *arg2, +; long long int *result) +; { +; int i; +; +; for (i=0 ; i<2*size ; i++) result[i] = 0ULL; +; +; for (i=0 ; i<size ; i++) { +; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]); +; } +; } |