; This Source Code Form is subject to the terms of the Mozilla Public ; License, v. 2.0. If a copy of the MPL was not distributed with this ; file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifdef __LP64__ .LEVEL 2.0W #else ; .LEVEL 1.1 ; .ALLOW 2.0N .LEVEL 2.0 #endif .SPACE $TEXT$,SORT=8 .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24 ; *************************************************************** ; ; maxpy_[little/big] ; ; *************************************************************** ; There is no default -- you must specify one or the other. #define LITTLE_WORDIAN 1 #ifdef LITTLE_WORDIAN #define EIGHT 8 #define SIXTEEN 16 #define THIRTY_TWO 32 #define UN_EIGHT -8 #define UN_SIXTEEN -16 #define UN_TWENTY_FOUR -24 #endif #ifdef BIG_WORDIAN #define EIGHT -8 #define SIXTEEN -16 #define THIRTY_TWO -32 #define UN_EIGHT 8 #define UN_SIXTEEN 16 #define UN_TWENTY_FOUR 24 #endif ; This performs a multiple-precision integer version of "daxpy", ; Using the selected addressing direction. "Little-wordian" means that ; the least significant word of a number is stored at the lowest address. ; "Big-wordian" means that the most significant word is at the lowest ; address. Either way, the incoming address of the vector is that ; of the least significant word. That means that, for little-wordian ; addressing, we move the address upward as we propagate carries ; from the least significant word to the most significant. For ; big-wordian we move the address downward. ; We use the following registers: ; ; r2 return PC, of course ; r26 = arg1 = length ; r25 = arg2 = address of scalar ; r24 = arg3 = multiplicand vector ; r23 = arg4 = result vector ; ; fr9 = scalar loaded once only from r25 ; The cycle counts shown in the bodies below are simply the result of a ; scheduling by hand. The actual PCX-U hardware does it differently. ; The intention is that the overall speed is the same. ; The pipeline startup and shutdown code is constructed in the usual way, ; by taking the loop bodies and removing unnecessary instructions. ; We have left the comments describing cycle numbers in the code. ; These are intended for reference when comparing with the main loop, ; and have no particular relationship to actual cycle numbers. #ifdef LITTLE_WORDIAN maxpy_little #else maxpy_big #endif .PROC .CALLINFO FRAME=120,ENTRY_GR=4 .ENTRY STW,MA %r3,128(%sp) STW %r4,-124(%sp) ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately. FLDD 0(%r25),%fr9 ; fr9 = scalar ; First startup FLDD 0(%r24),%fr24 ; Cycle 1 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 FLDD EIGHT(%r24),%fr28 ; Cycle 8 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 FSTD %fr24,-96(%sp) XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 FSTD %fr25,-80(%sp) LDO SIXTEEN(%r24),%r24 ; Cycle 12 FSTD %fr31,-64(%sp) XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 FSTD %fr27,-48(%sp) ; Second startup XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 FSTD %fr30,-56(%sp) FLDD 0(%r24),%fr24 FSTD %fr26,-88(%sp) ; Cycle 2 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 FSTD %fr28,-104(%sp) XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 LDD -96(%sp),%r3 FSTD %fr29,-72(%sp) XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 LDD -64(%sp),%r19 LDD -80(%sp),%r21 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 LDD -56(%sp),%r20 ADD %r21,%r3,%r3 ADD,DC %r20,%r19,%r19 ; Cycle 7 LDD -88(%sp),%r4 SHRPD %r3,%r0,32,%r21 LDD -48(%sp),%r1 FLDD EIGHT(%r24),%fr28 ; Cycle 8 LDD -104(%sp),%r31 ADD,DC %r0,%r0,%r20 SHRPD %r19,%r3,32,%r3 LDD -72(%sp),%r29 ; Cycle 9 SHRPD %r20,%r19,32,%r20 ADD %r21,%r1,%r1 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 ADD,DC %r3,%r4,%r4 FSTD %fr24,-96(%sp) XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 ADD,DC %r0,%r20,%r20 LDD 0(%r23),%r3 FSTD %fr25,-80(%sp) LDO SIXTEEN(%r24),%r24 ; Cycle 12 FSTD %fr31,-64(%sp) XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 ADD %r0,%r0,%r0 ; clear the carry bit ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12 FSTD %fr27,-48(%sp) ; MFCTL %cr16,%r21 ; for timing ; STD %r21,-112(%sp) ; Here is the loop. $LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 ADD,DC %r29,%r4,%r4 FSTD %fr30,-56(%sp) FLDD 0(%r24),%fr24 LDO SIXTEEN(%r23),%r23 ; Cycle 2 ADD,DC %r0,%r20,%r20 FSTD %fr26,-88(%sp) XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 ADD %r3,%r1,%r1 FSTD %fr28,-104(%sp) LDD UN_EIGHT(%r23),%r21 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 ADD,DC %r21,%r4,%r28 FSTD %fr29,-72(%sp) LDD -96(%sp),%r3 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 ADD,DC %r20,%r31,%r22 LDD -64(%sp),%r19 LDD -80(%sp),%r21 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 ADD %r21,%r3,%r3 LDD -56(%sp),%r20 STD %r1,UN_SIXTEEN(%r23) ADD,DC %r20,%r19,%r19 ; Cycle 7 SHRPD %r3,%r0,32,%r21 LDD -88(%sp),%r4 LDD -48(%sp),%r1 ADD,DC %r0,%r0,%r20 ; Cycle 8 SHRPD %r19,%r3,32,%r3 FLDD EIGHT(%r24),%fr28 LDD -104(%sp),%r31 SHRPD %r20,%r19,32,%r20 ; Cycle 9 ADD %r21,%r1,%r1 STD %r28,UN_EIGHT(%r23) LDD -72(%sp),%r29 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 ADD,DC %r3,%r4,%r4 FSTD %fr24,-96(%sp) XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 ADD,DC %r0,%r20,%r20 FSTD %fr25,-80(%sp) LDD 0(%r23),%r3 LDO SIXTEEN(%r24),%r24 ; Cycle 12 FSTD %fr31,-64(%sp) XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 ADD %r22,%r1,%r1 ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12 FSTD %fr27,-48(%sp) $ENDLOOP ; Shutdown code, first stage. ; MFCTL %cr16,%r21 ; for timing ; STD %r21,UN_SIXTEEN(%r23) ; LDD -112(%sp),%r21 ; STD %r21,UN_EIGHT(%r23) XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 ADD,DC %r29,%r4,%r4 CMPIB,= 0,%r26,$ONEMORE FSTD %fr30,-56(%sp) LDO SIXTEEN(%r23),%r23 ; Cycle 2 ADD,DC %r0,%r20,%r20 FSTD %fr26,-88(%sp) ADD %r3,%r1,%r1 ; Cycle 3 FSTD %fr28,-104(%sp) LDD UN_EIGHT(%r23),%r21 ADD,DC %r21,%r4,%r28 ; Cycle 4 FSTD %fr29,-72(%sp) STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9 LDD -96(%sp),%r3 ADD,DC %r20,%r31,%r22 ; Cycle 5 STD %r1,UN_SIXTEEN(%r23) $JOIN4 LDD -64(%sp),%r19 LDD -80(%sp),%r21 ADD %r21,%r3,%r3 ; Cycle 6 LDD -56(%sp),%r20 ADD,DC %r20,%r19,%r19 ; Cycle 7 SHRPD %r3,%r0,32,%r21 LDD -88(%sp),%r4 LDD -48(%sp),%r1 ADD,DC %r0,%r0,%r20 ; Cycle 8 SHRPD %r19,%r3,32,%r3 LDD -104(%sp),%r31 SHRPD %r20,%r19,32,%r20 ; Cycle 9 ADD %r21,%r1,%r1 LDD -72(%sp),%r29 ADD,DC %r3,%r4,%r4 ; Cycle 10 ADD,DC %r0,%r20,%r20 ; Cycle 11 LDD 0(%r23),%r3 ADD %r22,%r1,%r1 ; Cycle 13 ; Shutdown code, second stage. ADD,DC %r29,%r4,%r4 ; Cycle 1 LDO SIXTEEN(%r23),%r23 ; Cycle 2 ADD,DC %r0,%r20,%r20 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 ADD %r3,%r1,%r1 ADD,DC %r21,%r4,%r28 ; Cycle 4 ADD,DC %r20,%r31,%r22 ; Cycle 5 STD %r1,UN_SIXTEEN(%r23); Cycle 6 STD %r28,UN_EIGHT(%r23) ; Cycle 9 LDD 0(%r23),%r3 ; Cycle 11 ; Shutdown code, third stage. LDO SIXTEEN(%r23),%r23 ADD %r3,%r22,%r1 $JOIN1 ADD,DC %r0,%r0,%r21 CMPIB,*= 0,%r21,$L0 ; if no overflow, exit STD %r1,UN_SIXTEEN(%r23) ; Final carry propagation $FINAL1 LDO EIGHT(%r23),%r23 LDD UN_SIXTEEN(%r23),%r21 ADDI 1,%r21,%r21 CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry. STD %r21,UN_SIXTEEN(%r23) B $L0 NOP ; Here is the code that handles the difficult cases N=1, N=2, and N=3. ; We do the usual trick -- branch out of the startup code at appropriate ; points, and branch into the shutdown code. $N_IS_SMALL CMPIB,= 0,%r26,$N_IS_ONE FSTD %fr24,-96(%sp) ; Cycle 10 FLDD EIGHT(%r24),%fr28 ; Cycle 8 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 FSTD %fr25,-80(%sp) FSTD %fr31,-64(%sp) ; Cycle 12 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 FSTD %fr27,-48(%sp) XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 CMPIB,= 2,%r26,$N_IS_THREE FSTD %fr30,-56(%sp) ; N = 2 FSTD %fr26,-88(%sp) ; Cycle 2 FSTD %fr28,-104(%sp) ; Cycle 3 LDD -96(%sp),%r3 ; Cycle 4 FSTD %fr29,-72(%sp) B $JOIN4 ADD %r0,%r0,%r22 $N_IS_THREE FLDD SIXTEEN(%r24),%fr24 FSTD %fr26,-88(%sp) ; Cycle 2 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 FSTD %fr28,-104(%sp) XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 LDD -96(%sp),%r3 FSTD %fr29,-72(%sp) XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 LDD -64(%sp),%r19 LDD -80(%sp),%r21 B $JOIN3 ADD %r0,%r0,%r22 $N_IS_ONE FSTD %fr25,-80(%sp) FSTD %fr27,-48(%sp) FSTD %fr26,-88(%sp) ; Cycle 2 B $JOIN5 ADD %r0,%r0,%r22 ; We came out of the unrolled loop with wrong parity. Do one more ; single cycle. This is quite tricky, because of the way the ; carry chains and SHRPD chains have been chopped up. $ONEMORE FLDD 0(%r24),%fr24 LDO SIXTEEN(%r23),%r23 ; Cycle 2 ADD,DC %r0,%r20,%r20 FSTD %fr26,-88(%sp) XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 FSTD %fr28,-104(%sp) LDD UN_EIGHT(%r23),%r21 ADD %r3,%r1,%r1 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 ADD,DC %r21,%r4,%r28 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 LDD -96(%sp),%r3 FSTD %fr29,-72(%sp) XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 ADD,DC %r20,%r31,%r22 LDD -64(%sp),%r19 LDD -80(%sp),%r21 STD %r1,UN_SIXTEEN(%r23); Cycle 6 $JOIN3 XMPYU %fr9L,%fr24R,%fr24 LDD -56(%sp),%r20 ADD %r21,%r3,%r3 ADD,DC %r20,%r19,%r19 ; Cycle 7 LDD -88(%sp),%r4 SHRPD %r3,%r0,32,%r21 LDD -48(%sp),%r1 LDD -104(%sp),%r31 ; Cycle 8 ADD,DC %r0,%r0,%r20 SHRPD %r19,%r3,32,%r3 LDD -72(%sp),%r29 ; Cycle 9 SHRPD %r20,%r19,32,%r20 ADD %r21,%r1,%r1 ADD,DC %r3,%r4,%r4 ; Cycle 10 FSTD %fr24,-96(%sp) ADD,DC %r0,%r20,%r20 ; Cycle 11 LDD 0(%r23),%r3 FSTD %fr25,-80(%sp) ADD %r22,%r1,%r1 ; Cycle 13 FSTD %fr27,-48(%sp) ; Shutdown code, stage 1-1/2. ADD,DC %r29,%r4,%r4 ; Cycle 1 LDO SIXTEEN(%r23),%r23 ; Cycle 2 ADD,DC %r0,%r20,%r20 FSTD %fr26,-88(%sp) LDD UN_EIGHT(%r23),%r21 ; Cycle 3 ADD %r3,%r1,%r1 ADD,DC %r21,%r4,%r28 ; Cycle 4 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 ADD,DC %r20,%r31,%r22 ; Cycle 5 STD %r1,UN_SIXTEEN(%r23) $JOIN5 LDD -96(%sp),%r3 ; moved from cycle 4 LDD -80(%sp),%r21 ADD %r21,%r3,%r3 ; Cycle 6 ADD,DC %r0,%r0,%r19 ; Cycle 7 LDD -88(%sp),%r4 SHRPD %r3,%r0,32,%r21 LDD -48(%sp),%r1 SHRPD %r19,%r3,32,%r3 ; Cycle 8 ADD %r21,%r1,%r1 ; Cycle 9 ADD,DC %r3,%r4,%r4 ; Cycle 10 LDD 0(%r23),%r3 ; Cycle 11 ADD %r22,%r1,%r1 ; Cycle 13 ; Shutdown code, stage 2-1/2. ADD,DC %r0,%r4,%r4 ; Cycle 1 LDO SIXTEEN(%r23),%r23 ; Cycle 2 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 ADD %r3,%r1,%r1 STD %r1,UN_SIXTEEN(%r23) ADD,DC %r21,%r4,%r1 B $JOIN1 LDO EIGHT(%r23),%r23 ; exit $L0 LDW -124(%sp),%r4 BVE (%r2) .EXIT LDW,MB -128(%sp),%r3 .PROCEND ; *************************************************************** ; ; add_diag_[little/big] ; ; *************************************************************** ; The arguments are as follows: ; r2 return PC, of course ; r26 = arg1 = length ; r25 = arg2 = vector to square ; r24 = arg3 = result vector #ifdef LITTLE_WORDIAN add_diag_little #else add_diag_big #endif .PROC .CALLINFO FRAME=120,ENTRY_GR=4 .ENTRY STW,MA %r3,128(%sp) STW %r4,-124(%sp) ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately. NOP ; Startup code FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body) XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 XMPYU %fr7L,%fr7L,%fr30 LDO SIXTEEN(%r25),%r25 ; Cycle 6 FSTD %fr29,-88(%sp) FSTD %fr27,-72(%sp) ; Cycle 7 CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body) FSTD %fr30,-96(%sp) FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2 LDD -88(%sp),%r22 ; Cycle 3 LDD -72(%sp),%r31 ; Cycle 4 XMPYU %fr7R,%fr7R,%fr28 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 XMPYU %fr7L,%fr7L,%fr31 LDD -96(%sp),%r20 ; Cycle 6 FSTD %fr28,-80(%sp) ADD %r0,%r0,%r0 ; clear the carry bit ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7 FSTD %fr24,-64(%sp) ; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body". $DIAGLOOP SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) LDO SIXTEEN(%r25),%r25 LDD 0(%r24),%r1 FSTD %fr31,-104(%sp) SHRPD %r0,%r31,31,%r4 ; Cycle 2 ADD,DC %r22,%r3,%r3 FLDD UN_SIXTEEN(%r25),%fr7 ADD,DC %r0,%r20,%r20 ; Cycle 3 ADD %r1,%r3,%r3 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 LDD -80(%sp),%r21 STD %r3,0(%r24) XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 XMPYU %fr7L,%fr7L,%fr30 LDD -64(%sp),%r29 LDD EIGHT(%r24),%r1 ADD,DC %r4,%r20,%r20 ; Cycle 6 LDD -104(%sp),%r19 FSTD %fr29,-88(%sp) ADD %r20,%r1,%r1 ; Cycle 7 FSTD %fr27,-72(%sp) SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) LDO THIRTY_TWO(%r24),%r24 LDD UN_SIXTEEN(%r24),%r28 FSTD %fr30,-96(%sp) SHRPD %r0,%r29,31,%r3 ; Cycle 2 ADD,DC %r21,%r4,%r4 FLDD UN_EIGHT(%r25),%fr7 STD %r1,UN_TWENTY_FOUR(%r24) ADD,DC %r0,%r19,%r19 ; Cycle 3 ADD %r28,%r4,%r4 XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4 LDD -88(%sp),%r22 STD %r4,UN_SIXTEEN(%r24) XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 XMPYU %fr7L,%fr7L,%fr31 LDD -72(%sp),%r31 LDD UN_EIGHT(%r24),%r28 ADD,DC %r3,%r19,%r19 ; Cycle 6 LDD -96(%sp),%r20 FSTD %fr28,-80(%sp) ADD %r19,%r28,%r28 ; Cycle 7 FSTD %fr24,-64(%sp) ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8 STD %r28,UN_EIGHT(%r24) $ENDDIAGLOOP ADD,DC %r0,%r22,%r22 CMPIB,= 0,%r26,$ONEMOREDIAG SHRPD %r31,%r0,31,%r3 ; Shutdown code, first stage. FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) LDD 0(%r24),%r28 SHRPD %r0,%r31,31,%r4 ; Cycle 2 ADD %r3,%r22,%r3 ADD,DC %r0,%r20,%r20 ; Cycle 3 LDD -80(%sp),%r21 ADD %r3,%r28,%r3 LDD -64(%sp),%r29 ; Cycle 4 STD %r3,0(%r24) LDD EIGHT(%r24),%r1 ; Cycle 5 LDO SIXTEEN(%r25),%r25 ; Cycle 6 LDD -104(%sp),%r19 ADD,DC %r4,%r20,%r20 ADD %r20,%r1,%r1 ; Cycle 7 ADD,DC %r0,%r21,%r21 ; Cycle 8 STD %r1,EIGHT(%r24) ; Shutdown code, second stage. SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) LDO THIRTY_TWO(%r24),%r24 LDD UN_SIXTEEN(%r24),%r1 SHRPD %r0,%r29,31,%r3 ; Cycle 2 ADD %r4,%r21,%r4 ADD,DC %r0,%r19,%r19 ; Cycle 3 ADD %r4,%r1,%r4 STD %r4,UN_SIXTEEN(%r24); Cycle 4 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 ADD,DC %r3,%r19,%r19 ; Cycle 6 ADD %r19,%r28,%r28 ; Cycle 7 ADD,DC %r0,%r0,%r22 ; Cycle 8 CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit STD %r28,UN_EIGHT(%r24) ; Final carry propagation $FDIAG2 LDO EIGHT(%r24),%r24 LDD UN_EIGHT(%r24),%r26 ADDI 1,%r26,%r26 CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry. STD %r26,UN_EIGHT(%r24) B $Z0 NOP ; Here is the code that handles the difficult case N=1. ; We do the usual trick -- branch out of the startup code at appropriate ; points, and branch into the shutdown code. $DIAG_N_IS_ONE LDD -88(%sp),%r22 LDD -72(%sp),%r31 B $JOINDIAG LDD -96(%sp),%r20 ; We came out of the unrolled loop with wrong parity. Do one more ; single cycle. This is the "alternate body". It will, of course, ; give us opposite registers from the other case, so we need ; completely different shutdown code. $ONEMOREDIAG FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) LDD 0(%r24),%r28 FLDD 0(%r25),%fr7 ; Cycle 2 SHRPD %r0,%r31,31,%r4 ADD %r3,%r22,%r3 ADD,DC %r0,%r20,%r20 ; Cycle 3 LDD -80(%sp),%r21 ADD %r3,%r28,%r3 LDD -64(%sp),%r29 ; Cycle 4 STD %r3,0(%r24) XMPYU %fr7R,%fr7R,%fr29 LDD EIGHT(%r24),%r1 ; Cycle 5 XMPYU %fr7L,%fr7R,%fr27 XMPYU %fr7L,%fr7L,%fr30 LDD -104(%sp),%r19 ; Cycle 6 FSTD %fr29,-88(%sp) ADD,DC %r4,%r20,%r20 FSTD %fr27,-72(%sp) ; Cycle 7 ADD %r20,%r1,%r1 ADD,DC %r0,%r21,%r21 ; Cycle 8 STD %r1,EIGHT(%r24) ; Shutdown code, first stage. SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) LDO THIRTY_TWO(%r24),%r24 FSTD %fr30,-96(%sp) LDD UN_SIXTEEN(%r24),%r1 SHRPD %r0,%r29,31,%r3 ; Cycle 2 ADD %r4,%r21,%r4 ADD,DC %r0,%r19,%r19 ; Cycle 3 LDD -88(%sp),%r22 ADD %r4,%r1,%r4 LDD -72(%sp),%r31 ; Cycle 4 STD %r4,UN_SIXTEEN(%r24) LDD UN_EIGHT(%r24),%r28 ; Cycle 5 LDD -96(%sp),%r20 ; Cycle 6 ADD,DC %r3,%r19,%r19 ADD %r19,%r28,%r28 ; Cycle 7 ADD,DC %r0,%r22,%r22 ; Cycle 8 STD %r28,UN_EIGHT(%r24) ; Shutdown code, second stage. $JOINDIAG SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) LDD 0(%r24),%r28 SHRPD %r0,%r31,31,%r4 ; Cycle 2 ADD %r3,%r22,%r3 ADD,DC %r0,%r20,%r20 ; Cycle 3 ADD %r3,%r28,%r3 STD %r3,0(%r24) ; Cycle 4 LDD EIGHT(%r24),%r1 ; Cycle 5 ADD,DC %r4,%r20,%r20 ADD %r20,%r1,%r1 ; Cycle 7 ADD,DC %r0,%r0,%r21 ; Cycle 8 CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit STD %r1,EIGHT(%r24) ; Final carry propagation $FDIAG1 LDO EIGHT(%r24),%r24 LDD EIGHT(%r24),%r26 ADDI 1,%r26,%r26 CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry. STD %r26,EIGHT(%r24) $Z0 LDW -124(%sp),%r4 BVE (%r2) .EXIT LDW,MB -128(%sp),%r3 .PROCEND ; .ALLOW .SPACE $TEXT$ .SUBSPA $CODE$ #ifdef LITTLE_WORDIAN #ifdef __GNUC__ ; GNU-as (as of 2.19) does not support LONG_RETURN .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR #else .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN #endif #else .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN #endif .END ; How to use "maxpy_PA20_little" and "maxpy_PA20_big" ; ; The routine "maxpy_PA20_little" or "maxpy_PA20_big" ; performs a 64-bit x any-size multiply, and adds the ; result to an area of memory. That is, it performs ; something like ; ; A B C D ; * Z ; __________ ; P Q R S T ; ; and then adds the "PQRST" vector into an area of memory, ; handling all carries. ; ; Digression on nomenclature and endian-ness: ; ; Each of the capital letters in the above represents a 64-bit ; quantity. That is, you could think of the discussion as ; being in terms of radix-16-quintillion arithmetic. The data ; type being manipulated is "unsigned long long int". This ; requires the 64-bit extension of the HP-UX C compiler, ; available at release 10. You need these compiler flags to ; enable these extensions: ; ; -Aa +e +DA2.0 +DS2.0 ; ; (The first specifies ANSI C, the second enables the ; extensions, which are beyond ANSI C, and the third and ; fourth tell the compiler to use whatever features of the ; PA2.0 architecture it wishes, in order to made the code more ; efficient. Since the presence of the assembly code will ; make the program unable to run on anything less than PA2.0, ; you might as well gain the performance enhancements in the C ; code as well.) ; ; Questions of "endian-ness" often come up, usually in the ; context of byte ordering in a word. These routines have a ; similar issue, that could be called "wordian-ness". ; Independent of byte ordering (PA is always big-endian), one ; can make two choices when representing extremely large ; numbers as arrays of 64-bit doublewords in memory. ; ; "Little-wordian" layout means that the least significant ; word of a number is stored at the lowest address. ; ; MSW LSW ; | | ; V V ; ; A B C D E ; ; ^ ^ ^ ; | | |____ address 0 ; | | ; | |_______address 8 ; | ; address 32 ; ; "Big-wordian" means that the most significant word is at the ; lowest address. ; ; MSW LSW ; | | ; V V ; ; A B C D E ; ; ^ ^ ^ ; | | |____ address 32 ; | | ; | |_______address 24 ; | ; address 0 ; ; When you compile the file, you must specify one or the other, with ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN". ; ; Incidentally, you assemble this file as part of your ; project with the same C compiler as the rest of the program. ; My "makefile" for a superprecision arithmetic package has ; the following stuff: ; ; # definitions: ; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1 ; CFLAGS = +O3 ; LDFLAGS = -L /usr/lib -Wl,-aarchive ; ; # general build rule for ".s" files: ; .s.o: ; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN ; ; # Now any bind step that calls for pa20.o will assemble pa20.s ; ; End of digression, back to arithmetic: ; ; The way we multiply two huge numbers is, of course, to multiply ; the "ABCD" vector by each of the "WXYZ" doublewords, adding ; the result vectors with increasing offsets, the way we learned ; in school, back before we all used calculators: ; ; A B C D ; * W X Y Z ; __________ ; P Q R S T ; E F G H I ; M N O P Q ; + R S T U V ; _______________ ; F I N A L S U M ; ; So we call maxpy_PA20_big (in my case; my package is ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments ; in turn as the "scalar", and giving the "ABCD" vector each ; time. We direct it to add its result into an area of memory ; that we have cleared at the start. We skew the exact ; location into that area with each call. ; ; The prototype for the function is ; ; extern void maxpy_PA20_big( ; int length, /* Number of doublewords in the multiplicand vector. */ ; const long long int *scalaraddr, /* Address to fetch the scalar. */ ; const long long int *multiplicand, /* The multiplicand vector. */ ; long long int *result); /* Where to accumulate the result. */ ; ; (You should place a copy of this prototype in an include file ; or in your C file.) ; ; Now, IN ALL CASES, the given address for the multiplicand or ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD. ; That word is, of course, the word at which the routine ; starts processing. "maxpy_PA20_little" then increases the ; addresses as it computes. "maxpy_PA20_big" decreases them. ; ; In our example above, "length" would be 4 in each case. ; "multiplicand" would be the "ABCD" vector. Specifically, ; the address of the element "D". "scalaraddr" would be the ; address of "W", "X", "Y", or "Z" on the four calls that we ; would make. (The order doesn't matter, of course.) ; "result" would be the appropriate address in the result ; area. When multiplying by "Z", that would be the least ; significant word. When multiplying by "Y", it would be the ; next higher word (8 bytes higher if little-wordian; 8 bytes ; lower if big-wordian), and so on. The size of the result ; area must be the the sum of the sizes of the multiplicand ; and multiplier vectors, and must be initialized to zero ; before we start. ; ; Whenever the routine adds its partial product into the result ; vector, it follows carry chains as far as they need to go. ; ; Here is the super-precision multiply routine that I use for ; my package. The package is big-wordian. I have taken out ; handling of exponents (it's a floating point package): ; ; static void mul_PA20( ; int size, ; const long long int *arg1, ; const long long int *arg2, ; long long int *result) ; { ; int i; ; ; for (i=0 ; i<2*size ; i++) result[i] = 0ULL; ; ; for (i=0 ; i