.globl _sqrt
prec=50
; r1 replaces SP
; This code neither reads nor writes the FPSCR
; It assumes that FPSCR indicates "round to nearest".
; It does not "signal inexact".
; 60 clocks for prec <= 50
; 68 clocks for 50 < prec <= 57
; 102 clocks for 57 < prec

_sqrt:
zx=40 ; number of stack bytes used by next two instructions.
; must also be a multiple of 8
    stmw r26,-6*4-8(r1)
    stfd f1,-8(r1) ; pun.f = t
    li r31,lo16(tz)
    addis r31,r31,ha16(tz)
    lis r30,0x3FF0 ; 0x3FF00000
    lwz r29,-8(r1) ; pun.x.hi
    mfcr r28 ; save
    rlwinm r27,r29,32-11,23,27 ; w*16
    oris r26,r29,0xFFF0
    xoris r26,r26,0xC000 ; pun.x.hi with exp set to 3FF
    stw r26,-8(r1) ; pun.x.hi
    rlwinm. r26,r29,12,21,31 ; exp
    add r27,r27,r31 ; -> vx element
    stfd f4,-zx-8(r1)  ; save for caller
    lfs f4,4(r27) ; vx[w].pre
    bc 12,2,smallarg ; asul
    stfd f5,-zx-32(r1)  ; save for caller
    add r30,r29,r30 ; pun.x.hi + 0x3FF00000
    mtcrf 0xFF,r29 ; oe to CR1
    lfd f1,-8(r1)
    stfd f3,-zx-64(r1)  ; save for caller
    lfs f3,lo16(c0-tz)(r31) ; 1.
    rlwinm r30,r30,32-1,1,11 ; expon adjust for ans.
;    if pedantic
;      mffs f5
;      mtfsfi 7,0 ; guard against inexact interrupts
;                 ; and ensure round to nearest
;      stfd f5,-40(r1) ; preserve caller's state
;    endif
    stw r30,-8(r1)
    fmsub f4,f4,f1,f3 ; x
    lfs f5,lo16(c8-tz)(r31) ; 429./32768 [x8]
    stfd f13,-zx-16(r1) ; save for caller
    lfs f13,lo16(c6-tz)(r31) ; 21./1024 [x6]
    li r30,0
    stfd f2,-zx-48(r1)
    fmul f2,f4,f4 ; x2
    stfd f10,-zx-56(r1)
    lfs f10,lo16(c7-tz)(r31) ; 33./2048 [x7]
    stfd f12,-zx-24(r1)  ; save for caller
    lfs f12,lo16(c5-tz)(r31) ; 7./256 [x5]
    stfd f7,-zx-40(r1) ; save for caller
    fmadds f7,f5,f2,f13
    fmadds f10,f10,f2,f12
    lfs f12,lo16(c4-tz)(r31) ; 5./128 [x4]
    bc 12,0,NegArg ; NB. CR1 still in use
    lfs f13,lo16(c3-tz)(r31) ; 1./16 [x3]
    fmadds f7,f7,f2,f12
    lfs f5,lo16(c2-tz)(r31) ; 1./8 [x2]
    stw r30,-4(r1) ; formed double power of two
    fmadd f10,f10,f2,f13
    lfs f12,lo16(c1-tz)(r31) ; 1./2 [x1]
    cmpi 0,0,r26,0x7ff ; Infinity exponent?
    fmadd f7,f7,f2,f5
    fmadd f10,f10,f2,f12
    bc 12,2,WayTooBig
    fnmsub f5,f7,f2,f3 ; even terms
    lfd f13,8(r27) ; vx[w].post
    lfd f7,lo16(sqrt2-tz)(r31)
    lfd f2,-zx-48(r1)  ; for caller
    fmadd f0,f4,f10,f5 ; whole series
    lfd f10,-zx-56(r1)
    .if prec>50
      fmul f0,f13,f0 ; r
      lfs f12,0(r27) ; adj
      bc 12,11,OddExponent
      fadd f1,f1,f1 ; holds pun.f value
      fmul f0,f0,f7
      lfd f7,-8(r1)
join:
      fmsub f13,f0,f0,f1 ; (r*r-pun.f)
      .if prec>57
        fnmsub f12,f13,f12,f0 ; holds adjusted root, R
        lfs f4,lo16(tm52-tz)(r31)
        fmsub f3,f12,f12,f1 ; R*R - pun.f
; abs(root(pun.f) - R) < .5152 * 2^-52
        fsubs f0,f1,f1 ; produce 0 as comparand!
        fmadd f13,f4,f12,f3 ; (R+2^-53)^2 - pun.f - 2^-106
        fnmsub f5,f4,f12,f3 ; (R-2^-53)^2 - pun.f - 2^-106
        lfd f3,-zx-64(r1)
        fcmpu 1,f13,f0
        fcmpu 5,f5,f0
        lfd f5,-zx-32(r1)  ; for caller
        bc 12,4+0,TooSmall
        bc 4,20+0,TooBig
Safe:
        mtcrf 0xFF,r28 ; restore
        lfd f4,-zx-8(r1)  ; for caller
      .else
        lfd f5,-zx-32(r1)  ; caller
        lfd f4,-zx-8(r1)  ; for caller
        lfd f3,-zx-64(r1)
        fnmsub f12,f13,f12,f0 ; holds adjusted root, R
; abs(root(pun.f) - R) < .5152 * 2^-52
        mtcrf 0xFF,r28 ; restore
      .endif
      fmul f1,f7,f12
exit: lfd f12,-zx-24(r1)  ; for caller
      lfd f7,-zx-40(r1)  ; for caller
      lfd f13,-zx-16(r1) ; for caller
      lmw r26,-4*6-8(r1)
      blr
    .else
      lfd f5,-zx-32(r1)  ; caller
      lfd f12,-zx-24(r1)  ; for caller
      lfd f4,-zx-8(r1)  ; for caller
      fmul f0,f13,f0 ; r
      lfd f3,-zx-64(r1)
      bc 12,11,1f
      fadd f1,f1,f1 ; holds pun.f value
      fmul f0,f0,f7
1:    lfd f7,-8(r1)  ; for caller
      lfd f13,-zx-16(r1) ; for caller
      mtcrf 0xFF,r28 ; restore
      fmul f1,f7,f0
exit: lfd f7,-zx-40(r1)  ; for caller
      lmw r26,-4*6-8(r1)
      blr
    .endif
; with adjustment defeated the average absolute error is
; 0.6 of last bit while worst error is 3.5 times last bit.
; with adjustment average error is 0.25004 times last bit
; and worst case error is .515 times last bit.
; (.500 is best theoretical possible.) 
; rms errors are respectivesy .8 & .3

.if prec>50
OddExponent:
    fmul f12,f7,f12 ; adj
    lfd f7,-8(r1)
    b join
.endif
    .if prec>57
TooBig: fsub f12,f12,f4
        b Safe        
TooSmall: fadd f12,f12,f4
        b Safe        
    .endif

NegArg: lfd f1,lo16(nan1-tz)(r31)
    .if prec<=50
      lfd f12,-zx-24(r1)  ; for caller
      lfd f13,-zx-16(r1) ; for caller
    .endif
    lfd f2,-zx-48(r1)
    lfd f3,-zx-64(r1)
    lfd f4,-zx-8(r1)
    lfd f5,-zx-32(r1)
    lfd f10,-zx-56(r1)
    mtcrf 0xFF,r28 ; restore
    b exit

resZero: addic. r30,r30,0 ; test pun.x.lo
    bc 12,2,out ; return for .sqrt(0)
    b DeNormal

smallarg: ; Exponent of arg is 0.
    lwz r30,-4(r1) ; pun.x.lo
    rlwinm. r26,r29,0,1,31 ; test pun.x.hi
    bc 12,2,resZero
DeNormal:
    ; st r1,-zx-16(r1) ; This would please debuggers.
    subi r1,r1,zx+16
    mflr r27
    stw r27,0(r1) ; Preserve original caller of .sqrt
    lfs f4,lo16(t60-tz)(r31)
    fmul f1,f1,f4
    bl _sqrt
    lfs f4,lo16(tm30-tz)(r31)
    lwz r27,0(r1) ; Retrieve original caller
    mtlr r27
    fmul f1,f1,f4
    addi r1,r1,zx+16
out: lfd f4,-zx-8(r1)
    lmw r26,-4*6-8(r1)
    blr

WayTooBig: stw r29,-8(r1) ; put it back as it was
   lfd f1,-8(r1) ; reconstructed original argument
   b out

tz:
  .long 0x3EB39F19,0x3F7C0FC4,0x3FF01FE0,0x26B0EBA8
  .long 0x3EB0EB96,0x3F74898F,0x3FF05EE6,0x810A0447
  .long 0x3EAE565C,0x3F6D7307,0x3FF09CFD,0xB018663D
  .long 0x3EABDD46,0x3F66C2B5,0x3FF0DA30,0x46DF0BCB
  .long 0x3EA97E62,0x3F607038,0x3FF11687,0xA9BF7D1E
  .long 0x3EA737F0,0x3F5A7410,0x3FF1520C,0xB9665E7C
  .long 0x3EA50855,0x3F54C77B,0x3FF18CC8,0x21F9ED73
  .long 0x3EA2EE1D,0x3F4F6475,0x3FF1C6C1,0x676DCBA5
  .long 0x3EA0E7F5,0x3F4A4588,0x3FF1FFFF,0xFEE00000
  .long 0x3E9EF4A4,0x3F4565CB,0x3FF2388A,0xA24549E3
  .long 0x3E9D130E,0x3F40C0C6,0x3FF27067,0xE1508515
  .long 0x3E9B422C,0x3F3C5265,0x3FF2A79E,0x2E142258
  .long 0x3E99810C,0x3F381706,0x3FF2DE32,0x9D67F180
  .long 0x3E97CED0,0x3F340B43,0x3FF3142B,0x11823B72
  .long 0x3E962AA9,0x3F302C0C,0x3FF3498C,0x89D42842
  .long 0x3E9493D9,0x3F2C7692,0x3FF37E5B,0xCD0E2CA5
  .long 0x3E9309AF,0x3F28E842,0x3FF3B29D,0x55AF516A
  .long 0x3E918B87,0x3F257EB5,0x3FF3E655,0xEF25E013
  .long 0x3E9018C6,0x3F2237C3,0x3FF41989,0x4ECE8E2D
  .long 0x3E8EB0E0,0x3F1F1166,0x3FF44C3B,0x824F7CAD
  .long 0x3E8D534F,0x3F1C09C5,0x3FF47E70,0xADFAA87
  .long 0x3E8BFF97,0x3F191F1A,0x3FF4B02B,0x54FAD60B
  .long 0x3E8AB544,0x3F164FDE,0x3FF4E16F,0x9B31198B
  .long 0x3E8973E8,0x3F139A86,0x3FF51241,0x312541A3
  .long 0x3E883B1E,0x3F10FDBD,0x3FF542A2,0x66B811F2
  .long 0x3E870A87,0x3F0E7835,0x3FF57296,0xA3C13A85
  .long 0x3E85E1C7,0x3F0C08C4,0x3FF5A220,0x2F014E4E
  .long 0x3E84C08B,0x3F09AE41,0x3FF5D142,0xAD7BC321
  .long 0x3E83A682,0x3F0767AC,0x3FF5FFFF,0xF2F0000C
  .long 0x3E829362,0x3F053408,0x3FF62E5A,0xD3FABB21
  .long 0x3E8186E2,0x3F03126F,0x3FF65C55,0x799527C5
  .long 0x3E8080C1,0x3F010204,0x3FF689F2,0x6D1F5164
c0: .long 0x3F800000 ; 1.0000000000000e+00
c1: .long 0x3F000000 ; 5.0000000000000e-01
c2: .long 0x3E000000 ; 1.2500000000000e-01
c3: .long 0x3D800000 ; 6.2500000000000e-02
c4: .long 0x3D200000 ; 3.9062500000000e-02
c5: .long 0x3CE00000 ; 2.7343750000000e-02
c6: .long 0x3CA80000 ; 2.0507812500000e-02
c7: .long 0x3C840000 ; 1.6113281250000e-02
c8: .long 0x3C568000 ; 1.3092041015625e-02
t60: .long 0x5D800000
sqrt2: .long 0x3FF6A09E,0x667F3BCD
nan1: .long 0x7FF80020,0
tm30: .long 0x30800000
tm52: .long 0x25800000