.globl _sqrt prec=50 ; r1 replaces SP ; This code neither reads nor writes the FPSCR ; It assumes that FPSCR indicates "round to nearest". ; It does not "signal inexact". ; 60 clocks for prec <= 50 ; 68 clocks for 50 < prec <= 57 ; 102 clocks for 57 < prec _sqrt: zx=40 ; number of stack bytes used by next two instructions. ; must also be a multiple of 8 stmw r26,-6*4-8(r1) stfd f1,-8(r1) ; pun.f = t li r31,lo16(tz) addis r31,r31,ha16(tz) lis r30,0x3FF0 ; 0x3FF00000 lwz r29,-8(r1) ; pun.x.hi mfcr r28 ; save rlwinm r27,r29,32-11,23,27 ; w*16 oris r26,r29,0xFFF0 xoris r26,r26,0xC000 ; pun.x.hi with exp set to 3FF stw r26,-8(r1) ; pun.x.hi rlwinm. r26,r29,12,21,31 ; exp add r27,r27,r31 ; -> vx element stfd f4,-zx-8(r1) ; save for caller lfs f4,4(r27) ; vx[w].pre bc 12,2,smallarg ; asul stfd f5,-zx-32(r1) ; save for caller add r30,r29,r30 ; pun.x.hi + 0x3FF00000 mtcrf 0xFF,r29 ; oe to CR1 lfd f1,-8(r1) stfd f3,-zx-64(r1) ; save for caller lfs f3,lo16(c0-tz)(r31) ; 1. rlwinm r30,r30,32-1,1,11 ; expon adjust for ans. ; if pedantic ; mffs f5 ; mtfsfi 7,0 ; guard against inexact interrupts ; ; and ensure round to nearest ; stfd f5,-40(r1) ; preserve caller's state ; endif stw r30,-8(r1) fmsub f4,f4,f1,f3 ; x lfs f5,lo16(c8-tz)(r31) ; 429./32768 [x8] stfd f13,-zx-16(r1) ; save for caller lfs f13,lo16(c6-tz)(r31) ; 21./1024 [x6] li r30,0 stfd f2,-zx-48(r1) fmul f2,f4,f4 ; x2 stfd f10,-zx-56(r1) lfs f10,lo16(c7-tz)(r31) ; 33./2048 [x7] stfd f12,-zx-24(r1) ; save for caller lfs f12,lo16(c5-tz)(r31) ; 7./256 [x5] stfd f7,-zx-40(r1) ; save for caller fmadds f7,f5,f2,f13 fmadds f10,f10,f2,f12 lfs f12,lo16(c4-tz)(r31) ; 5./128 [x4] bc 12,0,NegArg ; NB. CR1 still in use lfs f13,lo16(c3-tz)(r31) ; 1./16 [x3] fmadds f7,f7,f2,f12 lfs f5,lo16(c2-tz)(r31) ; 1./8 [x2] stw r30,-4(r1) ; formed double power of two fmadd f10,f10,f2,f13 lfs f12,lo16(c1-tz)(r31) ; 1./2 [x1] cmpi 0,0,r26,0x7ff ; Infinity exponent? fmadd f7,f7,f2,f5 fmadd f10,f10,f2,f12 bc 12,2,WayTooBig fnmsub f5,f7,f2,f3 ; even terms lfd f13,8(r27) ; vx[w].post lfd f7,lo16(sqrt2-tz)(r31) lfd f2,-zx-48(r1) ; for caller fmadd f0,f4,f10,f5 ; whole series lfd f10,-zx-56(r1) .if prec>50 fmul f0,f13,f0 ; r lfs f12,0(r27) ; adj bc 12,11,OddExponent fadd f1,f1,f1 ; holds pun.f value fmul f0,f0,f7 lfd f7,-8(r1) join: fmsub f13,f0,f0,f1 ; (r*r-pun.f) .if prec>57 fnmsub f12,f13,f12,f0 ; holds adjusted root, R lfs f4,lo16(tm52-tz)(r31) fmsub f3,f12,f12,f1 ; R*R - pun.f ; abs(root(pun.f) - R) < .5152 * 2^-52 fsubs f0,f1,f1 ; produce 0 as comparand! fmadd f13,f4,f12,f3 ; (R+2^-53)^2 - pun.f - 2^-106 fnmsub f5,f4,f12,f3 ; (R-2^-53)^2 - pun.f - 2^-106 lfd f3,-zx-64(r1) fcmpu 1,f13,f0 fcmpu 5,f5,f0 lfd f5,-zx-32(r1) ; for caller bc 12,4+0,TooSmall bc 4,20+0,TooBig Safe: mtcrf 0xFF,r28 ; restore lfd f4,-zx-8(r1) ; for caller .else lfd f5,-zx-32(r1) ; caller lfd f4,-zx-8(r1) ; for caller lfd f3,-zx-64(r1) fnmsub f12,f13,f12,f0 ; holds adjusted root, R ; abs(root(pun.f) - R) < .5152 * 2^-52 mtcrf 0xFF,r28 ; restore .endif fmul f1,f7,f12 exit: lfd f12,-zx-24(r1) ; for caller lfd f7,-zx-40(r1) ; for caller lfd f13,-zx-16(r1) ; for caller lmw r26,-4*6-8(r1) blr .else lfd f5,-zx-32(r1) ; caller lfd f12,-zx-24(r1) ; for caller lfd f4,-zx-8(r1) ; for caller fmul f0,f13,f0 ; r lfd f3,-zx-64(r1) bc 12,11,1f fadd f1,f1,f1 ; holds pun.f value fmul f0,f0,f7 1: lfd f7,-8(r1) ; for caller lfd f13,-zx-16(r1) ; for caller mtcrf 0xFF,r28 ; restore fmul f1,f7,f0 exit: lfd f7,-zx-40(r1) ; for caller lmw r26,-4*6-8(r1) blr .endif ; with adjustment defeated the average absolute error is ; 0.6 of last bit while worst error is 3.5 times last bit. ; with adjustment average error is 0.25004 times last bit ; and worst case error is .515 times last bit. ; (.500 is best theoretical possible.) ; rms errors are respectivesy .8 & .3 .if prec>50 OddExponent: fmul f12,f7,f12 ; adj lfd f7,-8(r1) b join .endif .if prec>57 TooBig: fsub f12,f12,f4 b Safe TooSmall: fadd f12,f12,f4 b Safe .endif NegArg: lfd f1,lo16(nan1-tz)(r31) .if prec<=50 lfd f12,-zx-24(r1) ; for caller lfd f13,-zx-16(r1) ; for caller .endif lfd f2,-zx-48(r1) lfd f3,-zx-64(r1) lfd f4,-zx-8(r1) lfd f5,-zx-32(r1) lfd f10,-zx-56(r1) mtcrf 0xFF,r28 ; restore b exit resZero: addic. r30,r30,0 ; test pun.x.lo bc 12,2,out ; return for .sqrt(0) b DeNormal smallarg: ; Exponent of arg is 0. lwz r30,-4(r1) ; pun.x.lo rlwinm. r26,r29,0,1,31 ; test pun.x.hi bc 12,2,resZero DeNormal: ; st r1,-zx-16(r1) ; This would please debuggers. subi r1,r1,zx+16 mflr r27 stw r27,0(r1) ; Preserve original caller of .sqrt lfs f4,lo16(t60-tz)(r31) fmul f1,f1,f4 bl _sqrt lfs f4,lo16(tm30-tz)(r31) lwz r27,0(r1) ; Retrieve original caller mtlr r27 fmul f1,f1,f4 addi r1,r1,zx+16 out: lfd f4,-zx-8(r1) lmw r26,-4*6-8(r1) blr WayTooBig: stw r29,-8(r1) ; put it back as it was lfd f1,-8(r1) ; reconstructed original argument b out tz: .long 0x3EB39F19,0x3F7C0FC4,0x3FF01FE0,0x26B0EBA8 .long 0x3EB0EB96,0x3F74898F,0x3FF05EE6,0x810A0447 .long 0x3EAE565C,0x3F6D7307,0x3FF09CFD,0xB018663D .long 0x3EABDD46,0x3F66C2B5,0x3FF0DA30,0x46DF0BCB .long 0x3EA97E62,0x3F607038,0x3FF11687,0xA9BF7D1E .long 0x3EA737F0,0x3F5A7410,0x3FF1520C,0xB9665E7C .long 0x3EA50855,0x3F54C77B,0x3FF18CC8,0x21F9ED73 .long 0x3EA2EE1D,0x3F4F6475,0x3FF1C6C1,0x676DCBA5 .long 0x3EA0E7F5,0x3F4A4588,0x3FF1FFFF,0xFEE00000 .long 0x3E9EF4A4,0x3F4565CB,0x3FF2388A,0xA24549E3 .long 0x3E9D130E,0x3F40C0C6,0x3FF27067,0xE1508515 .long 0x3E9B422C,0x3F3C5265,0x3FF2A79E,0x2E142258 .long 0x3E99810C,0x3F381706,0x3FF2DE32,0x9D67F180 .long 0x3E97CED0,0x3F340B43,0x3FF3142B,0x11823B72 .long 0x3E962AA9,0x3F302C0C,0x3FF3498C,0x89D42842 .long 0x3E9493D9,0x3F2C7692,0x3FF37E5B,0xCD0E2CA5 .long 0x3E9309AF,0x3F28E842,0x3FF3B29D,0x55AF516A .long 0x3E918B87,0x3F257EB5,0x3FF3E655,0xEF25E013 .long 0x3E9018C6,0x3F2237C3,0x3FF41989,0x4ECE8E2D .long 0x3E8EB0E0,0x3F1F1166,0x3FF44C3B,0x824F7CAD .long 0x3E8D534F,0x3F1C09C5,0x3FF47E70,0xADFAA87 .long 0x3E8BFF97,0x3F191F1A,0x3FF4B02B,0x54FAD60B .long 0x3E8AB544,0x3F164FDE,0x3FF4E16F,0x9B31198B .long 0x3E8973E8,0x3F139A86,0x3FF51241,0x312541A3 .long 0x3E883B1E,0x3F10FDBD,0x3FF542A2,0x66B811F2 .long 0x3E870A87,0x3F0E7835,0x3FF57296,0xA3C13A85 .long 0x3E85E1C7,0x3F0C08C4,0x3FF5A220,0x2F014E4E .long 0x3E84C08B,0x3F09AE41,0x3FF5D142,0xAD7BC321 .long 0x3E83A682,0x3F0767AC,0x3FF5FFFF,0xF2F0000C .long 0x3E829362,0x3F053408,0x3FF62E5A,0xD3FABB21 .long 0x3E8186E2,0x3F03126F,0x3FF65C55,0x799527C5 .long 0x3E8080C1,0x3F010204,0x3FF689F2,0x6D1F5164 c0: .long 0x3F800000 ; 1.0000000000000e+00 c1: .long 0x3F000000 ; 5.0000000000000e-01 c2: .long 0x3E000000 ; 1.2500000000000e-01 c3: .long 0x3D800000 ; 6.2500000000000e-02 c4: .long 0x3D200000 ; 3.9062500000000e-02 c5: .long 0x3CE00000 ; 2.7343750000000e-02 c6: .long 0x3CA80000 ; 2.0507812500000e-02 c7: .long 0x3C840000 ; 1.6113281250000e-02 c8: .long 0x3C568000 ; 1.3092041015625e-02 t60: .long 0x5D800000 sqrt2: .long 0x3FF6A09E,0x667F3BCD nan1: .long 0x7FF80020,0 tm30: .long 0x30800000 tm52: .long 0x25800000