#include <math.h>
typedef long double ld;
#include <complex.h>
typedef long double _Complex c;
// The following 4 values are good for the X86 extended precision.
#define bg2 0x1.p8000L
#define bg  0x1.p4000L
#define sm2 0x1.p-8000L
#define sm  0x1.p-4000L
// remove CMPLXL definition in C11
static c CMPLXL(ld r, ld i){c x; __real__ x = r; __imag__ x = i; return x;}

c csqrtl(c x){
ld A = creall(x), B = cimagl(x);
ld aA = fabsl(A), aB = fabsl(B);
if(A < 0) {c r = csqrtl(-x); if(B<0) r = -r;
   return CMPLXL(-cimagl(r), creall(r));}
if(aA > bg2 || aB > bg2) return
    (aA==INFINITY || aB==INFINITY) ? x :  bg*csqrtl(sm2*x);
if(aA < sm2 && aB < sm2) return (x==0) ? x : sm*csqrtl(bg2*x);
{ld r = sqrtl((A + sqrtl(A*A + B*B))/2); return CMPLXL(r, B/(2*r));}}
// We assume that underflow for multipliction yields 0.

/*
// unit test:
#include <stdio.h>
void pc(c x, char* m){printf("%Le %Le %s\n", creall(x), cimagl(x), m);}
void tr(c x){c r = csqrtl(x); printf("\n");
   pc(x, "x"); pc(r, "r"); pc(r*r-x, "r*r-x");}
int main(){tr(2+3i); tr(-1-0.001i); tr(-1);
  tr(0.003 - 4i); tr(-0.003 - 4i); tr(1e1800L-2e1800Li);
  tr(-0.7 + .002i); tr(-0.7 - .002i); tr(3e-1900L-2e-1900Li); return 0;}
*/