diff --git a/arb.h b/arb.h
index 6b1008510..1fa01aaa6 100644
--- a/arb.h
+++ b/arb.h
@@ -413,6 +413,8 @@ void arb_mul_si(arb_t z, const arb_t x, slong y, slong prec);
 void arb_mul_ui(arb_t z, const arb_t x, ulong y, slong prec);
 void arb_mul_fmpz(arb_t z, const arb_t x, const fmpz_t y, slong prec);
 
+void arb_sqr(arb_t res, const arb_t x, slong prec);
+
 void arb_addmul(arb_t z, const arb_t x, const arb_t y, slong prec);
 void arb_addmul_arf(arb_t z, const arb_t x, const arf_t y, slong prec);
 void arb_addmul_si(arb_t z, const arb_t x, slong y, slong prec);
@@ -484,9 +486,27 @@ void arb_rsqrt(arb_t z, const arb_t x, slong prec);
 void arb_rsqrt_ui(arb_t z, ulong x, slong prec);
 void arb_sqrt1pm1(arb_t r, const arb_t z, slong prec);
 
-void arb_pow_fmpz_binexp(arb_t y, const arb_t b, const fmpz_t e, slong prec);
-void arb_pow_fmpz(arb_t y, const arb_t b, const fmpz_t e, slong prec);
-void arb_pow_ui(arb_t y, const arb_t b, ulong e, slong prec);
+void _arb_pow_mpz_binexp(arb_t y, const arb_t b, mpz_srcptr e, slong prec);
+void arb_pow_ui_binexp(arb_t y, const arb_t b, ulong e, slong prec);
+void arb_pow_si(arb_t y, const arb_t b, slong e, slong prec);
+ARB_INLINE void
+arb_pow_fmpz_binexp(arb_t y, const arb_t b, const fmpz_t e, slong prec)
+{
+    if (!COEFF_IS_MPZ(*e))
+        arb_pow_si(y, b, *e, prec);
+    else
+        _arb_pow_mpz_binexp(y, b, COEFF_TO_PTR(*e), prec);
+}
+ARB_INLINE void
+arb_pow_fmpz(arb_t y, const arb_t b, const fmpz_t e, slong prec)
+{
+    arb_pow_fmpz_binexp(y, b, e, prec);
+}
+ARB_INLINE void
+arb_pow_ui(arb_t y, const arb_t b, ulong e, slong prec)
+{
+    arb_pow_ui_binexp(y, b, e, prec);
+}
 void arb_ui_pow_ui(arb_t y, ulong b, ulong e, slong prec);
 void arb_si_pow_ui(arb_t y, slong b, ulong e, slong prec);
 void arb_pow_fmpq(arb_t y, const arb_t x, const fmpq_t a, slong prec);
@@ -620,12 +640,6 @@ void arb_primorial_ui(arb_t res, ulong n, slong prec);
 
 void arb_lambertw(arb_t res, const arb_t x, int flags, slong prec);
 
-ARB_INLINE void
-arb_sqr(arb_t res, const arb_t val, slong prec)
-{
-    arb_mul(res, val, val, prec);
-}
-
 #define ARB_DEF_CACHED_CONSTANT(name, comp_func) \
     TLS_PREFIX slong name ## _cached_prec = 0; \
     TLS_PREFIX arb_t name ## _cached_value; \
diff --git a/arb/pow_binexp.c b/arb/pow_binexp.c
new file mode 100644
index 000000000..9e21ca5d9
--- /dev/null
+++ b/arb/pow_binexp.c
@@ -0,0 +1,100 @@
+/*
+    Copyright (C) 2012, 2013 Fredrik Johansson
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+void
+_arb_pow_mpz_binexp(arb_t y, const arb_t b, mpz_srcptr e, slong prec)
+{
+    slong i, wp, bits;
+
+    if (e->_mp_size < 0)
+    {
+        __mpz_struct f = { 1, 0, NULL };
+        f._mp_size = -(e->_mp_size);
+        f._mp_d = e->_mp_d;
+
+        if (arb_is_exact(b))
+        {
+            _arb_pow_mpz_binexp(y, b, &f, prec + 2);
+            arb_inv(y, y, prec);
+        }
+        else
+        {
+            arb_inv(y, b, prec + mpz_sizeinbase(e, 2) + 2);
+            _arb_pow_mpz_binexp(y, y, &f, prec);
+        }
+
+        return;
+    }
+
+    if (y == b)
+    {
+        arb_t t;
+        arb_init(t);
+        arb_set(t, b);
+        _arb_pow_mpz_binexp(y, t, e, prec);
+        arb_clear(t);
+        return;
+    }
+
+    arb_set(y, b);
+
+    bits = mpz_sizeinbase(e, 2);
+    wp = ARF_PREC_ADD(prec, bits);
+
+    for (i = bits - 2; i >= 0; i--)
+    {
+        arb_sqr(y, y, wp);
+        if (mpz_tstbit(e, i))
+            arb_mul(y, y, b, wp);
+    }
+}
+
+void
+arb_pow_ui_binexp(arb_t y, const arb_t b, ulong e, slong prec)
+{
+    slong i, wp, bits;
+
+    if (e <= 2)
+    {
+        if (e == 0)
+            arb_one(y);
+        else if (e == 1)
+            arb_set_round(y, b, prec);
+        else
+            arb_sqr(y, b, prec);
+        return;
+    }
+
+    if (y == b)
+    {
+        arb_t t;
+        arb_init(t);
+        arb_set(t, b);
+        arb_pow_ui_binexp(y, t, e, prec);
+        arb_clear(t);
+        return;
+    }
+
+    arb_set(y, b);
+
+    bits = FLINT_BIT_COUNT(e);
+    wp = ARF_PREC_ADD(prec, bits);
+
+    for (i = bits - 2; i >= 0; i--)
+    {
+        arb_sqr(y, y, wp);
+        if (((WORD(1) << i) & e) != 0)
+            arb_mul(y, y, b, wp);
+    }
+}
diff --git a/arb/pow_fmpz_binexp.c b/arb/pow_fmpz_binexp.c
deleted file mode 100644
index 4e4f66afd..000000000
--- a/arb/pow_fmpz_binexp.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-    Copyright (C) 2012, 2013 Fredrik Johansson
-
-    This file is part of Arb.
-
-    Arb is free software: you can redistribute it and/or modify it under
-    the terms of the GNU Lesser General Public License (LGPL) as published
-    by the Free Software Foundation; either version 2.1 of the License, or
-    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
-*/
-
-#include "arb.h"
-
-void
-arb_pow_fmpz_binexp(arb_t y, const arb_t b, const fmpz_t e, slong prec)
-{
-    slong i, wp, bits;
-
-    if (-WORD(2) <= *e && *e <= WORD(2))
-    {
-        if (*e == WORD(0))
-            arb_one(y);
-        else if (*e == WORD(1))
-            arb_set_round(y, b, prec);
-        else if (*e == -WORD(1))
-            arb_inv(y, b, prec);
-        else if (*e == WORD(2))
-            arb_mul(y, b, b, prec);
-        else
-        {
-            arb_inv(y, b, prec);
-            arb_mul(y, y, y, prec);
-        }
-        return;
-    }
-
-    if (fmpz_sgn(e) < 0)
-    {
-        fmpz_t f;
-        fmpz_init(f);
-        fmpz_neg(f, e);
-
-        if (arb_is_exact(b))
-        {
-            arb_pow_fmpz_binexp(y, b, f, prec + 2);
-            arb_inv(y, y, prec);
-        }
-        else
-        {
-            arb_inv(y, b, prec + fmpz_bits(e) + 2);
-            arb_pow_fmpz_binexp(y, y, f, prec);
-        }
-
-        fmpz_clear(f);
-        return;
-    }
-
-    if (y == b)
-    {
-        arb_t t;
-        arb_init(t);
-        arb_set(t, b);
-        arb_pow_fmpz_binexp(y, t, e, prec);
-        arb_clear(t);
-        return;
-    }
-
-    arb_set(y, b);
-
-    bits = fmpz_bits(e);
-    wp = ARF_PREC_ADD(prec, bits);
-
-    for (i = bits - 2; i >= 0; i--)
-    {
-        arb_mul(y, y, y, wp);
-        if (fmpz_tstbit(e, i))
-            arb_mul(y, y, b, wp);
-    }
-}
-
diff --git a/arb/pow_si.c b/arb/pow_si.c
new file mode 100644
index 000000000..e626a1bc8
--- /dev/null
+++ b/arb/pow_si.c
@@ -0,0 +1,37 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+void
+arb_pow_si(arb_t y, const arb_t b, slong e, slong prec)
+{
+    if (e >= 0)
+        arb_pow_ui_binexp(y, b, e, prec);
+    else if (e == -1)
+        arb_inv(y, b, prec);
+    else if (e == -2)
+    {
+        arb_inv(y, b, prec);
+        arb_sqr(y, y, prec);
+    }
+    else if (arb_is_exact(b))
+    {
+        arb_pow_ui_binexp(y, b, -e, prec + 2);
+        arb_inv(y, y, prec);
+    }
+    else
+    {
+        e = -e;
+        arb_inv(y, b, prec + FLINT_BIT_COUNT(e) + 2);
+        arb_pow_ui_binexp(y, y, e, prec);
+    }
+}
diff --git a/arb/sqr.c b/arb/sqr.c
new file mode 100644
index 000000000..c6900379b
--- /dev/null
+++ b/arb/sqr.c
@@ -0,0 +1,67 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+void
+arb_sqr(arb_t res, const arb_t x, slong prec)
+{
+    mag_t resr, xm;
+    int inexact;
+
+    if (arb_is_exact(x))
+    {
+        inexact = arf_sqr_rnd_down(arb_midref(res), arb_midref(x), prec);
+
+        if (inexact)
+            arf_mag_set_ulp(arb_radref(res), arb_midref(res), prec);
+        else
+            mag_zero(arb_radref(res));
+    }
+    else if (ARB_IS_LAGOM(x) && ARB_IS_LAGOM(res))
+    {
+        mag_fast_init_set_arf(xm, arb_midref(x));
+
+        mag_init(resr);
+        mag_fast_mul(resr, xm, arb_radref(x));
+        if (resr->exp < COEFF_MAX && resr->exp >= COEFF_MIN)
+            (resr->exp)++;
+        else
+            fmpz_add_ui(&(resr->exp), &(resr->exp), 1);
+        mag_fast_addmul(resr, arb_radref(x), arb_radref(x));
+
+        inexact = arf_sqr_rnd_down(arb_midref(res), arb_midref(x), prec);
+
+        if (inexact)
+            arf_mag_fast_add_ulp(resr, resr, arb_midref(res), prec);
+
+        *arb_radref(res) = *resr;
+    }
+    else
+    {
+        mag_init_set_arf(xm, arb_midref(x));
+
+        mag_init(resr);
+        mag_mul(resr, xm, arb_radref(x));
+        fmpz_add_ui(&(resr->exp), &(resr->exp), 1);
+        mag_addmul(resr, arb_radref(x), arb_radref(x));
+
+        inexact = arf_sqr_rnd_down(arb_midref(res), arb_midref(x), prec);
+
+        if (inexact)
+            arf_mag_add_ulp(arb_radref(res), resr, arb_midref(res), prec);
+        else
+            mag_swap(arb_radref(res), resr);
+
+        mag_clear(xm);
+        mag_clear(resr);
+    }
+}
diff --git a/arb/test/t-sqr.c b/arb/test/t-sqr.c
new file mode 100644
index 000000000..2c4fb8292
--- /dev/null
+++ b/arb/test/t-sqr.c
@@ -0,0 +1,111 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arb.h"
+
+int
+mag_close(const mag_t am, const mag_t bm)
+{
+    arf_t t, a, b;
+    int res1, res2;
+
+    arf_init(t);
+    arf_init(a);
+    arf_init(b);
+
+    arf_set_mag(a, am);
+    arf_set_mag(b, bm);
+
+    arf_mul_ui(t, b, 257, MAG_BITS, ARF_RND_UP);
+    arf_mul_2exp_si(t, t, -8);
+    res1 = arf_cmp(a, t) <= 0;
+
+    arf_mul_ui(t, a, 257, MAG_BITS, ARF_RND_UP);
+    arf_mul_2exp_si(t, t, -8);
+    res2 = arf_cmp(b, t) <= 0;
+
+    arf_clear(t);
+    arf_clear(a);
+    arf_clear(b);
+
+    return res1 && res2;
+}
+
+int main()
+{
+    slong iter, iter2;
+    flint_rand_t state;
+
+    flint_printf("sqr....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
+    {
+        arb_t x, z, v;
+        slong prec;
+
+        arb_init(x);
+        arb_init(z);
+        arb_init(v);
+
+        for (iter2 = 0; iter2 < 100; iter2++)
+        {
+            arb_randtest_special(x, state, n_randint(state,2) ? 2000 : 200, 200);
+
+            prec = 2 + n_randint(state, 2000);
+
+            switch (n_randint(state, 2))
+            {
+            case 0:
+                arb_mul(z, x, x, prec);
+                arb_sqr(v, x, prec);
+
+                if (!arf_equal(arb_midref(z), arb_midref(v))
+                    || !mag_close(arb_radref(z), arb_radref(v)))
+                {
+                    flint_printf("FAIL!\n");
+                    flint_printf("x = "); arb_print(x); flint_printf("\n\n");
+                    flint_printf("z = "); arb_print(z); flint_printf("\n\n");
+                    flint_printf("v = "); arb_print(v); flint_printf("\n\n");
+                    flint_abort();
+                }
+                break;
+
+            default:
+                arb_set(v, x);
+                arb_mul(z, x, x, prec);
+                arb_sqr(v, v, prec);
+
+                if (!arf_equal(arb_midref(v), arb_midref(z))
+                    || !mag_close(arb_radref(v), arb_radref(z)))
+                {
+                    flint_printf("FAIL (aliasing)!\n");
+                    flint_printf("x = "); arb_print(x); flint_printf("\n\n");
+                    flint_printf("v = "); arb_print(v); flint_printf("\n\n");
+                    flint_printf("z = "); arb_print(z); flint_printf("\n\n");
+                    flint_abort();
+                }
+                break;
+            }
+        }
+
+        arb_clear(x);
+        arb_clear(z);
+        arb_clear(v);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
diff --git a/arf.h b/arf.h
index 495293ce8..438c3b6c3 100644
--- a/arf.h
+++ b/arf.h
@@ -823,6 +823,18 @@ void arf_urandom(arf_t x, flint_rand_t state, slong bits, arf_rnd_t rnd);
         r3 += r2 < t1;                                      \
     } while (0)
 
+#define nn_sqr_2(r3, r2, r1, r0, a1, a0)                    \
+    do {                                                    \
+        mp_limb_t t1, t2, t3;                               \
+        umul_ppmm(r1, r0, a0, a0);                          \
+        umul_ppmm(t2, t1, a1, a0);                          \
+        add_ssaaaa(r2, r1, t2, t1, 0, r1);                  \
+        umul_ppmm(r3, t3, a1, a1);                          \
+        add_ssaaaa(r3, t2, r3, t2, 0, t3);                  \
+        add_ssaaaa(r2, r1, r2, r1, t2, t1);                 \
+        r3 += r2 < t2;                                      \
+    } while (0)
+
 #define ARF_MPN_MUL(_z, _x, _xn, _y, _yn) \
     if ((_xn) == (_yn)) \
     { \
@@ -914,6 +926,12 @@ int arf_mul_rnd_down(arf_ptr z, arf_srcptr x, arf_srcptr y, slong prec);
         ? arf_mul_rnd_down(z, x, y, prec)        \
         : arf_mul_rnd_any(z, x, y, prec, rnd))
 
+int arf_sqr_via_mpfr(arf_ptr res, arf_srcptr x, slong prec, arf_rnd_t rnd);
+
+int arf_sqr_rnd_down(arf_ptr res, arf_srcptr x, slong prec);
+
+void arf_sqr_special(arf_t res, const arf_t x);
+
 ARF_INLINE int
 arf_neg_mul(arf_t z, const arf_t x, const arf_t y, slong prec, arf_rnd_t rnd)
 {
diff --git a/arf/mul_rnd_down.c b/arf/mul_rnd_down.c
index 731a1d545..2f9b4b2a0 100644
--- a/arf/mul_rnd_down.c
+++ b/arf/mul_rnd_down.c
@@ -83,7 +83,7 @@ arf_mul_rnd_down(arf_ptr z, arf_srcptr x, arf_srcptr y, slong prec)
             {
                 ret = 0;
             }
-            else  /* prec < FLINT_BITS < 2 * FLINT_BITS */
+            else  /* FLINT_BITS < prec < 2 * FLINT_BITS */
             {
                 ret = MASK_LIMB(lo, 2 * FLINT_BITS - prec) != lo;
                 lo = MASK_LIMB(lo, 2 * FLINT_BITS - prec);
@@ -183,7 +183,7 @@ arf_mul_rnd_down(arf_ptr z, arf_srcptr x, arf_srcptr y, slong prec)
     }
     else
     {
-        mp_size_t zn, alloc;
+        mp_size_t alloc;
         mp_srcptr xptr, yptr;
         mp_ptr tmp;
         ARF_MUL_TMP_DECL
diff --git a/arf/sqr_rnd_down.c b/arf/sqr_rnd_down.c
new file mode 100644
index 000000000..1128f34f7
--- /dev/null
+++ b/arf/sqr_rnd_down.c
@@ -0,0 +1,181 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arf.h"
+
+int
+arf_sqr_rnd_down(arf_ptr res, arf_srcptr x, slong prec)
+{
+    mp_size_t xn, resn;
+    mp_limb_t hi, lo;
+    slong expfix;
+    int ret, fix;
+    mp_ptr resptr;
+
+    xn = ARF_XSIZE(x);
+    xn >>= 1;
+
+    /* Either operand is a special value. */
+    if (xn == 0)
+    {
+        arf_sqr_special(res, x);
+        return 0;
+    }
+
+    if (xn == 1)
+    {
+        lo = ARF_NOPTR_D(x)[0];
+
+        umul_ppmm(hi, lo, lo, lo);
+
+        /* Shift so that the top bit is set (branch free version). */
+        fix = !(hi >> (FLINT_BITS - 1));
+        hi = (hi << fix) | ((lo >> (FLINT_BITS - 1)) & fix);
+        lo = (lo << fix);
+
+        ARF_DEMOTE(res);
+
+        if (lo == 0)
+        {
+            resn = 1;
+
+            if (prec >= FLINT_BITS)
+            {
+                lo = hi;
+                ret = 0;
+            }
+            else
+            {
+                lo = MASK_LIMB(hi, FLINT_BITS - prec);
+                ret = (lo != hi);
+            }
+        }
+        else
+        {
+            resn = 2;
+
+            if (prec <= FLINT_BITS)  /* Must be inexact. */
+            {
+                lo = MASK_LIMB(hi, FLINT_BITS - prec);
+                resn = ret = 1;
+            }
+            else if (prec >= 2 * FLINT_BITS) /* Must be exact. */
+            {
+                ret = 0;
+            }
+            else  /* FLINT_BITS < prec < 2 * FLINT_BITS */
+            {
+                ret = MASK_LIMB(lo, 2 * FLINT_BITS - prec) != lo;
+                lo = MASK_LIMB(lo, 2 * FLINT_BITS - prec);
+
+                if (lo == 0)
+                {
+                    resn = 1;
+                    lo = hi;
+                }
+            }
+        }
+
+        _fmpz_2times_fast(ARF_EXPREF(res), ARF_EXPREF(x), -fix);
+        ARF_XSIZE(res) = ARF_MAKE_XSIZE(resn, 0);
+
+        resptr = ARF_NOPTR_D(res);
+        resptr[0] = lo;
+        resptr[1] = hi;
+
+        return ret;
+    }
+    else if (xn == 2)
+    {
+        mp_limb_t zz[4];
+        mp_limb_t x1, x0;
+
+        x0 = ARF_NOPTR_D(x)[0];
+        x1 = ARF_NOPTR_D(x)[1];
+
+        nn_sqr_2(zz[3], zz[2], zz[1], zz[0], x1, x0);
+
+        /* Likely case, must be inexact */
+        if (prec <= 2 * FLINT_BITS)
+        {
+            ARF_DEMOTE(res);
+
+            fix = !(zz[3] >> (FLINT_BITS - 1));
+            zz[3] = (zz[3] << fix) | ((zz[2] >> (FLINT_BITS - 1)) & fix);
+            zz[2] = (zz[2] << fix) | ((zz[1] >> (FLINT_BITS - 1)) & fix);
+
+            _fmpz_2times_fast(ARF_EXPREF(res), ARF_EXPREF(x), -fix);
+
+            /* Rounding */
+            if (prec != 2 * FLINT_BITS)
+            {
+                if (prec > FLINT_BITS)
+                {
+                    zz[2] &= (LIMB_ONES << (2 * FLINT_BITS - prec));
+                }
+                else if (prec == FLINT_BITS)
+                {
+                    zz[2] = 0;
+                }
+                else
+                {
+                    zz[3] &= (LIMB_ONES << (FLINT_BITS - prec));
+                    zz[2] = 0;
+                }
+            }
+
+            if (zz[2] == 0)
+            {
+                ARF_XSIZE(res) = ARF_MAKE_XSIZE(1, 0);
+                ARF_NOPTR_D(res)[0] = zz[3];
+            }
+            else
+            {
+                ARF_XSIZE(res) = ARF_MAKE_XSIZE(2, 0);
+                ARF_NOPTR_D(res)[0] = zz[2];
+                ARF_NOPTR_D(res)[1] = zz[3];
+            }
+
+            return 1;
+        }
+
+        resn = 2 * xn;
+        ret = _arf_set_round_mpn(res, &expfix, zz, resn, 0, prec, ARF_RND_DOWN);
+        _fmpz_2times_fast(ARF_EXPREF(res), ARF_EXPREF(x), expfix);
+        return ret;
+    }
+    else if (xn > MUL_MPFR_MIN_LIMBS && prec != ARF_PREC_EXACT
+                && xn > 0.675 * prec / FLINT_BITS
+                && xn < MUL_MPFR_MAX_LIMBS)  /* FIXME: proper cutoffs */
+    {
+        return arf_sqr_via_mpfr(res, x, prec, ARF_RND_DOWN);
+    }
+    else
+    {
+        mp_size_t alloc;
+        mp_srcptr xptr;
+        mp_ptr tmp;
+        ARF_MUL_TMP_DECL
+
+        ARF_GET_MPN_READONLY(xptr, xn, x);
+
+        alloc = resn = 2 * xn;
+        ARF_MUL_TMP_ALLOC(tmp, alloc)
+
+        mpn_sqr(tmp, xptr, xn);
+
+        ret = _arf_set_round_mpn(res, &expfix, tmp, resn, 0, prec, ARF_RND_DOWN);
+        _fmpz_2times_fast(ARF_EXPREF(res), ARF_EXPREF(x), expfix);
+        ARF_MUL_TMP_FREE(tmp, alloc)
+
+        return ret;
+    }
+}
diff --git a/arb/pow_fmpz.c b/arf/sqr_special.c
similarity index 51%
rename from arb/pow_fmpz.c
rename to arf/sqr_special.c
index dc470b274..e6e1b154c 100644
--- a/arb/pow_fmpz.c
+++ b/arf/sqr_special.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2012, 2013 Fredrik Johansson
+    Copyright (C) 2021 Albin Ahlbäck
 
     This file is part of Arb.
 
@@ -9,19 +9,15 @@
     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
 */
 
-#include "arb.h"
+#include "arf.h"
 
 void
-arb_pow_fmpz(arb_t y, const arb_t b, const fmpz_t e, slong prec)
+arf_sqr_special(arf_t res, const arf_t x)
 {
-    arb_pow_fmpz_binexp(y, b, e, prec);
-}
-
-void
-arb_pow_ui(arb_t y, const arb_t b, ulong e, slong prec)
-{
-    fmpz_t f;
-    fmpz_init_set_ui(f, e);
-    arb_pow_fmpz(y, b, f, prec);
-    fmpz_clear(f);
+    if (arf_is_zero(x))
+        arf_zero(res);
+    else if (arf_is_nan(x))
+        arf_nan(res);
+    else
+        arf_pos_inf(res);
 }
diff --git a/arf/sqr_via_mpfr.c b/arf/sqr_via_mpfr.c
new file mode 100644
index 000000000..4e3d6697a
--- /dev/null
+++ b/arf/sqr_via_mpfr.c
@@ -0,0 +1,63 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arf.h"
+
+int
+arf_sqr_via_mpfr(arf_ptr res, arf_srcptr x, slong prec, arf_rnd_t rnd)
+{
+    mp_size_t xn, resn, val;
+    mp_srcptr xptr;
+    mp_ptr tmp, resptr;
+    mpfr_t xf, resf;
+    int ret;
+    ARF_MUL_TMP_DECL
+
+    if (arf_is_special(x))
+    {
+        arf_sqr_special(res, x);
+        return 0;
+    }
+
+    ARF_GET_MPN_READONLY(xptr, xn, x);
+
+    prec = FLINT_MIN(xn * 2 * FLINT_BITS, prec);
+    resn = (prec + FLINT_BITS - 1) / FLINT_BITS;
+
+    ARF_MUL_TMP_ALLOC(tmp, resn)
+
+    resf->_mpfr_d = tmp;
+    resf->_mpfr_prec = prec;
+    resf->_mpfr_sign = 1;
+    resf->_mpfr_exp = 0;
+
+    xf->_mpfr_d = (mp_ptr) xptr;
+    xf->_mpfr_prec = xn * FLINT_BITS;
+    xf->_mpfr_sign = 1; /* Sign does not matter since we are squaring */
+    xf->_mpfr_exp = 0;
+
+    ret = mpfr_sqr(resf, xf, arf_rnd_to_mpfr(rnd));
+
+    ret = (ret != 0);
+
+    _fmpz_2times_fast(ARF_EXPREF(res), ARF_EXPREF(x), resf->_mpfr_exp);
+
+    val = 0;
+    while (tmp[val] == 0)
+        val++;
+
+    ARF_GET_MPN_WRITE(resptr, resn - val, res);
+    flint_mpn_copyi(resptr, tmp + val, resn - val);
+
+    ARF_MUL_TMP_FREE(tmp, resn)
+
+    return ret;
+}
diff --git a/arf/test/t-nn_sqr_2.c b/arf/test/t-nn_sqr_2.c
new file mode 100644
index 000000000..e0654eac8
--- /dev/null
+++ b/arf/test/t-nn_sqr_2.c
@@ -0,0 +1,45 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arf.h"
+
+int main()
+{
+    slong ix, result;
+    flint_rand_t state;
+
+    flint_printf("nn_sqr_2....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (ix = 0; ix < 10000 * arb_test_multiplier(); ix++)
+    {
+        ulong r0, r1, r2, r3, s0, s1, s2, s3, a0, a1;
+
+        a0 = n_randtest(state);
+        a1 = n_randtest(state);
+
+        nn_mul_2x2(r3, r2, r1, r0, a1, a0, a1, a0);
+        nn_sqr_2(s3, s2, s1, s0, a1, a0);
+
+        result = (s3 == r3 && s2 == r2 && s1 == r1 && s0 == r0);
+        if (!result)
+        {
+            flint_printf("FAIL!\n");
+            flint_abort();
+        }
+    }
+
+    flint_randclear(state);
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
diff --git a/arf/test/t-sqr_rnd_down.c b/arf/test/t-sqr_rnd_down.c
new file mode 100644
index 000000000..f62aca379
--- /dev/null
+++ b/arf/test/t-sqr_rnd_down.c
@@ -0,0 +1,65 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arf.h"
+
+int main()
+{
+    slong iter, iter2;
+    flint_rand_t state;
+
+    flint_printf("sqr_rnd_down....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
+    {
+        arf_t x, z, v;
+        slong prec, r1, r2;
+        arf_rnd_t rnd;
+
+        arf_init(x);
+        arf_init(z);
+        arf_init(v);
+
+        for (iter2 = 0; iter2 < 100; iter2++)
+        {
+            arf_randtest_special(x, state, 2000, 100);
+            prec = 2 + n_randint(state, 2000);
+
+            if (n_randint(state, 50) == 0)
+                prec = ARF_PREC_EXACT;
+
+            r1 = arf_mul_rnd_down(z, x, x, prec);
+            r2 = arf_sqr_rnd_down(v, x, prec);
+            if (!arf_equal(z, v) || r1 != r2)
+            {
+                flint_printf("FAIL!\n");
+                flint_printf("prec = %wd, rnd = %d\n\n", prec, rnd);
+                flint_printf("x = "); arf_print(x); flint_printf("\n\n");
+                flint_printf("z = "); arf_print(z); flint_printf("\n\n");
+                flint_printf("v = "); arf_print(v); flint_printf("\n\n");
+                flint_printf("r1 = %wd, r2 = %wd\n", r1, r2);
+                flint_abort();
+            }
+        }
+
+        arf_clear(x);
+        arf_clear(z);
+        arf_clear(v);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
diff --git a/arf/test/t-sqr_via_mpfr.c b/arf/test/t-sqr_via_mpfr.c
new file mode 100644
index 000000000..94b924b42
--- /dev/null
+++ b/arf/test/t-sqr_via_mpfr.c
@@ -0,0 +1,94 @@
+/*
+    Copyright (C) 2021 Albin Ahlbäck
+
+    This file is part of Arb.
+
+    Arb is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 2.1 of the License, or
+    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
+*/
+
+#include "arf.h"
+
+int main()
+{
+    slong iter, iter2;
+    flint_rand_t state;
+
+    flint_printf("sqr_via_mpfr....");
+    fflush(stdout);
+
+    flint_randinit(state);
+
+    for (iter = 0; iter < 10000 * arb_test_multiplier(); iter++)
+    {
+        arf_t x, z, v;
+        slong prec, r1, r2;
+        arf_rnd_t rnd;
+
+        arf_init(x);
+        arf_init(z);
+        arf_init(v);
+
+        for (iter2 = 0; iter2 < 100; iter2++)
+        {
+            arf_randtest_special(x, state, 2000, 100);
+            prec = 2 + n_randint(state, 2000);
+
+            if (n_randint(state, 50) == 0)
+                prec = ARF_PREC_EXACT;
+
+            switch (n_randint(state, 5))
+            {
+                case 0:  rnd = ARF_RND_DOWN; break;
+                case 1:  rnd = ARF_RND_UP; break;
+                case 2:  rnd = ARF_RND_FLOOR; break;
+                case 3:  rnd = ARF_RND_CEIL; break;
+                default: rnd = ARF_RND_NEAR; break;
+            }
+
+            switch (n_randint(state, 2))
+            {
+            case 0:
+                r1 = arf_mul_via_mpfr(z, x, x, prec, rnd);
+                r2 = arf_sqr_via_mpfr(v, x, prec, rnd);
+                if (!arf_equal(z, v) || r1 != r2)
+                {
+                    flint_printf("FAIL!\n");
+                    flint_printf("prec = %wd, rnd = %d\n\n", prec, rnd);
+                    flint_printf("x = "); arf_print(x); flint_printf("\n\n");
+                    flint_printf("z = "); arf_print(z); flint_printf("\n\n");
+                    flint_printf("v = "); arf_print(v); flint_printf("\n\n");
+                    flint_printf("r1 = %wd, r2 = %wd\n", r1, r2);
+                    flint_abort();
+                }
+                break;
+
+            default:
+                r1 = arf_mul_via_mpfr(v, x, x, prec, rnd);
+                r2 = arf_sqr_via_mpfr(x, x, prec, rnd);
+                if (!arf_equal(v, x) || r1 != r2)
+                {
+                    flint_printf("FAIL (aliasing)!\n");
+                    flint_printf("prec = %wd, rnd = %d\n\n", prec, rnd);
+                    flint_printf("x = "); arf_print(x); flint_printf("\n\n");
+                    flint_printf("z = "); arf_print(z); flint_printf("\n\n");
+                    flint_printf("v = "); arf_print(v); flint_printf("\n\n");
+                    flint_printf("r1 = %wd, r2 = %wd\n", r1, r2);
+                    flint_abort();
+                }
+                break;
+            }
+        }
+
+        arf_clear(x);
+        arf_clear(z);
+        arf_clear(v);
+    }
+
+    flint_randclear(state);
+    flint_cleanup();
+    flint_printf("PASS\n");
+    return EXIT_SUCCESS;
+}
diff --git a/doc/source/arb.rst b/doc/source/arb.rst
index 70837041e..30ca849f7 100644
--- a/doc/source/arb.rst
+++ b/doc/source/arb.rst
@@ -989,28 +989,47 @@ Powers and roots
 
     Alias for :func:`arb_root_ui`, provided for backwards compatibility.
 
-.. function:: void arb_sqr(arb_t y, const arb_t x, slong prec)
+.. function:: void arb_sqr(arb_t res, const arb_t x, slong prec)
 
-    Sets *y* to be the square of *x*. 
+    Sets *res* to the square of `x`.
+
+.. function:: void _arb_pow_mpz_binexp(arb_t y, const arb_t b, mpz_srcptr e, slong prec)
+
+.. function:: void arb_pow_ui_binexp(arb_t y, const arb_t b, ulong e, slong prec)
+
+    Sets `y = b^e` via binary exponentiation with an initial division if
+    `e < 0`. Provided that `b` and `e` are small enough and the exponent is
+    positive, the exact power can be computed by setting the precision to
+    *ARF_PREC_EXACT*.
+
+    Note that these functions can get slow if the exponent is
+    extremely large (in such cases :func:`arb_pow` may be superior).
+
+.. function:: void arb_pow_si(arb_t y, const arb_t b, slong e, slong prec);
+
+    Sets `y = b^e` by wrapping *arb_pow_ui*, doing an initial division if
+    `e < 0`.
 
 .. function:: void arb_pow_fmpz_binexp(arb_t y, const arb_t b, const fmpz_t e, slong prec)
 
+    Sets `y = b^e` by wrapping *arb_pow_si* and *_arb_pow_mpz_binexp*.
+
 .. function:: void arb_pow_fmpz(arb_t y, const arb_t b, const fmpz_t e, slong prec)
 
+    Wrapper for *arb_pow_fmpz_binexp*.
+
 .. function:: void arb_pow_ui(arb_t y, const arb_t b, ulong e, slong prec)
 
+    Wrapper for *arb_pow_ui_binexp*.
+
 .. function:: void arb_ui_pow_ui(arb_t y, ulong b, ulong e, slong prec)
 
 .. function:: void arb_si_pow_ui(arb_t y, slong b, ulong e, slong prec)
 
-    Sets `y = b^e` using binary exponentiation (with an initial division
-    if `e < 0`). Provided that *b* and *e*
+    Sets `y = b^e` using binary exponentiation. Provided that *b* and *e*
     are small enough and the exponent is positive, the exact power can be
     computed by setting the precision to *ARF_PREC_EXACT*.
 
-    Note that these functions can get slow if the exponent is
-    extremely large (in such cases :func:`arb_pow` may be superior).
-
 .. function:: void arb_pow_fmpq(arb_t y, const arb_t x, const fmpq_t a, slong prec)
 
     Sets `y = b^e`, computed as `y = (b^{1/q})^p` if the denominator of
diff --git a/doc/source/arf.rst b/doc/source/arf.rst
index 0d8ba4fd3..3d4e4b9ae 100644
--- a/doc/source/arf.rst
+++ b/doc/source/arf.rst
@@ -111,6 +111,18 @@ Types, macros and constants
     test code. If in doubt, simply try with some convenient high precision
     instead of using this special value, and check that the result is exact.
 
+.. macro:: nn_mul_2x1(r2, r1, r0, a1, a0, b0)
+
+    Sets `(r_2, r_1, r_0)` to `(a_1, a_0)` multiplied by `b_0`.
+
+.. macro:: nn_mul_2x2(r3, r2, r1, r0, a1, a0, b1, b0)
+
+    Sets `(r_2, r_1, r_0)` to `(a_1, a_0)` multiplied by `(b_1, b_0)`.
+
+.. macro:: nn_sqr_2(r3, r2, r1, r0, a1, a0)
+
+    Sets `(r_2, r_1, r_0)` to the square of `(a_1, a_0)`.
+
 Memory management
 -------------------------------------------------------------------------------
 
@@ -553,7 +565,8 @@ Addition and multiplication
 
 .. function:: int arf_neg_round(arf_t res, const arf_t x, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `-x`.
+    Sets *res* to `-x`. Returns 0 if this operation was made exactly and 1 if
+    truncation occurred.
 
 .. function:: int arf_add(arf_t res, const arf_t x, const arf_t y, slong prec, arf_rnd_t rnd)
 
@@ -563,11 +576,13 @@ Addition and multiplication
 
 .. function:: int arf_add_fmpz(arf_t res, const arf_t x, const fmpz_t y, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `x + y`.
+    Sets *res* to `x + y`. Returns 0 if this operation was made exactly and 1
+    if truncation occurred.
 
 .. function:: int arf_add_fmpz_2exp(arf_t res, const arf_t x, const fmpz_t y, const fmpz_t e, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `x + y 2^e`.
+    Sets *res* to `x + y 2^e`. Returns 0 if this operation was made exactly and
+    1 if truncation occurred.
 
 .. function:: int arf_sub(arf_t res, const arf_t x, const arf_t y, slong prec, arf_rnd_t rnd)
 
@@ -577,7 +592,8 @@ Addition and multiplication
 
 .. function:: int arf_sub_fmpz(arf_t res, const arf_t x, const fmpz_t y, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `x - y`.
+    Sets *res* to `x - y`. Returns 0 if this operation was made exactly and 1
+    if truncation occurred.
 
 .. function:: void arf_mul_2exp_si(arf_t res, const arf_t x, slong e)
 
@@ -595,7 +611,31 @@ Addition and multiplication
 
 .. function:: int arf_mul_fmpz(arf_t res, const arf_t x, const fmpz_t y, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `x \cdot y`.
+    Sets *res* to `x \cdot y`. Returns 0 if this operation was made exactly and
+    1 if truncation occurred.
+
+.. function:: void arf_mul_special(arf_t z, const arf_t x, const arf_t y)
+
+    Sets *res* to the special value of `x \cdot y`.
+
+    Note that it is assumed that at least one of `x` and `y` has special
+    values.
+
+.. function:: int arf_sqr_via_mpfr(arf_ptr res, arf_srcptr x, slong prec, arf_rnd_t rnd)
+
+    Sets *res* to `x^2` by calling MPFR. Returns 0 if this operation was made
+    exactly and 1 if truncation occurred.
+
+.. function:: int arf_sqr_rnd_down(arf_ptr res, arf_srcptr x, slong prec)
+
+    Sets *res* to `x^2`. If the opertation was made exactly, it returns 1.
+    Otherwise it rounds downwards and returns 1.
+
+.. function:: void arf_sqr_special(arf_t res, const arf_t x)
+
+    Sets *res* to the special value of `x^2`.
+
+    Note that it is assumed that `x` has a special value.
 
 .. function:: int arf_addmul(arf_t z, const arf_t x, const arf_t y, slong prec, arf_rnd_t rnd)
 
@@ -607,7 +647,8 @@ Addition and multiplication
 
 .. function:: int arf_addmul_fmpz(arf_t z, const arf_t x, const fmpz_t y, slong prec, arf_rnd_t rnd)
 
-    Performs a fused multiply-add `z = z + x \cdot y`, updating *z* in-place.
+    Adds `x \cdot y` to `z`, updating `z` in-place. Returns 0 if this operation
+    was made exactly and 1 if truncation occurred.
 
 .. function:: int arf_submul(arf_t z, const arf_t x, const arf_t y, slong prec, arf_rnd_t rnd)
 
@@ -619,16 +660,21 @@ Addition and multiplication
 
 .. function:: int arf_submul_fmpz(arf_t z, const arf_t x, const fmpz_t y, slong prec, arf_rnd_t rnd)
 
-    Performs a fused multiply-subtract `z = z - x \cdot y`, updating *z* in-place.
+    Performs a fused multiply-subtract `z = z - x \cdot y`, updating *z*
+    in-place. Returns 0 if this operation was made exactly and 1 if truncation
+    occurred.
 
 .. function:: int arf_fma(arf_t res, const arf_t x, const arf_t y, const arf_t z, slong prec, arf_rnd_t rnd)
 
     Sets *res* to `x \cdot y + z`. This is equivalent to an *addmul* except
-    that *res* and *z* can be separate variables.
+    that *res* and *z* can be separate variables. Returns 0 if this operation
+    was made exactly and 1 if truncation occurred.
 
 .. function:: int arf_sosq(arf_t res, const arf_t x, const arf_t y, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `x^2 + y^2`, rounded to *prec* bits in the direction specified by *rnd*.
+    Sets *res* to `x^2 + y^2`, rounded to *prec* bits in the direction
+    specified by *rnd*. Returns 0 if this operation was made exactly and 1 if
+    truncation occurred.
 
 Summation
 -------------------------------------------------------------------------------
@@ -662,8 +708,9 @@ Division
 
 .. function:: int arf_fmpz_div_fmpz(arf_t res, const fmpz_t x, const fmpz_t y, slong prec, arf_rnd_t rnd)
 
-    Sets *res* to `x / y`, rounded to *prec* bits in the direction specified by *rnd*,
-    returning nonzero iff the operation is inexact. The result is NaN if *y* is zero.
+    Sets *res* to `x / y`, rounded to *prec* bits in the direction specified by
+    *rnd*, returning nonzero iff the operation is inexact. The result is NaN if
+    *y* is zero.
 
 Square roots
 -------------------------------------------------------------------------------
diff --git a/mag.h b/mag.h
index 00eb44fa9..917798677 100644
--- a/mag.h
+++ b/mag.h
@@ -112,6 +112,23 @@ _fmpz_sub2_fast(fmpz_t z, const fmpz_t x, const fmpz_t y, slong c)
     }
 }
 
+static __inline__ void
+_fmpz_2times_fast(fmpz_t res, const fmpz_t x, slong c)
+{
+    slong cx = *x;
+
+    if (!COEFF_IS_MPZ(*res) && (cx > COEFF_MIN / 2 && cx < COEFF_MAX / 2))
+    {
+        *res = 2 * cx + c;
+    }
+    else
+    {
+        fmpz_mul_2exp(res, x, 1);
+        if (c != 0)
+            fmpz_add_si(res, res, c);
+    }
+}
+
 
 #define MAG_EXP_POS_INF (COEFF_MIN+1)