/* Copyright (C) 2007, 2009 Free Software Foundation, Inc. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Under Section 7 of GPL version 3, you are granted additional permissions described in the GCC Runtime Library Exception, version 3.1, as published by the Free Software Foundation. You should have received a copy of the GNU General Public License and a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ #ifndef _SQRT_MACROS_H_ #define _SQRT_MACROS_H_ #define FENCE __fence #if DOUBLE_EXTENDED_ON extern BINARY80 SQRT80 (BINARY80); __BID_INLINE__ UINT64 short_sqrt128 (UINT128 A10) { BINARY80 lx, ly, l64; int_float f64; // 2^64 f64.i = 0x5f800000; l64 = (BINARY80) f64.d; lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0]; ly = SQRT80 (lx); return (UINT64) ly; } __BID_INLINE__ void long_sqrt128 (UINT128 * pCS, UINT256 C256) { UINT256 C4; UINT128 CS; UINT64 X; SINT64 SE; BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2, l1, l0, lp, lCl; int_float fx, f64, fm64; int *ple = (int *) &lx; // 2^64 f64.i = 0x5f800000; l64 = (BINARY80) f64.d; l128 = l64 * l64; lx = l3 = (BINARY80) C256.w[3] * l64 * l128; l2 = (BINARY80) C256.w[2] * l128; lx = FENCE (lx + l2); l1 = (BINARY80) C256.w[1] * l64; lx = FENCE (lx + l1); l0 = (BINARY80) C256.w[0]; lx = FENCE (lx + l0); // sqrt(C256) lS = SQRT80 (lx); // get coefficient // 2^(-64) fm64.i = 0x1f800000; lm64 = (BINARY80) fm64.d; CS.w[1] = (UINT64) (lS * lm64); CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64); /////////////////////////////////////// // CAUTION! // little endian code only // add solution for big endian ////////////////////////////////////// lSH = lS; *((UINT64 *) & lSH) &= 0xffffffff00000000ull; // correction for C256 rounding lCl = FENCE (l3 - lx); lCl = FENCE (lCl + l2); lCl = FENCE (lCl + l1); lCl = FENCE (lCl + l0); lSL = lS - lSH; ////////////////////////////////////////// // Watch for compiler re-ordering // ///////////////////////////////////////// // C256-S^2 lxL = FENCE (lx - lSH * lSH); lp = lSH * lSL; lp += lp; lxL = FENCE (lxL - lp); lSL *= lSL; lxL = FENCE (lxL - lSL); lCl += lxL; // correction term lE = lCl / (lS + lS); // get low part of coefficient X = CS.w[0]; if (lCl >= 0) { SE = (SINT64) (lE); CS.w[0] += SE; if (CS.w[0] < X) CS.w[1]++; } else { SE = (SINT64) (-lE); CS.w[0] -= SE; if (CS.w[0] > X) CS.w[1]--; } pCS->w[0] = CS.w[0]; pCS->w[1] = CS.w[1]; } #else extern double sqrt (double); __BID_INLINE__ UINT64 short_sqrt128 (UINT128 A10) { UINT256 ARS, ARS0, AE0, AE, S; UINT64 MY, ES, CY; double lx, l64; int_double f64, ly; int ey, k; // 2^64 f64.i = 0x43f0000000000000ull; l64 = f64.d; lx = (double) A10.w[1] * l64 + (double) A10.w[0]; ly.d = 1.0 / sqrt (lx); MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull; ey = 0x3ff - (ly.i >> 52); // A10*RS^2 __mul_64x128_to_192 (ARS0, MY, A10); __mul_64x192_to_256 (ARS, MY, ARS0); // shr by 2*ey+40, to get a 64-bit value k = (ey << 1) + 104 - 64; if (k >= 128) { if (k > 128) ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k)); else ES = ARS.w[2]; } else { if (k >= 64) { ARS.w[0] = ARS.w[1]; ARS.w[1] = ARS.w[2]; k -= 64; } if (k) { __shr_128 (ARS, ARS, k); } ES = ARS.w[0]; } ES = ((SINT64) ES) >> 1; if (((SINT64) ES) < 0) { ES = -ES; // A*RS*eps (scaled by 2^64) __mul_64x192_to_256 (AE0, ES, ARS0); AE.w[0] = AE0.w[1]; AE.w[1] = AE0.w[2]; AE.w[2] = AE0.w[3]; __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]); __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY); S.w[2] = ARS0.w[2] + AE.w[2] + CY; } else { // A*RS*eps (scaled by 2^64) __mul_64x192_to_256 (AE0, ES, ARS0); AE.w[0] = AE0.w[1]; AE.w[1] = AE0.w[2]; AE.w[2] = AE0.w[3]; __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]); __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY); S.w[2] = ARS0.w[2] - AE.w[2] - CY; } k = ey + 51; if (k >= 64) { if (k >= 128) { S.w[0] = S.w[2]; S.w[1] = 0; k -= 128; } else { S.w[0] = S.w[1]; S.w[1] = S.w[2]; } k -= 64; } if (k) { __shr_128 (S, S, k); } return (UINT64) ((S.w[0] + 1) >> 1); } __BID_INLINE__ void long_sqrt128 (UINT128 * pCS, UINT256 C256) { UINT512 ARS0, ARS; UINT256 ARS00, AE, AE2, S; UINT128 ES, ES2, ARS1; UINT64 ES32, CY, MY; double l64, l128, lx, l2, l1, l0; int_double f64, ly; int ey, k, k2; // 2^64 f64.i = 0x43f0000000000000ull; l64 = f64.d; l128 = l64 * l64; lx = (double) C256.w[3] * l64 * l128; l2 = (double) C256.w[2] * l128; lx = FENCE (lx + l2); l1 = (double) C256.w[1] * l64; lx = FENCE (lx + l1); l0 = (double) C256.w[0]; lx = FENCE (lx + l0); // sqrt(C256) ly.d = 1.0 / sqrt (lx); MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull; ey = 0x3ff - (ly.i >> 52); // A10*RS^2, scaled by 2^(2*ey+104) __mul_64x256_to_320 (ARS0, MY, C256); __mul_64x320_to_384 (ARS, MY, ARS0); // shr by k=(2*ey+104)-128 // expect k is in the range (192, 256) if result in [10^33, 10^34) // apply an additional signed shift by 1 at the same time (to get eps=eps0/2) k = (ey << 1) + 104 - 128 - 192; k2 = 64 - k; ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1)); ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2); ES.w[1] = ((SINT64) ES.w[1]) >> 1; // A*RS >> 192 (for error term computation) ARS1.w[0] = ARS0.w[3]; ARS1.w[1] = ARS0.w[4]; // A*RS>>64 ARS00.w[0] = ARS0.w[1]; ARS00.w[1] = ARS0.w[2]; ARS00.w[2] = ARS0.w[3]; ARS00.w[3] = ARS0.w[4]; if (((SINT64) ES.w[1]) < 0) { ES.w[0] = -ES.w[0]; ES.w[1] = -ES.w[1]; if (ES.w[0]) ES.w[1]--; // A*RS*eps __mul_128x128_to_256 (AE, ES, ARS1); __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]); __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY); __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY); S.w[3] = ARS00.w[3] + AE.w[3] + CY; } else { // A*RS*eps __mul_128x128_to_256 (AE, ES, ARS1); __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]); __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY); __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY); S.w[3] = ARS00.w[3] - AE.w[3] - CY; } // 3/2*eps^2, scaled by 2^128 ES32 = ES.w[1] + (ES.w[1] >> 1); __mul_64x64_to_128 (ES2, ES32, ES.w[1]); // A*RS*3/2*eps^2 __mul_128x128_to_256 (AE2, ES2, ARS1); // result, scaled by 2^(ey+52-64) __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]); __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY); __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY); S.w[3] = S.w[3] + AE2.w[3] + CY; // k in (0, 64) k = ey + 51 - 128; k2 = 64 - k; S.w[0] = (S.w[1] >> k) | (S.w[2] << k2); S.w[1] = (S.w[2] >> k) | (S.w[3] << k2); // round to nearest S.w[0]++; if (!S.w[0]) S.w[1]++; pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1); pCS->w[1] = S.w[1] >> 1; } #endif #endif