@@ -117,21 +117,31 @@ inttype(::Type{Float128}) = Int128
117
117
118
118
# conversion
119
119
Float128 (x:: Float128 ) = x
120
+ function Float128 (x:: T ) where T <: Base.IEEEFloat
121
+ if x=== zero (x)
122
+ return reinterpret (Float128, zero (UInt128))
123
+ elseif x=== - zero (x)
124
+ return reinterpret (Float128, sign_mask (Float128))
125
+ else
126
+ s = UInt128 (signbit (x))<< 127
127
+ if ! isfinite (x)
128
+ xf = reinterpret (Unsigned, x) & significand_mask (T)
129
+ d = exponent_mask (Float128)
130
+ else
131
+ f, e = frexp (x)
132
+ xf = reinterpret (Unsigned, f) & significand_mask (T)
133
+ d = ((e+ exponent_bias (Float128)- 1 ) % UInt128) << significand_bits (Float128)
134
+ end
135
+ xu = UInt128 (xf)<< (significand_bits (Float128)- significand_bits (T))
136
+ return reinterpret (Float128, s| d| xu)
137
+ end
138
+ end
120
139
121
- # Float64
122
- @assume_effects :foldable Float128 (x:: Float64 ) =
123
- Float128 (@quad_ccall (quadoplib. __extenddftf2 (x:: Cdouble ):: Cfloat128 ))
140
+ # truncation
124
141
@assume_effects :foldable Float64 (x:: Float128 ) =
125
142
@quad_ccall (quadoplib. __trunctfdf2 (x:: Cfloat128 ):: Cdouble )
126
-
127
- # Float32
128
- @assume_effects :foldable Float128 (x:: Float32 ) =
129
- Float128 (@quad_ccall (quadoplib. __extendsftf2 (x:: Cfloat ):: Cfloat128 ))
130
143
@assume_effects :foldable Float32 (x:: Float128 ) =
131
144
@quad_ccall (quadoplib. __trunctfsf2 (x:: Cfloat128 ):: Cfloat )
132
-
133
- # Float16
134
- Float128 (x:: Float16 ) = Float128 (Float32 (x))
135
145
Float16 (x:: Float128 ) = Float16 (Float64 (x)) # TODO : avoid double rounding
136
146
137
147
# TwicePrecision
@@ -192,75 +202,81 @@ Float128(x::Bool) = x ? Float128(1) : Float128(0)
192
202
Float128 (@quad_ccall (quadoplib. __negtf2 (x:: Cfloat128 ):: Cfloat128 ))
193
203
194
204
# Float128 -> Integer
195
- @assume_effects :foldable unsafe_trunc (:: Type{Int32} , x:: Float128 ) =
196
- @quad_ccall (quadoplib. __fixtfsi (x:: Cfloat128 ):: Int32 )
197
-
198
- @assume_effects :foldable unsafe_trunc (:: Type{Int64} , x:: Float128 ) =
199
- @quad_ccall (quadoplib. __fixtfdi (x:: Cfloat128 ):: Int64 )
200
-
201
- @assume_effects :foldable unsafe_trunc (:: Type{UInt32} , x:: Float128 ) =
202
- @quad_ccall (quadoplib. __fixunstfsi (x:: Cfloat128 ):: UInt32 )
203
-
204
- @assume_effects :foldable unsafe_trunc (:: Type{UInt64} , x:: Float128 ) =
205
- @quad_ccall (quadoplib. __fixunstfdi (x:: Cfloat128 ):: UInt64 )
206
-
207
- function unsafe_trunc (:: Type{UInt128} , x:: Float128 )
205
+ function unsafe_trunc (:: Type{T} , x:: Float128 ) where T<: Base.BitUnsigned
208
206
xu = reinterpret (UInt128,x)
209
207
k = (Int64 (xu >> 112 ) & 0x07fff ) - 16382 - 113
210
208
xu = (xu & significand_mask (Float128)) | 0x0001_0000_0000_0000_0000_0000_0000_0000
211
209
if k <= 0
212
- UInt128 (xu >> - k)
210
+ (xu >> - k) % T
213
211
else
214
- UInt128 (xu) << k
212
+ (xu % T ) << k
215
213
end
216
214
end
217
- function unsafe_trunc (:: Type{Int128 } , x:: Float128 )
218
- copysign (unsafe_trunc (UInt128 ,x) % Int128 , x)
215
+ function unsafe_trunc (:: Type{T } , x:: Float128 ) where T <: Base.BitSigned
216
+ copysign (unsafe_trunc (unsigned (T) ,x) % T , x)
219
217
end
220
218
trunc (:: Type{Signed} , x:: Float128 ) = trunc (Int,x)
221
219
trunc (:: Type{Unsigned} , x:: Float128 ) = trunc (Int,x)
222
220
trunc (:: Type{Integer} , x:: Float128 ) = trunc (Int,x)
223
221
224
- for Ti in (Int32, Int64, Int128, UInt32, UInt64, UInt128)
225
- if Ti <: Unsigned || sizeof (Ti) < sizeof (Float128)
226
- # Here `Float128(typemin(Ti))-1` is exact, so we can compare the lower-bound
227
- # directly. `Float128(typemax(Ti))+1` is either always exactly representable, or
228
- # rounded to `Inf` (e.g. when `Ti==UInt128 && Float128==Float32`).
229
- @eval begin
230
- function trunc (:: Type{$Ti} ,x:: Float128 )
231
- if Float128 (typemin ($ Ti)) - one (Float128) < x < Float128 (typemax ($ Ti)) + one (Float128)
232
- return unsafe_trunc ($ Ti,x)
233
- else
234
- throw (InexactError (:trunc , $ Ti, x))
235
- end
222
+ for Ti in (UInt8, UInt16, UInt32, UInt64, UInt128)
223
+ # Here `Float128(typemin(Ti))` is 0, so we can check the sign of the float
224
+ # `Float128(typemax(Ti))+1` is exactly representable
225
+ @eval begin
226
+ function trunc (:: Type{$Ti} ,x:: Float128 )
227
+ if ! signbit (x) && x < Float128 (typemax ($ Ti)) + one (Float128)
228
+ return unsafe_trunc ($ Ti,x)
229
+ else
230
+ throw (InexactError (:trunc , $ Ti, x))
236
231
end
237
- function ( :: Type{$Ti} )(x :: Float128 )
238
- if ( Float128 ( typemin ( $ Ti)) <= x <= Float128 ( typemax ( $ Ti))) && ( round (x, RoundToZero) == x )
239
- return unsafe_trunc ( $ Ti, x)
240
- else
241
- throw ( InexactError ( $ ( Expr ( :quote ,Ti . name . name)), $ Ti, x))
242
- end
232
+ end
233
+ function ( :: Type{ $Ti} )(x :: Float128 )
234
+ if ( ! signbit (x) && x <= Float128 ( typemax ( $ Ti))) && ( round (x, RoundToZero) == x)
235
+ return unsafe_trunc ( $ Ti,x)
236
+ else
237
+ throw ( InexactError ( $ ( Expr ( :quote ,Ti . name . name)), $ Ti, x))
243
238
end
244
239
end
245
- else
246
- # Here `eps(Float128(typemin(Ti))) > 1`, so the only value which can be truncated to
247
- # `Float128(typemin(Ti)` is itself. Similarly, `Float128(typemax(Ti))` is inexact and will
248
- # be rounded up. This assumes that `Float128(typemin(Ti)) > -Inf`, which is true for
249
- # these types, but not for `Float16` or larger integer types.
250
- @eval begin
251
- function trunc (:: Type{$Ti} ,x:: Float128 )
252
- if Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))
253
- return unsafe_trunc ($ Ti,x)
254
- else
255
- throw (InexactError (:trunc , $ Ti, x))
256
- end
240
+ end
241
+ end
242
+ for Ti in (Int8, Int16, Int32, Int64)
243
+ # Here `Float128(typemin(Ti))-1` is exact, so we can compare the lower-bound
244
+ # directly. `Float128(typemax(Ti))+1` is always exactly representable
245
+ @eval begin
246
+ function trunc (:: Type{$Ti} ,x:: Float128 )
247
+ if Float128 (typemin ($ Ti)) - one (Float128) < x < Float128 (typemax ($ Ti)) + one (Float128)
248
+ return unsafe_trunc ($ Ti,x)
249
+ else
250
+ throw (InexactError (:trunc , $ Ti, x))
257
251
end
258
- function (:: Type{$Ti} )(x:: Float128 )
259
- if (Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))) && (round (x, RoundToZero) == x)
260
- return unsafe_trunc ($ Ti,x)
261
- else
262
- throw (InexactError ($ (Expr (:quote ,Ti. name. name)), $ Ti, x))
263
- end
252
+ end
253
+ function (:: Type{$Ti} )(x:: Float128 )
254
+ if (Float128 (typemin ($ Ti)) <= x <= Float128 (typemax ($ Ti))) && (round (x, RoundToZero) == x)
255
+ return unsafe_trunc ($ Ti,x)
256
+ else
257
+ throw (InexactError ($ (Expr (:quote ,Ti. name. name)), $ Ti, x))
258
+ end
259
+ end
260
+ end
261
+ end
262
+ for Ti in (Int128,)
263
+ # Here `eps(Float128(typemin(Ti))) > 1`, so the only value which can be truncated to
264
+ # `Float128(typemin(Ti)` is itself. Similarly, `Float128(typemax(Ti))` is inexact and will
265
+ # be rounded up. This assumes that `Float128(typemin(Ti)) > -Inf`, which is true for
266
+ # Int128, but possibly not for larger integer types.
267
+ @eval begin
268
+ function trunc (:: Type{$Ti} ,x:: Float128 )
269
+ if Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))
270
+ return unsafe_trunc ($ Ti,x)
271
+ else
272
+ throw (InexactError (:trunc , $ Ti, x))
273
+ end
274
+ end
275
+ function (:: Type{$Ti} )(x:: Float128 )
276
+ if (Float128 (typemin ($ Ti)) <= x < Float128 (typemax ($ Ti))) && (round (x, RoundToZero) == x)
277
+ return unsafe_trunc ($ Ti,x)
278
+ else
279
+ throw (InexactError ($ (Expr (:quote ,Ti. name. name)), $ Ti, x))
264
280
end
265
281
end
266
282
end
@@ -278,7 +294,10 @@ for f in (:acos, :acosh, :asin, :asinh, :atan, :atanh, :cosh, :cos,
278
294
end
279
295
end
280
296
281
- @assume_effects :foldable abs (x:: Float128 ) = Float128 (@quad_ccall (libquadmath. fabsq (x:: Cfloat128 ):: Cfloat128 ))
297
+ function abs (x:: Float128 )
298
+ # mask out sign
299
+ reinterpret (Float128, reinterpret (UInt128, x)& (~ (UInt128 (1 )<< 127 )))
300
+ end
282
301
@assume_effects :foldable round (x:: Float128 ) = Float128 (@quad_ccall (libquadmath. rintq (x:: Cfloat128 ):: Cfloat128 ))
283
302
round (x:: Float128 , r:: RoundingMode{:Down} ) = floor (x)
284
303
round (x:: Float128 , r:: RoundingMode{:Up} ) = ceil (x)
@@ -317,9 +336,16 @@ sincos(x::Float128) = (sin(x), cos(x))
317
336
Float128 (@quad_ccall (libquadmath. fmaq (x:: Cfloat128 , y:: Cfloat128 , z:: Cfloat128 ):: Cfloat128 ))
318
337
end
319
338
320
- @assume_effects :foldable isnan (x:: Float128 ) = 0 != @quad_ccall (libquadmath. isnanq (x:: Cfloat128 ):: Cint )
321
- @assume_effects :foldable isinf (x:: Float128 ) = 0 != @quad_ccall (libquadmath. isinfq (x:: Cfloat128 ):: Cint )
322
- @assume_effects :foldable isfinite (x:: Float128 ) = 0 != @quad_ccall (libquadmath. finiteq (x:: Cfloat128 ):: Cint )
339
+ function isinf (x:: Float128 )
340
+ xu = reinterpret (UInt128, x)
341
+ # xu must be either 0x7fff_0000... or 0xffff_0000...
342
+ return xu in (exponent_mask (Float128), exponent_mask (Float128)&~ (UInt128 (1 )<< 127 ))
343
+ end
344
+ function isfinite (x:: Float128 )
345
+ xu = reinterpret (UInt128, x)
346
+ return xu>> significand_bits (Float128)& 0x7fff != 0x7fff
347
+ end
348
+ isnan (x:: Float128 ) = ! (isfinite (x) || isinf (x))
323
349
324
350
isinteger (x:: Float128 ) = isfinite (x) && x === trunc (x)
325
351
0 commit comments