Skip to content

Commit 504dfdf

Browse files
authored
Add Parsers.parse(Number, source) for number type detection + parsing (#149)
* Add ability to parse generic Number * Add Parsers.parse(Number, source) for number type detection + parsing * fix tests * Fix 32-bit * Fix BigFloat positioning
1 parent 0e62a95 commit 504dfdf

File tree

6 files changed

+168
-56
lines changed

6 files changed

+168
-56
lines changed

src/Parsers.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -305,14 +305,14 @@ const SourceType = Union{AbstractVector{UInt8}, AbstractString, IO}
305305
xparse(::Type{T}, source::SourceType; pos::Integer=1, len::Integer=source isa IO ? 0 : sizeof(source), kw...) where {T} =
306306
xparse(T, source, pos, len, Options(; kw...))
307307

308-
@inline _xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, options::Options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T <: SupportedTypes, S} =
308+
@inline _xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, options::Options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S} =
309309
Result(emptysentinel(options)(delimiter(options)(whitespace(options)(
310310
quoted(options)(whitespace(options)(sentinel(options)(typeparser(options)
311311
)))))))(T, source, pos, len, S)
312312

313313
function xparse(::Type{T}, source::SourceType, pos, len, options=XOPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S}
314314
buf = source isa AbstractString ? codeunits(source) : source
315-
if supportedtype(T)
315+
if supportedtype(T) || T === Number
316316
return _xparse(T, buf, pos, len, options, S)
317317
else
318318
# generic fallback calls Base.tryparse
@@ -334,12 +334,12 @@ function xparse(::Type{T}, source::SourceType, pos, len, options=XOPTIONS, ::Typ
334334
end
335335

336336
# condensed version of xparse that doesn't worry about quoting or delimiters; called from Parsers.parse/Parsers.tryparse
337-
@inline _xparse2(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, opts::Options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T <: SupportedTypes, S} =
337+
@inline _xparse2(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, opts::Options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S} =
338338
Result(whitespace(false, false, false, true)(typeparser(opts)))(T, source, pos, len, S)
339339

340340
@inline function xparse2(::Type{T}, source::SourceType, pos, len, options=OPTIONS, ::Type{S}=(T <: AbstractString) ? PosLen : T) where {T, S}
341341
buf = source isa AbstractString ? codeunits(source) : source
342-
if supportedtype(T)
342+
if supportedtype(T) || T === Number
343343
return _xparse2(T, buf, pos, len, options, S)
344344
else
345345
# generic fallback calls Base.tryparse
@@ -386,7 +386,7 @@ include("dates.jl")
386386

387387
function __init__()
388388
resize!(empty!(BIGINT), Threads.nthreads())
389-
resize!(empty!(BIGFLOAT), Threads.nthreads())
389+
resize!(empty!(BIGFLOATS), Threads.nthreads())
390390
return
391391
end
392392

src/floats.jl

Lines changed: 76 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using Base.MPFR, Base.GMP, Base.GMP.MPZ
22

33
_widen(x::UInt64) = UInt128(x)
4+
_widen(x::Int64) = Int128(x)
45

56
const BIGINT = BigInt[]
67

@@ -17,10 +18,10 @@ function access_threaded(f, v::Vector)
1718
end
1819
@noinline _length_assert() = @assert false "0 < tid <= v"
1920

20-
function _widen(v::UInt128)
21+
function _widen(v::T) where {T <: Union{Int128, UInt128}}
2122
x = access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), BIGINT)
2223
ccall((:__gmpz_import, :libgmp), Int32,
23-
(Ref{BigInt}, Csize_t, Cint, Csize_t, Cint, Csize_t, Ref{UInt128}),
24+
(Ref{BigInt}, Csize_t, Cint, Csize_t, Cint, Csize_t, Ref{T}),
2425
x, 1, 1, 16, 0, 0, v)
2526
return x
2627
end
@@ -29,7 +30,7 @@ end
2930
maxdigits(::Type{Float64}) = 1079
3031
maxdigits(::Type{Float32}) = 154
3132
maxdigits(::Type{Float16}) = 29
32-
maxdigits(::Type{BigFloat}) = typemax(Int64)
33+
maxdigits(T) = typemax(Int64)
3334

3435
ten(::Type{T}) where {T} = T(10)
3536
const TEN = BigInt(10)
@@ -43,12 +44,46 @@ function _muladd(ten, digits::BigInt, b)
4344
return digits
4445
end
4546

47+
@enum FloatType FLOAT16 FLOAT32 FLOAT64 BIGFLOAT
48+
float_type(::Type{T}, FT::FloatType) where {T <: AbstractFloat} = T
49+
float_type(T, FT::FloatType) = FT === FLOAT16 ? Float16 :
50+
FT === FLOAT32 ? Float32 :
51+
FT === FLOAT64 ? Float64 :
52+
BigFloat
53+
4654
# for non SupportedFloat Reals, parse as Float64, then convert
4755
@inline function typeparser(::Type{T}, source, pos, len, b, code, pl, options) where {T <: Real}
4856
pos, code, pl, x = typeparser(Float64, source, pos, len, b, code, pl, options)
4957
return pos, code, pl, T(x)
5058
end
5159

60+
function typeparser(::Type{BigFloat}, source, pos, len, b, code, pl, options)
61+
base = 0
62+
rounding = Base.MPFR.ROUNDING_MODE[]
63+
z = BigFloat(precision=Base.MPFR.DEFAULT_PRECISION[])
64+
if source isa AbstractVector{UInt8}
65+
str = source
66+
strpos = pos
67+
else
68+
_, _, _pl, _ = typeparser(String, source, pos, len, b, code, pl, options)
69+
_pos = position(source)
70+
vpos, vlen = _pl.pos, _pl.len
71+
fastseek!(source, vpos - 1)
72+
str = Base.StringVector(vlen)
73+
strpos = 1
74+
readbytes!(source, str, vlen)
75+
fastseek!(source, _pos) # reset IO to earlier position
76+
end
77+
GC.@preserve str begin
78+
ptr = pointer(str, strpos)
79+
endptr = Ref{Ptr{UInt8}}()
80+
err = ccall((:mpfr_strtofr, :libmpfr), Int32, (Ref{BigFloat}, Ptr{UInt8}, Ref{Ptr{UInt8}}, Int32, Base.MPFR.MPFRRoundingMode), z, ptr, endptr, base, rounding)
81+
code |= endptr[] == ptr ? INVALID : OK
82+
pos += Int(endptr[] - ptr)
83+
return pos, code, PosLen(pl.pos, pos - pl.pos), z
84+
end
85+
end
86+
5287
@inline function typeparser(::Type{T}, source, pos, len, b, code, pl, options) where {T <: SupportedFloats}
5388
# keep track of starting pos in case of invalid, we can rewind to start of parsing
5489
startpos = pos
@@ -89,7 +124,7 @@ end
89124
if eof(source, pos, len)
90125
code |= EOF
91126
end
92-
code |= OK
127+
code |= OK | SPECIAL_VALUE
93128
@goto done
94129
end
95130
end
@@ -111,7 +146,7 @@ end
111146
b = peekbyte(source, pos)
112147
if b == UInt8('f') || b == UInt8('F')
113148
x = ifelse(neg, T(-Inf), T(Inf))
114-
code |= OK
149+
code |= OK | SPECIAL_VALUE
115150
pos += 1
116151
incr!(source)
117152
if eof(source, pos, len)
@@ -183,10 +218,10 @@ end
183218
end
184219

185220
# if we need to _widen the type due to `digits` overflow, we want a non-inlined version so base case compilation doesn't get out of control
186-
@noinline _parsedigits(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos) where {T <: SupportedFloats, IntType} =
221+
@noinline _parsedigits(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos) where {T, IntType} =
187222
parsedigits(T, source, pos, len, b, code, options, digits, neg, startpos)
188223

189-
@inline function parsedigits(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos) where {T <: SupportedFloats, IntType}
224+
@inline function parsedigits(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos) where {T, IntType}
190225
x = zero(T)
191226
ndigits = 0
192227
has_groupmark = options.groupmark !== nothing
@@ -267,12 +302,13 @@ end
267302
# same as above; if digits overflows, we want a non-inlined version to call with a wider type
268303
# note that we never expect `frac` to overflow, since it's just keep track of the # of digits
269304
# we parse post-decimal point
270-
@noinline _parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T <: SupportedFloats, IntType} =
305+
@noinline _parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T, IntType} =
271306
parsefrac(T, source, pos, len, b, code, options, digits, neg, startpos, frac)
272307

273-
@inline function parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T <: SupportedFloats, IntType}
308+
@inline function parsefrac(::Type{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, frac) where {T, IntType}
274309
x = zero(T)
275310
parsedanyfrac = false
311+
FT = FLOAT64
276312
# check if `b` is a digit
277313
if b - UInt8('0') < 0x0a
278314
b -= UInt8('0')
@@ -285,7 +321,7 @@ end
285321
frac += UInt64(1)
286322
if eof(source, pos, len)
287323
# input is simple non-scientific-notation floating number, like "1.1"
288-
x = scale(T, digits, -signed(frac), neg)
324+
x = scale(T, FT, digits, -signed(frac), neg)
289325
code |= OK | EOF
290326
@goto done
291327
end
@@ -299,6 +335,9 @@ end
299335
end
300336
# check for exponent notation
301337
if b == UInt8('e') || b == UInt8('E') || b == UInt8('f') || b == UInt8('F')
338+
if b == UInt8('f') || b == UInt8('F')
339+
FT = FLOAT32
340+
end
302341
pos += 1
303342
incr!(source)
304343
if eof(source, pos, len)
@@ -327,11 +366,11 @@ end
327366

328367
# at this point, we've parsed X and Y in "X.YeZ", but not Z in a scientific notation exponent number
329368
# we start our exponent number at UInt64(0)
330-
return parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, UInt64(0), negexp)
369+
return parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, UInt64(0), negexp, FT)
331370
else
332371
# if no scientific notation, we're done, so scale digits + frac and return
333372
if parsedanyfrac
334-
x = scale(T, digits, -signed(frac), neg)
373+
x = scale(T, FT, digits, -signed(frac), neg)
335374
else
336375
x = ifelse(neg, -T(digits), T(digits))
337376
end
@@ -344,10 +383,10 @@ end
344383

345384
# same no-inline story, but this time for exponent number; probably even more rare to overflow the exponent number
346385
# compared to pre/post decimal digits, but we account for it all the same (a lot of float parsers don't account for this)
347-
@noinline _parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp) where {T <: SupportedFloats, ExpType} =
348-
parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, exp, negexp)
386+
@noinline _parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp, FT) where {T, ExpType} =
387+
parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, exp, negexp, FT)
349388

350-
@inline function parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp) where {T <: SupportedFloats, ExpType}
389+
@inline function parseexp(::Type{T}, source, pos, len, b, code, options, digits, neg::Bool, startpos, frac, exp::ExpType, negexp, FT) where {T, ExpType}
351390
x = zero(T)
352391
# note that `b` has already had `b - UInt8('0')` applied to it for parseexp
353392
while true
@@ -356,19 +395,19 @@ end
356395
incr!(source)
357396
if eof(source, pos, len)
358397
# we finished parsing input like "1.1e1"
359-
x = scale(T, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
398+
x = scale(T, FT, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
360399
code |= OK | EOF
361400
@goto done
362401
end
363402
b = peekbyte(source, pos) - UInt8('0')
364403
# if we encounter a non-digit, that must mean we're done
365404
if b > 0x09
366-
x = scale(T, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
405+
x = scale(T, FT, digits, ifelse(negexp, -signed(exp), signed(exp)) - signed(frac), neg)
367406
code |= OK
368407
@goto done
369408
end
370409
if overflows(ExpType) && exp > overflowval(ExpType)
371-
return _parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, _widen(exp), negexp)
410+
return _parseexp(T, source, pos, len, b, code, options, digits, neg, startpos, frac, _widen(exp), negexp, FT)
372411
end
373412
end
374413
@label done
@@ -395,7 +434,14 @@ pow10(::Type{Float32}, e) = (@inbounds v = F32_SHORT_POWERS[e+1]; return v)
395434
pow10(::Type{Float64}, e) = (@inbounds v = F64_SHORT_POWERS[e+1]; return v)
396435
pow10(::Type{BigFloat}, e) = (@inbounds v = F64_SHORT_POWERS[e+1]; return v)
397436

398-
function scale(::Type{T}, v, exp, neg) where {T}
437+
_unsigned(x::BigInt) = x
438+
_unsigned(x) = unsigned(x)
439+
440+
function scale(::Type{T}, FT::FloatType, v, exp, neg) where {T}
441+
return __scale(float_type(T, FT), _unsigned(v), exp, neg)
442+
end
443+
444+
function __scale(::Type{T}, v, exp, neg) where {T}
399445
ms = maxsig(T)
400446
cl = ceillog5(T)
401447
if v < ms
@@ -409,7 +455,7 @@ function scale(::Type{T}, v, exp, neg) where {T}
409455
end
410456
end
411457
v == 0 && return zero(T)
412-
if exp > 308
458+
if exp > 308 && T != BigFloat
413459
return T(neg ? -Inf : Inf)
414460
elseif exp < -326
415461
# https://github.com/JuliaData/Parsers.jl/issues/83
@@ -485,9 +531,9 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: UInt128}
485531
if exp == 23
486532
# special-case concluded from https://github.com/JuliaLang/julia/issues/38509
487533
x = v * V(1e23)
488-
elseif exp >= 0
534+
elseif 0 <= exp < 290
489535
x = v * exp10(exp)
490-
elseif exp < -308 || v > maxsig(T)
536+
elseif exp < -308 || exp > 308 || v > maxsig(T)
491537
# if v is too large, we lose precision by just doing
492538
# v / exp10(-exp) since that only promotes to Float64
493539
# so detect and re-route to this branch where we widen v
@@ -500,15 +546,11 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: UInt128}
500546
end
501547

502548
const BIGEXP10 = [1 / exp10(BigInt(e)) for e = 309:327]
503-
const BIGFLOAT = BigFloat[]
504-
if VERSION > v"1.5"
505-
const BIGFLOATEXP10 = [exp10(BigFloat(i; precision=64)) for i = 1:308]
506-
else
507-
const BIGFLOATEXP10 = [exp10(BigFloat(i)) for i = 1:308]
508-
end
549+
const BIGFLOATS = BigFloat[]
550+
const BIGFLOATEXP10 = [exp10(BigFloat(i; precision=256)) for i = 1:308]
509551

510552
function _scale(::Type{T}, v::V, exp, neg) where {T, V <: BigInt}
511-
x = access_threaded(BigFloat, BIGFLOAT)
553+
x = access_threaded(BigFloat, BIGFLOATS)
512554

513555
ccall((:mpfr_set_z, :libmpfr), Int32,
514556
(Ref{BigFloat}, Ref{BigInt}, Int32),
@@ -531,7 +573,11 @@ function _scale(::Type{T}, v::V, exp, neg) where {T, V <: BigInt}
531573
x, x, y, MPFR.ROUNDING_MODE[])
532574
else
533575
# v * exp10(V(exp))
534-
y = BIGFLOATEXP10[exp]
576+
if exp <= 308
577+
y = BIGFLOATEXP10[exp]
578+
else
579+
y = exp10(BigFloat(exp; precision=256))
580+
end
535581
ccall((:mpfr_mul, :libmpfr), Int32,
536582
(Ref{BigFloat}, Ref{BigFloat}, Ref{BigFloat}, Int32),
537583
x, x, y, MPFR.ROUNDING_MODE[])

src/ints.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,30 @@ overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))
9595
@label done
9696
return pos, code, PosLen(pl.pos, pos - pl.pos), x
9797
end
98+
99+
@inline function typeparser(::Type{Number}, source, pos, len, b, code, pl, opts)
100+
startpos = pos
101+
startcode = code
102+
# begin parsing
103+
neg = b == UInt8('-')
104+
if neg || b == UInt8('+')
105+
pos += 1
106+
incr!(source)
107+
end
108+
if eof(source, pos, len)
109+
code |= INVALID | EOF
110+
@goto done
111+
end
112+
b = peekbyte(source, pos)
113+
# parse rest of number
114+
digits = Int64(0)
115+
x, code, pos = parsedigits(Number, source, pos, len, b, code, opts, digits, neg, startpos)
116+
if (x === Inf || x === -Inf) && !specialvalue(code)
117+
# by default, parsedigits only has up to Float64 precision; if we overflow
118+
# let's try BigFloat
119+
return typeparser(BigFloat, source, startpos, len, b, startcode, pl, opts)
120+
end
121+
122+
@label done
123+
return pos, code, PosLen(pl.pos, pos - pl.pos), x
124+
end

src/utils.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ const DELIMITED = 0b0000000000001000 % ReturnCode
6161
const NEWLINE = 0b0000000000010000 % ReturnCode
6262
const EOF = 0b0000000000100000 % ReturnCode
6363
const ESCAPED_STRING = 0b0000001000000000 % ReturnCode
64+
const SPECIAL_VALUE = 0b0000010000000000 % ReturnCode
6465

6566
# invalid flags
6667
const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode
@@ -78,6 +79,7 @@ quoted(x::ReturnCode) = (x & QUOTED) == QUOTED
7879
delimited(x::ReturnCode) = (x & DELIMITED) == DELIMITED
7980
newline(x::ReturnCode) = (x & NEWLINE) == NEWLINE
8081
escapedstring(x::ReturnCode) = (x & ESCAPED_STRING) == ESCAPED_STRING
82+
specialvalue(x::ReturnCode) = (x & SPECIAL_VALUE) == SPECIAL_VALUE
8183
invalidquotedfield(x::ReturnCode) = (x & INVALID_QUOTED_FIELD) == INVALID_QUOTED_FIELD
8284
invaliddelimiter(x::ReturnCode) = (x & INVALID_DELIMITER) == INVALID_DELIMITER
8385
overflow(x::ReturnCode) = (x & OVERFLOW) == OVERFLOW

0 commit comments

Comments
 (0)