Skip to content

Commit 5cd4616

Browse files
committed
move code from abstractdataframe.jl to iteration.jl
1 parent 9683931 commit 5cd4616

File tree

2 files changed

+282
-282
lines changed

2 files changed

+282
-282
lines changed

src/abstractdataframe/abstractdataframe.jl

Lines changed: 0 additions & 282 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,288 +1814,6 @@ Base.vcat(dfs::AbstractDataFrame...;
18141814
Pair{<:SymbolOrString, <:AbstractVector}}=nothing) =
18151815
reduce(vcat, dfs; cols=cols, source=source)
18161816

1817-
"""
1818-
reduce(::typeof(vcat),
1819-
dfs::Union{AbstractVector{<:AbstractDataFrame},
1820-
Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}};
1821-
cols::Union{Symbol, AbstractVector{Symbol},
1822-
AbstractVector{<:AbstractString}}=:setequal,
1823-
source::Union{Nothing, Symbol, AbstractString,
1824-
Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing,
1825-
init::AbstractDataFrame=DataFrame())
1826-
1827-
Efficiently reduce the given vector or tuple of `AbstractDataFrame`s with
1828-
`vcat`.
1829-
1830-
See the [`vcat`](@ref) docstring for a description of keyword arguments `cols`
1831-
and `source`.
1832-
1833-
The keyword argument `init` is the initial value to use in the reductions.
1834-
It must be a data frame that has zero rows. It is not taken into account when
1835-
computing the value of the `source` column nor when determining metadata
1836-
of the produced data frame.
1837-
1838-
The column order, names, and types of the resulting `DataFrame`, and the
1839-
behavior of `cols` and `source` keyword arguments follow the rules specified for
1840-
[`vcat`](@ref) of `AbstractDataFrame`s.
1841-
1842-
Metadata: `vcat` propagates table-level `:note`-style metadata for keys that are
1843-
present in all passed data frames and have the same value. `vcat` propagates
1844-
column-level `:note`-style metadata for keys that are present in all passed data
1845-
frames that contain this column and have the same value.
1846-
1847-
# Example
1848-
```jldoctest
1849-
julia> df1 = DataFrame(A=1:3, B=1:3)
1850-
3×2 DataFrame
1851-
Row │ A B
1852-
│ Int64 Int64
1853-
─────┼──────────────
1854-
1 │ 1 1
1855-
2 │ 2 2
1856-
3 │ 3 3
1857-
1858-
julia> df2 = DataFrame(A=4:6, B=4:6)
1859-
3×2 DataFrame
1860-
Row │ A B
1861-
│ Int64 Int64
1862-
─────┼──────────────
1863-
1 │ 4 4
1864-
2 │ 5 5
1865-
3 │ 6 6
1866-
1867-
julia> df3 = DataFrame(A=7:9, C=7:9)
1868-
3×2 DataFrame
1869-
Row │ A C
1870-
│ Int64 Int64
1871-
─────┼──────────────
1872-
1 │ 7 7
1873-
2 │ 8 8
1874-
3 │ 9 9
1875-
1876-
julia> reduce(vcat, (df1, df2))
1877-
6×2 DataFrame
1878-
Row │ A B
1879-
│ Int64 Int64
1880-
─────┼──────────────
1881-
1 │ 1 1
1882-
2 │ 2 2
1883-
3 │ 3 3
1884-
4 │ 4 4
1885-
5 │ 5 5
1886-
6 │ 6 6
1887-
1888-
julia> reduce(vcat, [df1, df2, df3], cols=:union, source=:source)
1889-
9×4 DataFrame
1890-
Row │ A B C source
1891-
│ Int64 Int64? Int64? Int64
1892-
─────┼─────────────────────────────────
1893-
1 │ 1 1 missing 1
1894-
2 │ 2 2 missing 1
1895-
3 │ 3 3 missing 1
1896-
4 │ 4 4 missing 2
1897-
5 │ 5 5 missing 2
1898-
6 │ 6 6 missing 2
1899-
7 │ 7 missing 7 3
1900-
8 │ 8 missing 8 3
1901-
9 │ 9 missing 9 3
1902-
```
1903-
"""
1904-
function Base.reduce(::typeof(vcat),
1905-
dfs::Union{AbstractVector{<:AbstractDataFrame},
1906-
Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}};
1907-
cols::Union{Symbol, AbstractVector{Symbol},
1908-
AbstractVector{<:AbstractString}}=:setequal,
1909-
source::Union{Nothing, SymbolOrString,
1910-
Pair{<:SymbolOrString, <:AbstractVector}}=nothing,
1911-
init::AbstractDataFrame=DataFrame())
1912-
if nrow(init) > 0
1913-
throw(ArgumentError("init data frame must have zero rows"))
1914-
end
1915-
dfs_init = AbstractDataFrame[emptycolmetadata!(copy(init))]
1916-
append!(dfs_init, dfs)
1917-
res = _vcat(AbstractDataFrame[df for df in dfs_init if ncol(df) != 0]; cols=cols)
1918-
# only handle table-level metadata, as column-level metadata was done in _vcat
1919-
_merge_matching_table_note_metadata!(res, dfs)
1920-
1921-
if source !== nothing
1922-
len = length(dfs)
1923-
if source isa SymbolOrString
1924-
col, vals = source, 1:len
1925-
else
1926-
@assert source isa Pair{<:SymbolOrString, <:AbstractVector}
1927-
col, vals = source
1928-
end
1929-
1930-
if columnindex(res, col) > 0
1931-
idx = findfirst(df -> columnindex(df, col) > 0, dfs)
1932-
@assert idx !== nothing
1933-
throw(ArgumentError("source column name :$col already exists in data frame " *
1934-
" passed in position $idx"))
1935-
end
1936-
1937-
if len != length(vals)
1938-
throw(ArgumentError("number of passed source identifiers ($(length(vals)))" *
1939-
"does not match the number of data frames ($len)"))
1940-
end
1941-
1942-
source_vec = Tables.allocatecolumn(eltype(vals), nrow(res))
1943-
@assert firstindex(source_vec) == 1 && lastindex(source_vec) == nrow(res)
1944-
start = 1
1945-
for (v, df) in zip(vals, dfs)
1946-
stop = start + nrow(df) - 1
1947-
source_vec[start:stop] .= Ref(v)
1948-
start = stop + 1
1949-
end
1950-
1951-
@assert start == nrow(res) + 1
1952-
insertcols!(res, col => source_vec)
1953-
end
1954-
1955-
return res
1956-
end
1957-
1958-
# definition needed to avoid dispatch ambiguity
1959-
Base.reduce(::typeof(vcat),
1960-
dfs::SentinelArrays.ChainedVector{T, A} where {T<:AbstractDataFrame,
1961-
A<:AbstractVector{T}};
1962-
cols::Union{Symbol, AbstractVector{Symbol},
1963-
AbstractVector{<:AbstractString}}=:setequal,
1964-
source::Union{Nothing, SymbolOrString,
1965-
Pair{<:SymbolOrString, <:AbstractVector}}=nothing,
1966-
init::AbstractDataFrame=DataFrame()) =
1967-
reduce(vcat, collect(AbstractDataFrame, dfs), cols=cols, source=source, init=init)
1968-
1969-
function _vcat(dfs::AbstractVector{AbstractDataFrame};
1970-
cols::Union{Symbol, AbstractVector{Symbol},
1971-
AbstractVector{<:AbstractString}}=:setequal)
1972-
# note that empty DataFrame() objects are dropped from dfs before we call _vcat
1973-
if isempty(dfs)
1974-
cols isa Symbol && return DataFrame()
1975-
return DataFrame([col => Missing[] for col in cols])
1976-
end
1977-
# Array of all headers
1978-
allheaders = map(names, dfs)
1979-
# Array of unique headers across all data frames
1980-
uniqueheaders = unique(allheaders)
1981-
# All symbols present across all headers
1982-
unionunique = union(uniqueheaders...)
1983-
# List of symbols present in all dataframes
1984-
intersectunique = intersect(uniqueheaders...)
1985-
1986-
if cols === :orderequal
1987-
header = unionunique
1988-
if length(uniqueheaders) > 1
1989-
throw(ArgumentError("when `cols=:orderequal` all data frames need to " *
1990-
"have the same column names and be in the same order"))
1991-
end
1992-
elseif cols === :setequal || cols === :equal
1993-
# an explicit error is thrown as :equal was supported in the past
1994-
if cols === :equal
1995-
throw(ArgumentError("`cols=:equal` is not supported. " *
1996-
"Use `:setequal` instead."))
1997-
end
1998-
1999-
header = unionunique
2000-
coldiff = setdiff(unionunique, intersectunique)
2001-
2002-
if !isempty(coldiff)
2003-
# if any DataFrames are a full superset of names, skip them
2004-
let header=header # julia #15276
2005-
filter!(u -> !issetequal(u, header), uniqueheaders)
2006-
end
2007-
estrings = map(enumerate(uniqueheaders)) do (i, head)
2008-
matching = findall(h -> head == h, allheaders)
2009-
headerdiff = setdiff(coldiff, head)
2010-
badcols = join(headerdiff, ", ", " and ")
2011-
args = join(matching, ", ", " and ")
2012-
return "column(s) $badcols are missing from argument(s) $args"
2013-
end
2014-
throw(ArgumentError(join(estrings, ", ", ", and ")))
2015-
end
2016-
elseif cols === :intersect
2017-
header = intersectunique
2018-
elseif cols === :union
2019-
header = unionunique
2020-
elseif cols isa Symbol
2021-
throw(ArgumentError("Invalid `cols` value :$cols. " *
2022-
"Only `:orderequal`, `:setequal`, `:intersect`, " *
2023-
"`:union`, or a vector of column names is allowed."))
2024-
elseif cols isa AbstractVector{Symbol}
2025-
header = cols
2026-
else
2027-
@assert cols isa AbstractVector{<:AbstractString}
2028-
header = Symbol.(cols)
2029-
end
2030-
2031-
if isempty(header)
2032-
out_df = DataFrame()
2033-
else
2034-
all_cols = Vector{AbstractVector}(undef, length(header))
2035-
for (i, name) in enumerate(header)
2036-
newcols = map(dfs) do df
2037-
if hasproperty(df, name)
2038-
return df[!, name]
2039-
else
2040-
Iterators.repeated(missing, nrow(df))
2041-
end
2042-
end
2043-
2044-
lens = map(length, newcols)
2045-
T = mapreduce(eltype, promote_type, newcols)
2046-
all_cols[i] = Tables.allocatecolumn(T, sum(lens))
2047-
offset = 1
2048-
for j in 1:length(newcols)
2049-
copyto!(all_cols[i], offset, newcols[j])
2050-
offset += lens[j]
2051-
end
2052-
end
2053-
2054-
out_df = DataFrame(all_cols, header, copycols=false)
2055-
end
2056-
2057-
# here we process column-level metadata, table-level metadata is processed in reduce
2058-
2059-
# first check if all data frames do not have column-level metadata
2060-
# in which case we do not have to do anything
2061-
all(df -> getfield(parent(df), :colmetadata) === nothing, dfs) && return out_df
2062-
2063-
for colname in _names(out_df)
2064-
if length(dfs) == 1
2065-
df1 = dfs[1]
2066-
hasproperty(df1, colname) && _copy_col_note_metadata!(out_df, colname, df1, colname)
2067-
else
2068-
start = findfirst(x -> hasproperty(x, colname), dfs)
2069-
start === nothing && continue
2070-
df_start = dfs[start]
2071-
for key_start in colmetadatakeys(df_start, colname)
2072-
meta_val_start, meta_style_start = colmetadata(df_start, colname, key_start, style=true)
2073-
if meta_style_start === :note
2074-
good_key = true
2075-
for i in start+1:length(dfs)
2076-
dfi = dfs[i]
2077-
if hasproperty(dfi, colname)
2078-
if key_start in colmetadatakeys(dfi, colname)
2079-
meta_vali, meta_stylei = colmetadata(dfi, colname, key_start, style=true)
2080-
if !(meta_stylei === :note && isequal(meta_val_start, meta_vali))
2081-
good_key = false
2082-
break
2083-
end
2084-
else
2085-
good_key = false
2086-
break
2087-
end
2088-
end
2089-
end
2090-
good_key && colmetadata!(out_df, colname, key_start, meta_val_start, style=:note)
2091-
end
2092-
end
2093-
end
2094-
end
2095-
2096-
return out_df
2097-
end
2098-
20991817
"""
21001818
repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1)
21011819

0 commit comments

Comments
 (0)