@@ -1814,288 +1814,6 @@ Base.vcat(dfs::AbstractDataFrame...;
1814
1814
Pair{<: SymbolOrString , <: AbstractVector }}= nothing ) =
1815
1815
reduce (vcat, dfs; cols= cols, source= source)
1816
1816
1817
- """
1818
- reduce(::typeof(vcat),
1819
- dfs::Union{AbstractVector{<:AbstractDataFrame},
1820
- Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}};
1821
- cols::Union{Symbol, AbstractVector{Symbol},
1822
- AbstractVector{<:AbstractString}}=:setequal,
1823
- source::Union{Nothing, Symbol, AbstractString,
1824
- Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing,
1825
- init::AbstractDataFrame=DataFrame())
1826
-
1827
- Efficiently reduce the given vector or tuple of `AbstractDataFrame`s with
1828
- `vcat`.
1829
-
1830
- See the [`vcat`](@ref) docstring for a description of keyword arguments `cols`
1831
- and `source`.
1832
-
1833
- The keyword argument `init` is the initial value to use in the reductions.
1834
- It must be a data frame that has zero rows. It is not taken into account when
1835
- computing the value of the `source` column nor when determining metadata
1836
- of the produced data frame.
1837
-
1838
- The column order, names, and types of the resulting `DataFrame`, and the
1839
- behavior of `cols` and `source` keyword arguments follow the rules specified for
1840
- [`vcat`](@ref) of `AbstractDataFrame`s.
1841
-
1842
- Metadata: `vcat` propagates table-level `:note`-style metadata for keys that are
1843
- present in all passed data frames and have the same value. `vcat` propagates
1844
- column-level `:note`-style metadata for keys that are present in all passed data
1845
- frames that contain this column and have the same value.
1846
-
1847
- # Example
1848
- ```jldoctest
1849
- julia> df1 = DataFrame(A=1:3, B=1:3)
1850
- 3×2 DataFrame
1851
- Row │ A B
1852
- │ Int64 Int64
1853
- ─────┼──────────────
1854
- 1 │ 1 1
1855
- 2 │ 2 2
1856
- 3 │ 3 3
1857
-
1858
- julia> df2 = DataFrame(A=4:6, B=4:6)
1859
- 3×2 DataFrame
1860
- Row │ A B
1861
- │ Int64 Int64
1862
- ─────┼──────────────
1863
- 1 │ 4 4
1864
- 2 │ 5 5
1865
- 3 │ 6 6
1866
-
1867
- julia> df3 = DataFrame(A=7:9, C=7:9)
1868
- 3×2 DataFrame
1869
- Row │ A C
1870
- │ Int64 Int64
1871
- ─────┼──────────────
1872
- 1 │ 7 7
1873
- 2 │ 8 8
1874
- 3 │ 9 9
1875
-
1876
- julia> reduce(vcat, (df1, df2))
1877
- 6×2 DataFrame
1878
- Row │ A B
1879
- │ Int64 Int64
1880
- ─────┼──────────────
1881
- 1 │ 1 1
1882
- 2 │ 2 2
1883
- 3 │ 3 3
1884
- 4 │ 4 4
1885
- 5 │ 5 5
1886
- 6 │ 6 6
1887
-
1888
- julia> reduce(vcat, [df1, df2, df3], cols=:union, source=:source)
1889
- 9×4 DataFrame
1890
- Row │ A B C source
1891
- │ Int64 Int64? Int64? Int64
1892
- ─────┼─────────────────────────────────
1893
- 1 │ 1 1 missing 1
1894
- 2 │ 2 2 missing 1
1895
- 3 │ 3 3 missing 1
1896
- 4 │ 4 4 missing 2
1897
- 5 │ 5 5 missing 2
1898
- 6 │ 6 6 missing 2
1899
- 7 │ 7 missing 7 3
1900
- 8 │ 8 missing 8 3
1901
- 9 │ 9 missing 9 3
1902
- ```
1903
- """
1904
- function Base. reduce (:: typeof (vcat),
1905
- dfs:: Union {AbstractVector{<: AbstractDataFrame },
1906
- Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}};
1907
- cols:: Union {Symbol, AbstractVector{Symbol},
1908
- AbstractVector{<: AbstractString }}= :setequal ,
1909
- source:: Union {Nothing, SymbolOrString,
1910
- Pair{<: SymbolOrString , <: AbstractVector }}= nothing ,
1911
- init:: AbstractDataFrame = DataFrame ())
1912
- if nrow (init) > 0
1913
- throw (ArgumentError (" init data frame must have zero rows" ))
1914
- end
1915
- dfs_init = AbstractDataFrame[emptycolmetadata! (copy (init))]
1916
- append! (dfs_init, dfs)
1917
- res = _vcat (AbstractDataFrame[df for df in dfs_init if ncol (df) != 0 ]; cols= cols)
1918
- # only handle table-level metadata, as column-level metadata was done in _vcat
1919
- _merge_matching_table_note_metadata! (res, dfs)
1920
-
1921
- if source != = nothing
1922
- len = length (dfs)
1923
- if source isa SymbolOrString
1924
- col, vals = source, 1 : len
1925
- else
1926
- @assert source isa Pair{<: SymbolOrString , <: AbstractVector }
1927
- col, vals = source
1928
- end
1929
-
1930
- if columnindex (res, col) > 0
1931
- idx = findfirst (df -> columnindex (df, col) > 0 , dfs)
1932
- @assert idx != = nothing
1933
- throw (ArgumentError (" source column name :$col already exists in data frame " *
1934
- " passed in position $idx " ))
1935
- end
1936
-
1937
- if len != length (vals)
1938
- throw (ArgumentError (" number of passed source identifiers ($(length (vals)) )" *
1939
- " does not match the number of data frames ($len )" ))
1940
- end
1941
-
1942
- source_vec = Tables. allocatecolumn (eltype (vals), nrow (res))
1943
- @assert firstindex (source_vec) == 1 && lastindex (source_vec) == nrow (res)
1944
- start = 1
1945
- for (v, df) in zip (vals, dfs)
1946
- stop = start + nrow (df) - 1
1947
- source_vec[start: stop] .= Ref (v)
1948
- start = stop + 1
1949
- end
1950
-
1951
- @assert start == nrow (res) + 1
1952
- insertcols! (res, col => source_vec)
1953
- end
1954
-
1955
- return res
1956
- end
1957
-
1958
- # definition needed to avoid dispatch ambiguity
1959
- Base. reduce (:: typeof (vcat),
1960
- dfs:: SentinelArrays.ChainedVector{T, A} where {T<: AbstractDataFrame ,
1961
- A<: AbstractVector{T} };
1962
- cols:: Union {Symbol, AbstractVector{Symbol},
1963
- AbstractVector{<: AbstractString }}= :setequal ,
1964
- source:: Union {Nothing, SymbolOrString,
1965
- Pair{<: SymbolOrString , <: AbstractVector }}= nothing ,
1966
- init:: AbstractDataFrame = DataFrame ()) =
1967
- reduce (vcat, collect (AbstractDataFrame, dfs), cols= cols, source= source, init= init)
1968
-
1969
- function _vcat (dfs:: AbstractVector{AbstractDataFrame} ;
1970
- cols:: Union {Symbol, AbstractVector{Symbol},
1971
- AbstractVector{<: AbstractString }}= :setequal )
1972
- # note that empty DataFrame() objects are dropped from dfs before we call _vcat
1973
- if isempty (dfs)
1974
- cols isa Symbol && return DataFrame ()
1975
- return DataFrame ([col => Missing[] for col in cols])
1976
- end
1977
- # Array of all headers
1978
- allheaders = map (names, dfs)
1979
- # Array of unique headers across all data frames
1980
- uniqueheaders = unique (allheaders)
1981
- # All symbols present across all headers
1982
- unionunique = union (uniqueheaders... )
1983
- # List of symbols present in all dataframes
1984
- intersectunique = intersect (uniqueheaders... )
1985
-
1986
- if cols === :orderequal
1987
- header = unionunique
1988
- if length (uniqueheaders) > 1
1989
- throw (ArgumentError (" when `cols=:orderequal` all data frames need to " *
1990
- " have the same column names and be in the same order" ))
1991
- end
1992
- elseif cols === :setequal || cols === :equal
1993
- # an explicit error is thrown as :equal was supported in the past
1994
- if cols === :equal
1995
- throw (ArgumentError (" `cols=:equal` is not supported. " *
1996
- " Use `:setequal` instead." ))
1997
- end
1998
-
1999
- header = unionunique
2000
- coldiff = setdiff (unionunique, intersectunique)
2001
-
2002
- if ! isempty (coldiff)
2003
- # if any DataFrames are a full superset of names, skip them
2004
- let header= header # julia #15276
2005
- filter! (u -> ! issetequal (u, header), uniqueheaders)
2006
- end
2007
- estrings = map (enumerate (uniqueheaders)) do (i, head)
2008
- matching = findall (h -> head == h, allheaders)
2009
- headerdiff = setdiff (coldiff, head)
2010
- badcols = join (headerdiff, " , " , " and " )
2011
- args = join (matching, " , " , " and " )
2012
- return " column(s) $badcols are missing from argument(s) $args "
2013
- end
2014
- throw (ArgumentError (join (estrings, " , " , " , and " )))
2015
- end
2016
- elseif cols === :intersect
2017
- header = intersectunique
2018
- elseif cols === :union
2019
- header = unionunique
2020
- elseif cols isa Symbol
2021
- throw (ArgumentError (" Invalid `cols` value :$cols . " *
2022
- " Only `:orderequal`, `:setequal`, `:intersect`, " *
2023
- " `:union`, or a vector of column names is allowed." ))
2024
- elseif cols isa AbstractVector{Symbol}
2025
- header = cols
2026
- else
2027
- @assert cols isa AbstractVector{<: AbstractString }
2028
- header = Symbol .(cols)
2029
- end
2030
-
2031
- if isempty (header)
2032
- out_df = DataFrame ()
2033
- else
2034
- all_cols = Vector {AbstractVector} (undef, length (header))
2035
- for (i, name) in enumerate (header)
2036
- newcols = map (dfs) do df
2037
- if hasproperty (df, name)
2038
- return df[! , name]
2039
- else
2040
- Iterators. repeated (missing , nrow (df))
2041
- end
2042
- end
2043
-
2044
- lens = map (length, newcols)
2045
- T = mapreduce (eltype, promote_type, newcols)
2046
- all_cols[i] = Tables. allocatecolumn (T, sum (lens))
2047
- offset = 1
2048
- for j in 1 : length (newcols)
2049
- copyto! (all_cols[i], offset, newcols[j])
2050
- offset += lens[j]
2051
- end
2052
- end
2053
-
2054
- out_df = DataFrame (all_cols, header, copycols= false )
2055
- end
2056
-
2057
- # here we process column-level metadata, table-level metadata is processed in reduce
2058
-
2059
- # first check if all data frames do not have column-level metadata
2060
- # in which case we do not have to do anything
2061
- all (df -> getfield (parent (df), :colmetadata ) === nothing , dfs) && return out_df
2062
-
2063
- for colname in _names (out_df)
2064
- if length (dfs) == 1
2065
- df1 = dfs[1 ]
2066
- hasproperty (df1, colname) && _copy_col_note_metadata! (out_df, colname, df1, colname)
2067
- else
2068
- start = findfirst (x -> hasproperty (x, colname), dfs)
2069
- start === nothing && continue
2070
- df_start = dfs[start]
2071
- for key_start in colmetadatakeys (df_start, colname)
2072
- meta_val_start, meta_style_start = colmetadata (df_start, colname, key_start, style= true )
2073
- if meta_style_start === :note
2074
- good_key = true
2075
- for i in start+ 1 : length (dfs)
2076
- dfi = dfs[i]
2077
- if hasproperty (dfi, colname)
2078
- if key_start in colmetadatakeys (dfi, colname)
2079
- meta_vali, meta_stylei = colmetadata (dfi, colname, key_start, style= true )
2080
- if ! (meta_stylei === :note && isequal (meta_val_start, meta_vali))
2081
- good_key = false
2082
- break
2083
- end
2084
- else
2085
- good_key = false
2086
- break
2087
- end
2088
- end
2089
- end
2090
- good_key && colmetadata! (out_df, colname, key_start, meta_val_start, style= :note )
2091
- end
2092
- end
2093
- end
2094
- end
2095
-
2096
- return out_df
2097
- end
2098
-
2099
1817
"""
2100
1818
repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1)
2101
1819
0 commit comments