JuliaAI · ablaom · Feb 17, 2025 · Feb 11, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,16 @@
 name = "LearnTestAPI"
 uuid = "3111ed91-c4f2-40e7-bb19-7f6c618409b8"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IsURL = "ceb4388c-583f-448d-bb30-00b11e8c5682"
 LearnAPI = "92ad9a40-7767-427a-9ee6-6e577f1266cb"
+LearnDataFrontEnds = "5cca22a3-9356-470e-ba1b-8268d0135a4b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLCore = "c2834f40-e789-41da-a90e-33b280584a8c"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -22,6 +25,8 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
+CategoricalArrays = "0.10.8"
+CategoricalDistributions = "0.1.15"
 Distributions = "0.25"
 InteractiveUtils = "<0.0.1, 1"
 IsURL = "0.2.0"
@@ -46,7 +51,16 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [targets]
-test = ["DataFrames", "Distributions", "Random", "LinearAlgebra", "Statistics", "Tables"]
+test = [
+  "DataFrames",
+  "Distributions",
+  "Random",
+  "LinearAlgebra",
+  "Statistics",
+  "StatsModels",
+  "Tables",
+  ]
diff --git a/src/LearnTestAPI.jl b/src/LearnTestAPI.jl
@@ -1,7 +1,7 @@
 """
     LearnTestAPI
 
-Module for testing implementations of the interfacde defined in
+Module for testing implementations of the interface defined in
 [LearnAPI.jl](https://juliaai.github.io/LearnAPI.jl/dev/).
 
 If your package defines an object `learner` implementing the interface, then put something
@@ -46,12 +46,14 @@ using LinearAlgebra
 using Random
 using Statistics
 using UnPack
+import LearnDataFrontEnds
 
 include("tools.jl")
 include("logging.jl")
 include("testapi.jl")
 include("learners/static_algorithms.jl")
 include("learners/regression.jl")
+include("learners/classification.jl")
 include("learners/ensembling.jl")
 # next learner excluded because of heavy dependencies:
 # include("learners/gradient_descent.jl")

diff --git a/src/learners/classification.jl b/src/learners/classification.jl
@@ -0,0 +1,118 @@
+# This file defines `ConstantClassifier()`
+
+using LearnAPI
+import LearnDataFrontEnds as FrontEnds
+import MLCore
+import CategoricalArrays
+import CategoricalDistributions
+import CategoricalDistributions.OrderedCollections.OrderedDict
+import CategoricalDistributions.Distributions.StatsBase.proportionmap
+
+# The implementation of a constant classifier below is not the simplest, but it
+# demonstrates some patterns that apply more generally in classification, including
+# inclusion of the canned data front end, `Sage`.
+
+"""
+    ConstantClassifier()
+
+Instantiate a constant (dummy) classifier. Can predict `Point` or `Distribution` targets.
+
+"""
+struct ConstantClassifier end
+
+struct ConstantClassifierFitted
+    learner::ConstantClassifier
+    probabilities
+    names::Vector{Symbol}
+    classes_seen
+    codes_seen
+    decoder
+end
+
+LearnAPI.learner(model::ConstantClassifierFitted) = model.learner
+
+# add a data front end; `obs` will return objects with type `FrontEnds.Obs`:
+const front_end = FrontEnds.Sage(code_type=:small)
+LearnAPI.obs(learner::ConstantClassifier, data) =
+    FrontEnds.fitobs(learner, data, front_end)
+LearnAPI.obs(model::ConstantClassifierFitted, data) =
+    obs(model, data, front_end)
+
+# data deconstructors:
+LearnAPI.features(learner::ConstantClassifier, data) =
+    LearnAPI.features(learner, data, front_end)
+LearnAPI.target(learner::ConstantClassifier, data) =
+    LearnAPI.target(learner, data, front_end)
+
+function LearnAPI.fit(learner::ConstantClassifier, observations::FrontEnds.Obs; verbosity=1)
+    y = observations.target # integer "codes"
+    names = observations.names
+    classes_seen = observations.classes_seen
+    codes_seen = sort(unique(y))
+    decoder = observations.decoder
+
+    d = proportionmap(y)
+    # proportions ordered by key, i.e., by codes seen:
+    probabilities = values(sort!(OrderedDict(d))) |> collect
+
+    return ConstantClassifierFitted(
+        learner,
+        probabilities,
+        names,
+        classes_seen,
+        codes_seen,
+        decoder,
+    )
+end
+LearnAPI.fit(learner::ConstantClassifier, data; kwargs...) =
+    fit(learner, obs(learner, data); kwargs...)
+
+function LearnAPI.predict(
+    model::ConstantClassifierFitted,
+    ::Point,
+    observations::FrontEnds.Obs,
+    )
+    n = MLCore.numobs(observations)
+    idx = argmax(model.probabilities)
+    code_of_mode = model.codes_seen[idx]
+    return model.decoder.(fill(code_of_mode, n))
+end
+LearnAPI.predict(model::ConstantClassifierFitted, ::Point, data) =
+    predict(model, Point(), obs(model, data))
+
+function LearnAPI.predict(
+    model::ConstantClassifierFitted,
+    ::Distribution,
+    observations::FrontEnds.Obs,
+    )
+    n = MLCore.numobs(observations)
+    probs = model.probabilities
+    # repeat vertically to get rows of a matrix:
+    probs_matrix = reshape(repeat(probs, n), (length(probs), n))'
+    return CategoricalDistributions.UnivariateFinite(model.classes_seen, probs_matrix)
+end
+LearnAPI.predict(model::ConstantClassifierFitted, ::Distribution, data) =
+        predict(model, Distribution(), obs(model, data))
+
+# accessor function:
+LearnAPI.feature_names(model::ConstantClassifierFitted) = model.names
+
+@trait(
+    ConstantClassifier,
+    constructor = ConstantClassifier,
+    kinds_of_proxy = (Point(),Distribution()),
+    tags = ("classification",),
+    functions = (
+        :(LearnAPI.fit),
+        :(LearnAPI.learner),
+        :(LearnAPI.clone),
+        :(LearnAPI.strip),
+        :(LearnAPI.obs),
+        :(LearnAPI.features),
+        :(LearnAPI.target),
+        :(LearnAPI.predict),
+        :(LearnAPI.feature_names),
+   )
+)
+
+true
diff --git a/src/learners/dimension_reduction.jl b/src/learners/dimension_reduction.jl
@@ -1,13 +1,14 @@
 # This file defines `TruncatedSVD(; codim=1)`
 
 using LearnAPI
-using LinearAlgebra 
+using LinearAlgebra
+import LearnDataFrontEnds as FrontEnds
 
 
 # # DIMENSION REDUCTION USING TRUNCATED SVD DECOMPOSITION
 
 # Recall that truncated SVD reduction is the same as PCA reduction, but without
-# centering. We suppose observations are presented as the columns of a `Real` matrix.
+# centering.
 
 # Some struct fields are left abstract for simplicity.
 
@@ -23,6 +24,11 @@ end
 Instantiate a truncated singular value decomposition algorithm for reducing the dimension
 of observations by `codim`.
 
+Data can be provided to `fit` or `transform` in any form supported by the `Tarragon` data
+front end at LearnDataFrontEnds.jl. However, the outputs of `transform` and
+`inverse_transform` are always matrices.
+
+
 ```julia
 learner = Truncated()
 X = rand(3, 100)  # 100 observations in 3-space
@@ -49,10 +55,21 @@ end
 
 LearnAPI.learner(model::TruncatedSVDFitted) = model.learner
 
-function LearnAPI.fit(learner::TruncatedSVD, X; verbosity=1)
+# add a canned data front end; `obs` will return objects of type `FrontEnds.Obs`:
+LearnAPI.obs(learner::TruncatedSVD, data) =
+    FrontEnds.fitobs(learner, data, FrontEnds.Tarragon())
+LearnAPI.obs(model::TruncatedSVDFitted, data) =
+    obs(model, data, FrontEnds.Tarragon())
+
+# training data deconstructor:
+LearnAPI.features(learner::TruncatedSVD, data) =
+    LearnAPI.features(learner, data, FrontEnds.Tarragon())
+
+function LearnAPI.fit(learner::TruncatedSVD, observations::FrontEnds.Obs; verbosity=1)
 
     # unpack hyperparameters:
     codim = learner.codim
+    X = observations.features
     p, n = size(X)
     n ≥ p || error("Insufficient number observations. ")
     outdim = p - codim
@@ -70,14 +87,19 @@ function LearnAPI.fit(learner::TruncatedSVD, X; verbosity=1)
     return TruncatedSVDFitted(learner, U, Ut, singular_values)
 
 end
+LearnAPI.fit(learner::TruncatedSVD, data; kwargs...) =
+    LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...)
 
-LearnAPI.transform(model::TruncatedSVDFitted, X) = model.Ut*X
+LearnAPI.transform(model::TruncatedSVDFitted, observations::FrontEnds.Obs) =
+    model.Ut*(observations.features)
+LearnAPI.transform(model::TruncatedSVDFitted, data) =
+    LearnAPI.transform(model, obs(model, data))
 
 # convenience fit-transform:
-LearnAPI.transform(learner::TruncatedSVD, X; kwargs...) =
-    transform(fit(learner, X; kwargs...), X)
+LearnAPI.transform(learner::TruncatedSVD, data; kwargs...) =
+    transform(fit(learner, data; kwargs...), data)
 
-LearnAPI.inverse_transform(model::TruncatedSVDFitted, W) = model.U*W
+LearnAPI.inverse_transform(model::TruncatedSVDFitted, W::AbstractMatrix) = model.U*W
 
 # accessor function:
 function LearnAPI.extras(model::TruncatedSVDFitted)

diff --git a/src/learners/ensembling.jl b/src/learners/ensembling.jl
@@ -211,7 +211,7 @@ LearnAPI.components(model::EnsembleFitted) = [:atom => model.models,]
 # - `out_of_sample_losses`
 
 # For simplicity, this implementation is restricted to univariate features. The simplistic
-# algorithm is explained in the docstring.  of the data presented.
+# algorithm is explained in the docstring.
 
 
 # ## HELPERS
@@ -276,6 +276,7 @@ function update!(
     stump = Stump(ξ, left, right)
     push!(forest, stump)
     new_predictions = _predict(stump, x)
+
     # efficient in-place update of `predictions`:
     predictions .= (k*predictions .+ new_predictions)/(k + 1)
     push!(training_losses, (predictions[training_indices] .- ytrain).^2 |> sum)