Skip to content

Commit c662fd1

Browse files
authored
Merge pull request #156 from JuliaAI/dev
For a 2.2.0 release
2 parents 264cecd + 69680b3 commit c662fd1

File tree

6 files changed

+64
-25
lines changed

6 files changed

+64
-25
lines changed

Project.toml

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,30 @@
11
name = "ScientificTypes"
22
uuid = "321657f4-b219-11e9-178b-2701a2544e81"
33
authors = ["Anthony D. Blaom <[email protected]>"]
4-
version = "2.1.3"
4+
version = "2.2"
55

66
[deps]
77
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
88
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
9+
CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
910
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1011
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
1112
PersistenceDiagramsBase = "b1ad91c1-539c-4ace-90bd-ea06abc420fa"
1213
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
14+
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
1315
ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
1416
StatisticalTraits = "64bff920-2084-43da-a3e6-9bb72801c0c9"
1517
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1618

1719
[compat]
1820
CategoricalArrays = "0.8, 0.9, 0.10"
1921
ColorTypes = "0.9, 0.10, 0.11"
22+
CorpusLoaders = "0.3.2"
2023
Distributions = "0.25"
2124
PersistenceDiagramsBase = "0.1"
2225
PrettyTables = "1"
23-
ScientificTypesBase = "2.1"
26+
Reexport = "1.2"
27+
ScientificTypesBase = "2.2"
2428
StatisticalTraits = "2"
2529
Tables = "1"
2630
julia = "1"

docs/src/index.md

+21-9
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,17 @@ ScientificTimeType
5151
Sampleable{Ω}
5252
└─ Density{Ω}
5353
54+
Annotated{S}
55+
56+
AnnotationFor{S}
57+
58+
Multiset{S}
59+
5460
Table{K}
5561
5662
Textual
5763
58-
PersistenceDiagram
64+
ManifoldPoint{MT}
5965
6066
Unknown
6167
```
@@ -109,26 +115,32 @@ typically much slower than calling `scitype` or `elscitype`.
109115
The table below summarizes the default convention for representing
110116
scientific types:
111117

112-
Type `T` | `scitype(x)` for `x::T` | package required
118+
Type `T` | `scitype(x)` for `x::T` | package/module required
113119
:-------------- | :-------------------------------- | :------------------------
114120
`Missing` | `Missing` |
115121
`Nothing` | `Nothing` |
116122
`AbstractFloat` | `Continuous` |
117123
`Integer` | `Count` |
118124
`String` | `Textual` |
119-
`CategoricalValue` | `Multiclass{N}` where `N = nlevels(x)`, provided `x.pool.ordered == false` | CategoricalArrays
120-
`CategoricalString` | `Multiclass{N}` where `N = nlevels(x)`, provided `x.pool.ordered == false` | CategoricalArrays
121-
`CategoricalValue` | `OrderedFactor{N}` where `N = nlevels(x)`, provided `x.pool.ordered == true`| CategoricalArrays
122-
`CategoricalString` | `OrderedFactor{N}` where `N = nlevels(x)` provided `x.pool.ordered == true` | CategoricalArrays
125+
`CategoricalValue` | `Multiclass{N}` where `N = nlevels(x)`, provided `x.pool.ordered == false` | CategoricalArrays.jl
126+
`CategoricalString` | `Multiclass{N}` where `N = nlevels(x)`, provided `x.pool.ordered == false` | CategoricalArrays.jl
127+
`CategoricalValue` | `OrderedFactor{N}` where `N = nlevels(x)`, provided `x.pool.ordered == true`| CategoricalArrays.jl
128+
`CategoricalString` | `OrderedFactor{N}` where `N = nlevels(x)` provided `x.pool.ordered == true` | CategoricalArrays.jl
123129
`Date` | `ScientificDate` | Dates
124130
`Time` | `ScientificTime` | Dates
125131
`DateTime` | `ScientificDateTime` | Dates
126132
`Distributions.Sampleable{F,S}` | `Sampleable{Ω}` where `Ω` is scitype of sample space, according to `{F,S}`
127133
`Distributions.Distributions{F,S}` | `Density{Ω}` where `Ω` is scitype of sample space, according to `{F,S}`
128-
`AbstractArray{<:Gray,2}` | `GrayImage{W,H}` where `(W, H) = size(x)` | ColorTypes
129-
`AbstractArrray{<:AbstractRGB,2}` | `ColorImage{W,H}` where `(W, H) = size(x)` | ColorTypes
134+
`AbstractArray{<:Gray,2}` | `GrayImage{W,H}` where `(W, H) = size(x)` | ColorTypes.jl
135+
`AbstractArrray{<:AbstractRGB,2}` | `ColorImage{W,H}` where `(W, H) = size(x)` | ColorTypes.jl
130136
`PersistenceDiagram` | `PersistenceDiagram` | PersistenceDiagramsBase
131-
any table type `T` supported by Tables.jl | `Table{K}` where `K=Union{column_scitypes...}` | Tables
137+
any table type `T` supported by Tables.jl | `Table{K}` where `K=Union{column_scitypes...}` | Tables.jl
138+
`CorpusLoaders.TaggedWord` | `Annotated{Textual}` | CorpusLoaders.jl
139+
`CorpusLoaders.Document{AbstractVector{Q}}` | `Annotated{AbstractVector{Scitype(Q)}}` | CorpusLoaders.jl
140+
`AbstractDict{<:AbstractString,<:Integer}` | `Multiset{Textual}` |
141+
`AbstractDict{<:TaggedWord,<:Integer}` | `Multiset{Annotated{Textual}}` | CorpusLoaders.jl
142+
143+
*Experimental* and subject to change in new minor or patch release
132144

133145
Here `nlevels(x) = length(levels(x.pool))`.
134146

src/ScientificTypes.jl

+3-10
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,19 @@
11
module ScientificTypes
22

33
# Dependencies
4-
using ScientificTypesBase
4+
using Reexport
5+
@reexport using ScientificTypesBase
56
using Tables
67
using CategoricalArrays
78
using ColorTypes
89
using PersistenceDiagramsBase
10+
using CorpusLoaders
911
using PrettyTables
1012
using Dates
1113
import Distributions
1214

1315
import StatisticalTraits: info
1416

15-
# re-exports from ScientificTypes
16-
export Scientific, Found, Unknown, Known, Finite, Infinite,
17-
OrderedFactor, Multiclass, Count, Continuous, Textual,
18-
Binary, ColorImage, GrayImage, Image, Table,
19-
ScientificTimeType, ScientificDate, ScientificDateTime,
20-
ScientificTime,
21-
Density, Sampleable
22-
export scitype, scitype_union, elscitype, nonmissing, trait
23-
2417
# re-export from StatisticalTraits
2518
export info
2619

src/convention/scitype.jl

+17-3
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,21 @@ ST.scitype(::Distributions.Sampleable{F,S}) where {F,S} =
6666
ST.scitype(::Distributions.Distribution{F,S}) where {F,S} =
6767
Density{space_scitype(F,S)}
6868

69-
70-
71-
69+
# Text analysis - EXPERIMENTAL
70+
71+
type2scitype(T::Type) = ST.Scitype(T, DefaultConvention())
72+
type2scitype(::Type{<:AbstractVector{T}}) where T =
73+
AbstractVector{type2scitype(T)}
74+
ST.scitype(::TaggedWord, ::DefaultConvention) = Annotated{Textual}
75+
ST.scitype(::Document{<:AbstractVector{T}}, ::DefaultConvention) where T =
76+
Annotated{AbstractVector{type2scitype(T)}}
77+
ST.scitype(::AbstractDict{<:AbstractString,<:Integer},
78+
::DefaultConvention) = Multiset{Textual}
79+
ST.scitype(::AbstractDict{<:TaggedWord,<:Integer},
80+
::DefaultConvention) = Multiset{Annotated{Textual}}
81+
ST.scitype(::AbstractDict{<:Union{TaggedWord,AbstractString},<:Integer},
82+
::DefaultConvention) =
83+
Multiset{Annotated{Textual}}
7284

7385
# Scitype for fast array broadcasting
7486

@@ -80,3 +92,5 @@ ST.Scitype(::Type{<:Date}, ::DefaultConvention) = ScientificDate
8092
ST.Scitype(::Type{<:Time}, ::DefaultConvention) = ScientificTime
8193
ST.Scitype(::Type{<:DateTime}, ::DefaultConvention) = ScientificDateTime
8294
ST.Scitype(::Type{<:PersistenceDiagram}, ::DefaultConvention) = PersistenceDiagram
95+
ST.Scitype(::Type{<:TaggedWord}, ::DefaultConvention) =
96+
Annotated{Textual}

test/runtests.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
using Test, ScientificTypes, ScientificTypesBase, Random
22
using Tables, CategoricalArrays, DataFrames
3-
using ColorTypes, PersistenceDiagramsBase
3+
using ColorTypes, PersistenceDiagramsBase, CorpusLoaders
44
using Dates
55
# using CSV # dropped until julia release new LTS as issue for 1.0
66
import Distributions

test/scitypes.jl

+16
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,22 @@ end
240240
@test scitype(FooSampleable()) == Sampleable{Count}
241241
end
242242

243+
@testset "text analysis" begin
244+
tagged_word = CorpusLoaders.PosTaggedWord("NN", "wheelbarrow")
245+
tagged_word2 = CorpusLoaders.PosTaggedWord("NN", "soil")
246+
@test scitype(tagged_word) == Annotated{Textual}
247+
bag_of_words = Dict("cat"=>1, "dog"=>3)
248+
@test scitype(bag_of_words) == Multiset{Textual}
249+
bag_of_tagged_words = Dict(tagged_word => 5)
250+
@test scitype(bag_of_tagged_words) == Multiset{Annotated{Textual}}
251+
@test scitype(Document("kadsfkj", "My Document")) == Unknown
252+
@test scitype(Document([tagged_word, tagged_word2], "My Other Doc")) ==
253+
Annotated{AbstractVector{Annotated{Textual}}}
254+
nested_tokens = [["dog", "cat"], ["bird", "cat"]]
255+
@test scitype(Document(nested_tokens), "Essay Number 1") ==
256+
Annotated{AbstractVector{AbstractVector{Textual}}}
257+
end
258+
243259
@testset "Autotype+tight" begin
244260
x = [1,2,3,missing];
245261
x = x[1:3]

0 commit comments

Comments
 (0)