Skip to content

Commit 39e7aab

Browse files
authored
Merge pull request #158 from JuliaAI/text-ngram-fix
Fix scitype for multisets of tuples of strings (and tagged strings)
2 parents 8bb610c + d2ff22e commit 39e7aab

File tree

3 files changed

+50
-6
lines changed

3 files changed

+50
-6
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "ScientificTypes"
22
uuid = "321657f4-b219-11e9-178b-2701a2544e81"
33
authors = ["Anthony D. Blaom <[email protected]>"]
4-
version = "2.2.1"
4+
version = "2.2.2"
55

66
[deps]
77
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

src/convention/scitype.jl

+14-1
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,14 @@ ST.scitype(::Distributions.Distribution{F,S}) where {F,S} =
6868

6969
# Text analysis - EXPERIMENTAL
7070

71+
# This would be less of a hack if some of #155 were adopted.
72+
7173
type2scitype(T::Type) = ST.Scitype(T, DefaultConvention())
7274
type2scitype(::Type{<:AbstractVector{T}}) where T =
7375
AbstractVector{type2scitype(T)}
76+
type2scitype(::NTuple{N,T}) where {N,T} = NTuple{type2scitype{T}}
77+
const PlainNGram{N} = NTuple{N,<:AbstractString}
78+
const TaggedNGram{N} = NTuple{N,<:CorpusLoaders.TaggedWord}
7479
ST.scitype(::TaggedWord, ::DefaultConvention) = Annotated{Textual}
7580
ST.scitype(::Document{<:AbstractVector{T}}, ::DefaultConvention) where T =
7681
Annotated{AbstractVector{type2scitype(T)}}
@@ -80,7 +85,15 @@ ST.scitype(::AbstractDict{<:TaggedWord,<:Integer},
8085
::DefaultConvention) = Multiset{Annotated{Textual}}
8186
ST.scitype(::AbstractDict{<:Union{TaggedWord,AbstractString},<:Integer},
8287
::DefaultConvention) =
83-
Multiset{Annotated{Textual}}
88+
Multiset{Union{Textual,Annotated{Textual}}}
89+
ST.scitype(::AbstractDict{<:PlainNGram{N}}) where N =
90+
Multiset{NTuple{N,Textual}}
91+
ST.scitype(::AbstractDict{<:TaggedNGram{N}}) where N =
92+
Multiset{NTuple{N,Annotated{Textual}}}
93+
ST.scitype(::AbstractDict{<:PlainNGram}) =
94+
Multiset{NTuple{<:Any,Textual}}
95+
ST.scitype(::AbstractDict{<:TaggedNGram}) =
96+
Multiset{NTuple{<:Any,Annotated{Textual}}}
8497

8598
# Scitype for fast array broadcasting
8699

test/scitypes.jl

+35-4
Original file line numberDiff line numberDiff line change
@@ -248,12 +248,43 @@ end
248248
@test scitype(bag_of_words) == Multiset{Textual}
249249
bag_of_tagged_words = Dict(tagged_word => 5)
250250
@test scitype(bag_of_tagged_words) == Multiset{Annotated{Textual}}
251-
@test scitype(Document("kadsfkj", "My Document")) == Unknown
252-
@test scitype(Document([tagged_word, tagged_word2], "My Other Doc")) ==
251+
@test scitype(Document("My Document", "kadsfkj")) == Unknown
252+
@test scitype(Document([tagged_word, tagged_word2])) ==
253+
Annotated{AbstractVector{Annotated{Textual}}}
254+
@test scitype(Document("My Other Doc", [tagged_word, tagged_word2])) ==
253255
Annotated{AbstractVector{Annotated{Textual}}}
254256
nested_tokens = [["dog", "cat"], ["bird", "cat"]]
255-
@test scitype(Document(nested_tokens), "Essay Number 1") ==
256-
Annotated{AbstractVector{AbstractVector{Textual}}}
257+
@test scitype(Document("Essay Number 1", nested_tokens)) ==
258+
Annotated{AbstractVector{AbstractVector{Textual}}}
259+
260+
@test scitype(Dict(("cat", "in") => 3)) == Multiset{Tuple{Textual,Textual}}
261+
bag_of_words = Dict("cat in" => 1,
262+
"the hat" => 1,
263+
"the" => 2,
264+
"cat" => 1,
265+
"hat" => 1,
266+
"in the" => 1,
267+
"in" => 1,
268+
"the cat" => 1)
269+
bag_of_ngrams =
270+
Dict(Tuple(String.(split(k))) => v for (k, v) in bag_of_words)
271+
# Dict{Tuple{String, Vararg{String, N} where N}, Int64} with 8 entries:
272+
# ("cat",) => 1
273+
# ("cat", "in") => 1
274+
# ("in",) => 1
275+
# ("the", "hat") => 1
276+
# ("the",) => 2
277+
# ("hat",) => 1
278+
# ("in", "the") => 1
279+
# ("the", "cat") => 1
280+
@test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Textual}}
281+
282+
@test scitype(Dict((tagged_word, tagged_word2) => 3)) ==
283+
Multiset{Tuple{Annotated{Textual},Annotated{Textual}}}
284+
bag_of_ngrams = Dict((tagged_word, tagged_word2) => 3,
285+
(tagged_word,) => 7)
286+
@test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Annotated{Textual}}}
287+
257288
end
258289

259290
@testset "Autotype+tight" begin

0 commit comments

Comments
 (0)