-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
244 lines (186 loc) · 7.33 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
from dataclasses import dataclass
from typing import List, Optional, Tuple, Dict
from nltk.tree import Tree as NltkTree
@dataclass
class PlainSentence:
"""
The untokenized sentence as it originally appeared in the source material, or transcribed with typical
English orthographic conventions.
Arguments:
string: str - the raw sentence string
"""
string: str
@dataclass
class TreebankedSentence:
"""
The sentence after it has been enriched with null elements (such as *PRO* for control).
Arguments:
string: str - the tokenized and enriched sentence, with a single space between each token.
tokens: List[str] - for convenience, the tokens which constitute the argument.
"""
string: str
tokens: List[str]
@dataclass
class SpeakerInformation:
"""
A piece of metadata which normally represents a speaker and when they began and stopped talking in a media
file which the document is sourced from. Note that this field may have different meanings in some subcorpora,
such as the Bible subcorpus.
Arguments:
name: Optional[str] - usually, name of the speaker
start_time: Optional[str] - usually, decimal-formatted second offset into the media file when speaking began
stop_time: Optional[str] - usually, decimal-formatted second offset into the media file when speaking stopped
"""
name: Optional[str] = None
start_time: Optional[str] = None
stop_time: Optional[str] = None
@dataclass
class Tree:
"""
The Penn Treebank tree for a sentence.
Attributes:
tree_string: str - S-expression formatted PTB tree.
parsed_tree: Tree - nltk.tree.Tree instance
"""
tree_string: str
@property
def parsed_tree(self) -> NltkTree:
if hasattr(self, "__parsed_tree"):
return self.__parsed_tree
else:
t = NltkTree.fromstring(self.tree_string)
self.__parsed_tree = t
return t
@dataclass
class PropArg:
"""
Represents a single argument for a PropBank frame annotation.
Attributes:
token_id: int - 0-indexed offset of the head token of this argument
height: int - the number of levels up in the tree you need to go relative to `token_id` to find the
constituent which corresponds to the argument's projection in the tree.
tokens: List[str] - for convenience, the tokens which constitute the argument.
"""
token_id: int
height: int
tokens: List[str]
@dataclass
class Prop:
"""
A representation of a PropBank frame annotation for a given token. Used by Leaf.
Attributes:
label: str - the PropBank label for this annotation
args: Dict[str, List[PropArg]] - the arguments for the PropBank annotation
"""
label: str
args: Dict[str, List[PropArg]]
@dataclass
class Coref:
"""
Coreference label for a coreferent mention. Used by Leaf.
Attributes:
type: str - Coreference type. Attested values are "APPOS HEAD", "APPOS ATTRIB", "IDENT"
chain_id: str - ID of the chain that the mention belongs to. Cf. Sentence.chains
token_id_range: Tuple[int, int] - *INCLUSIVE* 0-indexed range of tokens in the sentence which constitute
the mention.
tokens: List[str] - for convenience, the tokens which constitute the mention
"""
type: str
chain_id: str
token_id_range: Tuple[int, int]
tokens: List[str]
@dataclass
class Name:
"""
Named entity recognition label. Used by Leaf.
Attributes:
type: str - Entity type. Attested values are:
'TIME', 'MONEY', 'PERSON', 'PRODUCT', 'QUANTITY',
'ORG', 'DATE', 'LOC', 'FAC', 'CARDINAL', 'LAW',
'WORK_OF_ART', 'GPE', 'LANGUAGE', 'PERCENT',
'NORP', 'ORDINAL', 'EVENT'
token_id_range: Tuple[int, int] - *INCLUSIVE* 0-indexed range of tokens in the sentence which constitute
the mention of this entity.
tokens: List[str] - for convenience, the tokens which constitute the mention
"""
type: str
token_id_range: Tuple[int, int]
tokens: List[str]
@dataclass
class Sense:
"""
Word sense disambiguation label. Used by Leaf.
Attributes:
label: str - WordNet sense label
"""
label: str
@dataclass
class Leaf:
"""
Representation of a token containing token- and span-level annotations for propbanking, coreference,
named entity recognition, and word sense disambiguation.
Attributes:
token_id: int - 0-indexed offset of this token in the sentence
token: str - form of the token as it appears in the tree
prop: Optional[Prop] - propbanking information
coref: Optional[Coref] - coreference information
name: Optional[Name] - NER information
sense: Optional[Sense] - WSD information
"""
token_id: int
token: str
prop: Optional[Prop] = None
coref: Optional[Coref] = None
name: Optional[Name] = None
sense: Optional[Sense] = None
@dataclass
class Sentence:
"""
Represents most of the annotations in OntoNotes for a given sentence.
Attributes:
plain_sentence: Optional[PlainSentence] - the sentence before processing
treebanked_sentence: Optional[TreebankedSentence] - the sentence after tokenization and addition of nulls
speaker_information: Optional[SpeakerInformation] - optional metadata usually describing the speaker
tree: Optional[Tree] - the Penn Treebank tree for this sentence formatted as an S-expression
leaves: Optional[List[Leaf]] - contains tokens which contain token- and span-level annotations for NER,
coreference, word sense disambiguation, and propbanking
"""
plain_sentence: Optional[PlainSentence] = None
treebanked_sentence: Optional[TreebankedSentence] = None
speaker_information: Optional[SpeakerInformation] = None
tree: Optional[Tree] = None
leaves: Optional[List[Leaf]] = None
@dataclass
class Mention:
"""
Represents a single mention of an entity within a Chain.
Attributes:
sentence_id: int - 0-indexed index of the sentence the mention occurs in in the document
token_id_range: Tuple[int, int] - *INCLUSIVE* range of tokens which constitute the mention
tokens: List[str] - for convenience, the tokens which constitute the mention
"""
sentence_id: int
token_id_range: Tuple[int, int]
tokens: List[str]
@dataclass
class Chain:
"""
Represents a coreference chain for a single entity within a document.
Attributes:
id: str - the unique identifier of this chain within the document. Often but not necessarily an integer.
type: str - either APPOS for apposition, or IDENT for normal coreference
mentions: List[Mention] - the mentions of the entity forming the chain
"""
id: str
type: str
mentions: List[Mention]
@dataclass
class Section:
"""
Represents a document in OntoNotes, aka a section.
Attributes:
sentences: List[Sentence] - a list of `Sentence` objects containing most annotations
chains: List[Chain] - a list of `Chain` objecst each describing a single coreference chain
"""
sentences: List[Sentence]
chains: Optional[List[Chain]]