-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathinstance.py
212 lines (165 loc) · 4.4 KB
/
instance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class Instance:
"""
This class represents a single instance of
input data.
"""
def __init__(self, line, preserve=False):
"""
Initializes an instance with a line of
text. Features are extracted from the line
and added to the instance.
:param line: The input line
:param preserve: strip a line or not?
"""
if preserve:
self.goal = None
self.value = line
else:
self.goal = line[:2]
self.value = line[2:]
self.features = get_features(line)
self.weight = None
def get_features(line):
"""
Gets the features of a line.
:param line: The line to be operated on
:return: table of features
"""
v_pairs, c_pairs = vow_con_pairs(line)
words = set(line.split())
return {
"cv-ratio": vow_con_ratio(line),
"av-len": avg_word_len(line),
"v-pairs": v_pairs,
"c-pairs": c_pairs,
"l-pairs": letter_pairs(line),
"ends-en": ends_in("en", line),
"ends-e": ends_in("e", line),
"has-aa": "aa" in line,
"has-ee": "ee" in line,
"has-word-het": "het" in words,
"has-word-een": "een" in words,
"has-word-en": "en" in words,
"has-word-de": "de" in words,
"has-word-the": "the" in words,
"has-word-and": "and" in words,
"has-word-in": "in" in words,
"has-word-of": "of" in words,
}
def ends_in(suffix, line):
"""
Checks if a word which ends in
"suffix" exists in the line.
:param suffix: the suffix
:param line: the line
:return: True or False
"""
line = line.split()
for word in line:
if len(word) < len(suffix):
continue
i = len(word) - len(suffix)
val = True
for ch in suffix:
if word[i] != ch:
val = False
break
i += 1
if val:
return val
return False
def letter_pairs(line):
"""
A range. Number of letter pairs in the
sentence. A letter pair is a consonant
or vowel pair e.g “oo” and “qq” are
letter pairs.
:param line: the line
:return: range of letter pairs
"""
pair_count = 0
for i in range(len(line) - 1):
ch = line[i]
next_ch = line[i + 1]
if ch == next_ch:
pair_count += 1
return pair_range(pair_count)
def vow_con_pairs(line):
"""
A range. Number of consonant and vowel
pairs in the sentence. A letter pair
is two consecutive appearances of the
same consonant or vowel e.g. jj or oo
:param line: the line
:return: Range of letter pairs
"""
vowels = {"a", "e", "i", "o", "u"}
v_count = c_count = i = 0
while i < len(line) - 1:
ch = line[i]
next_ch = line[i + 1]
if ch in vowels and ch == next_ch:
v_count += 1
i += 2
elif ch == next_ch:
c_count += 1
i += 2
else:
i += 1
return pair_range(v_count), pair_range(c_count)
def pair_range(pair_count):
"""
Finds the range which a pair_count
falls into.
:param pair_count: a count of pairs
:return: the appropriate range
"""
range1 = 0, 3
range2 = 4, 7
range3 = 8, 10
range4 = 10, None
if pair_count <= range1[1]:
return range1
if range2[0] <= pair_count <= range2[1]:
return range2
if range3[0] <= pair_count <= range3[1]:
return range3
return range4
def avg_word_len(line):
"""
:param line: the line
:return: range of average length of words in the line
"""
total = 0
range1 = 0, 4
range2 = 5, 8
range3 = 8, None
for _ in line:
total += 1
avg = total//len(line.split())
if avg <= 4:
return range1
if 4 < avg <= 8:
return range2
return range3
def vow_con_ratio(line):
"""
:param line: the line
:return: the ratio of consonants to vowels.
"""
vowels = {"a", "e", "i", "o", "u"}
v_count = c_count = 0
range1 = 0, 0.5
range2 = 0.51, 0.69
range3 = 0.7, None
for ch in line:
if ch in vowels:
v_count += 1
else:
c_count += 1
ratio = v_count/c_count
if ratio <= range1[1]:
return range1
if range1[1] < ratio < range3[0]:
return range2
return range3