-
Notifications
You must be signed in to change notification settings - Fork 139
/
Copy pathrecurrentlanguagemodel.lua
360 lines (323 loc) · 13.7 KB
/
recurrentlanguagemodel.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
require 'dp'
require 'rnn'
version = 9
--[[command line arguments]]--
cmd = torch.CmdLine()
cmd:text()
cmd:text('Train a Language Model on BillionWords or PennTreeBank (or your own) dataset using a Simple Recurrent Neural Network')
cmd:text('Example:')
cmd:text("$> th recurrentlanguagemodel.lua --dataset PennTreeBank --cuda --useDevice 2 --trainEpochSize -1 --trainEpochSize -1 --dropout --bidirectional --hiddenSize '{200,200}' --zeroFirst --batchSize 32 --progress")
cmd:text('$> th recurrentlanguagemodel.lua --tiny --batchSize 64 ')
cmd:text('$> th recurrentlanguagemodel.lua --tiny --batchSize 64 --rho 5 --validEpochSize 10000 --trainEpochSize 100000 --softmaxtree')
cmd:text('Options:')
cmd:option('--learningRate', 0.1, 'learning rate at t=0')
cmd:option('--lrDecay', 'linear', 'type of learning rate decay : adaptive | linear | schedule | none')
cmd:option('--minLR', 0.00001, 'minimum learning rate')
cmd:option('--saturateEpoch', 300, 'epoch at which linear decayed LR will reach minLR')
cmd:option('--schedule', '{}', 'learning rate schedule')
cmd:option('--maxWait', 4, 'maximum number of epochs to wait for a new minima to be found. After that, the learning rate is decayed by decayFactor.')
cmd:option('--decayFactor', 0.001, 'factor by which learning rate is decayed for adaptive decay.')
cmd:option('--momentum', 0, 'momentum')
cmd:option('--maxOutNorm', 2, 'max l2-norm each layers output neuron weights')
cmd:option('--cutoffNorm', -1, 'max l2-norm of contatenation of all gradParam tensors')
cmd:option('--batchSize', 64, 'number of examples per batch')
cmd:option('--evalSize', 100, 'size of context used for evaluation (more means more memory). With --bidirectional, specifies number of steps between each bwd rnn forget() (more means longer bwd recursions)')
cmd:option('--cuda', false, 'use CUDA')
cmd:option('--useDevice', 1, 'sets the device (GPU) to use')
cmd:option('--maxEpoch', 400, 'maximum number of epochs to run')
cmd:option('--maxTries', 30, 'maximum number of epochs to try to find a better local minima for early-stopping')
cmd:option('--accUpdate', false, 'accumulate updates inplace using accUpdateGradParameters')
cmd:option('--progress', false, 'print progress bar')
cmd:option('--silent', false, 'dont print anything to stdout')
cmd:option('--xpPath', '', 'path to a previously saved model')
cmd:option('--uniform', -1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization')
--[[ recurrent layer ]]--
cmd:option('--lstm', false, 'use Long Short Term Memory (nn.LSTM instead of nn.Recurrent)')
cmd:option('--bidirectional', false, 'use a Bidirectional RNN/LSTM (nn.BiSequencer instead of nn.Sequencer)')
cmd:option('--rho', 5, 'back-propagate through time (BPTT) for rho time-steps')
cmd:option('--hiddenSize', '{200}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs are stacked')
cmd:option('--zeroFirst', false, 'first step will forward zero through recurrence (i.e. add bias of recurrence). As opposed to learning bias specifically for first step.')
cmd:option('--dropout', false, 'apply dropout after each recurrent layer')
cmd:option('--dropoutProb', 0.5, 'probability of zeroing a neuron (dropout probability)')
--[[ output layer ]]--
cmd:option('--softmaxtree', false, 'use SoftmaxTree instead of the inefficient (full) softmax')
cmd:option('--softmaxforest', false, 'use SoftmaxForest instead of SoftmaxTree (uses more memory)')
cmd:option('--forestGaterSize', '{}', 'size of hidden layers used for forest gater (trees are experts)')
--[[ data ]]--
cmd:option('--dataset', 'BillionWords', 'which dataset to use : BillionWords | PennTreeBank | TextSource')
cmd:option('--trainEpochSize', 400000, 'number of train examples seen between each epoch')
cmd:option('--validEpochSize', 24000, 'number of valid examples used for early stopping and cross-validation')
cmd:option('--trainOnly', false, 'forget the validation and test sets, focus on the training set')
cmd:option('--dataPath', dp.DATA_DIR, 'path to data directory')
-- BillionWords
cmd:option('--small', false, 'use a small (1/30th) subset of the training set (BillionWors only)')
cmd:option('--tiny', false, 'use a tiny (1/100th) subset of the training set (BillionWors only)')
-- TextSource
cmd:option('--trainFile', 'train.txt', 'filename containing tokenized training text data')
cmd:option('--validFile', 'valid.txt', 'filename containing tokenized validation text data')
cmd:option('--testFile', 'test.txt', 'filename containing tokenized test text data')
cmd:text()
opt = cmd:parse(arg or {})
opt.schedule = dp.returnString(opt.schedule)
opt.hiddenSize = dp.returnString(opt.hiddenSize)
if not opt.silent then
table.print(opt)
end
if opt.bidirectional and not opt.silent then
print("Warning : the Perplexity of a bidirectional RNN/LSTM isn't "..
"necessarily mathematically valid as it uses P(x_t|x_{/neq t}) "..
"instead of P(x_t|x_{<t}), which is used for unidirectional RNN/LSTMs. "..
"You can however still use predictions to measure pseudo-likelihood.")
end
if opt.xpPath ~= '' then
-- check that saved model exists
assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist')
end
--[[Data]]--
local train_file = 'train_data.th7'
if opt.small then
train_file = 'train_small.th7'
elseif opt.tiny then
train_file = 'train_tiny.th7'
end
if opt.dataset == 'BillionWords' then
assert(not opt.bidirectional, "--bidirectional not yet supported with BillionWords")
ds = dp.BillionWords{
train_file=train_file, load_all=false,
context_size=opt.rho, recurrent=true
}
ds:loadTrain()
if not opt.trainOnly then
ds:loadValid()
ds:loadTest()
end
elseif opt.dataset == 'PennTreeBank' or opt.dataset == 'TextSource' then
assert(not opt.softmaxforest, "SoftMaxForest only supported with BillionWords")
if opt.dataset == 'PennTreeBank' then
ds = dp.PennTreeBank{
context_size=opt.bidirectional and opt.rho+1 or opt.rho,
recurrent=true, bidirectional=opt.bidirectional
}
elseif opt.dataset == 'TextSource' then
ds = dp.TextSource{
context_size=opt.bidirectional and opt.rho+1 or opt.rho,
recurrent=true, bidirectional=opt.bidirectional,
name='rnnlm', data_path = opt.dataPath,
train=opt.trainFile, valid=opt.validFile, test=opt.testFile
}
end
ds:validSet():contextSize(opt.evalSize)
ds:testSet():contextSize(opt.evalSize)
else
error"Unrecognized --dataset"
end
--[[Saved experiment]]--
if opt.xpPath ~= '' then
if opt.cuda then
require 'cunnx'
cutorch.setDevice(opt.useDevice)
end
xp = torch.load(opt.xpPath)
if opt.cuda then
xp:cuda()
else
xp:float()
end
xp:run(ds)
os.exit()
end
--[[Model]]--
-- language model
lm = nn.Sequential()
local inputSize = opt.hiddenSize[1]
for i,hiddenSize in ipairs(opt.hiddenSize) do
if i~= 1 and not opt.lstm then
lm:add(nn.Sequencer(nn.Linear(inputSize, hiddenSize)))
end
-- recurrent layer
local rnn
if opt.lstm then
-- Long Short Term Memory
rnn = nn.Sequencer(nn.FastLSTM(inputSize, hiddenSize))
else
-- simple recurrent neural network
rnn = nn.Recurrent(
hiddenSize, -- first step will use nn.Add
nn.Identity(), -- for efficiency (see above input layer)
nn.Linear(hiddenSize, hiddenSize), -- feedback layer (recurrence)
nn.Sigmoid(), -- transfer function
99999 -- maximum number of time-steps per sequence
)
if opt.zeroFirst then
-- this is equivalent to forwarding a zero vector through the feedback layer
rnn.startModule:share(rnn.feedbackModule, 'bias')
end
rnn = nn.Sequencer(rnn)
end
lm:add(rnn)
if opt.dropout then -- dropout it applied between recurrent layers
lm:add(nn.Sequencer(nn.Dropout(opt.dropoutProb)))
end
inputSize = hiddenSize
end
if opt.bidirectional then
-- initialize BRNN with fwd, bwd RNN/LSTMs
local bwd = lm:clone()
bwd:reset()
bwd:remember('neither')
local brnn = nn.BiSequencerLM(lm, bwd)
lm = nn.Sequential()
lm:add(brnn)
inputSize = inputSize*2
end
-- input layer (i.e. word embedding space)
lm:insert(nn.SplitTable(1,2), 1) -- tensor to table of tensors
if opt.dropout then
lm:insert(nn.Dropout(opt.dropoutProb), 1)
end
lookup = nn.LookupTable(ds:vocabularySize(), opt.hiddenSize[1], opt.accUpdate)
lookup.maxOutNorm = -1 -- disable maxParamNorm on the lookup table
lm:insert(lookup, 1)
-- output layer
if opt.softmaxforest or opt.softmaxtree then
-- input to nnlm is {inputs, targets} for nn.SoftMaxTree
local para = nn.ParallelTable()
para:add(lm):add(opt.cuda and nn.Sequencer(nn.Convert()) or nn.Identity())
lm = nn.Sequential()
lm:add(para)
lm:add(nn.ZipTable())
if opt.softmaxforest then -- requires a lot more memory
local trees = {ds:hierarchy('word_tree1.th7'), ds:hierarchy('word_tree2.th7'), ds:hierarchy('word_tree3.th7')}
local rootIds = {880542,880542,880542}
softmax = nn.SoftMaxForest(inputSize, trees, rootIds, opt.forestGaterSize, nn.Tanh(), opt.accUpdate)
opt.softmaxtree = true
elseif opt.softmaxtree then -- uses frequency based tree
local tree, root = ds:frequencyTree()
softmax = nn.SoftMaxTree(inputSize, tree, root, opt.accUpdate)
end
else
if #ds:vocabulary() > 50000 then
print("Warning: you are using full LogSoftMax for last layer, which "..
"is really slow (800,000 x outputEmbeddingSize multiply adds "..
"per example. Try --softmaxtree instead.")
end
softmax = nn.Sequential()
softmax:add(nn.Linear(inputSize, ds:vocabularySize()))
softmax:add(nn.LogSoftMax())
end
lm:add(nn.Sequencer(softmax))
if opt.uniform > 0 then
for k,param in ipairs(lm:parameters()) do
param:uniform(-opt.uniform, opt.uniform)
end
end
if opt.dataset ~= 'BillionWords' then
-- will recurse a single continuous sequence
lm:remember(opt.lstm and 'both' or 'eval')
end
--[[Propagators]]--
if opt.lrDecay == 'adaptive' then
ad = dp.AdaptiveDecay{max_wait = opt.maxWait, decay_factor=opt.decayFactor}
elseif opt.lrDecay == 'linear' then
opt.decayFactor = (opt.minLR - opt.learningRate)/opt.saturateEpoch
end
train = dp.Optimizer{
loss = opt.softmaxtree and nn.SequencerCriterion(nn.TreeNLLCriterion())
or nn.ModuleCriterion(
nn.SequencerCriterion(nn.ClassNLLCriterion()),
nn.Identity(),
opt.cuda and nn.Sequencer(nn.Convert()) or nn.Identity()
),
epoch_callback = function(model, report) -- called every epoch
if report.epoch > 0 then
if opt.lrDecay == 'adaptive' then
opt.learningRate = opt.learningRate*ad.decay
ad.decay = 1
elseif opt.lrDecay == 'schedule' and opt.schedule[report.epoch] then
opt.learningRate = opt.schedule[report.epoch]
elseif opt.lrDecay == 'linear' then
opt.learningRate = opt.learningRate + opt.decayFactor
end
opt.learningRate = math.max(opt.minLR, opt.learningRate)
if not opt.silent then
print("learningRate", opt.learningRate)
if opt.meanNorm then
print("mean gradParam norm", opt.meanNorm)
end
end
end
end,
callback = function(model, report) -- called every batch
if opt.accUpdate then
model:accUpdateGradParameters(model.dpnn_input, model.output, opt.learningRate)
else
if opt.cutoffNorm > 0 then
local norm = model:gradParamClip(opt.cutoffNorm) -- affects gradParams
opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm
end
model:updateGradParameters(opt.momentum) -- affects gradParams
model:updateParameters(opt.learningRate) -- affects params
end
model:maxParamNorm(opt.maxOutNorm) -- affects params
model:zeroGradParameters() -- affects gradParams
end,
feedback = dp.Perplexity(),
sampler = torch.isTypeOf(ds, 'dp.TextSource')
and dp.TextSampler{epoch_size = opt.trainEpochSize, batch_size = opt.batchSize}
or dp.RandomSampler{epoch_size = opt.trainEpochSize, batch_size = opt.batchSize},
acc_update = opt.accUpdate,
progress = opt.progress
}
if not opt.trainOnly then
valid = dp.Evaluator{
feedback = dp.Perplexity(),
sampler = torch.isTypeOf(ds, 'dp.TextSource')
and dp.TextSampler{epoch_size = opt.validEpochSize, batch_size = 1}
or dp.SentenceSampler{epoch_size = opt.validEpochSize, batch_size = 1, max_size = 100},
progress = opt.progress
}
tester = dp.Evaluator{
feedback = dp.Perplexity(),
sampler = torch.isTypeOf(ds, 'dp.TextSource')
and dp.TextSampler{batch_size = 1}
or dp.SentenceSampler{batch_size = 1, max_size = 100} -- Note : remove max_size for exact test set perplexity (will cost more memory)
}
end
--[[Experiment]]--
xp = dp.Experiment{
model = lm,
optimizer = train,
validator = valid,
tester = tester,
observer = {
ad,
dp.FileLogger(),
dp.EarlyStopper{
max_epochs = opt.maxTries,
error_report={opt.trainOnly and 'optimizer' or 'validator','feedback','perplexity','ppl'}
}
},
random_seed = os.time(),
max_epoch = opt.maxEpoch,
target_module = nn.SplitTable(1,1):type('torch.IntTensor')
}
if opt.softmaxtree then
-- makes it forward {input, target} instead of just input
xp:includeTarget()
end
--[[GPU or CPU]]--
if opt.cuda then
require 'cutorch'
require 'cunn'
if opt.softmaxtree or opt.softmaxforest then
require 'cunnx'
end
cutorch.setDevice(opt.useDevice)
xp:cuda()
end
xp:verbose(not opt.silent)
if not opt.silent then
print"Language Model :"
print(lm)
end
xp:run(ds)