add Scheduled Sampling at training
This commit is contained in:
parent
e9ad92ebfd
commit
9d37bba62c
22
Readme.md
22
Readme.md
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
# char-rnn-chinese
|
# char-rnn-chinese
|
||||||
Based on https://github.com/karpathy/char-rnn. make the code work well with Chinese.
|
Based on Andrej Karpathy's code https://github.com/karpathy/char-rnn and Samy Bengio's paper http://arxiv.org/abs/1506.03099
|
||||||
|
|
||||||
## Chinese process
|
## Chinese process
|
||||||
Make the code can process both English and Chinese characters.
|
Make the code can process both English and Chinese characters.
|
||||||
@ -10,6 +10,20 @@ This is my first touch of Lua, so the string process seems silly, but it works w
|
|||||||
I also add an option called 'min_freq' because the vocab size in Chinese is very big, which makes the parameter num increase a lot.
|
I also add an option called 'min_freq' because the vocab size in Chinese is very big, which makes the parameter num increase a lot.
|
||||||
So delete some rare character may help.
|
So delete some rare character may help.
|
||||||
|
|
||||||
|
## Scheduled Sampling
|
||||||
|
Samy Bengio's paper [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks](http://arxiv.org/abs/1506.03099) in NIPS15
|
||||||
|
propose a simple but power method to implove RNN.
|
||||||
|
In my experiment, I find it helps a lot to avoid overfitting and make the test loss go deeper. I only use linear decay.
|
||||||
|
Use `-use_ss` to turn on or turn off scheduled sampling, default is on. `-start_ss` is the start aomunt of real data, I suggest to use 1 because our model should learn data without noise at the very begining. `-min_ss` is also very important as too much noise will hurt performance. Finally, `-decay_ss` is the linear decay rate.
|
||||||
|
|
||||||
|
|
||||||
|
## Model conversion between cpu and gpu
|
||||||
|
I add a script to convert a model file trained by gpu to cpu model.
|
||||||
|
You can try it as follow:
|
||||||
|
```bash
|
||||||
|
$ th convert.lua gpu_model cpu_model
|
||||||
|
```
|
||||||
|
|
||||||
## web interface
|
## web interface
|
||||||
A web demo is added for others to test model easily, based on sub/pub of redis.
|
A web demo is added for others to test model easily, based on sub/pub of redis.
|
||||||
I use redis because i can't found some good RPC or WebServer work well integrated with Torch.
|
I use redis because i can't found some good RPC or WebServer work well integrated with Torch.
|
||||||
@ -35,12 +49,6 @@ $ nohup th web_backend.lua &
|
|||||||
$ nohup python web_server.py &
|
$ nohup python web_server.py &
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model conversion between cpu and gpu
|
|
||||||
I add a script to convert a model file trained by gpu to cpu model.
|
|
||||||
You can try it as follow:
|
|
||||||
```bash
|
|
||||||
$ th convert.lua gpu_model cpu_model
|
|
||||||
```
|
|
||||||
|
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
## Karpathy's raw Readme
|
## Karpathy's raw Readme
|
||||||
|
23
train.lua
23
train.lua
@ -60,11 +60,17 @@ cmd:option('-savefile','lstm','filename to autosave the checkpont to. Will be in
|
|||||||
-- GPU/CPU
|
-- GPU/CPU
|
||||||
cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU')
|
cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU')
|
||||||
cmd:option('-opencl',0,'use OpenCL (instead of CUDA)')
|
cmd:option('-opencl',0,'use OpenCL (instead of CUDA)')
|
||||||
|
-- Scheduled Sampling
|
||||||
|
cmd:option('-use_ss', 1, 'whether use scheduled sampling during training')
|
||||||
|
cmd:option('-start_ss', 1, 'start amount of truth to be github to the model when using ss')
|
||||||
|
cmd:option('-decay_ss', 0.01666, 'ss amount decay rate of each epoch')
|
||||||
|
cmd:option('-min_ss', 0.5, 'minimum amount of truth to be given to the model when using ss')
|
||||||
cmd:text()
|
cmd:text()
|
||||||
|
|
||||||
-- parse input params
|
-- parse input params
|
||||||
opt = cmd:parse(arg)
|
opt = cmd:parse(arg)
|
||||||
torch.manualSeed(opt.seed)
|
torch.manualSeed(opt.seed)
|
||||||
|
math.randomseed(opt.seed)
|
||||||
-- train / val / test split for data, in fractions
|
-- train / val / test split for data, in fractions
|
||||||
local test_frac = math.max(0, 1 - (opt.train_frac + opt.val_frac))
|
local test_frac = math.max(0, 1 - (opt.train_frac + opt.val_frac))
|
||||||
local split_sizes = {opt.train_frac, opt.val_frac, test_frac}
|
local split_sizes = {opt.train_frac, opt.val_frac, test_frac}
|
||||||
@ -238,7 +244,15 @@ function feval(x)
|
|||||||
local loss = 0
|
local loss = 0
|
||||||
for t=1,opt.seq_length do
|
for t=1,opt.seq_length do
|
||||||
clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag)
|
clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag)
|
||||||
local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
|
-- flip a coin to decide weather use scheduled sampling
|
||||||
|
if opt.use_ss == 1 and t > 1 and math.random() > ss_current then
|
||||||
|
local probs = torch.exp(predictions[t-1]):squeeze()
|
||||||
|
_,samples = torch.max(probs,2)
|
||||||
|
xx = samples:view(samples:nElement())
|
||||||
|
else
|
||||||
|
xx = x[{{}, t}]
|
||||||
|
end
|
||||||
|
local lst = clones.rnn[t]:forward{xx, unpack(rnn_state[t-1])}
|
||||||
rnn_state[t] = {}
|
rnn_state[t] = {}
|
||||||
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
|
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
|
||||||
predictions[t] = lst[#lst] -- last element is the prediction
|
predictions[t] = lst[#lst] -- last element is the prediction
|
||||||
@ -277,6 +291,7 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
|
|||||||
local iterations = opt.max_epochs * loader.ntrain
|
local iterations = opt.max_epochs * loader.ntrain
|
||||||
local iterations_per_epoch = loader.ntrain
|
local iterations_per_epoch = loader.ntrain
|
||||||
local loss0 = nil
|
local loss0 = nil
|
||||||
|
ss_current = opt.start_ss
|
||||||
for i = 1, iterations do
|
for i = 1, iterations do
|
||||||
local epoch = i / loader.ntrain
|
local epoch = i / loader.ntrain
|
||||||
|
|
||||||
@ -296,6 +311,12 @@ for i = 1, iterations do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
-- decay schedule sampling amount
|
||||||
|
if opt.use_ss == 1 and i % loader.ntrain == 0 and ss_current > opt.min_ss then
|
||||||
|
ss_current = opt.start_ss - opt.decay_ss * epoch
|
||||||
|
print('decay schedule sampling amount to ' .. ss_current)
|
||||||
|
end
|
||||||
|
|
||||||
-- every now and then or on last iteration
|
-- every now and then or on last iteration
|
||||||
if i % opt.eval_val_every == 0 or i == iterations then
|
if i % opt.eval_val_every == 0 or i == iterations then
|
||||||
-- evaluate loss on validation data
|
-- evaluate loss on validation data
|
||||||
|
Loading…
Reference in New Issue
Block a user