From 9d37bba62c6a1a33e3742bd490f5e128f746e63e Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Wed, 9 Sep 2015 10:22:58 +0800 Subject: [PATCH] add Scheduled Sampling at training --- Readme.md | 22 +++++++++++++++------- train.lua | 23 ++++++++++++++++++++++- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/Readme.md b/Readme.md index 72d8b52..8948588 100644 --- a/Readme.md +++ b/Readme.md @@ -1,6 +1,6 @@ # char-rnn-chinese -Based on https://github.com/karpathy/char-rnn. make the code work well with Chinese. +Based on Andrej Karpathy's code https://github.com/karpathy/char-rnn and Samy Bengio's paper http://arxiv.org/abs/1506.03099 ## Chinese process Make the code can process both English and Chinese characters. @@ -10,6 +10,20 @@ This is my first touch of Lua, so the string process seems silly, but it works w I also add an option called 'min_freq' because the vocab size in Chinese is very big, which makes the parameter num increase a lot. So delete some rare character may help. +## Scheduled Sampling +Samy Bengio's paper [Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks](http://arxiv.org/abs/1506.03099) in NIPS15 +propose a simple but power method to implove RNN. +In my experiment, I find it helps a lot to avoid overfitting and make the test loss go deeper. I only use linear decay. +Use `-use_ss` to turn on or turn off scheduled sampling, default is on. `-start_ss` is the start aomunt of real data, I suggest to use 1 because our model should learn data without noise at the very begining. `-min_ss` is also very important as too much noise will hurt performance. Finally, `-decay_ss` is the linear decay rate. + + +## Model conversion between cpu and gpu +I add a script to convert a model file trained by gpu to cpu model. +You can try it as follow: +```bash +$ th convert.lua gpu_model cpu_model +``` + ## web interface A web demo is added for others to test model easily, based on sub/pub of redis. I use redis because i can't found some good RPC or WebServer work well integrated with Torch. @@ -35,12 +49,6 @@ $ nohup th web_backend.lua & $ nohup python web_server.py & ``` -## Model conversion between cpu and gpu -I add a script to convert a model file trained by gpu to cpu model. -You can try it as follow: -```bash -$ th convert.lua gpu_model cpu_model -``` ----------------------------------------------- ## Karpathy's raw Readme diff --git a/train.lua b/train.lua index abe3680..900d1cc 100644 --- a/train.lua +++ b/train.lua @@ -60,11 +60,17 @@ cmd:option('-savefile','lstm','filename to autosave the checkpont to. Will be in -- GPU/CPU cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU') cmd:option('-opencl',0,'use OpenCL (instead of CUDA)') +-- Scheduled Sampling +cmd:option('-use_ss', 1, 'whether use scheduled sampling during training') +cmd:option('-start_ss', 1, 'start amount of truth to be github to the model when using ss') +cmd:option('-decay_ss', 0.01666, 'ss amount decay rate of each epoch') +cmd:option('-min_ss', 0.5, 'minimum amount of truth to be given to the model when using ss') cmd:text() -- parse input params opt = cmd:parse(arg) torch.manualSeed(opt.seed) +math.randomseed(opt.seed) -- train / val / test split for data, in fractions local test_frac = math.max(0, 1 - (opt.train_frac + opt.val_frac)) local split_sizes = {opt.train_frac, opt.val_frac, test_frac} @@ -238,7 +244,15 @@ function feval(x) local loss = 0 for t=1,opt.seq_length do clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag) - local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])} + -- flip a coin to decide weather use scheduled sampling + if opt.use_ss == 1 and t > 1 and math.random() > ss_current then + local probs = torch.exp(predictions[t-1]):squeeze() + _,samples = torch.max(probs,2) + xx = samples:view(samples:nElement()) + else + xx = x[{{}, t}] + end + local lst = clones.rnn[t]:forward{xx, unpack(rnn_state[t-1])} rnn_state[t] = {} for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output predictions[t] = lst[#lst] -- last element is the prediction @@ -277,6 +291,7 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate} local iterations = opt.max_epochs * loader.ntrain local iterations_per_epoch = loader.ntrain local loss0 = nil +ss_current = opt.start_ss for i = 1, iterations do local epoch = i / loader.ntrain @@ -296,6 +311,12 @@ for i = 1, iterations do end end + -- decay schedule sampling amount + if opt.use_ss == 1 and i % loader.ntrain == 0 and ss_current > opt.min_ss then + ss_current = opt.start_ss - opt.decay_ss * epoch + print('decay schedule sampling amount to ' .. ss_current) + end + -- every now and then or on last iteration if i % opt.eval_val_every == 0 or i == iterations then -- evaluate loss on validation data