diff --git a/.gitignore b/.gitignore index 44dbcd6..2d05e22 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.pyc .idea/ +src/telegram_bot/config.json \ No newline at end of file diff --git a/src/deep_dialog/agents/agent.py b/src/deep_dialog/agents/agent.py index 49c1724..0efbde3 100644 --- a/src/deep_dialog/agents/agent.py +++ b/src/deep_dialog/agents/agent.py @@ -4,7 +4,7 @@ @author: xiul, t-zalipt """ -from deep_dialog import dialog_config +from src.deep_dialog import dialog_config class Agent: diff --git a/src/deep_dialog/agents/agent_dqn.py b/src/deep_dialog/agents/agent_dqn.py index 790b4d9..5206ce8 100644 --- a/src/deep_dialog/agents/agent_dqn.py +++ b/src/deep_dialog/agents/agent_dqn.py @@ -21,7 +21,7 @@ import numpy as np from src.deep_dialog import dialog_config -from src.deep_dialog.qlearning import DQN +from src.deep_dialog.qlearning import DoubleDQN from .agent import Agent @@ -36,31 +36,23 @@ def __init__(self, movie_dict=None, act_set=None, slot_set=None, params=None): self.feasible_actions = dialog_config.feasible_actions self.num_actions = len(self.feasible_actions) + self.experience_replay_pool = [] self.epsilon = params['epsilon'] self.agent_run_mode = params['agent_run_mode'] self.agent_act_level = params['agent_act_level'] - self.experience_replay_pool = [] # experience replay pool - + self.warm_start = params['warm_start'] self.experience_replay_pool_size = params.get('experience_replay_pool_size', 1000) - self.hidden_size = params.get('dqn_hidden_size', 60) + self.hidden_size = params.get('dqn_hidden_size', 80) self.gamma = params.get('gamma', 0.9) self.predict_mode = params.get('predict_mode', False) - self.warm_start = params.get('warm_start', 0) self.max_turn = params['max_turn'] + 4 self.state_dimension = 2 * self.act_cardinality + 7 * self.slot_cardinality + 3 + self.max_turn - self.dqn = DQN(self.state_dimension, self.hidden_size, self.num_actions) - self.clone_dqn = copy.deepcopy(self.dqn) - - self.cur_bellman_err = 0 - - # Prediction Mode: load trained DQN model - if params['trained_model_path'] is not None: - self.dqn.model = copy.deepcopy(self.load_trained_DQN(params['trained_model_path'])) - self.clone_dqn = copy.deepcopy(self.dqn) - self.predict_mode = True - self.warm_start = 2 + if 'trained_model_path' not in params or params['trained_model_path'] is None: + self.dqn = DoubleDQN(n_actions=len(self.feasible_actions), n_features=self.state_dimension, hidden_size=self.hidden_size) + else: + self.dqn = DoubleDQN.load(params['trained_model_path']) def initialize_episode(self): """ Initialize a new episode. This function is called every time a new episode is run. """ @@ -75,7 +67,36 @@ def state_to_action(self, state): self.representation = self.prepare_state_representation(state) self.action = self.run_policy(self.representation) act_slot_response = copy.deepcopy(self.feasible_actions[self.action]) - return {'act_slot_response': act_slot_response, 'act_slot_value_response': None} + return {'act_slot_response': act_slot_response, 'act_slot_value_response': None, 'act_index': self.action} + + def register_experience_replay_tuple(self, s, a, r, s_, episode_over): + s = self.prepare_state_representation(s) + a = self.action + s_ = self.prepare_state_representation(s_) + + training_example = np.hstack((s, [a, r], s_)) + if not self.predict_mode: # Training Mode + if self.warm_start == 1: + self.experience_replay_pool.append(training_example) + else: # Prediction Mode + self.experience_replay_pool.append(training_example) + + def train(self, batch_size=1, num_batches=100): + """ Train DQN with experience replay """ + + for iter_batch in range(num_batches): + self.cur_bellman_err = 0 + iter_count = min(200, len(self.experience_replay_pool) // batch_size) + for _ in range(iter_count): + batch = [random.choice(self.experience_replay_pool) for __ in range(batch_size)] + batch_struct = self.dqn.learn(batch) + self.cur_bellman_err += batch_struct['cost']['total_cost'] + + print("cur bellman err %.4f, experience replay pool %s" + % (float(self.cur_bellman_err) / len(self.experience_replay_pool), len(self.experience_replay_pool))) + + self.dqn.replace_target_params() + self.experience_replay_pool = self.experience_replay_pool[-2000:] def prepare_state_representation(self, state): """ Create the representation for each state """ @@ -163,7 +184,7 @@ def prepare_state_representation(self, state): self.final_representation = np.hstack( [user_act_rep, user_inform_slots_rep, user_request_slots_rep, agent_act_rep, agent_inform_slots_rep, agent_request_slots_rep, current_slots_rep, turn_rep, turn_onehot_rep, kb_binary_rep, kb_count_rep]) - return self.final_representation + return self.final_representation.flatten() def run_policy(self, representation): """ epsilon-greedy policy """ @@ -174,9 +195,10 @@ def run_policy(self, representation): if self.warm_start == 1: if len(self.experience_replay_pool) > self.experience_replay_pool_size: self.warm_start = 2 + return self.rule_policy() else: - return self.dqn.predict(representation, {}, predict_model=True) + return self.dqn.choose_action(representation) def rule_policy(self): """ Rule Policy """ @@ -208,59 +230,3 @@ def action_index(self, act_slot_response): print(act_slot_response) raise Exception("action index not found") return None - - def register_experience_replay_tuple(self, s_t, a_t, reward, s_tplus1, episode_over): - """ Register feedback from the environment, to be stored as future training data """ - - state_t_rep = self.prepare_state_representation(s_t) - action_t = self.action - reward_t = reward - state_tplus1_rep = self.prepare_state_representation(s_tplus1) - training_example = (state_t_rep, action_t, reward_t, state_tplus1_rep, episode_over) - - if not self.predict_mode: # Training Mode - if self.warm_start == 1: - self.experience_replay_pool.append(training_example) - else: # Prediction Mode - self.experience_replay_pool.append(training_example) - - def train(self, batch_size=1, num_batches=100): - """ Train DQN with experience replay """ - - for iter_batch in range(num_batches): - self.cur_bellman_err = 0 - for _ in range(len(self.experience_replay_pool) // batch_size): - batch = [random.choice(self.experience_replay_pool) for __ in range(batch_size)] - batch_struct = self.dqn.singleBatch(batch, {'gamma': self.gamma}, self.clone_dqn) - self.cur_bellman_err += batch_struct['cost']['total_cost'] - - print("cur bellman err %.4f, experience replay pool %s" - % (float(self.cur_bellman_err) / len(self.experience_replay_pool), len(self.experience_replay_pool))) - - ################################################################################ - # Debug Functions - ################################################################################ - def save_experience_replay_to_file(self, path): - """ Save the experience replay pool to a file """ - - try: - with open(path, "wb") as f: - pickle.dump(self.experience_replay_pool, f) - print('saved model in %s' % path) - except Exception as e: - print('Error: Writing model fails: %s' % path) - print(e) - - def load_experience_replay_from_file(self, path): - """ Load the experience replay pool from a file""" - - self.experience_replay_pool = pickle.load(open(path, 'rb')) - - def load_trained_DQN(self, path): - """ Load the trained DQN from a file """ - - trained_file = pickle.load(open(path, 'rb')) - model = trained_file['model'] - - print("trained DQN Parameters:", json.dumps(trained_file['params'], indent=2)) - return model diff --git a/src/deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json b/src/deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json new file mode 100644 index 0000000..83b406f --- /dev/null +++ b/src/deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json @@ -0,0 +1 @@ +{"ave_reward": {"0": 0.0, "1": 0.0, "2": 0.8, "3": 0.0, "4": 0.0, "5": 0.0, "6": 0.0, "7": 0.0, "8": 3.2, "9": 1.6, "10": 0.0}, "ave_turns": {"0": 39.56, "1": 17.9, "2": 24.68, "3": 19.06, "4": 13.98, "5": 8.56, "6": 7.68, "7": 8.42, "8": 11.04, "9": 15.9, "10": 25.76}, "success_rate": {"0": 0.0, "1": 0.0, "2": 0.01, "3": 0.0, "4": 0.0, "5": 0.0, "6": 0.0, "7": 0.0, "8": 0.04, "9": 0.02, "10": 0.0}} \ No newline at end of file diff --git a/src/deep_dialog/dialog_system/dialog_manager.py b/src/deep_dialog/dialog_system/dialog_manager.py index 6501c71..342769b 100644 --- a/src/deep_dialog/dialog_system/dialog_manager.py +++ b/src/deep_dialog/dialog_system/dialog_manager.py @@ -21,6 +21,7 @@ def __init__(self, agent, user, act_set, slot_set, movie_dictionary): self.user_action = None self.reward = 0 self.episode_over = False + self.current_reward_function = self.reward_function_without_penalty def initialize_episode(self): """ Refresh state for new dialog """ @@ -60,7 +61,7 @@ def next_turn(self, record_training_data=True): ######################################################################## self.sys_action = self.state_tracker.dialog_history_dictionaries()[-1] self.user_action, self.episode_over, dialog_status = self.user.next(self.sys_action) - self.reward = self.reward_function(dialog_status) + self.reward = self.current_reward_function(dialog_status) ######################################################################## # Update state tracker with latest user action diff --git a/src/deep_dialog/qlearning/dqn.py b/src/deep_dialog/qlearning/dqn.py index dd2a98e..9357ee7 100644 --- a/src/deep_dialog/qlearning/dqn.py +++ b/src/deep_dialog/qlearning/dqn.py @@ -1,282 +1,154 @@ """ -Created on Jun 18, 2016 +The double DQN based on this paper: https://arxiv.org/abs/1509.06461 -@author: xiul +Based on: https://morvanzhou.github.io/tutorials/ + +Using: +Tensorflow: 1.0 +gym: 0.8.0 """ -from .utils import * - - -class DQN: - def __init__(self, input_size, hidden_size, output_size): - self.model = {} - # input-hidden - self.model['Wxh'] = initWeight(input_size, hidden_size) - self.model['bxh'] = np.zeros((1, hidden_size)) - - # hidden-output - self.model['Wd'] = initWeight(hidden_size, output_size) * 0.1 - self.model['bd'] = np.zeros((1, output_size)) - - self.update = ['Wxh', 'bxh', 'Wd', 'bd'] - self.regularize = ['Wxh', 'Wd'] - - self.step_cache = {} - - def getStruct(self): - return {'model': self.model, 'update': self.update, 'regularize': self.regularize} - - def fwdPass(self, Xs, params, **kwargs): - """Activation Function: Sigmoid, or tanh, or ReLu""" - predict_mode = kwargs.get('predict_mode', False) - active_func = params.get('activation_func', 'relu') - - # input layer to hidden layer - Wxh = self.model['Wxh'] - bxh = self.model['bxh'] - Xsh = Xs.dot(Wxh) + bxh - - hidden_size = self.model['Wd'].shape[0] # size of hidden layer - H = np.zeros((1, hidden_size)) # hidden layer representation - - if active_func == 'sigmoid': - H = 1 / (1 + np.exp(-Xsh)) - elif active_func == 'tanh': - H = np.tanh(Xsh) - elif active_func == 'relu': # ReLU - H = np.maximum(Xsh, 0) - else: # no activation function - H = Xsh - - # decoder at the end; hidden layer to output layer - Wd = self.model['Wd'] - bd = self.model['bd'] - Y = H.dot(Wd) + bd - - # cache the values in forward pass, we expect to do a backward pass - cache = {} - if not predict_mode: - cache['Wxh'] = Wxh - cache['Wd'] = Wd - cache['Xs'] = Xs - cache['Xsh'] = Xsh - cache['H'] = H - - cache['bxh'] = bxh - cache['bd'] = bd - cache['activation_func'] = active_func - - cache['Y'] = Y - - return Y, cache - - def bwdPass(self, dY, cache): - Wd = cache['Wd'] - H = cache['H'] - Xs = cache['Xs'] - Xsh = cache['Xsh'] - Wxh = cache['Wxh'] - - active_func = cache['activation_func'] - n, d = H.shape - - dH = dY.dot(Wd.transpose()) - # backprop the decoder - dWd = H.transpose().dot(dY) - dbd = np.sum(dY, axis=0, keepdims=True) - - dXsh = np.zeros(Xsh.shape) - dXs = np.zeros(Xs.shape) - - if active_func == 'sigmoid': - dH = (H - H ** 2) * dH - elif active_func == 'tanh': - dH = (1 - H ** 2) * dH - elif active_func == 'relu': - dH = (H > 0) * dH # backprop ReLU +import numpy as np +import tensorflow as tf +import pickle + +np.random.seed(1) +tf.set_random_seed(1) + +class DoubleDQN: + def __init__(self,**kwargs): + self.kwargs = kwargs + + self.hidden_size = kwargs.get('hidden_size', 80) + self.n_actions = kwargs.get('n_actions') + self.n_features = kwargs.get('n_features') + self.lr = kwargs.get('learning_rate', 0.001) + self.gamma = kwargs.get('reward_decay', 0.999) + self.epsilon_max = kwargs.get('e_greedy', 0.9) + self.batch_size = kwargs.get('batch_size', 16) + self.epsilon_increment = kwargs.get('e_greedy_increment') + self.epsilon = 1 + + self.double_q = kwargs.get('double_q', True) # decide to use double q or not + + sess = kwargs.get('sess') + self._build_net() + if sess is None: + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) else: - dH = dH - - # backprop to the input-hidden connection - dWxh = Xs.transpose().dot(dH) - dbxh = np.sum(dH, axis=0, keepdims=True) - - # backprop to the input - dXsh = dH - dXs = dXsh.dot(Wxh.transpose()) - - return {'Wd': dWd, 'bd': dbd, 'Wxh': dWxh, 'bxh': dbxh} - - - def batchForward(self, batch, params, predict_mode=False): - """batch Forward & Backward Pass""" - caches = [] - Ys = [] - for i, x in enumerate(batch): - Xs = np.array([x['cur_states']], dtype=float) - - Y, out_cache = self.fwdPass(Xs, params, predict_mode=predict_mode) - caches.append(out_cache) - Ys.append(Y) - - # back up information for efficient backprop - cache = {} - if not predict_mode: - cache['caches'] = caches - - return Ys, cache - - def batchDoubleForward(self, batch, params, clone_dqn, predict_mode=False): - caches = [] - Ys = [] - tYs = [] + self.sess = sess + if kwargs.get('output_graph'): + tf.summary.FileWriter("logs/", self.sess.graph) + self.cost_his = [] + + def _build_net(self): + def build_layers(s, c_names, n_l1, w_initializer, b_initializer): + with tf.variable_scope('l1'): + w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) + b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) + l1 = tf.nn.relu(tf.matmul(s, w1) + b1) + + with tf.variable_scope('l2'): + w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names) + b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names) + out = tf.matmul(l1, w2) + b2 + return out + # ------------------ build evaluate_net ------------------ + self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input + self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss + + with tf.variable_scope('eval_net'): + c_names, n_l1, w_initializer, b_initializer = \ + ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], self.hidden_size, \ + tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers + + self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer) + + with tf.variable_scope('loss'): + self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) + with tf.variable_scope('train'): + self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) + + # ------------------ build target_net ------------------ + self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input + with tf.variable_scope('target_net'): + c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] + + self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer) + + def choose_action(self, observation): + if len(observation.shape)!=2: + observation = observation[np.newaxis, :] + + actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) + action = np.argmax(actions_value) + + if not hasattr(self, 'q'): # record action value it gets + self.q = [] + self.running_q = 0 + self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value) + self.q.append(self.running_q) + + if np.random.uniform() > self.epsilon: # choosing action + action = np.random.randint(0, self.n_actions) + return action + + def replace_target_params(self): + t_params = tf.get_collection('target_net_params') + e_params = tf.get_collection('eval_net_params') + self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) + + print('\ntarget_params_replaced\n') + + def learn(self, batch_memory): + batch_memory = np.array(batch_memory) + q_next, q_eval4next = self.sess.run( + [self.q_next, self.q_eval], + feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation + self.s: batch_memory[:, -self.n_features:]}) # next observation + q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]}) + + q_target = q_eval.copy() + + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory[:, self.n_features].astype(int) + reward = batch_memory[:, self.n_features + 1] + + if self.double_q: + max_act4next = np.argmax(q_eval4next, axis=1) # the action that brings the highest value is evaluated by q_eval + selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions + else: + selected_q_next = np.max(q_next, axis=1) # the natural DQN - for i, x in enumerate(batch): - Xs = x[0] - Y, out_cache = self.fwdPass(Xs, params, predict_mode=predict_mode) - caches.append(out_cache) - Ys.append(Y) + q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next - tXs = x[3] - tY, t_cache = clone_dqn.fwdPass(tXs, params, predict_mode=False) + _, self.cost = self.sess.run([self._train_op, self.loss], + feed_dict={self.s: batch_memory[:, :self.n_features], + self.q_target: q_target}) + self.cost_his.append(self.cost) - tYs.append(tY) + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + return {'cost': {'total_cost': self.cost}} - # back up information for efficient backprop - cache = {} - if not predict_mode: - cache['caches'] = caches + def save(self, filepath): + self.kwargs['sess'] = None + pickle.dump(self.kwargs, open(filepath + '.params', 'wb')) - return Ys, cache, tYs + saver = tf.train.Saver(max_to_keep=1) + # Save model weights to disk + save_path= saver.save(self.sess, filepath, global_step=0) + print('Save path = {}'.format(save_path)) - def batchBackward(self, dY, cache): - caches = cache['caches'] + @classmethod + def load(self, filepath): + sess = tf.Session() + sess.run(tf.global_variables_initializer()) - grads = {} - for i in range(len(caches)): - single_cache = caches[i] - local_grads = self.bwdPass(dY[i], single_cache) - mergeDicts(grads, local_grads) # add up the gradients wrt model parameters + constructor_params = pickle.load(open(filepath.split('.tf')[0]+'.tf.params', 'rb')) + constructor_params['sess'] = sess + result = DoubleDQN(**constructor_params) + saver = tf.train.Saver() + sess = saver.restore(sess, filepath) + return result - return grads - - def costFunc(self, batch, params, clone_dqn): - """ cost function, returns cost and gradients for model """ - - regc = params.get('reg_cost', 1e-3) - gamma = params.get('gamma', 0.9) - - # batch forward - Ys, caches, tYs = self.batchDoubleForward(batch, params, clone_dqn, predict_mode=False) - - loss_cost = 0.0 - dYs = [] - for i, x in enumerate(batch): - Y = Ys[i] - nY = tYs[i] - - action = np.array(x[1], dtype=int) - reward = np.array(x[2], dtype=float) - - n_action = np.nanargmax(nY[0]) - max_next_y = nY[0][n_action] - - eposide_terminate = x[4] - - target_y = reward - if eposide_terminate != True: target_y += gamma * max_next_y - - pred_y = Y[0][action] - - nY = np.zeros(nY.shape) - nY[0][action] = target_y - Y = np.zeros(Y.shape) - Y[0][action] = pred_y - - # Cost Function - loss_cost += (target_y - pred_y) ** 2 - - dY = -(nY - Y) - # dY = np.minimum(dY, 1) - # dY = np.maximum(dY, -1) - dYs.append(dY) - - # backprop the RNN - grads = self.batchBackward(dYs, caches) - - # add L2 regularization cost and gradients - reg_cost = 0.0 - if regc > 0: - for p in self.regularize: - mat = self.model[p] - reg_cost += 0.5 * regc * np.sum(mat * mat) - grads[p] += regc * mat - - # normalize the cost and gradient by the batch size - batch_size = len(batch) - reg_cost /= batch_size - loss_cost /= batch_size - for k in grads: grads[k] /= batch_size - - out = {} - out['cost'] = {'reg_cost': reg_cost, 'loss_cost': loss_cost, 'total_cost': loss_cost + reg_cost} - out['grads'] = grads - return out - - """ A single batch """ - - def singleBatch(self, batch, params, clone_dqn): - learning_rate = params.get('learning_rate', 0.001) - decay_rate = params.get('decay_rate', 0.999) - momentum = params.get('momentum', 0.1) - grad_clip = params.get('grad_clip', -1e-3) - smooth_eps = params.get('smooth_eps', 1e-8) - sdg_type = params.get('sdgtype', 'rmsprop') - activation_func = params.get('activation_func', 'relu') - - for u in self.update: - if not u in self.step_cache: - self.step_cache[u] = np.zeros(self.model[u].shape) - - cg = self.costFunc(batch, params, clone_dqn) - - cost = cg['cost'] - grads = cg['grads'] - - # clip gradients if needed - if activation_func.lower() == 'relu': - if grad_clip > 0: - for p in self.update: - if p in grads: - grads[p] = np.minimum(grads[p], grad_clip) - grads[p] = np.maximum(grads[p], -grad_clip) - - # perform parameter update - for p in self.update: - if p in grads: - if sdg_type == 'vanilla': - if momentum > 0: - dx = momentum * self.step_cache[p] - learning_rate * grads[p] - else: - dx = -learning_rate * grads[p] - self.step_cache[p] = dx - elif sdg_type == 'rmsprop': - self.step_cache[p] = self.step_cache[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2 - dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache[p] + smooth_eps) - elif sdg_type == 'adgrad': - self.step_cache[p] += grads[p] ** 2 - dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache[p] + smooth_eps) - - self.model[p] += dx - - out = {} - out['cost'] = cost - return out - - """ prediction """ - - def predict(self, Xs, params, **kwargs): - Ys, caches = self.fwdPass(Xs, params, predict_model=True) - pred_action = np.argmax(Ys) - - return pred_action diff --git a/src/draw_learning_curve.py b/src/draw_learning_curve.py index 8077663..ad66d49 100644 --- a/src/draw_learning_curve.py +++ b/src/draw_learning_curve.py @@ -14,11 +14,15 @@ def read_performance_records(path): """ load the performance score (.json) file """ - data = json.load(open(path, 'rb')) + data = json.load(open(path, 'rt')) + numbers = {'x': [], 'success_rate': [], 'ave_turns': [], 'ave_rewards': []} for key in data['success_rate'].keys(): if int(key) > -1: + numbers['x'].append(key) print("%s\t%s\t%s\t%s" % (key, data['success_rate'][key], data['ave_turns'][key], data['ave_reward'][key])) + numbers['success_rate'].append(data['success_rate'][key]) + return numbers def load_performance_file(path): """ load the performance score (.json) file """ @@ -54,9 +58,10 @@ def main(params): if cmd == 0: numbers = load_performance_file(params['result_file']) - draw_learning_curve(numbers) elif cmd == 1: - read_performance_records(params['result_file']) + numbers = read_performance_records(params['result_file']) + + draw_learning_curve(numbers) if __name__ == "__main__": @@ -65,7 +70,7 @@ def main(params): parser.add_argument('--cmd', dest='cmd', type=int, default=1, help='cmd') parser.add_argument('--result_file', dest='result_file', type=str, - default='./deep_dialog/checkpoints/rl_agent/11142016/noe2e/agt_9_performance_records.json', + default='./deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json', help='path to the result file') args = parser.parse_args() diff --git a/src/requirements.txt b/src/requirements.txt index 9fdcb92..f6f42ea 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -1,2 +1,3 @@ numpy -fuzzywuzzy \ No newline at end of file +fuzzywuzzy +tensorflow \ No newline at end of file diff --git a/src/run.py b/src/run.py index 2e565ca..014be3c 100644 --- a/src/run.py +++ b/src/run.py @@ -4,37 +4,37 @@ This should be a simple minimalist run file. It's only responsibility should be to parse the arguments (which agent, user simulator to use) and launch a dialog simulation. -Rule-agent: python run.py --agt 6 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p +Rule-agent: python run.py --agt 6 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json --run_mode 2 movie_kb: -movie_kb.1k.p: 94% success rate -movie_kb.v2.p: 36% success rate +movie_kb.1k.json: 94% success rate +movie_kb.v2.json: 36% success rate user goal files: -first turn: user_goals_first_turn_template.v2.p -all turns: user_goals_all_turns_template.p -user_goals_first_turn_template.part.movie.v1.p: a subset of user goal. [Please use this one, the upper bound success +first turn: user_goals_first_turn_template.v2.json +all turns: user_goals_all_turns_template.json +user_goals_first_turn_template.part.movie.v1.json: a subset of user goal. [Please use this one, the upper bound success rate on movie_kb.1k.json is 0.9765.] Commands: -Rule: python run.py --agt 5 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p ---goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p --intent_err_prob 0.00 +Rule: python run.py --agt 5 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json +--goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json --intent_err_prob 0.00 --slot_err_prob 0.00 --episodes 500 --act_level 1 --run_mode 1 Training: -RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p --dqn_hidden_size 80 +RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json --dqn_hidden_size 80 --experience_replay_pool_size 1000 --episodes 500 --simulation_epoch_size 100 --write_model_dir ./deep_dialog/checkpoints/rl_agent --run_mode 3 --act_level 0 --slot_err_prob 0.05 --intent_err_prob 0.00 ---batch_size 16 --goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p --warm_start 1 +--batch_size 16 --goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json --warm_start 1 --warm_start_epochs 120 Predict: -RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p --dqn_hidden_size 80 +RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json --dqn_hidden_size 80 --experience_replay_pool_size 1000 --episodes 300 --simulation_epoch_size 100 --write_model_dir ./deep_dialog/checkpoints/rl_agent --slot_err_prob 0.00 --intent_err_prob 0.00 --batch_size 16 --goal_file_path -./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p --episodes 200 --trained_model_path -./deep_dialog/checkpoints/rl_agent/agt_9_22_30_0.37000.p --run_mode 3 +./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json --episodes 200 --trained_model_path +./deep_dialog/checkpoints/rl_agent/agt_9_0.79.tf-0 --run_mode 3 --train 0 @author: xiul, t-zalipt """ @@ -72,16 +72,16 @@ def load_file(file_name): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--dict_path', dest='dict_path', type=str, default='./deep_dialog/data/dicts.v3.p', + parser.add_argument('--dict_path', dest='dict_path', type=str, default='./deep_dialog/data/dicts.v3.json', help='path to the .json dictionary file') - parser.add_argument('--movie_kb_path', dest='movie_kb_path', type=str, default='./deep_dialog/data/movie_kb.1k.p', + parser.add_argument('--movie_kb_path', dest='movie_kb_path', type=str, default='./deep_dialog/data/movie_kb.1k.json', help='path to the movie kb .json file') parser.add_argument('--act_set', dest='act_set', type=str, default='./deep_dialog/data/dia_acts.txt', help='path to dia act set; none for loading from labeled file') parser.add_argument('--slot_set', dest='slot_set', type=str, default='./deep_dialog/data/slot_set.txt', help='path to slot set; none for loading from labeled file') parser.add_argument('--goal_file_path', dest='goal_file_path', type=str, - default='./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p', + default='./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json', help='a list of user goals') parser.add_argument('--diaact_nl_pairs', dest='diaact_nl_pairs', type=str, default='./deep_dialog/data/dia_act_nl_pairs.v6.json', @@ -145,6 +145,9 @@ def load_file(file_name): parser.add_argument('--save_check_point', dest='save_check_point', type=int, default=10, help='number of epochs for saving model') + parser.add_argument('--train', dest='train', type=int, default=1, + help='0 - predict, 1 - train') + parser.add_argument('--success_rate_threshold', dest='success_rate_threshold', type=float, default=0.3, help='the threshold for success rate') @@ -160,6 +163,8 @@ def load_file(file_name): agt = params['agt'] usr = params['usr'] +is_training_mode = params['train'] + dict_path = params['dict_path'] goal_file_path = params['goal_file_path'] @@ -282,7 +287,7 @@ def load_file(file_name): """ Best Model and Performance Records """ best_model = {} best_res = {'success_rate': 0, 'ave_reward': float('-inf'), 'ave_turns': float('inf'), 'epoch': 0} -best_model['model'] = copy.deepcopy(agent) +best_model['model'] = agent best_res['success_rate'] = 0 performance_records = {'success_rate': {}, 'ave_turns': {}, 'ave_reward': {}} @@ -291,13 +296,13 @@ def load_file(file_name): def save_model(path, agt, success_rate, agent, best_epoch, cur_epoch): """ Save model """ - filename = 'agt_%s_%s_%s_%.5f.p' % (agt, best_epoch, cur_epoch, success_rate) + filename = 'agt_%s_%.2f.tf' % (agt, success_rate) filepath = os.path.join(path, filename) checkpoint = {} - if agt == 9: checkpoint['model'] = copy.deepcopy(agent.dqn.model) + if agt == 9: checkpoint['model'] = agent.dqn checkpoint['params'] = params try: - pickle.dump(checkpoint, open(filepath, "wb")) + agent.dqn.save(filepath) print('saved model in %s' % (filepath,)) except Exception as e: print('Error: Writing model fails: %s' % (filepath,)) @@ -310,7 +315,7 @@ def save_performance_records(path, agt, records): filename = 'agt_%s_performance_records.json' % (agt) filepath = os.path.join(path, filename) try: - json.dump(records, open(filepath, "wb")) + json.dump(records, open(filepath, "w")) print('saved model in %s' % (filepath,)) except Exception as e: print('Error: Writing model fails: %s' % (filepath,)) @@ -372,9 +377,9 @@ def warm_start_simulation(): break agent.warm_start = 2 - res['success_rate'] = float(successes) / simulation_epoch_size - res['ave_reward'] = float(cumulative_reward) / simulation_epoch_size - res['ave_turns'] = float(cumulative_turns) / simulation_epoch_size + res['success_rate'] = float(successes) / warm_start_epochs + res['ave_reward'] = float(cumulative_reward) / warm_start_epochs + res['ave_turns'] = float(cumulative_turns) / warm_start_epochs print("Warm_Start %s epochs, success rate %s, ave reward %s, ave turns %s" % (episode + 1, res['success_rate'], res['ave_reward'], res['ave_turns'])) print("Current experience replay buffer size %s" % (len(agent.experience_replay_pool))) @@ -390,6 +395,9 @@ def run_episodes(count, status): warm_start_simulation() print('warm_start finished, start RL training ...') + if agt == 9 and params['trained_model_path']: + agent.warm_start = 2 + for episode in range(count): print("Episode: %s" % episode) dialog_manager.initialize_episode() @@ -409,7 +417,7 @@ def run_episodes(count, status): cumulative_turns += dialog_manager.state_tracker.turn_count # simulation - if agt == 9 and not params['trained_model_path']: + if agt ==9 and is_training_mode: agent.predict_mode = True simulation_res = simulation_epoch(simulation_epoch_size) @@ -418,18 +426,21 @@ def run_episodes(count, status): performance_records['ave_reward'][episode] = simulation_res['ave_reward'] if simulation_res['success_rate'] >= best_res['success_rate']: - if simulation_res['success_rate'] >= success_rate_threshold: # threshold = 0.30 - agent.experience_replay_pool = [] - simulation_epoch(simulation_epoch_size) + if simulation_res['success_rate'] >= success_rate_threshold: # threshold = 0.3 + #agent.experience_replay_pool = [] + dialog_manager.current_reward_function = dialog_manager.reward_function + simulation_epoch(simulation_epoch_size) if simulation_res['success_rate'] > best_res['success_rate']: - best_model['model'] = copy.deepcopy(agent) + best_model['model'] = agent best_res['success_rate'] = simulation_res['success_rate'] best_res['ave_reward'] = simulation_res['ave_reward'] best_res['ave_turns'] = simulation_res['ave_turns'] best_res['epoch'] = episode - agent.clone_dqn = copy.deepcopy(agent.dqn) + save_model(params['write_model_dir'], agt, best_res['success_rate'], best_model['model'], + best_res['epoch'], episode) + agent.train(batch_size, 1) agent.predict_mode = False @@ -438,8 +449,6 @@ def run_episodes(count, status): performance_records['ave_turns'][episode], best_res['success_rate'])) if episode % save_check_point == 0 \ and not params['trained_model_path']: # save the model every 10 episodes - save_model(params['write_model_dir'], agt, best_res['success_rate'], best_model['model'], - best_res['epoch'], episode) save_performance_records(params['write_model_dir'], agt, performance_records) print("Progress: %s / %s, Success rate: %s / %s Avg reward: %.2f Avg turns: %.2f" @@ -451,8 +460,6 @@ def run_episodes(count, status): status['count'] += count if agt == 9 and not params['trained_model_path']: - save_model(params['write_model_dir'], agt, float(successes) / count, best_model['model'], best_res['epoch'], - count) save_performance_records(params['write_model_dir'], agt, performance_records) diff --git a/src/telegram_bot/bot.py b/src/telegram_bot/bot.py index 38abd83..dee7297 100644 --- a/src/telegram_bot/bot.py +++ b/src/telegram_bot/bot.py @@ -7,12 +7,12 @@ import json # from deep_dialog.agents import AgentDQN # from deep_dialog.agents import agent_baselines -from deep_dialog.nlg import nlg -from deep_dialog.nlu import nlu -from deep_dialog.usersims.realUser import RealUser -from telegramDialogClasses import TelegramDialogManager -from telegramDialogClasses import RuleAgent -from deep_dialog.dialog_system.dict_reader import text_to_dict +from src.deep_dialog.nlg import nlg +from src.deep_dialog.nlu import nlu +from src.deep_dialog.usersims.realUser import RealUser +from src.telegram_bot.telegramDialogClasses import TelegramDialogManager +from src.telegram_bot.telegramDialogClasses import RuleAgent +from src.deep_dialog.dialog_system.dict_reader import text_to_dict import telebot import cherrypy @@ -31,7 +31,7 @@ def load_file(file_name): WEBHOOKS_AVAIL = False -config = load_file("/Users/fogside/Projects/Telegram_bot/RL-Dialog-Bot/src/telegram_bot/config.json") +config = load_file("config.json") bot = telebot.TeleBot(config["token"]) turn_count = 0 @@ -74,25 +74,24 @@ def get_random_emoji(num = 1): ######### --****--- End of initialization ---****-- ########### ######### --****--- Next code is for debugging ---****-- ############ -# -# dia_manager.initialize_episode() -# turn_count+=1 -# -# print("Hello! I can help you to buy tickets to the cinema.\nWhat film would you like to watch?") -# -# while(turn_count > 0): -# msg = input() -# episode_over, agent_ans = dia_manager.next_turn(msg) -# turn_count+=1 -# # bot.send_message(msg, agent_ans+' ' + get_random_emoji(1)) -# print("turn #{}: {}".format(turn_count, agent_ans)) -# if episode_over: -# turn_count = 0 -# if msg == 'stop': -# turn_count = 0 -# -# exit(0) -# + +dia_manager.initialize_episode() +turn_count+=1 + +print("Hello! I can help you to buy tickets to the cinema.\nWhat film would you like to watch?") + +while(turn_count > 0): + msg = input() + episode_over, agent_ans = dia_manager.next_turn(msg) + turn_count+=1 + print("turn #{}: {}".format(turn_count, agent_ans)) + if episode_over: + turn_count = 0 + if msg == 'stop': + turn_count = 0 + +exit(0) + ####### --** End of Debugging **-- ########