diff --git a/.gitignore b/.gitignore
index 44dbcd6..2d05e22 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.pyc
 .idea/
+src/telegram_bot/config.json
\ No newline at end of file
diff --git a/src/deep_dialog/agents/agent.py b/src/deep_dialog/agents/agent.py
index 49c1724..0efbde3 100644
--- a/src/deep_dialog/agents/agent.py
+++ b/src/deep_dialog/agents/agent.py
@@ -4,7 +4,7 @@
 @author: xiul, t-zalipt
 """
 
-from deep_dialog import dialog_config
+from src.deep_dialog import dialog_config
 
 
 class Agent:
diff --git a/src/deep_dialog/agents/agent_dqn.py b/src/deep_dialog/agents/agent_dqn.py
index 790b4d9..5206ce8 100644
--- a/src/deep_dialog/agents/agent_dqn.py
+++ b/src/deep_dialog/agents/agent_dqn.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from src.deep_dialog import dialog_config
-from src.deep_dialog.qlearning import DQN
+from src.deep_dialog.qlearning import DoubleDQN
 from .agent import Agent
 
 
@@ -36,31 +36,23 @@ def __init__(self, movie_dict=None, act_set=None, slot_set=None, params=None):
         self.feasible_actions = dialog_config.feasible_actions
         self.num_actions = len(self.feasible_actions)
 
+        self.experience_replay_pool = []
         self.epsilon = params['epsilon']
         self.agent_run_mode = params['agent_run_mode']
         self.agent_act_level = params['agent_act_level']
-        self.experience_replay_pool = []  # experience replay pool <s_t, a_t, r_t, s_t+1>
-
+        self.warm_start = params['warm_start']
         self.experience_replay_pool_size = params.get('experience_replay_pool_size', 1000)
-        self.hidden_size = params.get('dqn_hidden_size', 60)
+        self.hidden_size = params.get('dqn_hidden_size', 80)
         self.gamma = params.get('gamma', 0.9)
         self.predict_mode = params.get('predict_mode', False)
-        self.warm_start = params.get('warm_start', 0)
 
         self.max_turn = params['max_turn'] + 4
         self.state_dimension = 2 * self.act_cardinality + 7 * self.slot_cardinality + 3 + self.max_turn
 
-        self.dqn = DQN(self.state_dimension, self.hidden_size, self.num_actions)
-        self.clone_dqn = copy.deepcopy(self.dqn)
-
-        self.cur_bellman_err = 0
-
-        # Prediction Mode: load trained DQN model
-        if params['trained_model_path'] is not None:
-            self.dqn.model = copy.deepcopy(self.load_trained_DQN(params['trained_model_path']))
-            self.clone_dqn = copy.deepcopy(self.dqn)
-            self.predict_mode = True
-            self.warm_start = 2
+        if 'trained_model_path' not in params or params['trained_model_path'] is None:
+            self.dqn = DoubleDQN(n_actions=len(self.feasible_actions), n_features=self.state_dimension, hidden_size=self.hidden_size)
+        else:
+            self.dqn = DoubleDQN.load(params['trained_model_path'])
 
     def initialize_episode(self):
         """ Initialize a new episode. This function is called every time a new episode is run. """
@@ -75,7 +67,36 @@ def state_to_action(self, state):
         self.representation = self.prepare_state_representation(state)
         self.action = self.run_policy(self.representation)
         act_slot_response = copy.deepcopy(self.feasible_actions[self.action])
-        return {'act_slot_response': act_slot_response, 'act_slot_value_response': None}
+        return {'act_slot_response': act_slot_response, 'act_slot_value_response': None, 'act_index': self.action}
+
+    def register_experience_replay_tuple(self, s, a, r, s_, episode_over):
+        s = self.prepare_state_representation(s)
+        a = self.action
+        s_ = self.prepare_state_representation(s_)
+
+        training_example = np.hstack((s, [a, r], s_))
+        if not self.predict_mode:  # Training Mode
+            if self.warm_start == 1:
+                self.experience_replay_pool.append(training_example)
+        else:  # Prediction Mode
+            self.experience_replay_pool.append(training_example)
+
+    def train(self, batch_size=1, num_batches=100):
+        """ Train DQN with experience replay """
+
+        for iter_batch in range(num_batches):
+            self.cur_bellman_err = 0
+            iter_count = min(200, len(self.experience_replay_pool) // batch_size)
+            for _ in range(iter_count):
+                batch = [random.choice(self.experience_replay_pool) for __ in range(batch_size)]
+                batch_struct = self.dqn.learn(batch)
+                self.cur_bellman_err += batch_struct['cost']['total_cost']
+
+            print("cur bellman err %.4f, experience replay pool %s"
+                  % (float(self.cur_bellman_err) / len(self.experience_replay_pool), len(self.experience_replay_pool)))
+
+        self.dqn.replace_target_params()
+        self.experience_replay_pool = self.experience_replay_pool[-2000:]
 
     def prepare_state_representation(self, state):
         """ Create the representation for each state """
@@ -163,7 +184,7 @@ def prepare_state_representation(self, state):
         self.final_representation = np.hstack(
             [user_act_rep, user_inform_slots_rep, user_request_slots_rep, agent_act_rep, agent_inform_slots_rep,
              agent_request_slots_rep, current_slots_rep, turn_rep, turn_onehot_rep, kb_binary_rep, kb_count_rep])
-        return self.final_representation
+        return self.final_representation.flatten()
 
     def run_policy(self, representation):
         """ epsilon-greedy policy """
@@ -174,9 +195,10 @@ def run_policy(self, representation):
             if self.warm_start == 1:
                 if len(self.experience_replay_pool) > self.experience_replay_pool_size:
                     self.warm_start = 2
+
                 return self.rule_policy()
             else:
-                return self.dqn.predict(representation, {}, predict_model=True)
+                return self.dqn.choose_action(representation)
 
     def rule_policy(self):
         """ Rule Policy """
@@ -208,59 +230,3 @@ def action_index(self, act_slot_response):
             print(act_slot_response)
             raise Exception("action index not found")
         return None
-
-    def register_experience_replay_tuple(self, s_t, a_t, reward, s_tplus1, episode_over):
-        """ Register feedback from the environment, to be stored as future training data """
-
-        state_t_rep = self.prepare_state_representation(s_t)
-        action_t = self.action
-        reward_t = reward
-        state_tplus1_rep = self.prepare_state_representation(s_tplus1)
-        training_example = (state_t_rep, action_t, reward_t, state_tplus1_rep, episode_over)
-
-        if not self.predict_mode:  # Training Mode
-            if self.warm_start == 1:
-                self.experience_replay_pool.append(training_example)
-        else:  # Prediction Mode
-            self.experience_replay_pool.append(training_example)
-
-    def train(self, batch_size=1, num_batches=100):
-        """ Train DQN with experience replay """
-
-        for iter_batch in range(num_batches):
-            self.cur_bellman_err = 0
-            for _ in range(len(self.experience_replay_pool) // batch_size):
-                batch = [random.choice(self.experience_replay_pool) for __ in range(batch_size)]
-                batch_struct = self.dqn.singleBatch(batch, {'gamma': self.gamma}, self.clone_dqn)
-                self.cur_bellman_err += batch_struct['cost']['total_cost']
-
-            print("cur bellman err %.4f, experience replay pool %s"
-                  % (float(self.cur_bellman_err) / len(self.experience_replay_pool), len(self.experience_replay_pool)))
-
-    ################################################################################
-    #    Debug Functions
-    ################################################################################
-    def save_experience_replay_to_file(self, path):
-        """ Save the experience replay pool to a file """
-
-        try:
-            with open(path, "wb") as f:
-                pickle.dump(self.experience_replay_pool, f)
-            print('saved model in %s' % path)
-        except Exception as e:
-            print('Error: Writing model fails: %s' % path)
-            print(e)
-
-    def load_experience_replay_from_file(self, path):
-        """ Load the experience replay pool from a file"""
-
-        self.experience_replay_pool = pickle.load(open(path, 'rb'))
-
-    def load_trained_DQN(self, path):
-        """ Load the trained DQN from a file """
-
-        trained_file = pickle.load(open(path, 'rb'))
-        model = trained_file['model']
-
-        print("trained DQN Parameters:", json.dumps(trained_file['params'], indent=2))
-        return model
diff --git a/src/deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json b/src/deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json
new file mode 100644
index 0000000..83b406f
--- /dev/null
+++ b/src/deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json
@@ -0,0 +1 @@
+{"ave_reward": {"0": 0.0, "1": 0.0, "2": 0.8, "3": 0.0, "4": 0.0, "5": 0.0, "6": 0.0, "7": 0.0, "8": 3.2, "9": 1.6, "10": 0.0}, "ave_turns": {"0": 39.56, "1": 17.9, "2": 24.68, "3": 19.06, "4": 13.98, "5": 8.56, "6": 7.68, "7": 8.42, "8": 11.04, "9": 15.9, "10": 25.76}, "success_rate": {"0": 0.0, "1": 0.0, "2": 0.01, "3": 0.0, "4": 0.0, "5": 0.0, "6": 0.0, "7": 0.0, "8": 0.04, "9": 0.02, "10": 0.0}}
\ No newline at end of file
diff --git a/src/deep_dialog/dialog_system/dialog_manager.py b/src/deep_dialog/dialog_system/dialog_manager.py
index 6501c71..342769b 100644
--- a/src/deep_dialog/dialog_system/dialog_manager.py
+++ b/src/deep_dialog/dialog_system/dialog_manager.py
@@ -21,6 +21,7 @@ def __init__(self, agent, user, act_set, slot_set, movie_dictionary):
         self.user_action = None
         self.reward = 0
         self.episode_over = False
+        self.current_reward_function = self.reward_function_without_penalty
 
     def initialize_episode(self):
         """ Refresh state for new dialog """
@@ -60,7 +61,7 @@ def next_turn(self, record_training_data=True):
         ########################################################################
         self.sys_action = self.state_tracker.dialog_history_dictionaries()[-1]
         self.user_action, self.episode_over, dialog_status = self.user.next(self.sys_action)
-        self.reward = self.reward_function(dialog_status)
+        self.reward = self.current_reward_function(dialog_status)
 
         ########################################################################
         #   Update state tracker with latest user action
diff --git a/src/deep_dialog/qlearning/dqn.py b/src/deep_dialog/qlearning/dqn.py
index dd2a98e..9357ee7 100644
--- a/src/deep_dialog/qlearning/dqn.py
+++ b/src/deep_dialog/qlearning/dqn.py
@@ -1,282 +1,154 @@
 """
-Created on Jun 18, 2016
+The double DQN based on this paper: https://arxiv.org/abs/1509.06461
 
-@author: xiul
+Based on: https://morvanzhou.github.io/tutorials/
+
+Using:
+Tensorflow: 1.0
+gym: 0.8.0
 """
 
-from .utils import *
-
-
-class DQN:
-    def __init__(self, input_size, hidden_size, output_size):
-        self.model = {}
-        # input-hidden
-        self.model['Wxh'] = initWeight(input_size, hidden_size)
-        self.model['bxh'] = np.zeros((1, hidden_size))
-
-        # hidden-output
-        self.model['Wd'] = initWeight(hidden_size, output_size) * 0.1
-        self.model['bd'] = np.zeros((1, output_size))
-
-        self.update = ['Wxh', 'bxh', 'Wd', 'bd']
-        self.regularize = ['Wxh', 'Wd']
-
-        self.step_cache = {}
-
-    def getStruct(self):
-        return {'model': self.model, 'update': self.update, 'regularize': self.regularize}
-
-    def fwdPass(self, Xs, params, **kwargs):
-        """Activation Function: Sigmoid, or tanh, or ReLu"""
-        predict_mode = kwargs.get('predict_mode', False)
-        active_func = params.get('activation_func', 'relu')
-
-        # input layer to hidden layer
-        Wxh = self.model['Wxh']
-        bxh = self.model['bxh']
-        Xsh = Xs.dot(Wxh) + bxh
-
-        hidden_size = self.model['Wd'].shape[0]  # size of hidden layer
-        H = np.zeros((1, hidden_size))  # hidden layer representation
-
-        if active_func == 'sigmoid':
-            H = 1 / (1 + np.exp(-Xsh))
-        elif active_func == 'tanh':
-            H = np.tanh(Xsh)
-        elif active_func == 'relu':  # ReLU
-            H = np.maximum(Xsh, 0)
-        else:  # no activation function
-            H = Xsh
-
-        # decoder at the end; hidden layer to output layer
-        Wd = self.model['Wd']
-        bd = self.model['bd']
-        Y = H.dot(Wd) + bd
-
-        # cache the values in forward pass, we expect to do a backward pass
-        cache = {}
-        if not predict_mode:
-            cache['Wxh'] = Wxh
-            cache['Wd'] = Wd
-            cache['Xs'] = Xs
-            cache['Xsh'] = Xsh
-            cache['H'] = H
-
-            cache['bxh'] = bxh
-            cache['bd'] = bd
-            cache['activation_func'] = active_func
-
-            cache['Y'] = Y
-
-        return Y, cache
-
-    def bwdPass(self, dY, cache):
-        Wd = cache['Wd']
-        H = cache['H']
-        Xs = cache['Xs']
-        Xsh = cache['Xsh']
-        Wxh = cache['Wxh']
-
-        active_func = cache['activation_func']
-        n, d = H.shape
-
-        dH = dY.dot(Wd.transpose())
-        # backprop the decoder
-        dWd = H.transpose().dot(dY)
-        dbd = np.sum(dY, axis=0, keepdims=True)
-
-        dXsh = np.zeros(Xsh.shape)
-        dXs = np.zeros(Xs.shape)
-
-        if active_func == 'sigmoid':
-            dH = (H - H ** 2) * dH
-        elif active_func == 'tanh':
-            dH = (1 - H ** 2) * dH
-        elif active_func == 'relu':
-            dH = (H > 0) * dH  # backprop ReLU
+import numpy as np
+import tensorflow as tf
+import pickle
+
+np.random.seed(1)
+tf.set_random_seed(1)
+
+class DoubleDQN:
+    def __init__(self,**kwargs):
+        self.kwargs = kwargs
+
+        self.hidden_size = kwargs.get('hidden_size', 80)
+        self.n_actions = kwargs.get('n_actions')
+        self.n_features = kwargs.get('n_features')
+        self.lr = kwargs.get('learning_rate', 0.001)
+        self.gamma = kwargs.get('reward_decay', 0.999)
+        self.epsilon_max = kwargs.get('e_greedy', 0.9)
+        self.batch_size = kwargs.get('batch_size', 16)
+        self.epsilon_increment = kwargs.get('e_greedy_increment')
+        self.epsilon = 1
+
+        self.double_q = kwargs.get('double_q', True)    # decide to use double q or not
+
+        sess = kwargs.get('sess')
+        self._build_net()
+        if sess is None:
+            self.sess = tf.Session()
+            self.sess.run(tf.global_variables_initializer())
         else:
-            dH = dH
-
-        # backprop to the input-hidden connection
-        dWxh = Xs.transpose().dot(dH)
-        dbxh = np.sum(dH, axis=0, keepdims=True)
-
-        # backprop to the input
-        dXsh = dH
-        dXs = dXsh.dot(Wxh.transpose())
-
-        return {'Wd': dWd, 'bd': dbd, 'Wxh': dWxh, 'bxh': dbxh}
-
-
-    def batchForward(self, batch, params, predict_mode=False):
-        """batch Forward & Backward Pass"""
-        caches = []
-        Ys = []
-        for i, x in enumerate(batch):
-            Xs = np.array([x['cur_states']], dtype=float)
-
-            Y, out_cache = self.fwdPass(Xs, params, predict_mode=predict_mode)
-            caches.append(out_cache)
-            Ys.append(Y)
-
-        # back up information for efficient backprop
-        cache = {}
-        if not predict_mode:
-            cache['caches'] = caches
-
-        return Ys, cache
-
-    def batchDoubleForward(self, batch, params, clone_dqn, predict_mode=False):
-        caches = []
-        Ys = []
-        tYs = []
+            self.sess = sess
+        if kwargs.get('output_graph'):
+            tf.summary.FileWriter("logs/", self.sess.graph)
+        self.cost_his = []
+
+    def _build_net(self):
+        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+            with tf.variable_scope('l1'):
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
+
+            with tf.variable_scope('l2'):
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                out = tf.matmul(l1, w2) + b2
+            return out
+        # ------------------ build evaluate_net ------------------
+        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
+        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
+
+        with tf.variable_scope('eval_net'):
+            c_names, n_l1, w_initializer, b_initializer = \
+                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], self.hidden_size, \
+                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
+
+            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+
+        with tf.variable_scope('loss'):
+            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
+        with tf.variable_scope('train'):
+            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
+
+        # ------------------ build target_net ------------------
+        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
+        with tf.variable_scope('target_net'):
+            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
+
+            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+
+    def choose_action(self, observation):
+        if len(observation.shape)!=2:
+            observation = observation[np.newaxis, :]
+
+        actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
+        action = np.argmax(actions_value)
+
+        if not hasattr(self, 'q'):  # record action value it gets
+            self.q = []
+            self.running_q = 0
+        self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value)
+        self.q.append(self.running_q)
+
+        if np.random.uniform() > self.epsilon:  # choosing action
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def replace_target_params(self):
+        t_params = tf.get_collection('target_net_params')
+        e_params = tf.get_collection('eval_net_params')
+        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+
+        print('\ntarget_params_replaced\n')
+
+    def learn(self, batch_memory):
+        batch_memory = np.array(batch_memory)
+        q_next, q_eval4next = self.sess.run(
+            [self.q_next, self.q_eval],
+            feed_dict={self.s_: batch_memory[:, -self.n_features:],    # next observation
+                       self.s: batch_memory[:, -self.n_features:]})    # next observation
+        q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
+
+        q_target = q_eval.copy()
+
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory[:, self.n_features].astype(int)
+        reward = batch_memory[:, self.n_features + 1]
+
+        if self.double_q:
+            max_act4next = np.argmax(q_eval4next, axis=1)        # the action that brings the highest value is evaluated by q_eval
+            selected_q_next = q_next[batch_index, max_act4next]  # Double DQN, select q_next depending on above actions
+        else:
+            selected_q_next = np.max(q_next, axis=1)    # the natural DQN
 
-        for i, x in enumerate(batch):
-            Xs = x[0]
-            Y, out_cache = self.fwdPass(Xs, params, predict_mode=predict_mode)
-            caches.append(out_cache)
-            Ys.append(Y)
+        q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
 
-            tXs = x[3]
-            tY, t_cache = clone_dqn.fwdPass(tXs, params, predict_mode=False)
+        _, self.cost = self.sess.run([self._train_op, self.loss],
+                                     feed_dict={self.s: batch_memory[:, :self.n_features],
+                                                self.q_target: q_target})
+        self.cost_his.append(self.cost)
 
-            tYs.append(tY)
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        return {'cost': {'total_cost': self.cost}}
 
-        # back up information for efficient backprop
-        cache = {}
-        if not predict_mode:
-            cache['caches'] = caches
+    def save(self, filepath):
+        self.kwargs['sess'] = None
+        pickle.dump(self.kwargs, open(filepath + '.params', 'wb'))
 
-        return Ys, cache, tYs
+        saver = tf.train.Saver(max_to_keep=1)
+        # Save model weights to disk
+        save_path= saver.save(self.sess, filepath, global_step=0)
+        print('Save path = {}'.format(save_path))
 
-    def batchBackward(self, dY, cache):
-        caches = cache['caches']
+    @classmethod
+    def load(self, filepath):
+        sess = tf.Session()
+        sess.run(tf.global_variables_initializer())
 
-        grads = {}
-        for i in range(len(caches)):
-            single_cache = caches[i]
-            local_grads = self.bwdPass(dY[i], single_cache)
-            mergeDicts(grads, local_grads)  # add up the gradients wrt model parameters
+        constructor_params = pickle.load(open(filepath.split('.tf')[0]+'.tf.params', 'rb'))
+        constructor_params['sess'] = sess
+        result = DoubleDQN(**constructor_params)
+        saver = tf.train.Saver()
+        sess = saver.restore(sess, filepath)
+        return result
 
-        return grads
-
-    def costFunc(self, batch, params, clone_dqn):
-        """ cost function, returns cost and gradients for model """
-
-        regc = params.get('reg_cost', 1e-3)
-        gamma = params.get('gamma', 0.9)
-
-        # batch forward
-        Ys, caches, tYs = self.batchDoubleForward(batch, params, clone_dqn, predict_mode=False)
-
-        loss_cost = 0.0
-        dYs = []
-        for i, x in enumerate(batch):
-            Y = Ys[i]
-            nY = tYs[i]
-
-            action = np.array(x[1], dtype=int)
-            reward = np.array(x[2], dtype=float)
-
-            n_action = np.nanargmax(nY[0])
-            max_next_y = nY[0][n_action]
-
-            eposide_terminate = x[4]
-
-            target_y = reward
-            if eposide_terminate != True: target_y += gamma * max_next_y
-
-            pred_y = Y[0][action]
-
-            nY = np.zeros(nY.shape)
-            nY[0][action] = target_y
-            Y = np.zeros(Y.shape)
-            Y[0][action] = pred_y
-
-            # Cost Function
-            loss_cost += (target_y - pred_y) ** 2
-
-            dY = -(nY - Y)
-            # dY = np.minimum(dY, 1)
-            # dY = np.maximum(dY, -1)
-            dYs.append(dY)
-
-        # backprop the RNN
-        grads = self.batchBackward(dYs, caches)
-
-        # add L2 regularization cost and gradients
-        reg_cost = 0.0
-        if regc > 0:
-            for p in self.regularize:
-                mat = self.model[p]
-                reg_cost += 0.5 * regc * np.sum(mat * mat)
-                grads[p] += regc * mat
-
-        # normalize the cost and gradient by the batch size
-        batch_size = len(batch)
-        reg_cost /= batch_size
-        loss_cost /= batch_size
-        for k in grads: grads[k] /= batch_size
-
-        out = {}
-        out['cost'] = {'reg_cost': reg_cost, 'loss_cost': loss_cost, 'total_cost': loss_cost + reg_cost}
-        out['grads'] = grads
-        return out
-
-    """ A single batch """
-
-    def singleBatch(self, batch, params, clone_dqn):
-        learning_rate = params.get('learning_rate', 0.001)
-        decay_rate = params.get('decay_rate', 0.999)
-        momentum = params.get('momentum', 0.1)
-        grad_clip = params.get('grad_clip', -1e-3)
-        smooth_eps = params.get('smooth_eps', 1e-8)
-        sdg_type = params.get('sdgtype', 'rmsprop')
-        activation_func = params.get('activation_func', 'relu')
-
-        for u in self.update:
-            if not u in self.step_cache:
-                self.step_cache[u] = np.zeros(self.model[u].shape)
-
-        cg = self.costFunc(batch, params, clone_dqn)
-
-        cost = cg['cost']
-        grads = cg['grads']
-
-        # clip gradients if needed
-        if activation_func.lower() == 'relu':
-            if grad_clip > 0:
-                for p in self.update:
-                    if p in grads:
-                        grads[p] = np.minimum(grads[p], grad_clip)
-                        grads[p] = np.maximum(grads[p], -grad_clip)
-
-        # perform parameter update
-        for p in self.update:
-            if p in grads:
-                if sdg_type == 'vanilla':
-                    if momentum > 0:
-                        dx = momentum * self.step_cache[p] - learning_rate * grads[p]
-                    else:
-                        dx = -learning_rate * grads[p]
-                    self.step_cache[p] = dx
-                elif sdg_type == 'rmsprop':
-                    self.step_cache[p] = self.step_cache[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
-                    dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache[p] + smooth_eps)
-                elif sdg_type == 'adgrad':
-                    self.step_cache[p] += grads[p] ** 2
-                    dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache[p] + smooth_eps)
-
-                self.model[p] += dx
-
-        out = {}
-        out['cost'] = cost
-        return out
-
-    """ prediction """
-
-    def predict(self, Xs, params, **kwargs):
-        Ys, caches = self.fwdPass(Xs, params, predict_model=True)
-        pred_action = np.argmax(Ys)
-
-        return pred_action
diff --git a/src/draw_learning_curve.py b/src/draw_learning_curve.py
index 8077663..ad66d49 100644
--- a/src/draw_learning_curve.py
+++ b/src/draw_learning_curve.py
@@ -14,11 +14,15 @@
 def read_performance_records(path):
     """ load the performance score (.json) file """
 
-    data = json.load(open(path, 'rb'))
+    data = json.load(open(path, 'rt'))
+    numbers = {'x': [], 'success_rate': [], 'ave_turns': [], 'ave_rewards': []}
     for key in data['success_rate'].keys():
         if int(key) > -1:
+            numbers['x'].append(key)
             print("%s\t%s\t%s\t%s" % (key, data['success_rate'][key], data['ave_turns'][key], data['ave_reward'][key]))
+            numbers['success_rate'].append(data['success_rate'][key])
 
+    return numbers
 
 def load_performance_file(path):
     """ load the performance score (.json) file """
@@ -54,9 +58,10 @@ def main(params):
 
     if cmd == 0:
         numbers = load_performance_file(params['result_file'])
-        draw_learning_curve(numbers)
     elif cmd == 1:
-        read_performance_records(params['result_file'])
+        numbers = read_performance_records(params['result_file'])
+
+    draw_learning_curve(numbers)
 
 
 if __name__ == "__main__":
@@ -65,7 +70,7 @@ def main(params):
     parser.add_argument('--cmd', dest='cmd', type=int, default=1, help='cmd')
 
     parser.add_argument('--result_file', dest='result_file', type=str,
-                        default='./deep_dialog/checkpoints/rl_agent/11142016/noe2e/agt_9_performance_records.json',
+                        default='./deep_dialog/checkpoints/rl_agent/agt_9_performance_records.json',
                         help='path to the result file')
 
     args = parser.parse_args()
diff --git a/src/requirements.txt b/src/requirements.txt
index 9fdcb92..f6f42ea 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -1,2 +1,3 @@
 numpy
-fuzzywuzzy
\ No newline at end of file
+fuzzywuzzy
+tensorflow
\ No newline at end of file
diff --git a/src/run.py b/src/run.py
index 2e565ca..014be3c 100644
--- a/src/run.py
+++ b/src/run.py
@@ -4,37 +4,37 @@
 This should be a simple minimalist run file. It's only responsibility should be to parse the arguments (which agent,
 user simulator to use) and launch a dialog simulation.
 
-Rule-agent: python run.py --agt 6 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p
+Rule-agent: python run.py --agt 6 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json
 --run_mode 2
 
 movie_kb:
-movie_kb.1k.p: 94% success rate
-movie_kb.v2.p: 36% success rate
+movie_kb.1k.json: 94% success rate
+movie_kb.v2.json: 36% success rate
 
 user goal files:
-first turn: user_goals_first_turn_template.v2.p
-all turns: user_goals_all_turns_template.p
-user_goals_first_turn_template.part.movie.v1.p: a subset of user goal. [Please use this one, the upper bound success
+first turn: user_goals_first_turn_template.v2.json
+all turns: user_goals_all_turns_template.json
+user_goals_first_turn_template.part.movie.v1.json: a subset of user goal. [Please use this one, the upper bound success
 rate on movie_kb.1k.json is 0.9765.]
 
 Commands:
-Rule: python run.py --agt 5 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p
---goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p --intent_err_prob 0.00
+Rule: python run.py --agt 5 --usr 1 --max_turn 40 --episodes 150 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json
+--goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json --intent_err_prob 0.00
 --slot_err_prob 0.00 --episodes 500 --act_level 1 --run_mode 1
 
 Training:
-RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p --dqn_hidden_size 80
+RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json --dqn_hidden_size 80
 --experience_replay_pool_size 1000 --episodes 500 --simulation_epoch_size 100 --write_model_dir
 ./deep_dialog/checkpoints/rl_agent --run_mode 3 --act_level 0 --slot_err_prob 0.05 --intent_err_prob 0.00
---batch_size 16 --goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p --warm_start 1
+--batch_size 16 --goal_file_path ./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json --warm_start 1
 --warm_start_epochs 120
 
 Predict:
-RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.p --dqn_hidden_size 80
+RL: python run.py --agt 9 --usr 1 --max_turn 40 --movie_kb_path ./deep_dialog/data/movie_kb.1k.json --dqn_hidden_size 80
 --experience_replay_pool_size 1000 --episodes 300 --simulation_epoch_size 100 --write_model_dir
 ./deep_dialog/checkpoints/rl_agent --slot_err_prob 0.00 --intent_err_prob 0.00 --batch_size 16 --goal_file_path
-./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p --episodes 200 --trained_model_path
-./deep_dialog/checkpoints/rl_agent/agt_9_22_30_0.37000.p --run_mode 3
+./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json --episodes 200 --trained_model_path
+./deep_dialog/checkpoints/rl_agent/agt_9_0.79.tf-0 --run_mode 3 --train 0
 
 @author: xiul, t-zalipt
 """
@@ -72,16 +72,16 @@ def load_file(file_name):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument('--dict_path', dest='dict_path', type=str, default='./deep_dialog/data/dicts.v3.p',
+    parser.add_argument('--dict_path', dest='dict_path', type=str, default='./deep_dialog/data/dicts.v3.json',
                         help='path to the .json dictionary file')
-    parser.add_argument('--movie_kb_path', dest='movie_kb_path', type=str, default='./deep_dialog/data/movie_kb.1k.p',
+    parser.add_argument('--movie_kb_path', dest='movie_kb_path', type=str, default='./deep_dialog/data/movie_kb.1k.json',
                         help='path to the movie kb .json file')
     parser.add_argument('--act_set', dest='act_set', type=str, default='./deep_dialog/data/dia_acts.txt',
                         help='path to dia act set; none for loading from labeled file')
     parser.add_argument('--slot_set', dest='slot_set', type=str, default='./deep_dialog/data/slot_set.txt',
                         help='path to slot set; none for loading from labeled file')
     parser.add_argument('--goal_file_path', dest='goal_file_path', type=str,
-                        default='./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.p',
+                        default='./deep_dialog/data/user_goals_first_turn_template.part.movie.v1.json',
                         help='a list of user goals')
     parser.add_argument('--diaact_nl_pairs', dest='diaact_nl_pairs', type=str,
                         default='./deep_dialog/data/dia_act_nl_pairs.v6.json',
@@ -145,6 +145,9 @@ def load_file(file_name):
     parser.add_argument('--save_check_point', dest='save_check_point', type=int, default=10,
                         help='number of epochs for saving model')
 
+    parser.add_argument('--train', dest='train', type=int, default=1,
+                        help='0 - predict, 1 - train')
+
     parser.add_argument('--success_rate_threshold', dest='success_rate_threshold', type=float, default=0.3,
                         help='the threshold for success rate')
 
@@ -160,6 +163,8 @@ def load_file(file_name):
 agt = params['agt']
 usr = params['usr']
 
+is_training_mode = params['train']
+
 dict_path = params['dict_path']
 goal_file_path = params['goal_file_path']
 
@@ -282,7 +287,7 @@ def load_file(file_name):
 """ Best Model and Performance Records """
 best_model = {}
 best_res = {'success_rate': 0, 'ave_reward': float('-inf'), 'ave_turns': float('inf'), 'epoch': 0}
-best_model['model'] = copy.deepcopy(agent)
+best_model['model'] = agent
 best_res['success_rate'] = 0
 
 performance_records = {'success_rate': {}, 'ave_turns': {}, 'ave_reward': {}}
@@ -291,13 +296,13 @@ def load_file(file_name):
 def save_model(path, agt, success_rate, agent, best_epoch, cur_epoch):
     """ Save model """
 
-    filename = 'agt_%s_%s_%s_%.5f.p' % (agt, best_epoch, cur_epoch, success_rate)
+    filename = 'agt_%s_%.2f.tf' % (agt, success_rate)
     filepath = os.path.join(path, filename)
     checkpoint = {}
-    if agt == 9: checkpoint['model'] = copy.deepcopy(agent.dqn.model)
+    if agt == 9: checkpoint['model'] = agent.dqn
     checkpoint['params'] = params
     try:
-        pickle.dump(checkpoint, open(filepath, "wb"))
+        agent.dqn.save(filepath)
         print('saved model in %s' % (filepath,))
     except Exception as e:
         print('Error: Writing model fails: %s' % (filepath,))
@@ -310,7 +315,7 @@ def save_performance_records(path, agt, records):
     filename = 'agt_%s_performance_records.json' % (agt)
     filepath = os.path.join(path, filename)
     try:
-        json.dump(records, open(filepath, "wb"))
+        json.dump(records, open(filepath, "w"))
         print('saved model in %s' % (filepath,))
     except Exception as e:
         print('Error: Writing model fails: %s' % (filepath,))
@@ -372,9 +377,9 @@ def warm_start_simulation():
             break
 
     agent.warm_start = 2
-    res['success_rate'] = float(successes) / simulation_epoch_size
-    res['ave_reward'] = float(cumulative_reward) / simulation_epoch_size
-    res['ave_turns'] = float(cumulative_turns) / simulation_epoch_size
+    res['success_rate'] = float(successes) / warm_start_epochs
+    res['ave_reward'] = float(cumulative_reward) / warm_start_epochs
+    res['ave_turns'] = float(cumulative_turns) / warm_start_epochs
     print("Warm_Start %s epochs, success rate %s, ave reward %s, ave turns %s"
           % (episode + 1, res['success_rate'], res['ave_reward'], res['ave_turns']))
     print("Current experience replay buffer size %s" % (len(agent.experience_replay_pool)))
@@ -390,6 +395,9 @@ def run_episodes(count, status):
         warm_start_simulation()
         print('warm_start finished, start RL training ...')
 
+    if agt == 9 and params['trained_model_path']:
+        agent.warm_start = 2
+
     for episode in range(count):
         print("Episode: %s" % episode)
         dialog_manager.initialize_episode()
@@ -409,7 +417,7 @@ def run_episodes(count, status):
                 cumulative_turns += dialog_manager.state_tracker.turn_count
 
         # simulation
-        if agt == 9 and not params['trained_model_path']:
+        if agt ==9 and is_training_mode:
             agent.predict_mode = True
             simulation_res = simulation_epoch(simulation_epoch_size)
 
@@ -418,18 +426,21 @@ def run_episodes(count, status):
             performance_records['ave_reward'][episode] = simulation_res['ave_reward']
 
             if simulation_res['success_rate'] >= best_res['success_rate']:
-                if simulation_res['success_rate'] >= success_rate_threshold:  # threshold = 0.30
-                    agent.experience_replay_pool = []
-                    simulation_epoch(simulation_epoch_size)
+                if simulation_res['success_rate'] >= success_rate_threshold:  # threshold = 0.3
+                   #agent.experience_replay_pool = []
+                   dialog_manager.current_reward_function = dialog_manager.reward_function
+                   simulation_epoch(simulation_epoch_size)
 
             if simulation_res['success_rate'] > best_res['success_rate']:
-                best_model['model'] = copy.deepcopy(agent)
+                best_model['model'] = agent
                 best_res['success_rate'] = simulation_res['success_rate']
                 best_res['ave_reward'] = simulation_res['ave_reward']
                 best_res['ave_turns'] = simulation_res['ave_turns']
                 best_res['epoch'] = episode
 
-            agent.clone_dqn = copy.deepcopy(agent.dqn)
+                save_model(params['write_model_dir'], agt, best_res['success_rate'], best_model['model'],
+                           best_res['epoch'], episode)
+
             agent.train(batch_size, 1)
             agent.predict_mode = False
 
@@ -438,8 +449,6 @@ def run_episodes(count, status):
                      performance_records['ave_turns'][episode], best_res['success_rate']))
             if episode % save_check_point == 0 \
                     and not params['trained_model_path']:  # save the model every 10 episodes
-                save_model(params['write_model_dir'], agt, best_res['success_rate'], best_model['model'],
-                           best_res['epoch'], episode)
                 save_performance_records(params['write_model_dir'], agt, performance_records)
 
         print("Progress: %s / %s, Success rate: %s / %s Avg reward: %.2f Avg turns: %.2f"
@@ -451,8 +460,6 @@ def run_episodes(count, status):
     status['count'] += count
 
     if agt == 9 and not params['trained_model_path']:
-        save_model(params['write_model_dir'], agt, float(successes) / count, best_model['model'], best_res['epoch'],
-                   count)
         save_performance_records(params['write_model_dir'], agt, performance_records)
 
 
diff --git a/src/telegram_bot/bot.py b/src/telegram_bot/bot.py
index 38abd83..dee7297 100644
--- a/src/telegram_bot/bot.py
+++ b/src/telegram_bot/bot.py
@@ -7,12 +7,12 @@
 import json
 # from deep_dialog.agents import AgentDQN
 # from deep_dialog.agents import agent_baselines
-from deep_dialog.nlg import nlg
-from deep_dialog.nlu import nlu
-from deep_dialog.usersims.realUser import RealUser
-from telegramDialogClasses import TelegramDialogManager
-from telegramDialogClasses import RuleAgent
-from deep_dialog.dialog_system.dict_reader import text_to_dict
+from src.deep_dialog.nlg import nlg
+from src.deep_dialog.nlu import nlu
+from src.deep_dialog.usersims.realUser import RealUser
+from src.telegram_bot.telegramDialogClasses import TelegramDialogManager
+from src.telegram_bot.telegramDialogClasses import RuleAgent
+from src.deep_dialog.dialog_system.dict_reader import text_to_dict
 import telebot
 import cherrypy
 
@@ -31,7 +31,7 @@ def load_file(file_name):
 
 WEBHOOKS_AVAIL = False
 
-config = load_file("/Users/fogside/Projects/Telegram_bot/RL-Dialog-Bot/src/telegram_bot/config.json")
+config = load_file("config.json")
 bot = telebot.TeleBot(config["token"])
 
 turn_count = 0
@@ -74,25 +74,24 @@ def get_random_emoji(num = 1):
 #########  --****---    End of initialization ---****--   ###########
 ######### --****--- Next code is for debugging ---****-- ############
 
-#
-# dia_manager.initialize_episode()
-# turn_count+=1
-#
-# print("Hello! I can help you to buy tickets to the cinema.\nWhat film would you like to watch?")
-#
-# while(turn_count > 0):
-#     msg = input()
-#     episode_over, agent_ans = dia_manager.next_turn(msg)
-#     turn_count+=1
-#     # bot.send_message(msg, agent_ans+' ' + get_random_emoji(1))
-#     print("turn #{}: {}".format(turn_count, agent_ans))
-#     if episode_over:
-#         turn_count = 0
-#     if msg == 'stop':
-#         turn_count = 0
-#
-# exit(0)
-#
+
+dia_manager.initialize_episode()
+turn_count+=1
+
+print("Hello! I can help you to buy tickets to the cinema.\nWhat film would you like to watch?")
+
+while(turn_count > 0):
+    msg = input()
+    episode_over, agent_ans = dia_manager.next_turn(msg)
+    turn_count+=1
+    print("turn #{}: {}".format(turn_count, agent_ans))
+    if episode_over:
+        turn_count = 0
+    if msg == 'stop':
+        turn_count = 0
+
+exit(0)
+
 
 
 ####### --** End of Debugging **-- ########