Unity-Technologies · awjuliani · Oct 10, 2018 · Sep 20, 2018 · Oct 4, 2018 · Oct 4, 2018
diff --git a/docs/Using-Tensorboard.md b/docs/Using-Tensorboard.md
@@ -40,41 +40,46 @@ The ML-Agents training program saves the following statistics:
 
 ![Example TensorBoard Run](images/mlagents-TensorBoard.png)
 
-* Lesson - Plots the progress from lesson to lesson. Only interesting when
+### Environment Statistics
+
+* `Environment/Lesson` - Plots the progress from lesson to lesson. Only interesting when
   performing [curriculum training](Training-Curriculum-Learning.md).
 
-* Cumulative Reward - The mean cumulative episode reward over all agents. Should
+* `Environment/Cumulative Reward` - The mean cumulative episode reward over all agents. Should
   increase during a successful training session.
+
+* `Environment/Episode Length` - The mean length of each episode in the environment for all agents.
+
+### Policy Statistics
 
-* Entropy - How random the decisions of the model are. Should slowly decrease
+* `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should slowly decrease
   during a successful training process. If it decreases too quickly, the `beta`
   hyperparameter should be increased.
 
-* Episode Length - The mean length of each episode in the environment for all
-  agents.
-
-* Learning Rate - How large a step the training algorithm takes as it searches
+* `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm takes as it searches
   for the optimal policy. Should decrease over time.
+
+* `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited by the agent. Should increase during a successful training session.
+
+* `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean cumulative intrinsic reward generated per-episode.
 
-* Policy Loss - The mean magnitude of policy loss function. Correlates to how
+### Learning Loss Functions
+
+* `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function. Correlates to how
   much the policy (process for deciding actions) is changing. The magnitude of
   this should decrease during a successful training session.
 
-* Value Estimate - The mean value estimate for all states visited by the agent.
-  Should increase during a successful training session.
-
-* Value Loss - The mean loss of the value function update. Correlates to how
+* `Losses/Value Loss` (PPO) - The mean loss of the value function update. Correlates to how
   well the model is able to predict the value of each state. This should
   increase while the agent is learning, and then decrease once the reward
   stabilizes.
 
-* _(Curiosity-Specific)_ Intrinsic Reward - This corresponds to the mean
-  cumulative intrinsic reward generated per-episode.
-
-* _(Curiosity-Specific)_ Forward Loss - The mean magnitude of the inverse model
+* `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse model
   loss function. Corresponds to how well the model is able to predict the new
   observation encoding.
 
-* _(Curiosity-Specific)_ Inverse Loss - The mean magnitude of the forward model
+* `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward model
   loss function. Corresponds to how well the model is able to predict the action
   taken between two observations.
+
+* `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.
diff --git a/docs/images/mlagents-TensorBoard.png b/docs/images/mlagents-TensorBoard.png
diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -11,7 +11,7 @@
 from mlagents.envs import AllBrainInfo
 from mlagents.trainers.bc.policy import BCPolicy
 from mlagents.trainers.buffer import Buffer
-from mlagents.trainers.trainer import UnityTrainerException, Trainer
+from mlagents.trainers.trainer import Trainer
 
 logger = logging.getLogger("mlagents.trainers")
 
@@ -33,7 +33,8 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
         self.n_sequences = 1
         self.cumulative_rewards = {}
         self.episode_steps = {}
-        self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []}
+        self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [],
+                      'Environment/Cumulative Reward': []}
 
         self.summary_path = trainer_parameters['summary_path']
         self.batches_per_epoch = trainer_parameters['batches_per_epoch']
@@ -73,8 +74,8 @@ def get_last_reward(self):
         Returns the last reward the trainer has had
         :return: the new last reward
         """
-        if len(self.stats['cumulative_reward']) > 0:
-            return np.mean(self.stats['cumulative_reward'])
+        if len(self.stats['Environment/Cumulative Reward']) > 0:
+            return np.mean(self.stats['Environment/Cumulative Reward'])
         else:
             return 0
 
@@ -142,9 +143,9 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf
         for l in range(len(info_student.agents)):
             if info_student.local_done[l]:
                 agent_id = info_student.agents[l]
-                self.stats['cumulative_reward'].append(
+                self.stats['Environment/Cumulative Reward'].append(
                     self.cumulative_rewards.get(agent_id, 0))
-                self.stats['episode_length'].append(
+                self.stats['Environment/Episode Length'].append(
                     self.episode_steps.get(agent_id, 0))
                 self.cumulative_rewards[agent_id] = 0
                 self.episode_steps[agent_id] = 0
@@ -184,6 +185,6 @@ def update_policy(self):
             loss = run_out['policy_loss']
             batch_losses.append(loss)
         if len(batch_losses) > 0:
-            self.stats['losses'].append(np.mean(batch_losses))
+            self.stats['Losses/Cloning Loss'].append(np.mean(batch_losses))
         else:
-            self.stats['losses'].append(0)
+            self.stats['Losses/Cloning Loss'].append(0)
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -12,7 +12,7 @@
 from mlagents.envs import AllBrainInfo, BrainInfo
 from mlagents.trainers.buffer import Buffer
 from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.trainer import UnityTrainerException, Trainer
+from mlagents.trainers.trainer import Trainer
 
 logger = logging.getLogger("mlagents.trainers")
 
@@ -42,12 +42,13 @@ def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, s
         self.policy = PPOPolicy(seed, brain, trainer_parameters,
                                 self.is_training, load)
 
-        stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
-                 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
+        stats = {'Environment/Cumulative Reward': [], 'Environment/Episode Length': [],
+                 'Policy/Value Estimate': [], 'Policy/Entropy': [], 'Losses/Value Loss': [],
+                 'Losses/Policy Loss': [], 'Policy/Learning Rate': []}
         if self.use_curiosity:
-            stats['forward_loss'] = []
-            stats['inverse_loss'] = []
-            stats['intrinsic_reward'] = []
+            stats['Losses/Forward Loss'] = []
+            stats['Losses/Inverse Loss'] = []
+            stats['Policy/Curiosity Reward'] = []
             self.intrinsic_rewards = {}
         self.stats = stats
 
@@ -102,8 +103,8 @@ def increment_step_and_update_last_reward(self):
         """
         Increment the step count of the trainer and Updates the last reward
         """
-        if len(self.stats['cumulative_reward']) > 0:
-            mean_reward = np.mean(self.stats['cumulative_reward'])
+        if len(self.stats['Environment/Cumulative Reward']) > 0:
+            mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
             self.policy.update_reward(mean_reward)
         self.policy.increment_step()
         self.step = self.policy.get_current_step()
@@ -120,9 +121,9 @@ def take_action(self, all_brain_info: AllBrainInfo):
             return [], [], [], None, None
 
         run_out = self.policy.evaluate(curr_brain_info)
-        self.stats['value_estimate'].append(run_out['value'].mean())
-        self.stats['entropy'].append(run_out['entropy'].mean())
-        self.stats['learning_rate'].append(run_out['learning_rate'])
+        self.stats['Policy/Value Estimate'].append(run_out['value'].mean())
+        self.stats['Policy/Entropy'].append(run_out['entropy'].mean())
+        self.stats['Policy/Learning Rate'].append(run_out['learning_rate'])
         if self.policy.use_recurrent:
             return run_out['action'], run_out['memory_out'], None, \
                    run_out['value'], run_out
@@ -286,15 +287,15 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
 
                 self.training_buffer[agent_id].reset_agent()
                 if info.local_done[l]:
-                    self.stats['cumulative_reward'].append(
+                    self.stats['Environment/Cumulative Reward'].append(
                         self.cumulative_rewards.get(agent_id, 0))
                     self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))
-                    self.stats['episode_length'].append(
+                    self.stats['Environment/Episode Length'].append(
                         self.episode_steps.get(agent_id, 0))
                     self.cumulative_rewards[agent_id] = 0
                     self.episode_steps[agent_id] = 0
                     if self.use_curiosity:
-                        self.stats['intrinsic_reward'].append(
+                        self.stats['Policy/Curiosity Reward'].append(
                             self.intrinsic_rewards.get(agent_id, 0))
                         self.intrinsic_rewards[agent_id] = 0
 
@@ -342,11 +343,11 @@ def update_policy(self):
                 if self.use_curiosity:
                     inverse_total.append(run_out['inverse_loss'])
                     forward_total.append(run_out['forward_loss'])
-        self.stats['value_loss'].append(np.mean(value_total))
-        self.stats['policy_loss'].append(np.mean(policy_total))
+        self.stats['Losses/Value Loss'].append(np.mean(value_total))
+        self.stats['Losses/Policy Loss'].append(np.mean(policy_total))
         if self.use_curiosity:
-            self.stats['forward_loss'].append(np.mean(forward_total))
-            self.stats['inverse_loss'].append(np.mean(inverse_total))
+            self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
+            self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
         self.training_buffer.reset_update_buffer()
 
 

diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py
@@ -160,12 +160,12 @@ def write_summary(self, global_step, lesson_num=0):
         """
         if global_step % self.trainer_parameters['summary_freq'] == 0 and global_step != 0:
             is_training = "Training." if self.is_training and self.get_step <= self.get_max_steps else "Not Training."
-            if len(self.stats['cumulative_reward']) > 0:
-                mean_reward = np.mean(self.stats['cumulative_reward'])
+            if len(self.stats['Environment/Cumulative Reward']) > 0:
+                mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
                 logger.info(" {}: {}: Step: {}. Mean Reward: {:0.3f}. Std of Reward: {:0.3f}. {}"
                             .format(self.run_id, self.brain_name,
                                     min(self.get_step, self.get_max_steps),
-                                    mean_reward, np.std(self.stats['cumulative_reward']),
+                                    mean_reward, np.std(self.stats['Environment/Cumulative Reward']),
                                     is_training))
             else:
                 logger.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
@@ -174,9 +174,9 @@ def write_summary(self, global_step, lesson_num=0):
             for key in self.stats:
                 if len(self.stats[key]) > 0:
                     stat_mean = float(np.mean(self.stats[key]))
-                    summary.value.add(tag='Info/{}'.format(key), simple_value=stat_mean)
+                    summary.value.add(tag='{}'.format(key), simple_value=stat_mean)
                     self.stats[key] = []
-            summary.value.add(tag='Info/Lesson', simple_value=lesson_num)
+            summary.value.add(tag='Environment/Lesson', simple_value=lesson_num)
             self.summary_writer.add_summary(summary, self.get_step)
             self.summary_writer.flush()