Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 22 additions & 17 deletions docs/Using-Tensorboard.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,41 +40,46 @@ The ML-Agents training program saves the following statistics:

![Example TensorBoard Run](images/mlagents-TensorBoard.png)

* Lesson - Plots the progress from lesson to lesson. Only interesting when
### Environment Statistics

* `Environment/Lesson` - Plots the progress from lesson to lesson. Only interesting when
performing [curriculum training](Training-Curriculum-Learning.md).

* Cumulative Reward - The mean cumulative episode reward over all agents. Should
* `Environment/Cumulative Reward` - The mean cumulative episode reward over all agents. Should
increase during a successful training session.

* `Environment/Episode Length` - The mean length of each episode in the environment for all agents.

### Policy Statistics

* Entropy - How random the decisions of the model are. Should slowly decrease
* `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.

* Episode Length - The mean length of each episode in the environment for all
agents.

* Learning Rate - How large a step the training algorithm takes as it searches
* `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.

* `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited by the agent. Should increase during a successful training session.

* `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean cumulative intrinsic reward generated per-episode.

* Policy Loss - The mean magnitude of policy loss function. Correlates to how
### Learning Loss Functions

* `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function. Correlates to how
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.

* Value Estimate - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.

* Value Loss - The mean loss of the value function update. Correlates to how
* `Losses/Value Loss` (PPO) - The mean loss of the value function update. Correlates to how
well the model is able to predict the value of each state. This should
increase while the agent is learning, and then decrease once the reward
stabilizes.

* _(Curiosity-Specific)_ Intrinsic Reward - This corresponds to the mean
cumulative intrinsic reward generated per-episode.

* _(Curiosity-Specific)_ Forward Loss - The mean magnitude of the inverse model
* `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse model
loss function. Corresponds to how well the model is able to predict the new
observation encoding.

* _(Curiosity-Specific)_ Inverse Loss - The mean magnitude of the forward model
* `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward model
loss function. Corresponds to how well the model is able to predict the action
taken between two observations.

* `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.
Binary file modified docs/images/mlagents-TensorBoard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 9 additions & 8 deletions ml-agents/mlagents/trainers/bc/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from mlagents.envs import AllBrainInfo
from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.trainer import UnityTrainerException, Trainer
from mlagents.trainers.trainer import Trainer

logger = logging.getLogger("mlagents.trainers")

Expand All @@ -33,7 +33,8 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
self.n_sequences = 1
self.cumulative_rewards = {}
self.episode_steps = {}
self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []}
self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [],
'Environment/Cumulative Reward': []}

self.summary_path = trainer_parameters['summary_path']
self.batches_per_epoch = trainer_parameters['batches_per_epoch']
Expand Down Expand Up @@ -73,8 +74,8 @@ def get_last_reward(self):
Returns the last reward the trainer has had
:return: the new last reward
"""
if len(self.stats['cumulative_reward']) > 0:
return np.mean(self.stats['cumulative_reward'])
if len(self.stats['Environment/Cumulative Reward']) > 0:
return np.mean(self.stats['Environment/Cumulative Reward'])
else:
return 0

Expand Down Expand Up @@ -142,9 +143,9 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf
for l in range(len(info_student.agents)):
if info_student.local_done[l]:
agent_id = info_student.agents[l]
self.stats['cumulative_reward'].append(
self.stats['Environment/Cumulative Reward'].append(
self.cumulative_rewards.get(agent_id, 0))
self.stats['episode_length'].append(
self.stats['Environment/Episode Length'].append(
self.episode_steps.get(agent_id, 0))
self.cumulative_rewards[agent_id] = 0
self.episode_steps[agent_id] = 0
Expand Down Expand Up @@ -184,6 +185,6 @@ def update_policy(self):
loss = run_out['policy_loss']
batch_losses.append(loss)
if len(batch_losses) > 0:
self.stats['losses'].append(np.mean(batch_losses))
self.stats['Losses/Cloning Loss'].append(np.mean(batch_losses))
else:
self.stats['losses'].append(0)
self.stats['Losses/Cloning Loss'].append(0)
37 changes: 19 additions & 18 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from mlagents.envs import AllBrainInfo, BrainInfo
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.trainer import UnityTrainerException, Trainer
from mlagents.trainers.trainer import Trainer

logger = logging.getLogger("mlagents.trainers")

Expand Down Expand Up @@ -42,12 +42,13 @@ def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, s
self.policy = PPOPolicy(seed, brain, trainer_parameters,
self.is_training, load)

stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
stats = {'Environment/Cumulative Reward': [], 'Environment/Episode Length': [],
'Policy/Value Estimate': [], 'Policy/Entropy': [], 'Losses/Value Loss': [],
'Losses/Policy Loss': [], 'Policy/Learning Rate': []}
if self.use_curiosity:
stats['forward_loss'] = []
stats['inverse_loss'] = []
stats['intrinsic_reward'] = []
stats['Losses/Forward Loss'] = []
stats['Losses/Inverse Loss'] = []
stats['Policy/Curiosity Reward'] = []
self.intrinsic_rewards = {}
self.stats = stats

Expand Down Expand Up @@ -102,8 +103,8 @@ def increment_step_and_update_last_reward(self):
"""
Increment the step count of the trainer and Updates the last reward
"""
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
if len(self.stats['Environment/Cumulative Reward']) > 0:
mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
self.policy.update_reward(mean_reward)
self.policy.increment_step()
self.step = self.policy.get_current_step()
Expand All @@ -120,9 +121,9 @@ def take_action(self, all_brain_info: AllBrainInfo):
return [], [], [], None, None

run_out = self.policy.evaluate(curr_brain_info)
self.stats['value_estimate'].append(run_out['value'].mean())
self.stats['entropy'].append(run_out['entropy'].mean())
self.stats['learning_rate'].append(run_out['learning_rate'])
self.stats['Policy/Value Estimate'].append(run_out['value'].mean())
self.stats['Policy/Entropy'].append(run_out['entropy'].mean())
self.stats['Policy/Learning Rate'].append(run_out['learning_rate'])
if self.policy.use_recurrent:
return run_out['action'], run_out['memory_out'], None, \
run_out['value'], run_out
Expand Down Expand Up @@ -286,15 +287,15 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo

self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
self.stats['cumulative_reward'].append(
self.stats['Environment/Cumulative Reward'].append(
self.cumulative_rewards.get(agent_id, 0))
self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))
self.stats['episode_length'].append(
self.stats['Environment/Episode Length'].append(
self.episode_steps.get(agent_id, 0))
self.cumulative_rewards[agent_id] = 0
self.episode_steps[agent_id] = 0
if self.use_curiosity:
self.stats['intrinsic_reward'].append(
self.stats['Policy/Curiosity Reward'].append(
self.intrinsic_rewards.get(agent_id, 0))
self.intrinsic_rewards[agent_id] = 0

Expand Down Expand Up @@ -342,11 +343,11 @@ def update_policy(self):
if self.use_curiosity:
inverse_total.append(run_out['inverse_loss'])
forward_total.append(run_out['forward_loss'])
self.stats['value_loss'].append(np.mean(value_total))
self.stats['policy_loss'].append(np.mean(policy_total))
self.stats['Losses/Value Loss'].append(np.mean(value_total))
self.stats['Losses/Policy Loss'].append(np.mean(policy_total))
if self.use_curiosity:
self.stats['forward_loss'].append(np.mean(forward_total))
self.stats['inverse_loss'].append(np.mean(inverse_total))
self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
self.training_buffer.reset_update_buffer()


Expand Down
10 changes: 5 additions & 5 deletions ml-agents/mlagents/trainers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,12 @@ def write_summary(self, global_step, lesson_num=0):
"""
if global_step % self.trainer_parameters['summary_freq'] == 0 and global_step != 0:
is_training = "Training." if self.is_training and self.get_step <= self.get_max_steps else "Not Training."
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
if len(self.stats['Environment/Cumulative Reward']) > 0:
mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
logger.info(" {}: {}: Step: {}. Mean Reward: {:0.3f}. Std of Reward: {:0.3f}. {}"
.format(self.run_id, self.brain_name,
min(self.get_step, self.get_max_steps),
mean_reward, np.std(self.stats['cumulative_reward']),
mean_reward, np.std(self.stats['Environment/Cumulative Reward']),
is_training))
else:
logger.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
Expand All @@ -174,9 +174,9 @@ def write_summary(self, global_step, lesson_num=0):
for key in self.stats:
if len(self.stats[key]) > 0:
stat_mean = float(np.mean(self.stats[key]))
summary.value.add(tag='Info/{}'.format(key), simple_value=stat_mean)
summary.value.add(tag='{}'.format(key), simple_value=stat_mean)
self.stats[key] = []
summary.value.add(tag='Info/Lesson', simple_value=lesson_num)
summary.value.add(tag='Environment/Lesson', simple_value=lesson_num)
self.summary_writer.add_summary(summary, self.get_step)
self.summary_writer.flush()

Expand Down