diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index fa0c3870ad..9e69cdfb28 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to ### Bug Fixes #### com.unity.ml-agents / com.unity.ml-agents.extensions (C#) #### ml-agents / ml-agents-envs / gym-unity (Python) +- Fixed a bug in multi-agent cooperative training where agents might not receive all of the states of +terminated teammates. (#5441) ## [2.1.0-exp.1] - 2021-06-09 ### Minor Changes diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py index 8c44b11b4d..3702e44ada 100644 --- a/ml-agents/mlagents/trainers/agent_processor.py +++ b/ml-agents/mlagents/trainers/agent_processor.py @@ -122,8 +122,6 @@ def add_experiences( self._process_step( terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id] ) - # Clear the last seen group obs when agents die. - self._clear_group_status_and_obs(global_id) # Iterate over all the decision steps, first gather all the group obs # and then create the trajectories. _add_to_group_status @@ -135,6 +133,12 @@ def add_experiences( self._process_step( ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id] ) + # Clear the last seen group obs when agents die, but only after all of the group + # statuses were added to the trajectory. + for terminal_step in terminal_steps.values(): + local_id = terminal_step.agent_id + global_id = get_global_agent_id(worker_id, local_id) + self._clear_group_status_and_obs(global_id) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py index cba79ecc4f..6658e6bb43 100644 --- a/ml-agents/mlagents/trainers/tests/mock_brain.py +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple import numpy as np from mlagents.trainers.buffer import AgentBuffer, AgentBufferKey @@ -21,6 +21,7 @@ def create_mock_steps( action_spec: ActionSpec, done: bool = False, grouped: bool = False, + agent_ids: Optional[List[int]] = None, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. @@ -43,7 +44,10 @@ def create_mock_steps( reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) - agent_id = np.arange(num_agents, dtype=np.int32) + if agent_ids is not None: + agent_id = np.array(agent_ids, dtype=np.int32) + else: + agent_id = np.arange(num_agents, dtype=np.int32) _gid = 1 if grouped else 0 group_id = np.array(num_agents * [_gid], dtype=np.int32) group_reward = np.array(num_agents * [0.0], dtype=np.float32) diff --git a/ml-agents/mlagents/trainers/tests/test_agent_processor.py b/ml-agents/mlagents/trainers/tests/test_agent_processor.py index b0c446e974..4d38b42f32 100644 --- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py +++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py @@ -137,32 +137,54 @@ def test_group_statuses(): ) # Make terminal steps for some dead agents - mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps( + _, mock_terminal_steps_2 = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=True, grouped=True, + agent_ids=[2, 3], + ) + # Make decision steps continue for other agents + mock_decision_steps_2, _ = mb.create_mock_steps( + num_agents=2, + observation_specs=create_observation_specs_with_shapes([(8,)]), + action_spec=ActionSpec.create_continuous(2), + done=False, + grouped=True, + agent_ids=[0, 1], ) processor.add_experiences( mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info ) - fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) + # Continue to add for remaining live agents + fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id) for _ in range(3): processor.add_experiences( - mock_decision_steps, mock_terminal_steps, 0, fake_action_info + mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info ) # Assert that four trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 4 - # Last trajectory should be the longest + + # Get the first trajectory, which should have been agent 2 (one of the killed agents) trajectory = tqueue.put.call_args_list[0][0][-1] + assert len(trajectory.steps) == 3 + # Make sure trajectory has the right Groupmate Experiences. + # All three steps should contain all agents + for step in trajectory.steps: + assert len(step.group_status) == 3 + + # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents. + trajectory = tqueue.put.call_args_list[-1][0][-1] + assert len(trajectory.steps) == 5 - # Make sure trajectory has the right Groupmate Experiences + # Make sure trajectory has the right Groupmate Experiences. + # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents) for step in trajectory.steps[0:3]: assert len(step.group_status) == 3 - # After 2 agents has died + # After 2 agents has died, there should only be 1 group status. for step in trajectory.steps[3:]: assert len(step.group_status) == 1