Develop environment bc fix and doc update (#1317)

xiaomaogy · web-flow · commit c830dc1b6334 · 2018-10-11T10:22:55.000-07:00
* split the config into two files

* fixed the Training-ML-Agents.md doc

* added the configs for all of the IL scenes
diff --git a/config/bc_config.yaml b/config/bc_config.yaml
diff --git a/config/offline_bc_config.yaml b/config/offline_bc_config.yaml
@@ -0,0 +1,27 @@
+default:
+    trainer: offline_bc
+    batch_size: 64
+    summary_freq: 1000
+    max_steps: 5.0e4
+    batches_per_epoch: 10
+    use_recurrent: false
+    hidden_units: 128
+    learning_rate: 3.0e-4
+    num_layers: 2
+    sequence_length: 32
+    memory_size: 256
+    demo_path: ./UnitySDK/Assets/Demonstrations/<Your_Demon_File>.demo
+
+HallwayBrain:
+    trainer: offline_bc
+    max_steps: 5.0e5
+    num_epoch: 5
+    batch_size: 64
+    batches_per_epoch: 5
+    num_layers: 2
+    hidden_units: 128
+    sequence_length: 16
+    use_recurrent: true
+    memory_size: 256
+    sequence_length: 32
+    demo_path: ./UnitySDK/Assets/Demonstrations/Hallway.demo
diff --git a/config/online_bc_config.yaml b/config/online_bc_config.yaml
@@ -0,0 +1,110 @@
+default:
+    trainer: online_bc
+    brain_to_imitate: <Your_Brain_Asset_Name>
+    batch_size: 64
+    time_horizon: 64
+    summary_freq: 1000
+    max_steps: 5.0e4
+    batches_per_epoch: 10
+    use_recurrent: false
+    hidden_units: 128
+    learning_rate: 3.0e-4
+    num_layers: 2
+    sequence_length: 32
+    memory_size: 256
+
+BananaLearning:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: BananaPlayer
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+BouncerLearning:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 10
+    brain_to_imitate: BouncerPlayer
+    batch_size: 16
+    batches_per_epoch: 1
+    num_layers: 1
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+HallwayLearning:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: HallwayPlayer
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+PushBlockLearning:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: PushBlockPlayer
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+PyramidsLearning:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: PyramidsPlayer
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+TennisLearning:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: TennisPlayer
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+StudentBrain:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: TeacherBrain
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: false
+    sequence_length: 16
+
+StudentRecurrentBrain:
+    trainer: online_bc
+    max_steps: 10000
+    summary_freq: 1000
+    brain_to_imitate: TeacherBrain
+    batch_size: 16
+    batches_per_epoch: 5
+    num_layers: 4
+    hidden_units: 64
+    use_recurrent: true
+    sequence_length: 32
diff --git a/docs/Training-Imitation-Learning.md b/docs/Training-Imitation-Learning.md
@@ -46,9 +46,9 @@ With offline behavioral cloning, we can use demonstrations (`.demo` files) gener
 1. Choose an agent you would like to learn to imitate some set of demonstrations. 
 2. Record a set of demonstration using the `Demonstration Recorder` (see above). For illustrative purposes we will refer to this file as `AgentRecording.demo`. 
 3. Build the scene, assigning the agent a Learning Brain, and set the Brain to Control in the Broadcast Hub. For more information on Brains, see [here](Learning-Environment-Design-Brains.md).
-4. Open the `config/bc_config.yaml` file. 
+4. Open the `config/offline_bc_config.yaml` file. 
 5. Modify the `demo_path` parameter in the file to reference the path to the demonstration file recorded in step 2. In our case this is: `./UnitySDK/Assets/Demonstrations/AgentRecording.demo`
-6. Launch `mlagent-learn`, and providing `./config/bc_config.yaml` as the config parameter, and your environment as the `--env` parameter.
+6. Launch `mlagent-learn`, and providing `./config/offline_bc_config.yaml` as the config parameter, and your environment as the `--env` parameter.
 7. (Optional) Observe training performance using Tensorboard.
 
 This will use the demonstration file to train a nerual network driven agent to directly imitate the actions provided in the demonstration. The environment will launch and be used for evaluating the agent's performance during training.
@@ -69,13 +69,13 @@ It is also possible to provide demonstrations in realtime during training, witho
    and check the `Control` checkbox on the "Student" brain. 
 4. Link the Brains to the desired Agents (one Agent as the teacher and at least
    one Agent as a student).
-5. In `config/trainer_config.yaml`, add an entry for the "Student" Brain. Set
+5. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set
    the `trainer` parameter of this entry to `imitation`, and the
    `brain_to_imitate` parameter to the name of the teacher Brain: "Teacher".
    Additionally, set `batches_per_epoch`, which controls how much training to do
    each moment. Increase the `max_steps` option if you'd like to keep training
    the Agents for a longer period of time.
-6. Launch the training process with `mlagents-learn config/trainer_config.yaml
+6. Launch the training process with `mlagents-learn config/online_bc_config.yaml
    --train --slow`, and press the :arrow_forward: button in Unity when the
    message _"Start training by pressing the Play button in the Unity Editor"_ is
    displayed on the screen
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
@@ -20,7 +20,7 @@ project to decide the best course of action for an agent.
 Use the command `mlagents-learn` to train your agents. This command is installed
 with the `mlagents` package and its implementation can be found at
 `ml-agents/mlagents/trainers/learn.py`. The [configuration file](#training-config-file),
-`config/trainer_config.yaml` specifies the hyperparameters used during training.
+like `config/trainer_config.yaml` specifies the hyperparameters used during training.
 You can edit this file with a text editor to add a specific configuration for
 each Brain.
 
@@ -87,12 +87,7 @@ When training is finished, you can find the saved model in the `models` folder
 under the assigned run-id — in the cats example, the path to the model would be
 `models/cob_1/CatsOnBicycles_cob_1.bytes`.
 
-On Mac and Linux platform, you can press Ctrl+c to terminate your training
-early, the model will be saved as if you set your max_steps to the current step.
-(**Note:** There is a known bug on Windows that causes the saving of the model
-to fail when you early terminate the training, it's recommended to wait until
-Step has reached the max_steps parameter you set in trainer_config.yaml.) While
-this example used the default training hyperparameters, you can edit the
+While this example used the default training hyperparameters, you can edit the
 [training_config.yaml file](#training-config-file) with a text editor to set
 different values.
 
@@ -154,21 +149,24 @@ environment, you can set the following command line options when invoking
 
 ### Training config file
 
-The training config file, `config/trainer_config.yaml` specifies the training
-method, the hyperparameters, and a few additional values to use during training.
-The file is divided into sections. The **default** section defines the default
-values for all the available settings. You can also add new sections to override
-these defaults to train specific Brains. Name each of these override sections
-after the GameObject containing the Brain component that should use these
-settings. (This GameObject will be a child of the Academy in your scene.)
-Sections for the example environments are included in the provided config file.
+The training config files `config/trainer_config.yaml`,
+`config/online_bc_config.yaml` and `config/offline_bc_config.yaml` specifies the
+training method, the hyperparameters, and a few additional values to use during
+training with PPO, online and offline BC. These files are divided into sections.
+The **default** section defines the default values for all the available
+settings. You can also add new sections to override these defaults to train
+specific Brains. Name each of these override sections after the GameObject
+containing the Brain component that should use these settings. (This GameObject
+will be a child of the Academy in your scene.) Sections for the example
+environments are included in the provided config file.
 
 |     **Setting**      |                                                                                     **Description**                                                                                     | **Applies To Trainer\*** |
 | :------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
 | batch_size           | The number of experiences in each iteration of gradient descent.                                                                                                                        | PPO, BC                  |
 | batches_per_epoch    | In imitation learning, the number of batches of training examples to collect before training the model.                                                                                 | BC                       |
 | beta                 | The strength of entropy regularization.                                                                                                                                                 | PPO                      |
-| brain\_to\_imitate   | For imitation learning, the name of the GameObject containing the Brain component to imitate.                                                                                           | BC                       |
+| brain\_to\_imitate   | For online imitation learning, the name of the GameObject containing the Brain component to imitate.                                                                                    | (online)BC               |
+| demo_path            | For offline imitation learning, the file path of the recorded demonstration file                                                                                                        | (offline)BC              |
 | buffer_size          | The number of experiences to collect before updating the policy model.                                                                                                                  | PPO                      |
 | curiosity\_enc\_size | The size of the encoding to use in the forward and inverse models in the Curioity module.                                                                                               | PPO                      |
 | curiosity_strength   | Magnitude of intrinsic reward generated by Intrinsic Curiosity Module.                                                                                                                  | PPO                      |
@@ -184,7 +182,7 @@ Sections for the example environments are included in the provided config file.
 | num_layers           | The number of hidden layers in the neural network.                                                                                                                                      | PPO, BC                  |
 | sequence_length      | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC                  |
 | summary_freq         | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard.                                                                       | PPO, BC                  |
-| time_horizon         | How many steps of experience to collect per-agent before adding it to the experience buffer.                                                                                            | PPO, BC                  |
+| time_horizon         | How many steps of experience to collect per-agent before adding it to the experience buffer.                                                                                            | PPO, (online)BC          |
 | trainer              | The type of training to perform: "ppo" or "imitation".                                                                                                                                  | PPO, BC                  |
 | use_curiosity        | Train using an additional intrinsic reward signal generated from Intrinsic Curiosity Module.                                                                                            | PPO                      |
 | use_recurrent        | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).                                                                                       | PPO, BC                  |