From 19eb84591d7da0ac1014119a45ac31a3081a80df Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Tue, 13 Feb 2024 17:19:49 +0000 Subject: [PATCH 1/2] Update config file paths and add new run name format --- examples/config_tiny_llama.yaml | 6 ++- examples/debug_run_train.yaml | 86 --------------------------------- src/nanotron/config/config.py | 22 ++++----- 3 files changed, 15 insertions(+), 99 deletions(-) delete mode 100644 examples/debug_run_train.yaml diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml index ec17d135..877cd2b4 100644 --- a/examples/config_tiny_llama.yaml +++ b/examples/config_tiny_llama.yaml @@ -1,6 +1,6 @@ checkpoints: checkpoint_interval: 10 - checkpoints_path: /fsx/thomwolf/github/nanotron/checkpoints + checkpoints_path: /fsx/nouamane/projects/nanotron/checkpoints checkpoints_path_is_shared_file_system: false resume_checkpoint_path: null save_initial_state: false @@ -19,9 +19,10 @@ general: consumed_train_samples: null ignore_sanity_checks: false project: debug - run: tiny_llama + run: tiny_llama_%date_%jobid seed: 42 step: null +lighteval: null logging: iteration_step_info_interval: 1 log_level: info @@ -59,6 +60,7 @@ optimizer: clip_grad: 1.0 learning_rate_scheduler: learning_rate: 0.0003 + lr_decay_starting_step: null lr_decay_steps: 8 lr_decay_style: cosine lr_warmup_steps: 2 diff --git a/examples/debug_run_train.yaml b/examples/debug_run_train.yaml deleted file mode 100644 index 3d1676b6..00000000 --- a/examples/debug_run_train.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file examples/debug_run_train.yaml - -checkpoints: - checkpoint_interval: 4 - checkpoints_path: checkpoints/test/ - checkpoints_path_is_shared_file_system: true - # resume_checkpoint_path: checkpoints_test/ - save_initial_state: false -data: - dataset: - num_loading_workers: 1 - seed: 42 -general: - benchmark_csv_path: null - consumed_train_samples: null - ignore_sanity_checks: false - project: debug - run: tiny_llama - seed: 42 - step: null -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 16 - initializer_range: 0.02 - intermediate_size: 64 - is_llama_config: true - max_position_embeddings: 256 - num_attention_heads: 4 - num_hidden_layers: 20 - num_key_value_heads: 4 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - tie_word_embeddings: true - use_cache: true - vocab_size: 256 -optimizer: - accumulate_grad_in_fp32: true - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0003 - lr_decay_steps: 8 - lr_decay_style: cosine - lr_warmup_steps: 2 - lr_warmup_style: linear - min_decay_lr: 1.0e-05 - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 0 -parallelism: - dp: 2 - pp: 2 - pp_engine: 1f1b - recompute_granularity: SELECTIVE - tp: 2 - tp_linear_async_communication: true - tp_mode: REDUCE_SCATTER -profiler: null -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: gpt2 - tokenizer_revision: null -tokens: - batch_accumulation_per_replica: 1 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 32 - train_steps: 10 - val_check_interval: -1 diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index 5c4123a0..80b99e5d 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -288,17 +288,17 @@ def __post_init__(self): class Config: """Main configuration class""" - general: Optional[GeneralArgs] - checkpoints: Optional[CheckpointsArgs] - parallelism: Optional[ParallelismArgs] - model: Optional[ModelArgs] - tokenizer: Optional[TokenizerArgs] - logging: Optional[LoggingArgs] - tokens: Optional[TokensArgs] - optimizer: Optional[OptimizerArgs] - data: Optional[DataArgs] - profiler: Optional[ProfilerArgs] - lighteval: Optional[LightEvalConfig] + general: GeneralArgs + checkpoints: CheckpointsArgs + parallelism: ParallelismArgs + model: ModelArgs + tokenizer: TokenizerArgs + logging: LoggingArgs + tokens: TokensArgs + optimizer: OptimizerArgs + data: DataArgs + profiler: Optional[ProfilerArgs] = None + lighteval: Optional[LightEvalConfig] = None @classmethod def create_empty(cls): From aa3f841688ebce3425e4f815f90185a58bf70ef1 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Wed, 14 Feb 2024 08:10:28 +0000 Subject: [PATCH 2/2] quality and update --- src/nanotron/config/config.py | 10 +++++----- src/nanotron/serialize/optimizer.py | 2 +- src/nanotron/serialize/utils.py | 2 -- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index 80b99e5d..56ea25ed 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -289,14 +289,14 @@ class Config: """Main configuration class""" general: GeneralArgs - checkpoints: CheckpointsArgs parallelism: ParallelismArgs model: ModelArgs tokenizer: TokenizerArgs - logging: LoggingArgs - tokens: TokensArgs - optimizer: OptimizerArgs - data: DataArgs + checkpoints: Optional[CheckpointsArgs] = None + logging: Optional[LoggingArgs] = None + tokens: Optional[TokensArgs] = None + optimizer: Optional[OptimizerArgs] = None + data: Optional[DataArgs] = None profiler: Optional[ProfilerArgs] = None lighteval: Optional[LightEvalConfig] = None diff --git a/src/nanotron/serialize/optimizer.py b/src/nanotron/serialize/optimizer.py index fee0d921..7554a157 100644 --- a/src/nanotron/serialize/optimizer.py +++ b/src/nanotron/serialize/optimizer.py @@ -17,9 +17,9 @@ ) from nanotron.parallel import ParallelContext from nanotron.parallel.parameters import NanotronParameter +from nanotron.sanity_checks import check_optim_state_in_sync from nanotron.serialize.metadata import TensorMetadata from nanotron.serialize.utils import ObjectType, merge_and_shard_tp_tensors -from nanotron.sanity_checks import check_optim_state_in_sync # TODO(xrsrke): take rank instead of parallel_context diff --git a/src/nanotron/serialize/utils.py b/src/nanotron/serialize/utils.py index 1b555a59..efd391bb 100644 --- a/src/nanotron/serialize/utils.py +++ b/src/nanotron/serialize/utils.py @@ -4,11 +4,9 @@ from typing import List, Optional, Tuple import torch -import torch.distributed as dist from nanotron.parallel import ParallelContext from nanotron.parallel.parameters import SlicesPair -from nanotron.sanity_checks import assert_tensor_synced_across_pg from nanotron.serialize.metadata import TensorMetadata