From 073fd177012d31712dc97fe8eb6e52ea9ae3b131 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 3 Feb 2023 08:51:13 -0800 Subject: [PATCH 1/9] update memory monitor --- composer/callbacks/memory_monitor.py | 48 +++++++++++++++------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index 83dee99adb..a37371ce65 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -50,23 +50,23 @@ class MemoryMonitor(Callback): The following statistics are recorded: - +----------------+--------------------------------------------------------------------------------+ - | Statistic | Description | - +================+================================================================================+ - | alloc_requests | Number of memory allocation requests received by the memory allocator. | - +----------------+--------------------------------------------------------------------------------+ - | free_requests | Number of memory free requests received by the memory allocator. | - +----------------+--------------------------------------------------------------------------------+ - | allocated_mem | Amount of allocated memory in bytes. | - +----------------+--------------------------------------------------------------------------------+ - | active_mem | Amount of active memory in bytes at the time of recording. | - +----------------+--------------------------------------------------------------------------------+ - | inactive_mem | Amount of inactive, non-releaseable memory in bytes at the time of recording. | - +----------------+--------------------------------------------------------------------------------+ - | reserved_mem | Amount of reserved memory in bytes at the time of recording. | - +----------------+--------------------------------------------------------------------------------+ - | alloc_retries | Number of failed cudaMalloc calls that result in a cache flush and retry. | - +----------------+--------------------------------------------------------------------------------+ + +----------------+-----------------------------------------------------------------------------------+ + | Statistic | Description | + +================+===================================================================================+ + | alloc_requests | Number of memory allocation requests received by the memory allocator. | + +----------------+-----------------------------------------------------------------------------------+ + | free_requests | Number of memory free requests received by the memory allocator. | + +----------------+-----------------------------------------------------------------------------------+ + | allocated_mem | Amount of allocated memory in gigabytes. | + +----------------+-----------------------------------------------------------------------------------+ + | active_mem | Amount of active memory in gigabytes at the time of recording. | + +----------------+-----------------------------------------------------------------------------------+ + | inactive_mem | Amount of inactive, non-releaseable memory in gigabytes at the time of recording. | + +----------------+-----------------------------------------------------------------------------------+ + | reserved_mem | Amount of reserved memory in gigabytes at the time of recording. | + +----------------+-----------------------------------------------------------------------------------+ + | alloc_retries | Number of failed cudaMalloc calls that result in a cache flush and retry. | + +----------------+-----------------------------------------------------------------------------------+ .. note:: Memory usage monitoring is only supported for GPU devices. @@ -98,7 +98,7 @@ def after_train_batch(self, state: State, logger: Logger): _MEMORY_STATS = { 'allocation.all.allocated': 'alloc_requests', 'allocation.all.freed': 'free_requests', - 'allocated_bytes.all.allocated': 'allocated_mem', + 'allocated_bytes.all.current': 'allocated_mem', 'active_bytes.all.current': 'active_mem', 'inactive_split_bytes.all.current': 'inactive_mem', 'reserved_bytes.all.current': 'reserved_mem', @@ -109,9 +109,13 @@ def after_train_batch(self, state: State, logger: Logger): def _get_memory_report() -> Dict[str, Union[int, float]]: memory_stats = torch.cuda.memory_stats() - # simplify the memory_stats - memory_report = { - name: memory_stats[torch_name] for (torch_name, name) in _MEMORY_STATS.items() if torch_name in memory_stats - } + # simplify and reformat the memory_stats + memory_report = {} + for (torch_name, name) in _MEMORY_STATS.items(): + if torch_name in memory_stats: + # Convert to gigabytes + if 'bytes' in torch_name: + memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1e9 + memory_report[name] = memory_stats[torch_name] return memory_report From 3bf172f29de8f701f393176ea7b1bafcc78d0e82 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 11:02:17 -0800 Subject: [PATCH 2/9] add round --- composer/callbacks/memory_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index a37371ce65..51ccb2af9f 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -115,7 +115,7 @@ def _get_memory_report() -> Dict[str, Union[int, float]]: if torch_name in memory_stats: # Convert to gigabytes if 'bytes' in torch_name: - memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1e9 + memory_report[name.replace('bytes', 'gigabytes')] = round(memory_stats[torch_name] / 1e9, 3) memory_report[name] = memory_stats[torch_name] return memory_report From 47992c64657e9407b95587955056585b4cad6f3c Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 18:13:44 -0800 Subject: [PATCH 3/9] fix memory --- composer/callbacks/memory_monitor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index 51ccb2af9f..b3d028a38f 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -115,7 +115,8 @@ def _get_memory_report() -> Dict[str, Union[int, float]]: if torch_name in memory_stats: # Convert to gigabytes if 'bytes' in torch_name: - memory_report[name.replace('bytes', 'gigabytes')] = round(memory_stats[torch_name] / 1e9, 3) - memory_report[name] = memory_stats[torch_name] + memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1.0e9 + else: + memory_report[name] = memory_stats[torch_name] return memory_report From 73a3858ed94eed8630d1f0d301d88561dbfffe83 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 18:27:02 -0800 Subject: [PATCH 4/9] add rounding --- composer/callbacks/memory_monitor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index b3d028a38f..c7fc729983 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -3,6 +3,7 @@ """Log memory usage during training.""" import logging +import math import warnings from typing import Dict, Union @@ -115,6 +116,10 @@ def _get_memory_report() -> Dict[str, Union[int, float]]: if torch_name in memory_stats: # Convert to gigabytes if 'bytes' in torch_name: + gigabytes = memory_stats[torch_name] / 1.0e9 + # Round to preserve 5 significant digits + order_of_magnitude = int(math.floor(math.log10(abs(gigabytes)))) + gigabytes = round(gigabytes, -order_of_magnitude + 4) memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1.0e9 else: memory_report[name] = memory_stats[torch_name] From b668044577c377277df6fda4cb95bc452b43c04e Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 18:29:11 -0800 Subject: [PATCH 5/9] add rounding --- composer/callbacks/memory_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index c7fc729983..8acf661f89 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -120,7 +120,7 @@ def _get_memory_report() -> Dict[str, Union[int, float]]: # Round to preserve 5 significant digits order_of_magnitude = int(math.floor(math.log10(abs(gigabytes)))) gigabytes = round(gigabytes, -order_of_magnitude + 4) - memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1.0e9 + memory_report[name.replace('bytes', 'gigabytes')] = gigabytes else: memory_report[name] = memory_stats[torch_name] From a6980fdd69925488e0aaa00488a2b75d75da703e Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 19:58:49 -0800 Subject: [PATCH 6/9] update memory monitor --- composer/callbacks/memory_monitor.py | 31 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index 8acf661f89..cf34d21b57 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -5,7 +5,7 @@ import logging import math import warnings -from typing import Dict, Union +from typing import Dict, Optional, Union import torch.cuda @@ -54,10 +54,6 @@ class MemoryMonitor(Callback): +----------------+-----------------------------------------------------------------------------------+ | Statistic | Description | +================+===================================================================================+ - | alloc_requests | Number of memory allocation requests received by the memory allocator. | - +----------------+-----------------------------------------------------------------------------------+ - | free_requests | Number of memory free requests received by the memory allocator. | - +----------------+-----------------------------------------------------------------------------------+ | allocated_mem | Amount of allocated memory in gigabytes. | +----------------+-----------------------------------------------------------------------------------+ | active_mem | Amount of active memory in gigabytes at the time of recording. | @@ -71,11 +67,16 @@ class MemoryMonitor(Callback): .. note:: Memory usage monitoring is only supported for GPU devices. + + Args: + memory_keys (Dict[str, str], optional): A dict specifying memory statistics to log. Keys + are the names of memory statistics to log from `torch.cuda.memory_stats()`, and values + are the names of they are logged under. If not provided, the above statistics are + logged. Defaults to None. """ - def __init__(self) -> None: - # Memory monitor takes no args - pass + def __init__(self, memory_keys: Optional[Dict[str, str]] = None) -> None: + self.memory_keys = memory_keys def init(self, state: State, logger: Logger) -> None: # Not relying on `torch.cuda.is_available()` since the model could be on CPU. @@ -91,14 +92,12 @@ def after_train_batch(self, state: State, logger: Logger): if model_device.type != 'cuda': return - memory_report = _get_memory_report() + memory_report = _get_memory_report(self.memory_keys) logger.log_metrics({f'memory/{mem_stat}': val for (mem_stat, val) in memory_report.items()}) -_MEMORY_STATS = { - 'allocation.all.allocated': 'alloc_requests', - 'allocation.all.freed': 'free_requests', +_MEMORY_KEYS = { 'allocated_bytes.all.current': 'allocated_mem', 'active_bytes.all.current': 'active_mem', 'inactive_split_bytes.all.current': 'inactive_mem', @@ -107,12 +106,14 @@ def after_train_batch(self, state: State, logger: Logger): } -def _get_memory_report() -> Dict[str, Union[int, float]]: +def _get_memory_report(memory_keys: Optional[Dict[str, str]] = None) -> Dict[str, Union[int, float]]: + # simplify and reformat the memory_stats memory_stats = torch.cuda.memory_stats() - # simplify and reformat the memory_stats + memory_keys = memory_keys or _MEMORY_KEYS + memory_report = {} - for (torch_name, name) in _MEMORY_STATS.items(): + for (torch_name, name) in memory_keys.items(): if torch_name in memory_stats: # Convert to gigabytes if 'bytes' in torch_name: From 293fa870c2c90981d81b77c5d3421dfadbd8d8a5 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 19:59:53 -0800 Subject: [PATCH 7/9] reformat --- composer/callbacks/memory_monitor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index cf34d21b57..17cf3cbd0f 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -107,11 +107,10 @@ def after_train_batch(self, state: State, logger: Logger): def _get_memory_report(memory_keys: Optional[Dict[str, str]] = None) -> Dict[str, Union[int, float]]: - # simplify and reformat the memory_stats memory_stats = torch.cuda.memory_stats() - memory_keys = memory_keys or _MEMORY_KEYS + # simplify and reformat the memory_stats memory_report = {} for (torch_name, name) in memory_keys.items(): if torch_name in memory_stats: From f0ed0279968b93eb246af011042e7cdc8c6e1cf4 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 7 Feb 2023 20:00:55 -0800 Subject: [PATCH 8/9] round only if non 0 --- composer/callbacks/memory_monitor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index 17cf3cbd0f..fa57c3e88a 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -118,8 +118,9 @@ def _get_memory_report(memory_keys: Optional[Dict[str, str]] = None) -> Dict[str if 'bytes' in torch_name: gigabytes = memory_stats[torch_name] / 1.0e9 # Round to preserve 5 significant digits - order_of_magnitude = int(math.floor(math.log10(abs(gigabytes)))) - gigabytes = round(gigabytes, -order_of_magnitude + 4) + if gigabytes != 0: + order_of_magnitude = int(math.floor(math.log10(abs(gigabytes)))) + gigabytes = round(gigabytes, -order_of_magnitude + 4) memory_report[name.replace('bytes', 'gigabytes')] = gigabytes else: memory_report[name] = memory_stats[torch_name] From 0c4d739e67cbca75b55bc71d2b51112798da6c83 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 8 Feb 2023 10:23:11 -0800 Subject: [PATCH 9/9] Update composer/callbacks/memory_monitor.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- composer/callbacks/memory_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index fa57c3e88a..65009300b7 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -71,7 +71,7 @@ class MemoryMonitor(Callback): Args: memory_keys (Dict[str, str], optional): A dict specifying memory statistics to log. Keys are the names of memory statistics to log from `torch.cuda.memory_stats()`, and values - are the names of they are logged under. If not provided, the above statistics are + are the names they will be logged under. If not provided, the above statistics are logged. Defaults to None. """