From 073fd177012d31712dc97fe8eb6e52ea9ae3b131 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 3 Feb 2023 08:51:13 -0800
Subject: [PATCH 1/9] update memory monitor

---
 composer/callbacks/memory_monitor.py | 48 +++++++++++++++-------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index 83dee99adb..a37371ce65 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -50,23 +50,23 @@ class MemoryMonitor(Callback):
 
     The following statistics are recorded:
 
-    +----------------+--------------------------------------------------------------------------------+
-    | Statistic      | Description                                                                    |
-    +================+================================================================================+
-    | alloc_requests | Number of memory allocation requests received by the memory allocator.         |
-    +----------------+--------------------------------------------------------------------------------+
-    | free_requests  | Number of memory free requests received by the memory allocator.               |
-    +----------------+--------------------------------------------------------------------------------+
-    | allocated_mem  | Amount of allocated memory in bytes.                                           |
-    +----------------+--------------------------------------------------------------------------------+
-    | active_mem     | Amount of active memory in bytes at the time of recording.                     |
-    +----------------+--------------------------------------------------------------------------------+
-    | inactive_mem   | Amount of inactive, non-releaseable memory in bytes at the time of recording.  |
-    +----------------+--------------------------------------------------------------------------------+
-    | reserved_mem   | Amount of reserved memory in bytes at the time of recording.                   |
-    +----------------+--------------------------------------------------------------------------------+
-    | alloc_retries  | Number of failed cudaMalloc calls that result in a cache flush and retry.      |
-    +----------------+--------------------------------------------------------------------------------+
+    +----------------+-----------------------------------------------------------------------------------+
+    | Statistic      | Description                                                                       |
+    +================+===================================================================================+
+    | alloc_requests | Number of memory allocation requests received by the memory allocator.            |
+    +----------------+-----------------------------------------------------------------------------------+
+    | free_requests  | Number of memory free requests received by the memory allocator.                  |
+    +----------------+-----------------------------------------------------------------------------------+
+    | allocated_mem  | Amount of allocated memory in gigabytes.                                          |
+    +----------------+-----------------------------------------------------------------------------------+
+    | active_mem     | Amount of active memory in gigabytes at the time of recording.                    |
+    +----------------+-----------------------------------------------------------------------------------+
+    | inactive_mem   | Amount of inactive, non-releaseable memory in gigabytes at the time of recording. |
+    +----------------+-----------------------------------------------------------------------------------+
+    | reserved_mem   | Amount of reserved memory in gigabytes at the time of recording.                  |
+    +----------------+-----------------------------------------------------------------------------------+
+    | alloc_retries  | Number of failed cudaMalloc calls that result in a cache flush and retry.         |
+    +----------------+-----------------------------------------------------------------------------------+
 
     .. note::
         Memory usage monitoring is only supported for GPU devices.
@@ -98,7 +98,7 @@ def after_train_batch(self, state: State, logger: Logger):
 _MEMORY_STATS = {
     'allocation.all.allocated': 'alloc_requests',
     'allocation.all.freed': 'free_requests',
-    'allocated_bytes.all.allocated': 'allocated_mem',
+    'allocated_bytes.all.current': 'allocated_mem',
     'active_bytes.all.current': 'active_mem',
     'inactive_split_bytes.all.current': 'inactive_mem',
     'reserved_bytes.all.current': 'reserved_mem',
@@ -109,9 +109,13 @@ def after_train_batch(self, state: State, logger: Logger):
 def _get_memory_report() -> Dict[str, Union[int, float]]:
     memory_stats = torch.cuda.memory_stats()
 
-    # simplify the memory_stats
-    memory_report = {
-        name: memory_stats[torch_name] for (torch_name, name) in _MEMORY_STATS.items() if torch_name in memory_stats
-    }
+    # simplify and reformat the memory_stats
+    memory_report = {}
+    for (torch_name, name) in _MEMORY_STATS.items():
+        if torch_name in memory_stats:
+            # Convert to gigabytes
+            if 'bytes' in torch_name:
+                memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1e9
+            memory_report[name] = memory_stats[torch_name]
 
     return memory_report

From 3bf172f29de8f701f393176ea7b1bafcc78d0e82 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 11:02:17 -0800
Subject: [PATCH 2/9] add round

---
 composer/callbacks/memory_monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index a37371ce65..51ccb2af9f 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -115,7 +115,7 @@ def _get_memory_report() -> Dict[str, Union[int, float]]:
         if torch_name in memory_stats:
             # Convert to gigabytes
             if 'bytes' in torch_name:
-                memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1e9
+                memory_report[name.replace('bytes', 'gigabytes')] = round(memory_stats[torch_name] / 1e9, 3)
             memory_report[name] = memory_stats[torch_name]
 
     return memory_report

From 47992c64657e9407b95587955056585b4cad6f3c Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 18:13:44 -0800
Subject: [PATCH 3/9] fix memory

---
 composer/callbacks/memory_monitor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index 51ccb2af9f..b3d028a38f 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -115,7 +115,8 @@ def _get_memory_report() -> Dict[str, Union[int, float]]:
         if torch_name in memory_stats:
             # Convert to gigabytes
             if 'bytes' in torch_name:
-                memory_report[name.replace('bytes', 'gigabytes')] = round(memory_stats[torch_name] / 1e9, 3)
-            memory_report[name] = memory_stats[torch_name]
+                memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1.0e9
+            else:
+                memory_report[name] = memory_stats[torch_name]
 
     return memory_report

From 73a3858ed94eed8630d1f0d301d88561dbfffe83 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 18:27:02 -0800
Subject: [PATCH 4/9] add rounding

---
 composer/callbacks/memory_monitor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index b3d028a38f..c7fc729983 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -3,6 +3,7 @@
 
 """Log memory usage during training."""
 import logging
+import math
 import warnings
 from typing import Dict, Union
 
@@ -115,6 +116,10 @@ def _get_memory_report() -> Dict[str, Union[int, float]]:
         if torch_name in memory_stats:
             # Convert to gigabytes
             if 'bytes' in torch_name:
+                gigabytes = memory_stats[torch_name] / 1.0e9
+                # Round to preserve 5 significant digits
+                order_of_magnitude = int(math.floor(math.log10(abs(gigabytes))))
+                gigabytes = round(gigabytes, -order_of_magnitude + 4)
                 memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1.0e9
             else:
                 memory_report[name] = memory_stats[torch_name]

From b668044577c377277df6fda4cb95bc452b43c04e Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 18:29:11 -0800
Subject: [PATCH 5/9] add rounding

---
 composer/callbacks/memory_monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index c7fc729983..8acf661f89 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -120,7 +120,7 @@ def _get_memory_report() -> Dict[str, Union[int, float]]:
                 # Round to preserve 5 significant digits
                 order_of_magnitude = int(math.floor(math.log10(abs(gigabytes))))
                 gigabytes = round(gigabytes, -order_of_magnitude + 4)
-                memory_report[name.replace('bytes', 'gigabytes')] = memory_stats[torch_name] / 1.0e9
+                memory_report[name.replace('bytes', 'gigabytes')] = gigabytes
             else:
                 memory_report[name] = memory_stats[torch_name]
 

From a6980fdd69925488e0aaa00488a2b75d75da703e Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 19:58:49 -0800
Subject: [PATCH 6/9] update memory  monitor

---
 composer/callbacks/memory_monitor.py | 31 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index 8acf661f89..cf34d21b57 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -5,7 +5,7 @@
 import logging
 import math
 import warnings
-from typing import Dict, Union
+from typing import Dict, Optional, Union
 
 import torch.cuda
 
@@ -54,10 +54,6 @@ class MemoryMonitor(Callback):
     +----------------+-----------------------------------------------------------------------------------+
     | Statistic      | Description                                                                       |
     +================+===================================================================================+
-    | alloc_requests | Number of memory allocation requests received by the memory allocator.            |
-    +----------------+-----------------------------------------------------------------------------------+
-    | free_requests  | Number of memory free requests received by the memory allocator.                  |
-    +----------------+-----------------------------------------------------------------------------------+
     | allocated_mem  | Amount of allocated memory in gigabytes.                                          |
     +----------------+-----------------------------------------------------------------------------------+
     | active_mem     | Amount of active memory in gigabytes at the time of recording.                    |
@@ -71,11 +67,16 @@ class MemoryMonitor(Callback):
 
     .. note::
         Memory usage monitoring is only supported for GPU devices.
+
+    Args:
+        memory_keys (Dict[str, str], optional): A dict specifying memory statistics to log. Keys
+            are the names of memory statistics to log from `torch.cuda.memory_stats()`, and values
+            are the names of they are logged under. If not provided, the above statistics are
+            logged. Defaults to None.
     """
 
-    def __init__(self) -> None:
-        # Memory monitor takes no args
-        pass
+    def __init__(self, memory_keys: Optional[Dict[str, str]] = None) -> None:
+        self.memory_keys = memory_keys
 
     def init(self, state: State, logger: Logger) -> None:
         # Not relying on `torch.cuda.is_available()` since the model could be on CPU.
@@ -91,14 +92,12 @@ def after_train_batch(self, state: State, logger: Logger):
         if model_device.type != 'cuda':
             return
 
-        memory_report = _get_memory_report()
+        memory_report = _get_memory_report(self.memory_keys)
 
         logger.log_metrics({f'memory/{mem_stat}': val for (mem_stat, val) in memory_report.items()})
 
 
-_MEMORY_STATS = {
-    'allocation.all.allocated': 'alloc_requests',
-    'allocation.all.freed': 'free_requests',
+_MEMORY_KEYS = {
     'allocated_bytes.all.current': 'allocated_mem',
     'active_bytes.all.current': 'active_mem',
     'inactive_split_bytes.all.current': 'inactive_mem',
@@ -107,12 +106,14 @@ def after_train_batch(self, state: State, logger: Logger):
 }
 
 
-def _get_memory_report() -> Dict[str, Union[int, float]]:
+def _get_memory_report(memory_keys: Optional[Dict[str, str]] = None) -> Dict[str, Union[int, float]]:
+    # simplify and reformat the memory_stats
     memory_stats = torch.cuda.memory_stats()
 
-    # simplify and reformat the memory_stats
+    memory_keys = memory_keys or _MEMORY_KEYS
+
     memory_report = {}
-    for (torch_name, name) in _MEMORY_STATS.items():
+    for (torch_name, name) in memory_keys.items():
         if torch_name in memory_stats:
             # Convert to gigabytes
             if 'bytes' in torch_name:

From 293fa870c2c90981d81b77c5d3421dfadbd8d8a5 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 19:59:53 -0800
Subject: [PATCH 7/9] reformat

---
 composer/callbacks/memory_monitor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index cf34d21b57..17cf3cbd0f 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -107,11 +107,10 @@ def after_train_batch(self, state: State, logger: Logger):
 
 
 def _get_memory_report(memory_keys: Optional[Dict[str, str]] = None) -> Dict[str, Union[int, float]]:
-    # simplify and reformat the memory_stats
     memory_stats = torch.cuda.memory_stats()
-
     memory_keys = memory_keys or _MEMORY_KEYS
 
+    # simplify and reformat the memory_stats
     memory_report = {}
     for (torch_name, name) in memory_keys.items():
         if torch_name in memory_stats:

From f0ed0279968b93eb246af011042e7cdc8c6e1cf4 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 7 Feb 2023 20:00:55 -0800
Subject: [PATCH 8/9] round only if non 0

---
 composer/callbacks/memory_monitor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index 17cf3cbd0f..fa57c3e88a 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -118,8 +118,9 @@ def _get_memory_report(memory_keys: Optional[Dict[str, str]] = None) -> Dict[str
             if 'bytes' in torch_name:
                 gigabytes = memory_stats[torch_name] / 1.0e9
                 # Round to preserve 5 significant digits
-                order_of_magnitude = int(math.floor(math.log10(abs(gigabytes))))
-                gigabytes = round(gigabytes, -order_of_magnitude + 4)
+                if gigabytes != 0:
+                    order_of_magnitude = int(math.floor(math.log10(abs(gigabytes))))
+                    gigabytes = round(gigabytes, -order_of_magnitude + 4)
                 memory_report[name.replace('bytes', 'gigabytes')] = gigabytes
             else:
                 memory_report[name] = memory_stats[torch_name]

From 0c4d739e67cbca75b55bc71d2b51112798da6c83 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 8 Feb 2023 10:23:11 -0800
Subject: [PATCH 9/9] Update composer/callbacks/memory_monitor.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/callbacks/memory_monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index fa57c3e88a..65009300b7 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -71,7 +71,7 @@ class MemoryMonitor(Callback):
     Args:
         memory_keys (Dict[str, str], optional): A dict specifying memory statistics to log. Keys
             are the names of memory statistics to log from `torch.cuda.memory_stats()`, and values
-            are the names of they are logged under. If not provided, the above statistics are
+            are the names they will be logged under. If not provided, the above statistics are
             logged. Defaults to None.
     """