From 10a0274a1236b2e8f86e133aa08e52c1d3b82be7 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 16:32:38 +0530 Subject: [PATCH 01/17] Add naive and vectorized implementations of Linear Regression using Gradient Descent --- machine_learning/linear_regression_naive.py | 138 ++++++++++++++++++ .../linear_regression_vectorized.py | 96 ++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 machine_learning/linear_regression_naive.py create mode 100644 machine_learning/linear_regression_vectorized.py diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py new file mode 100644 index 000000000000..ad4b408aa295 --- /dev/null +++ b/machine_learning/linear_regression_naive.py @@ -0,0 +1,138 @@ +""" +Naive implementation of Linear Regression using Gradient Descent. + +This version is intentionally less optimized and more verbose, +designed for educational clarity. It shows the step-by-step +gradient descent update and error calculation. + +Dataset used: CSGO dataset (ADR vs Rating) +""" + +# /// script +# requires-python = ">=3.13" +# dependencies = [ +# "httpx", +# "numpy", +# ] +# /// + +import httpx +import numpy as np + + +def collect_dataset() -> np.ndarray: + """Collect dataset of CSGO (ADR vs Rating) + + :return: dataset as numpy matrix + """ + response = httpx.get( + "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" + "master/Week1/ADRvsRating.csv", + timeout=10, + ) + lines = response.text.splitlines() + data = [line.split(",") for line in lines] + data.pop(0) # remove header row + dataset = np.matrix(data) + return dataset + + +def run_steep_gradient_descent( + data_x: np.ndarray, data_y: np.ndarray, len_data: int, alpha: float, theta: np.ndarray +) -> np.ndarray: + """Run one step of steep gradient descent. + + :param data_x: dataset features + :param data_y: dataset labels + :param len_data: number of samples + :param alpha: learning rate + :param theta: feature vector (weights) + + :return: updated theta + + >>> import numpy as np + >>> data_x = np.array([[1, 2], [3, 4]]) + >>> data_y = np.array([5, 6]) + >>> len_data = len(data_x) + >>> alpha = 0.01 + >>> theta = np.array([0.1, 0.2]) + >>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) + array([0.196, 0.343]) + """ + prod = np.dot(theta, data_x.T) + prod -= data_y.T + grad = np.dot(prod, data_x) + theta = theta - (alpha / len_data) * grad + return theta + + +def sum_of_square_error( + data_x: np.ndarray, data_y: np.ndarray, len_data: int, theta: np.ndarray +) -> float: + """Return sum of square error for error calculation. + + >>> vc_x = np.array([[1.1], [2.1], [3.1]]) + >>> vc_y = np.array([1.2, 2.2, 3.2]) + >>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])), 3) + 0.005 + """ + prod = np.dot(theta, data_x.T) + prod -= data_y.T + error = np.sum(np.square(prod)) / (2 * len_data) + return float(error) + + +def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: + """Run linear regression using gradient descent. + + :param data_x: dataset features + :param data_y: dataset labels + :return: learned feature vector theta + """ + iterations = 100000 + alpha = 0.000155 + + no_features = data_x.shape[1] + len_data = data_x.shape[0] - 1 + + theta = np.zeros((1, no_features)) + + for i in range(iterations): + theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) + error = sum_of_square_error(data_x, data_y, len_data, theta) + print(f"Iteration {i + 1}: Error = {error:.5f}") + + return theta + + +def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> float: + """Return mean absolute error. + + >>> predicted_y = np.array([3, -0.5, 2, 7]) + >>> original_y = np.array([2.5, 0.0, 2, 8]) + >>> mean_absolute_error(predicted_y, original_y) + 0.5 + """ + total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) + return total / len(original_y) + + +def main() -> None: + """Driver function.""" + data = collect_dataset() + + len_data = data.shape[0] + data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) + data_y = data[:, -1].astype(float) + + theta = run_linear_regression(data_x, data_y) + print("Resultant Feature vector:") + for value in theta.ravel(): + print(f"{value:.5f}") + + +if __name__ == "__main__": + import doctest + + doctest.testmod() + main() diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py new file mode 100644 index 000000000000..60e75c01df48 --- /dev/null +++ b/machine_learning/linear_regression_vectorized.py @@ -0,0 +1,96 @@ +""" +Vectorized implementation of Linear Regression using Gradient Descent. + +This version uses NumPy vectorization for efficiency. +It is faster and cleaner than the naive version but assumes +readers are familiar with matrix operations. + +Dataset used: CSGO dataset (ADR vs Rating) +""" + +# /// script +# requires-python = ">=3.13" +# dependencies = [ +# "httpx", +# "numpy", +# ] +# /// + +import httpx +import numpy as np + + +def collect_dataset() -> np.ndarray: + """Collect dataset of CSGO (ADR vs Rating). + + :return: dataset as numpy array + """ + response = httpx.get( + "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" + "master/Week1/ADRvsRating.csv", + timeout=10, + ) + lines = response.text.splitlines() + data = [line.split(",") for line in lines] + data.pop(0) # remove header row + return np.array(data, dtype=float) + + +def gradient_descent( + x: np.ndarray, y: np.ndarray, alpha: float = 0.000155, iterations: int = 100000 +) -> np.ndarray: + """Run gradient descent in a fully vectorized form. + + :param x: dataset features + :param y: dataset labels + :param alpha: learning rate + :param iterations: number of iterations + :return: learned feature vector theta + """ + m, n = x.shape + theta = np.zeros((n, 1)) + + for i in range(iterations): + predictions = x @ theta + errors = predictions - y + gradients = (x.T @ errors) / m + theta -= alpha * gradients + + if i % (iterations // 10) == 0: # log occasionally + cost = np.sum(errors**2) / (2 * m) + print(f"Iteration {i+1}: Error = {cost:.5f}") + + return theta + + +def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> float: + """Return mean absolute error. + + >>> pred = np.array([3, -0.5, 2, 7]) + >>> orig = np.array([2.5, 0.0, 2, 8]) + >>> mean_absolute_error(pred, orig) + 0.5 + """ + return float(np.mean(np.abs(original_y - predicted_y))) + + +def main() -> None: + """Driver function.""" + dataset = collect_dataset() + + m = dataset.shape[0] + x = np.c_[np.ones(m), dataset[:, :-1]] # add intercept term + y = dataset[:, -1].reshape(-1, 1) + + theta = gradient_descent(x, y) + print("Resultant Feature vector:") + for value in theta.ravel(): + print(f"{value:.5f}") + + +if __name__ == "__main__": + import doctest + + doctest.testmod() + main() + From 11fa072c4e84dd2f976a3f80d4cf24e22e06122c Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 16:44:38 +0530 Subject: [PATCH 02/17] Add references section to docstrings in linear regression implementations --- machine_learning/linear_regression_naive.py | 3 +++ machine_learning/linear_regression_vectorized.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index ad4b408aa295..80d89c1e5e97 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -6,6 +6,9 @@ gradient descent update and error calculation. Dataset used: CSGO dataset (ADR vs Rating) + +References: +https://en.wikipedia.org/wiki/Linear_regression """ # /// script diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index 60e75c01df48..4000647faf17 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -6,6 +6,9 @@ readers are familiar with matrix operations. Dataset used: CSGO dataset (ADR vs Rating) + +References: + https://en.wikipedia.org/wiki/Linear_regression """ # /// script From e879c47b74a5f0463949d5b2c71103ef9b8e71b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:15:25 +0000 Subject: [PATCH 03/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/linear_regression_naive.py | 6 +++++- machine_learning/linear_regression_vectorized.py | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 80d89c1e5e97..c39fc7633b67 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -41,7 +41,11 @@ def collect_dataset() -> np.ndarray: def run_steep_gradient_descent( - data_x: np.ndarray, data_y: np.ndarray, len_data: int, alpha: float, theta: np.ndarray + data_x: np.ndarray, + data_y: np.ndarray, + len_data: int, + alpha: float, + theta: np.ndarray, ) -> np.ndarray: """Run one step of steep gradient descent. diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index 4000647faf17..b57db90ec49b 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -61,7 +61,7 @@ def gradient_descent( if i % (iterations // 10) == 0: # log occasionally cost = np.sum(errors**2) / (2 * m) - print(f"Iteration {i+1}: Error = {cost:.5f}") + print(f"Iteration {i + 1}: Error = {cost:.5f}") return theta @@ -96,4 +96,3 @@ def main() -> None: doctest.testmod() main() - From d868aba368eb7e0710b07e443c27981647c2ac49 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 16:49:37 +0530 Subject: [PATCH 04/17] Refactor function signatures for improved readability in linear regression implementation --- machine_learning/linear_regression_naive.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 80d89c1e5e97..6173ba1cc55c 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -41,7 +41,11 @@ def collect_dataset() -> np.ndarray: def run_steep_gradient_descent( - data_x: np.ndarray, data_y: np.ndarray, len_data: int, alpha: float, theta: np.ndarray + data_x: np.ndarray, + data_y: np.ndarray, + len_data: int, + alpha: float, + theta: np.ndarray ) -> np.ndarray: """Run one step of steep gradient descent. @@ -70,7 +74,10 @@ def run_steep_gradient_descent( def sum_of_square_error( - data_x: np.ndarray, data_y: np.ndarray, len_data: int, theta: np.ndarray + data_x: np.ndarray, + data_y: np.ndarray, + len_data: int, + theta: np.ndarray ) -> float: """Return sum of square error for error calculation. @@ -85,7 +92,10 @@ def sum_of_square_error( return float(error) -def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: +def run_linear_regression( + data_x: np.ndarray, + data_y: np.ndarray +) -> np.ndarray: """Run linear regression using gradient descent. :param data_x: dataset features @@ -108,7 +118,10 @@ def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: return theta -def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> float: +def mean_absolute_error( + predicted_y: np.ndarray, + original_y: np.ndarray +) -> float: """Return mean absolute error. >>> predicted_y = np.array([3, -0.5, 2, 7]) From 260f5a6cc41be6558deb53da1504c5decb4438e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:21:50 +0000 Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/linear_regression_naive.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 92b684eac4c8..c39fc7633b67 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -41,7 +41,11 @@ def collect_dataset() -> np.ndarray: def run_steep_gradient_descent( - data_x: np.ndarray, data_y: np.ndarray, len_data: int, alpha: float, theta: np.ndarray + data_x: np.ndarray, + data_y: np.ndarray, + len_data: int, + alpha: float, + theta: np.ndarray, ) -> np.ndarray: """Run one step of steep gradient descent. @@ -70,10 +74,7 @@ def run_steep_gradient_descent( def sum_of_square_error( - data_x: np.ndarray, - data_y: np.ndarray, - len_data: int, - theta: np.ndarray + data_x: np.ndarray, data_y: np.ndarray, len_data: int, theta: np.ndarray ) -> float: """Return sum of square error for error calculation. @@ -88,10 +89,7 @@ def sum_of_square_error( return float(error) -def run_linear_regression( - data_x: np.ndarray, - data_y: np.ndarray -) -> np.ndarray: +def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: """Run linear regression using gradient descent. :param data_x: dataset features @@ -114,10 +112,7 @@ def run_linear_regression( return theta -def mean_absolute_error( - predicted_y: np.ndarray, - original_y: np.ndarray -) -> float: +def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> float: """Return mean absolute error. >>> predicted_y = np.array([3, -0.5, 2, 7]) From 80082fde9e39e6ee5fdc003368f5620906790c92 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 17:08:57 +0530 Subject: [PATCH 06/17] Refactor function signatures for improved readability in linear regression implementation --- machine_learning/linear_regression_naive.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 92b684eac4c8..c39fc7633b67 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -41,7 +41,11 @@ def collect_dataset() -> np.ndarray: def run_steep_gradient_descent( - data_x: np.ndarray, data_y: np.ndarray, len_data: int, alpha: float, theta: np.ndarray + data_x: np.ndarray, + data_y: np.ndarray, + len_data: int, + alpha: float, + theta: np.ndarray, ) -> np.ndarray: """Run one step of steep gradient descent. @@ -70,10 +74,7 @@ def run_steep_gradient_descent( def sum_of_square_error( - data_x: np.ndarray, - data_y: np.ndarray, - len_data: int, - theta: np.ndarray + data_x: np.ndarray, data_y: np.ndarray, len_data: int, theta: np.ndarray ) -> float: """Return sum of square error for error calculation. @@ -88,10 +89,7 @@ def sum_of_square_error( return float(error) -def run_linear_regression( - data_x: np.ndarray, - data_y: np.ndarray -) -> np.ndarray: +def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: """Run linear regression using gradient descent. :param data_x: dataset features @@ -114,10 +112,7 @@ def run_linear_regression( return theta -def mean_absolute_error( - predicted_y: np.ndarray, - original_y: np.ndarray -) -> float: +def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> float: """Return mean absolute error. >>> predicted_y = np.array([3, -0.5, 2, 7]) From ebf3ab20f94812ed215373dfc03094e02c7be127 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 17:26:14 +0530 Subject: [PATCH 07/17] Update README sections for dataset inputs and usage instructions in linear regression implementations --- machine_learning/linear_regression_naive.py | 18 +++++++++++++++++- .../linear_regression_vectorized.py | 16 ++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index c39fc7633b67..921d784a20ea 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -1,3 +1,19 @@ +"""README, Author - Somrita Banerjee(mailto:somritabanerjee126@gmail.com) +Requirements: +- Python >= 3.13 +- httpx +- numpy + +Inputs: +- Downloads a CSV dataset (ADR vs Rating) from a public GitHub URL. +- The dataset should have features in all columns except the last, which is the label. + +Usage: +- Run this script directly: + python linear_regression_naive.py +- The script will fetch the dataset, run linear regression using gradient descent, and print the learned feature vector (theta) and error at each iteration. + +""" """ Naive implementation of Linear Regression using Gradient Descent. @@ -120,7 +136,7 @@ def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> floa >>> mean_absolute_error(predicted_y, original_y) 0.5 """ - total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y)) + total = sum(abs(predicted_y[i] - y) for i, y in enumerate(original_y)) return total / len(original_y) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index b57db90ec49b..5d274f08934a 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -1,3 +1,19 @@ +"""README, Author - Somrita Banerjee(mailto:somritabanerjee126@gmail.com) +Requirements: +- Python >= 3.13 +- httpx +- numpy + +Inputs: +- The script automatically downloads a CSV dataset (ADR vs Rating) from a public GitHub URL. +- The dataset must have features in all columns except the last, which is the label (rating). + +Usage: +- Run this script directly: + python linear_regression_vectorized.py +- The script will fetch the dataset, run linear regression using gradient descent, and print the learned feature vector (theta) and error at intervals. + +""" """ Vectorized implementation of Linear Regression using Gradient Descent. From 7375f399e87515d54f88574c52a64deb3f879d0c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Oct 2025 11:56:37 +0000 Subject: [PATCH 08/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/linear_regression_naive.py | 1 + machine_learning/linear_regression_vectorized.py | 1 + 2 files changed, 2 insertions(+) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 921d784a20ea..c97b9933c760 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -14,6 +14,7 @@ - The script will fetch the dataset, run linear regression using gradient descent, and print the learned feature vector (theta) and error at each iteration. """ + """ Naive implementation of Linear Regression using Gradient Descent. diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index 5d274f08934a..31d69c551688 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -14,6 +14,7 @@ - The script will fetch the dataset, run linear regression using gradient descent, and print the learned feature vector (theta) and error at intervals. """ + """ Vectorized implementation of Linear Regression using Gradient Descent. From 8d33d9042dc0b24f3f61fc556876e5cfe14c5e8d Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 17:30:50 +0530 Subject: [PATCH 09/17] Add doctests for dataset collection and gradient descent functions --- machine_learning/linear_regression_naive.py | 19 ++++++++ .../linear_regression_vectorized.py | 44 ++++++++++++++----- 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 921d784a20ea..dd461a6b31cd 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -43,6 +43,12 @@ def collect_dataset() -> np.ndarray: """Collect dataset of CSGO (ADR vs Rating) :return: dataset as numpy matrix + + >>> ds = collect_dataset() + >>> isinstance(ds, np.matrix) + True + >>> ds.shape[1] >= 2 + True """ response = httpx.get( "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" @@ -111,6 +117,19 @@ def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: :param data_x: dataset features :param data_y: dataset labels :return: learned feature vector theta + + >>> import numpy as np + >>> x = np.array([[1, 1], [1, 2], [1, 3]]) + >>> y = np.array([1, 2, 3]) + >>> theta = run_linear_regression(x, y) + Iteration 1: Error = ... + ... # lots of output omitted + >>> theta.shape + (1, 2) + >>> abs(theta[0, 0] - 0) < 0.1 # intercept close to 0 + True + >>> abs(theta[0, 1] - 1) < 0.1 # slope close to 1 + True """ iterations = 100000 alpha = 0.000155 diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index 5d274f08934a..c9c03bf23bf5 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -43,6 +43,12 @@ def collect_dataset() -> np.ndarray: """Collect dataset of CSGO (ADR vs Rating). :return: dataset as numpy array + + >>> ds = collect_dataset() + >>> isinstance(ds, np.ndarray) + True + >>> ds.shape[1] >= 2 + True """ response = httpx.get( "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" @@ -56,23 +62,36 @@ def collect_dataset() -> np.ndarray: def gradient_descent( - x: np.ndarray, y: np.ndarray, alpha: float = 0.000155, iterations: int = 100000 + features: np.ndarray, labels: np.ndarray, alpha: float = 0.000155, iterations: int = 100000 ) -> np.ndarray: """Run gradient descent in a fully vectorized form. - :param x: dataset features - :param y: dataset labels + :param features: dataset features + :param labels: dataset labels :param alpha: learning rate :param iterations: number of iterations :return: learned feature vector theta + + >>> import numpy as np + >>> features = np.array([[1, 1], [1, 2], [1, 3]]) + >>> labels = np.array([[1], [2], [3]]) + >>> theta = gradient_descent(features, labels, alpha=0.01, iterations=1000) + Iteration 1: Error = ... + ... # output omitted + >>> theta.shape + (2, 1) + >>> abs(theta[0, 0] - 0) < 0.1 # intercept close to 0 + True + >>> abs(theta[1, 0] - 1) < 0.1 # slope close to 1 + True """ - m, n = x.shape + m, n = features.shape theta = np.zeros((n, 1)) for i in range(iterations): - predictions = x @ theta - errors = predictions - y - gradients = (x.T @ errors) / m + predictions = features @ theta + errors = predictions - labels + gradients = (features.T @ errors) / m theta -= alpha * gradients if i % (iterations // 10) == 0: # log occasionally @@ -94,14 +113,17 @@ def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> floa def main() -> None: - """Driver function.""" + """Driver function. + + >>> main() # doctest: +SKIP + """ dataset = collect_dataset() m = dataset.shape[0] - x = np.c_[np.ones(m), dataset[:, :-1]] # add intercept term - y = dataset[:, -1].reshape(-1, 1) + features = np.c_[np.ones(m), dataset[:, :-1]] # add intercept term + labels = dataset[:, -1].reshape(-1, 1) - theta = gradient_descent(x, y) + theta = gradient_descent(features, labels) print("Resultant Feature vector:") for value in theta.ravel(): print(f"{value:.5f}") From 2e6804dbc293f3284e0c4dc36a9ba273e15ea380 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:01:13 +0000 Subject: [PATCH 10/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/linear_regression_vectorized.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index c1ea222e8069..a47e1ac50182 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -63,7 +63,10 @@ def collect_dataset() -> np.ndarray: def gradient_descent( - features: np.ndarray, labels: np.ndarray, alpha: float = 0.000155, iterations: int = 100000 + features: np.ndarray, + labels: np.ndarray, + alpha: float = 0.000155, + iterations: int = 100000, ) -> np.ndarray: """Run gradient descent in a fully vectorized form. From 4db85f54d8cf8af7fcae41af60cf44eace47ad57 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 17:46:41 +0530 Subject: [PATCH 11/17] Refactor imports and improve README formatting in linear regression scripts --- machine_learning/linear_regression_naive.py | 10 +++++----- machine_learning/linear_regression_vectorized.py | 16 +++++++++------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 38e6617dd6a0..8d6e82af667f 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -1,3 +1,6 @@ +import httpx +import numpy as np + """README, Author - Somrita Banerjee(mailto:somritabanerjee126@gmail.com) Requirements: - Python >= 3.13 @@ -11,7 +14,8 @@ Usage: - Run this script directly: python linear_regression_naive.py -- The script will fetch the dataset, run linear regression using gradient descent, and print the learned feature vector (theta) and error at each iteration. +- The script will fetch the dataset, run linear regression using gradient descent, + and print the learned feature vector (theta) and error at each iteration. """ @@ -27,7 +31,6 @@ References: https://en.wikipedia.org/wiki/Linear_regression """ - # /// script # requires-python = ">=3.13" # dependencies = [ @@ -36,9 +39,6 @@ # ] # /// -import httpx -import numpy as np - def collect_dataset() -> np.ndarray: """Collect dataset of CSGO (ADR vs Rating) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index a47e1ac50182..f5388745f07a 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -1,3 +1,6 @@ +import httpx +import numpy as np + """README, Author - Somrita Banerjee(mailto:somritabanerjee126@gmail.com) Requirements: - Python >= 3.13 @@ -5,13 +8,16 @@ - numpy Inputs: -- The script automatically downloads a CSV dataset (ADR vs Rating) from a public GitHub URL. -- The dataset must have features in all columns except the last, which is the label (rating). +- The script automatically downloads a CSV dataset (ADR vs Rating) + from a public GitHub URL. +- The dataset must have features in all columns except the last, which is the label + (rating). Usage: - Run this script directly: python linear_regression_vectorized.py -- The script will fetch the dataset, run linear regression using gradient descent, and print the learned feature vector (theta) and error at intervals. +- The script will fetch the dataset, run linear regression using gradient descent, + and print the learned feature vector (theta) and error at intervals. """ @@ -27,7 +33,6 @@ References: https://en.wikipedia.org/wiki/Linear_regression """ - # /// script # requires-python = ">=3.13" # dependencies = [ @@ -36,9 +41,6 @@ # ] # /// -import httpx -import numpy as np - def collect_dataset() -> np.ndarray: """Collect dataset of CSGO (ADR vs Rating). From 6c3e951d15dea1d89b231841b8680d03f0853ba5 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:06:24 +0530 Subject: [PATCH 12/17] fix doctests --- machine_learning/linear_regression_naive.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py index 8d6e82af667f..efb19d46a16b 100644 --- a/machine_learning/linear_regression_naive.py +++ b/machine_learning/linear_regression_naive.py @@ -161,7 +161,14 @@ def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> floa def main() -> None: - """Driver function.""" + """Driver function. + + >>> main() # doctest: +ELLIPSIS + Iteration 1: Error = ... + ... # lots of output omitted + Resultant Feature vector: + ... + """ data = collect_dataset() len_data = data.shape[0] @@ -170,7 +177,7 @@ def main() -> None: theta = run_linear_regression(data_x, data_y) print("Resultant Feature vector:") - for value in theta.ravel(): + for value in np.asarray(theta).ravel(): print(f"{value:.5f}") From 9c18a51727d7f328dbb970111d645fc10cc91a90 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:26:59 +0530 Subject: [PATCH 13/17] Remove linear regression naive implementation script --- machine_learning/linear_regression_naive.py | 188 -------------------- 1 file changed, 188 deletions(-) delete mode 100644 machine_learning/linear_regression_naive.py diff --git a/machine_learning/linear_regression_naive.py b/machine_learning/linear_regression_naive.py deleted file mode 100644 index efb19d46a16b..000000000000 --- a/machine_learning/linear_regression_naive.py +++ /dev/null @@ -1,188 +0,0 @@ -import httpx -import numpy as np - -"""README, Author - Somrita Banerjee(mailto:somritabanerjee126@gmail.com) -Requirements: -- Python >= 3.13 -- httpx -- numpy - -Inputs: -- Downloads a CSV dataset (ADR vs Rating) from a public GitHub URL. -- The dataset should have features in all columns except the last, which is the label. - -Usage: -- Run this script directly: - python linear_regression_naive.py -- The script will fetch the dataset, run linear regression using gradient descent, - and print the learned feature vector (theta) and error at each iteration. - -""" - -""" -Naive implementation of Linear Regression using Gradient Descent. - -This version is intentionally less optimized and more verbose, -designed for educational clarity. It shows the step-by-step -gradient descent update and error calculation. - -Dataset used: CSGO dataset (ADR vs Rating) - -References: -https://en.wikipedia.org/wiki/Linear_regression -""" -# /// script -# requires-python = ">=3.13" -# dependencies = [ -# "httpx", -# "numpy", -# ] -# /// - - -def collect_dataset() -> np.ndarray: - """Collect dataset of CSGO (ADR vs Rating) - - :return: dataset as numpy matrix - - >>> ds = collect_dataset() - >>> isinstance(ds, np.matrix) - True - >>> ds.shape[1] >= 2 - True - """ - response = httpx.get( - "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/" - "master/Week1/ADRvsRating.csv", - timeout=10, - ) - lines = response.text.splitlines() - data = [line.split(",") for line in lines] - data.pop(0) # remove header row - dataset = np.matrix(data) - return dataset - - -def run_steep_gradient_descent( - data_x: np.ndarray, - data_y: np.ndarray, - len_data: int, - alpha: float, - theta: np.ndarray, -) -> np.ndarray: - """Run one step of steep gradient descent. - - :param data_x: dataset features - :param data_y: dataset labels - :param len_data: number of samples - :param alpha: learning rate - :param theta: feature vector (weights) - - :return: updated theta - - >>> import numpy as np - >>> data_x = np.array([[1, 2], [3, 4]]) - >>> data_y = np.array([5, 6]) - >>> len_data = len(data_x) - >>> alpha = 0.01 - >>> theta = np.array([0.1, 0.2]) - >>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - array([0.196, 0.343]) - """ - prod = np.dot(theta, data_x.T) - prod -= data_y.T - grad = np.dot(prod, data_x) - theta = theta - (alpha / len_data) * grad - return theta - - -def sum_of_square_error( - data_x: np.ndarray, data_y: np.ndarray, len_data: int, theta: np.ndarray -) -> float: - """Return sum of square error for error calculation. - - >>> vc_x = np.array([[1.1], [2.1], [3.1]]) - >>> vc_y = np.array([1.2, 2.2, 3.2]) - >>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])), 3) - 0.005 - """ - prod = np.dot(theta, data_x.T) - prod -= data_y.T - error = np.sum(np.square(prod)) / (2 * len_data) - return float(error) - - -def run_linear_regression(data_x: np.ndarray, data_y: np.ndarray) -> np.ndarray: - """Run linear regression using gradient descent. - - :param data_x: dataset features - :param data_y: dataset labels - :return: learned feature vector theta - - >>> import numpy as np - >>> x = np.array([[1, 1], [1, 2], [1, 3]]) - >>> y = np.array([1, 2, 3]) - >>> theta = run_linear_regression(x, y) - Iteration 1: Error = ... - ... # lots of output omitted - >>> theta.shape - (1, 2) - >>> abs(theta[0, 0] - 0) < 0.1 # intercept close to 0 - True - >>> abs(theta[0, 1] - 1) < 0.1 # slope close to 1 - True - """ - iterations = 100000 - alpha = 0.000155 - - no_features = data_x.shape[1] - len_data = data_x.shape[0] - 1 - - theta = np.zeros((1, no_features)) - - for i in range(iterations): - theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta) - error = sum_of_square_error(data_x, data_y, len_data, theta) - print(f"Iteration {i + 1}: Error = {error:.5f}") - - return theta - - -def mean_absolute_error(predicted_y: np.ndarray, original_y: np.ndarray) -> float: - """Return mean absolute error. - - >>> predicted_y = np.array([3, -0.5, 2, 7]) - >>> original_y = np.array([2.5, 0.0, 2, 8]) - >>> mean_absolute_error(predicted_y, original_y) - 0.5 - """ - total = sum(abs(predicted_y[i] - y) for i, y in enumerate(original_y)) - return total / len(original_y) - - -def main() -> None: - """Driver function. - - >>> main() # doctest: +ELLIPSIS - Iteration 1: Error = ... - ... # lots of output omitted - Resultant Feature vector: - ... - """ - data = collect_dataset() - - len_data = data.shape[0] - data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float) - data_y = data[:, -1].astype(float) - - theta = run_linear_regression(data_x, data_y) - print("Resultant Feature vector:") - for value in np.asarray(theta).ravel(): - print(f"{value:.5f}") - - -if __name__ == "__main__": - import doctest - - doctest.testmod() - main() From 6551ba6eb856128390b39c36af28072d72224de0 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:41:18 +0530 Subject: [PATCH 14/17] Refactor docstring and improve script documentation for clarity --- .../linear_regression_vectorized.py | 46 +++++-------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index f5388745f07a..d91f1144112d 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -1,38 +1,22 @@ import httpx import numpy as np -"""README, Author - Somrita Banerjee(mailto:somritabanerjee126@gmail.com) -Requirements: -- Python >= 3.13 -- httpx -- numpy - -Inputs: -- The script automatically downloads a CSV dataset (ADR vs Rating) - from a public GitHub URL. -- The dataset must have features in all columns except the last, which is the label - (rating). - -Usage: -- Run this script directly: - python linear_regression_vectorized.py -- The script will fetch the dataset, run linear regression using gradient descent, - and print the learned feature vector (theta) and error at intervals. - """ +Vectorized Linear Regression using Gradient Descent -""" -Vectorized implementation of Linear Regression using Gradient Descent. +Author: Somrita Banerjee (mailto:somritabanerjee126@gmail.com) -This version uses NumPy vectorization for efficiency. -It is faster and cleaner than the naive version but assumes -readers are familiar with matrix operations. +Requirements: +- Python >= 3.13 +- numpy +- httpx Dataset used: CSGO dataset (ADR vs Rating) References: - https://en.wikipedia.org/wiki/Linear_regression +https://en.wikipedia.org/wiki/Linear_regression """ + # /// script # requires-python = ">=3.13" # dependencies = [ @@ -81,15 +65,7 @@ def gradient_descent( >>> import numpy as np >>> features = np.array([[1, 1], [1, 2], [1, 3]]) >>> labels = np.array([[1], [2], [3]]) - >>> theta = gradient_descent(features, labels, alpha=0.01, iterations=1000) - Iteration 1: Error = ... - ... # output omitted - >>> theta.shape - (2, 1) - >>> abs(theta[0, 0] - 0) < 0.1 # intercept close to 0 - True - >>> abs(theta[1, 0] - 1) < 0.1 # slope close to 1 - True + >>> theta = gradient_descent(features, labels, alpha=0.01, iterations=1000) # doctest: +SKIP """ m, n = features.shape theta = np.zeros((n, 1)) @@ -138,5 +114,5 @@ def main() -> None: if __name__ == "__main__": import doctest - doctest.testmod() - main() + doctest.testmod() # runs all doctests + main() # runs main function From 1d46f384e3f64cea397e448fea4dee29b9fdb69f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Oct 2025 13:11:57 +0000 Subject: [PATCH 15/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/linear_regression_vectorized.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index d91f1144112d..ce0e6da11f53 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -115,4 +115,4 @@ def main() -> None: import doctest doctest.testmod() # runs all doctests - main() # runs main function + main() # runs main function From acc797811f6bf38531c2307b94edb7c3671abc86 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:45:03 +0530 Subject: [PATCH 16/17] Fix formatting in gradient_descent doctest and streamline main function call --- machine_learning/linear_regression_vectorized.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index d91f1144112d..537ed3763f2a 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -65,7 +65,10 @@ def gradient_descent( >>> import numpy as np >>> features = np.array([[1, 1], [1, 2], [1, 3]]) >>> labels = np.array([[1], [2], [3]]) - >>> theta = gradient_descent(features, labels, alpha=0.01, iterations=1000) # doctest: +SKIP + >>> theta = gradient_descent( + ... features, labels, alpha=0.01, iterations=1000 # doctest: +SKIP + ... ) + """ m, n = features.shape theta = np.zeros((n, 1)) @@ -115,4 +118,4 @@ def main() -> None: import doctest doctest.testmod() # runs all doctests - main() # runs main function + main() # runs main function From acd2f67a01c8e6bd416df5959815c2e32e46e5f8 Mon Sep 17 00:00:00 2001 From: somrita-banerjee <144698416+somrita-banerjee@users.noreply.github.com> Date: Sun, 5 Oct 2025 19:02:35 +0530 Subject: [PATCH 17/17] fix doctest --- machine_learning/linear_regression_vectorized.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/linear_regression_vectorized.py b/machine_learning/linear_regression_vectorized.py index 537ed3763f2a..a7c4e84f9240 100644 --- a/machine_learning/linear_regression_vectorized.py +++ b/machine_learning/linear_regression_vectorized.py @@ -65,7 +65,7 @@ def gradient_descent( >>> import numpy as np >>> features = np.array([[1, 1], [1, 2], [1, 3]]) >>> labels = np.array([[1], [2], [3]]) - >>> theta = gradient_descent( + >>> theta = gradient_descent( ... features, labels, alpha=0.01, iterations=1000 # doctest: +SKIP ... )