Adam
class Adam(Optimizer):
"""
Adam Optimizer.
Attributes:
learning_rate : float
The learning rate for parameter updates.
beta_1: float
The decay_rate for first momentum.
beta_2: float
The decay_rate for second momentum.
epsilon:
A small constant to avoid division by zero
"""
def __init__(self, learning_rate = 0.01, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-07):
super().__init__(learning_rate)
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.m_w = 0
self.v_w = 0
self.m_b = 0
self.v_b = -0
self.timestep = 0
def apply(self, W: np.ndarray, dl_dw: np.ndarray, b: np.ndarray, dl_db: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
self.timestep += 1
self.m_w = (self.beta_1 * self.m_w) + ((1 - self.beta_1) * dl_dw)
self.m_b = (self.beta_1 * self.m_b) + ((1 - self.beta_1) * dl_db)
self.v_w = (self.beta_2 * self.v_w) + ((1 - self.beta_2) * np.square(dl_dw))
self.v_b = (self.beta_2 * self.v_b) + ((1 - self.beta_2) * np.square(dl_db))
m_w_hat = self.m_w / (1 - (self.beta_1 ** self.timestep))
m_b_hat = self.m_b / (1 - (self.beta_1 ** self.timestep))
v_w_hat = self.v_w / (1 - (self.beta_2 ** self.timestep))
v_b_hat = self.v_b / (1 - (self.beta_2 ** self.timestep))
W -= (self.learning_rate / (np.sqrt(v_w_hat) + self.epsilon)) * m_w_hat
b -= (self.learning_rate / (np.sqrt(v_b_hat) + self.epsilon)) * m_b_hat
return W, b
Last updated