Source code for splendor.agents.our_agents.ppo.common

"""
Collection of useful calculation functions.
"""

import torch
from torch import distributions

from .constants import ENTROPY_COEFFICIENT, VALUE_COEFFICIENT, VERY_SMALL_EPSILON



[docs]
def calculate_returns(
    rewards: torch.Tensor, discount_factor: float, normalize: bool = True
) -> torch.Tensor:
    """
    calculate episodes returns (cumulative summation of the rewards).

    :param rewards: the rewards obtained throughout each episode.
    :param discount_factor: by how much rewards decay over time.
    :param normalize: should the returns be normalized (have 0 mean and variance of 1).
    :return: the calculated returns.
    """
    returns_list: list[float] = []
    cumulative_reward: float = 0

    for r in reversed(rewards):
        cumulative_reward = r + cumulative_reward * discount_factor
        returns_list.insert(0, cumulative_reward)

    returns = torch.tensor(returns_list)

    if normalize:
        # avoid possible division by 0
        returns = (returns - returns.mean()) / (returns.std() + VERY_SMALL_EPSILON)

    return returns




[docs]
def calculate_advantages(
    returns: torch.Tensor, values: torch.Tensor, normalize: bool = True
) -> torch.Tensor:
    """
    Calculate the advantages.

    :param returns: the returns (cumulative summation of rewards).
    :param values: the value estimates for each state.
    :param normalize: should the advantages be normalized, i.e. have 0 mean and variance of 1.
    :return: the calculated advantages.
    """
    advantages = returns - values

    if normalize:
        # avoid possible division by 0
        advantages = (advantages - advantages.mean()) / (
            advantages.std() + VERY_SMALL_EPSILON
        )

    return advantages




[docs]
def calculate_policy_loss(
    action_prob: torch.Tensor,
    actions: torch.Tensor,
    log_prob_actions: torch.Tensor,
    advantages: torch.Tensor,
    ppo_clip: float,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    calculate the clipped policy loss.

    :param action_prob: the actions probabilities.
    :param actions: the actions taken.
    :param log_prob_actions: the log-probabilities of the actions.
    :param advantages: the advantages.
    :param ppo_clip: the PPO clipped objective clipping epsilon.
    :return: the policy loss, the Kullback-Leibler divergence estimate & the entropy gain.
    """
    dist = distributions.Categorical(action_prob)

    # new log prob using old actions
    new_log_prob_actions = dist.log_prob(actions)
    policy_ratio = (new_log_prob_actions - log_prob_actions).exp()
    policy_loss_1 = policy_ratio * advantages
    policy_loss_2 = (
        torch.clamp(policy_ratio, min=1.0 - ppo_clip, max=1.0 + ppo_clip) * advantages
    )

    policy_loss = -torch.min(policy_loss_1, policy_loss_2).mean()
    kl_divergence_estimate = (
        (log_prob_actions - new_log_prob_actions).mean().detach().cpu()
    )

    # entropy bonus - use to improve exploration.
    # as seen here (bullet #10):
    # https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/
    entropy = dist.entropy().mean()

    return policy_loss, kl_divergence_estimate, entropy




[docs]
def calculate_loss(
    policy_loss: torch.Tensor, value_loss: torch.Tensor, entropy_bonus: torch.Tensor
) -> torch.Tensor:
    """
    final loss of clipped objective PPO, as seen here:
    https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/model.py#L91

    :param policy_loss: the calculated policy loss.
    :param value_loss: the calculation value loss.
    :param entropy_bonus: the calculated entropy bonus.
    :return: the PPO objective, i.e. a linear combination of those losses & entropy bonus.
    """
    loss = (
        policy_loss
        + VALUE_COEFFICIENT * value_loss
        - ENTROPY_COEFFICIENT * entropy_bonus
    )

    return loss