# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.ex01.agent import Agent
from irlc.ex02.dp import DP_stochastic
from irlc import train
import numpy as np


class DynamicalProgrammingAgent(Agent):
    """
    This is an agent which plan using dynamical programming.
    """
    def __init__(self, env, model=None):
        super().__init__(env)
        self.J, self.pi_ = DP_stochastic(model)

    def pi(self, s, k, info=None):
        if k >= len(self.pi_):
            raise Exception("k >= N; I have not planned this far!")
        ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error.
        #----------------------------------------------------------------------------------------------------------------------------
        # action = se????????????
        raise NotImplementedError("Get the action according to the DP policy.")
        return action

    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):  # Do nothing; this is DP so no learning takes place.
        pass


def main():
    from irlc.ex01.inventory_environment import InventoryEnvironment
    from irlc.ex02.inventory import InventoryDPModel

    env = InventoryEnvironment(N=3) 
    inventory_model = InventoryDPModel(N=3)
    agent = DynamicalProgrammingAgent(env, model=inventory_model)
    stats, _ = train(env, agent, num_episodes=5000) 

    s, _ = env.reset() # Get initial state
    Er = np.mean([stat['Accumulated Reward'] for stat in stats])
    print("Estimated reward using trained policy and MC rollouts", Er)  
    print("Reward as computed using DP", -agent.J[0][s])  

if __name__ == "__main__":
    main()