import mdpsolver
import random
import sys
import numpy as np
from random import randint

#TEST 1
#Simple MDP with 3 states and 2 actions in each state.

#---------------------------------------
# CONFIGURATION 1
#---------------------------------------

#rewards
#1st index: from (current) states
#2nd index: actions
rewards = [[5,-1],
           [1,-2],
           [50,0]]

#transition probabilities
#1st index: from (current) states
#2nd index: actions
#3rd index: to (next) states 
tranMatWithZeros = [[[0.9,0.1,0.0],[0.1,0.9,0.0]],
                    [[0.4,0.5,0.1],[0.3,0.5,0.2]],
                    [[0.2,0.2,0.6],[0.5,0.5,0.0]]]

#initial policy
random.seed(10)
initPolicy = [randint(0, 1) for p in range(0, 3)]

#Model 1a (discounted reward, parallel)
mdl1a = mdpsolver.model()
mdl1a.mdp(discount=0.95,
        rewards=rewards,
        tranMatWithZeros=tranMatWithZeros)
mdl1a.solve(algorithm="mpi",
          update="standard",
          parallel=True,
          initPolicy=initPolicy)

print(mdl1a.getPolicy())