/* code to test Q-learning on a given MDP */

/* define usual variables */

#include "krandom.h"

#define STATES 14
#define ACTIONS 2

#define ITERATIONS 100000
#define STEP_SIZE 1000

#define EXPLORATION_THRESHOLD 0.05

#define RUNS 5

double trial_data[RUNS][ITERATIONS/STEP_SIZE];

double qvalues[STATES][ACTIONS]; 

/* transition probabilites */
double prob[ACTIONS][STATES][STATES];

/* rewards */
double reward[ACTIONS][STATES][STATES];


double BETA=0.1;
double GAMMA=0.7;

void init_qvalues()
{

  int s,a;

  for (s=0;s< STATES;s++)
    for (a=0;a<ACTIONS;a++)
	qvalues[s][a] = 0.0;
}

void read_trans_matrix()
{
  FILE *fname;
  int s_from,s_to,a;

  fname = fopen("intro-mdp.prob","r");

  for (a=0; a<ACTIONS; a++)
    for (s_from=0; s_from<STATES; s_from++)
      for (s_to=0; s_to<STATES; s_to++)
	fscanf(fname, "%lf", &prob[a][s_from][s_to]);
  close(fname);
}

void print_trans_matrix()
{
  int fs,a,ts;

  for (a=0; a<ACTIONS; a++)
    {
      for (fs=0; fs<STATES; fs++)
	{
	  for (ts=0; ts<STATES; ts++)
	    printf("%f ", prob[a][fs][ts]);
	  printf("\n");
	}
      printf("\n");
    }
}

void read_reward_matrix()
{
  FILE *fname;
  int s_from,s_to,a;

  fname = fopen("intro-mdp.reward","r");

  for (a=0; a<ACTIONS; a++)
    for (s_from=0; s_from<STATES; s_from++)
      for (s_to=0; s_to<STATES; s_to++)
	fscanf(fname, "%lf", &reward[a][s_from][s_to]);
  close(fname);
}

void print_reward_matrix()
{
  int fs,a,ts;

  for (a=0; a<ACTIONS; a++)
    {
      for (fs=0; fs<STATES; fs++)
	{
	  for (ts=0; ts<STATES; ts++)
	    printf("%f ", reward[a][fs][ts]);
	  printf("\n");
	}
      printf("\n");
    }
}


double best_qvalue(int state)
{
  int a,best_act=0;

  for (a=0;a<ACTIONS;a++)
    if (qvalues[state][a] > qvalues[state][best_act])
      best_act = a;
  return (qvalues[state][best_act]);
}


void update_qvalues(int olds,int news,int action,double r)
{
	double best_qval, qval;
	
	best_qval = best_qvalue(news);

	qval = qvalues[olds][action];
	
	qvalues[olds][action] = 
	  (1 - BETA)*qval + BETA*(r  + GAMMA*best_qval);
      }
      

/* generate next state using MDP transition probs */
int next_state(int state, int action)
{
  int i=0;
  double rand_val,cum_val=0.0;

  rand_val = choose_random_value();

  while(i<(STATES-1))
    {
      cum_val = cum_val + prob[action][state][i]; 
      if (rand_val <= cum_val)
	return(i);
      else ++i;
    }
  return(i);
} 

/* SEMI-UNIFORM EXPLORATION */	

int semi_uniform (int state)
{
  double count,curr_val,best_val = -1E+10;
  int a,act;
  double rand_value; 

  rand_value = choose_random_value();

  if (rand_value > (1.0 - EXPLORATION_THRESHOLD))
    return(choose_random_int_value(ACTIONS-1));
  else
    for (a=0;a<ACTIONS;a++)
      {
	curr_val = qvalues[state][a]; 
	if (curr_val > best_val)
	  {
	    best_val = curr_val;
	    act = a;
	  }
      }
  return (act);
} 

void output_action_values()
{

  int s,a;

  for(s=0; s<STATES; s++)
    {
      printf("\n");
      for (a=0; a<ACTIONS; a++)
	printf("R(%d,%d) = %f ",s,a,qvalues[s][a]);
    }
  printf("\n");
}  

void run_trials(int run)
{
  int i,olds,news,act;
  double cum_val=0.0;

  init_qvalues();

  olds = 0;

  for (i=0;i<= ITERATIONS; i++)
    {

      act = semi_uniform(olds);

      news = next_state(olds,act);

/*      printf("old:%d act:%d new:%d\n", olds,act,news); */

      cum_val += reward[act][olds][news];

      if (i%STEP_SIZE==0) 
	{
	  trial_data[run][i/STEP_SIZE] = cum_val/STEP_SIZE; 
	  cum_val = 0;
	}

      update_qvalues(olds,news,act,reward[act][olds][news]);

      olds = news;

/* decay learning rates slowly 
      BETA = BETA* (1 - i/(ITERATIONS-1));    
      ALPHA = ALPHA* (1 - i/(ITERATIONS-1));   */

    }
      printf("gain: %f\n", cum_val/i);
}

void output_trial_data()
{

  FILE *run;
  int i,j;
  double small, large, sum;
  
  run = fopen("q-run-data", "w");

  for (j=0;j< ITERATIONS/STEP_SIZE;j++)	
      {
  	   small = 1E+10;
   	   large = -1E+10;
   	   sum=0.0;
	   for (i=0; i<RUNS; i++)
	   {
	     sum += trial_data[i][j];
	     if (trial_data[i][j] < small)
	       small = trial_data[i][j];
	     if (trial_data[i][j] > large)
	       large = trial_data[i][j];
	   }
	   fprintf(run,"%d %f %f %f\n", j, sum/RUNS, small, large);
     }
  fclose(run);
} 

      
void main()
{

  int i;

  initialize_random_number_generator();  

  read_trans_matrix();
  print_trans_matrix();
  read_reward_matrix();
  print_reward_matrix();

  for (i=0; i<RUNS; i++)
    {
      printf("\n");
      run_trials(i);
      output_trial_data();
    }
}


      
      
	  

      


