Trying to fix nan output of RL by using relu with upperbound

This commit is contained in:
2024-07-16 12:13:05 +02:00
parent 0c9813beca
commit aac7434346
13 changed files with 405 additions and 75 deletions
+120 -43
View File
@@ -3,11 +3,13 @@
char *action_name[8] = {"LEFT", "CENTER", "RIGHT"};
float reLU(float x){
if(x>10) return 10;
if(x>0) return x;
return 0;
}
float d_reLU(float x){
if (x>10) return 0;
if (x>0) return 1;
return 0;
}
@@ -29,6 +31,10 @@ void copy_weight_in_networks_from_main_to_best(struct networks_qlearning * netwo
COPY_NN_ATTRIBUTE_IN_ALL_LAYERS(TYPE_FLOAT,weight_in, networks->best_net, networks->main_net);
}
float id(float x){ return x;}
float constOne(float x){return 1;}
struct networks_qlearning * create_nework_qlearning(
struct config_layers * config,
bool randomize, float minR, float maxR, int randomRange,
@@ -46,7 +52,6 @@ struct networks_qlearning * create_nework_qlearning(
setup_networks_alloutputs_config_TYPE_FLOAT(&(qnets->best_net), config, false, minR, maxR, randomRange);
copy_weight_in_networks_from_main_to_best(qnets);
setup_all_layers_functions_TYPE_FLOAT(qnets->main_net, tensorContractnProdThread_TYPE_FLOAT, tensorProdThread_TYPE_FLOAT, D_L2, L2, reLU, d_reLU);
setup_all_layers_params_TYPE_FLOAT(qnets->main_net, nb_prod_thread, nb_calc_thread, learning_rate);
setup_all_layers_functions_TYPE_FLOAT(qnets->target_net, tensorContractnProdThread_TYPE_FLOAT, tensorProdThread_TYPE_FLOAT, D_L2, L2, reLU, d_reLU);
@@ -54,6 +59,25 @@ struct networks_qlearning * create_nework_qlearning(
setup_all_layers_functions_TYPE_FLOAT(qnets->best_net, tensorContractnProdThread_TYPE_FLOAT, tensorProdThread_TYPE_FLOAT, D_L2, L2, reLU, d_reLU);
setup_all_layers_params_TYPE_FLOAT(qnets->best_net, nb_prod_thread, nb_calc_thread, learning_rate);
// ne pas mettre fonction d'activation à la sortie , i.e: fonction identité : f(x) = x:
neurons_TYPE_FLOAT *tmpMain = qnets->main_net;
neurons_TYPE_FLOAT *tmpTarget = qnets->target_net;
neurons_TYPE_FLOAT *tmpBest = qnets->best_net;
while(tmpMain){
if(tmpMain->next_layer == NULL){
tmpMain->f_act = id;
tmpMain->d_f_act = constOne;
tmpTarget->f_act = id;
tmpTarget->d_f_act = constOne;
tmpBest->f_act = id;
tmpBest->d_f_act = constOne;
}
tmpMain = tmpMain->next_layer;
tmpTarget= tmpTarget->next_layer;
tmpBest = tmpBest->next_layer;
}
return qnets;
@@ -73,6 +97,11 @@ struct status_qlearning * create_status_qlearning (){
status_ql->nb_training_after_updated_weight_in_target = 0;
status_ql->nb_episodes = 0;
status_ql->index_episode= 0;
status_ql->action=1;
// status_ql->last_action=-1;
// status_ql->count_last_action=0;
return status_ql;
}
@@ -129,6 +158,7 @@ struct qlearning_params * create_qlearning_params (
qparams->factor_update_exploration_factor = 0.995;
qparams->minimum_threshold_exploration_factor = 0.01;
// qparams->threshold_number_same_action = 500;
return qparams;
}
@@ -226,6 +256,8 @@ void train_qlearning(struct RL_agent * rlAgent,
qlParams->exploration_factor = (qlParams->exploration_factor < qlParams->minimum_threshold_exploration_factor) ? qlParams->exploration_factor : qlParams->exploration_factor * qlParams->factor_update_exploration_factor ;
// free_tensor_TYPE_FLOAT(action_value);
// free_tensor_TYPE_FLOAT(next_action_value);
}
@@ -236,26 +268,91 @@ int select_action(struct RL_agent * rlAgent){
//calculate_output_by_network_neurons_TYPE_FLOAT(rlAgent->networks->main_net, rlAgent->car->old_sensor, &action_value);
calculate_output_by_network_neurons_TYPE_FLOAT(rlAgent->networks->main_net, rlAgent->car->sensor, &action_value);
//long int NUMBER_EPISODE2 = (rlAgent->qlearnParams->number_episodes)*100;
int NUMBER_EPISODE2 = 3000;
//int randRange = 10000;
//NUMBER_EPISODE2 = NUMBER_EPISODE2 * NUMBER_EPISODE2;
// static bool init = true ;
// if(init){
srand(time(NULL));
// init =false;
// }
int random = rand() % NUMBER_EPISODE2;
float proba_explor = (float)(random ) / NUMBER_EPISODE2;
//static bool init = true ;
//if(init){
//srand(time(NULL));
//init =false;
//}
//int random = xrand() % randRange;
float proba_explor = (float) (rand() % (1<<17 -1))/ (1<<17 -1); //frand(); //(float)(random ) / randRange;
if(proba_explor > rlAgent->qlearnParams->exploration_factor ){
action = ARG_MAX_ARRAY_TYPE_FLOAT( action_value->x, action_value->dim->rank );
//if(action == ARG_MIN_ARRAY_TYPE_FLOAT( action_value->x, action_value->dim->rank ))
//action = xrand() % action_value->dim->rank ;
}
else{
action = rand() % action_value->dim->rank ;
action = xrand() % action_value->dim->rank ;
// explore++;
//printf(" EXPLORE :%ld, action : %d , factor : %f nb_episodes : %ld \n",explore,action,rlAgent->qlearnParams->exploration_factor, rlAgent->status->nb_episodes);
}
/*
if(rlAgent->status->last_action == action){
++(rlAgent->status->count_last_action);
if(rlAgent->status->count_last_action > rlAgent->qlearnParams->threshold_number_same_action ){
while(rlAgent->status->last_action == action)
action = xrand() % action_value->dim->rank ;
rlAgent->status->last_action = action;
rlAgent->status->count_last_action = 0;
}
}
else{
rlAgent->status->last_action = action;
rlAgent->status->count_last_action = 0;
}
*/
rlAgent->status->action = action;
return action;
}
void* runPrint(void *arg){
struct RL_agent *rlAgent = (struct RL_agent*)arg;
struct status_qlearning *qlStatus = rlAgent->status;
struct print_params * pprint = rlAgent->pprint;
struct vehicle *car = rlAgent->car;
size_t count_print = 0;
while(1){
if(/*(qlStatus->nb_episodes %125 == 0) &&*/ pprint->printed){
//pthread_mutex_lock(&(pprint->mut_printed));
pthread_mutex_lock(&(car->mut_coord));
print_vehicle_n_path(car, pprint->scale_x, pprint->scale_y);
pthread_mutex_unlock(&(car->mut_coord));
//pthread_mutex_unlock(&(pprint->mut_printed));
printf("%s ",pprint->string_space);
printf("ep: %ld\n",qlStatus->index_episode);
neurons_TYPE_FLOAT * net_main = rlAgent->networks->main_net;
neurons_TYPE_FLOAT * net_target = rlAgent->networks->target_net;
for(size_t i=0; i<net_main->output->dim->rank; ++i) {
printf("{sensro[%s]:%f "" vs oldsens[%s]: %f}\n",action_name[i%COUNT_ACTION],net_target->output->x[i],
action_name[i%COUNT_ACTION],net_main->output->x[i]);
}
printf("\n< %5.2f > ( %s ) \n", car->direction, action_name[qlStatus->action % COUNT_ACTION]);
//print_weight_in_neurons_TYPE_FLOAT(net_main, "net_main_wei");
//PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_main, weight_in, "net_main_we_in");
PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_main, output, "net_main_out");
//PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_target, output, "net_target_out");
//PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_main, input, "net_main_input");
printf(" action : %d , factor : %f nb_episodes : %ld \n",qlStatus->action,rlAgent->qlearnParams->exploration_factor, rlAgent->status->nb_episodes);
FOR_LIST_FORM_BEGIN(TYPE_L_INT, qlStatus->progress_best_cumul){
printf(" | %ld |,",(qlStatus->progress_best_cumul)->current_list->value);
}
printf("[%ld] %s ", rlAgent->car->status->cumulative_reward, pprint->string_space);
}
Sleep(pprint->delay->delay_between_games);
++count_print;
if(count_print > 20){
count_print = 0;
clear_screen();
}
}
}
void learn_to_drive(struct RL_agent * rlAgent){
int action;
struct vehicle * car = rlAgent->car;
@@ -264,11 +361,15 @@ void learn_to_drive(struct RL_agent * rlAgent){
struct status_qlearning * qlStatus = rlAgent->status;
struct print_params * pprint = rlAgent->pprint;
char msg[100];
pthread_t threadPrint;
pthread_create(&threadPrint, NULL, runPrint, (void*)rlAgent);
while(true){
for(size_t index_episode = 0; index_episode < qlParams->number_episodes; ++index_episode){
reset(car);
qlStatus->nb_training_after_updated_weight_in_target = 0;
qlStatus->index_episode = index_episode;
while(true){
++(qlStatus->nb_episodes);
++(qlStatus->nb_training_after_updated_weight_in_target);
@@ -277,51 +378,27 @@ void learn_to_drive(struct RL_agent * rlAgent){
add_string_log_M(car_status,msg);
step_vehicle(car, action);
train_qlearning(rlAgent, action);
if(/*(qlStatus->nb_episodes %15 == 0) && */ pprint->printed){
pthread_mutex_lock(&(pprint->mut_printed));
print_vehicle_n_path(car, pprint->scale_x, pprint->scale_y);
pthread_mutex_unlock(&(pprint->mut_printed));
printf("%s ",pprint->string_space);
printf("ep: %ld\n",index_episode);
neurons_TYPE_FLOAT * net_main = rlAgent->networks->main_net;
neurons_TYPE_FLOAT * net_target = rlAgent->networks->target_net;
for(size_t i=0; i<net_main->output->dim->rank; ++i) {
printf("{sensro[%s]:%f "/*vs %f / VS / %f */" vs oldsens[%s]: %f}\n",action_name[i%COUNT_ACTION],net_target->output->x[i],
/*car->sensor->x[i] ,car->old_sensor->x[i],
*/action_name[i%COUNT_ACTION],net_main->output->x[i]);
}
printf("\n< %f > ( %s ) \n", car->direction, action_name[action % COUNT_ACTION]);
//print_weight_in_neurons_TYPE_FLOAT(net_main, "net_main_wei");
//PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_main, weight_in, "net_main_we_in");
PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_main, output, "net_main_out");
//PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_target, output, "net_target_out");
//PRINT_ATTRIBUTE_TENS_IN_ALL_LAYERS(TYPE_FLOAT, net_main, input, "net_main_input");
printf("action : %d , factor : %f nb_episodes : %ld \n",action,rlAgent->qlearnParams->exploration_factor, rlAgent->status->nb_episodes);
Sleep(pprint->delay->delay_between_games);
}
//done in step ... copy_tensor_TYPE_FLOAT(car->old_sensor, car->sensor);
//done in step ... copy_tensor_TYPE_FLOAT(car->old_sensor, car->sensor);
if( qlStatus->nb_training_after_updated_weight_in_target > qlParams->nb_training_before_update_weight_in_target ){
qlStatus->nb_training_after_updated_weight_in_target = 0;
copy_weight_in_networks_from_main_to_target(rlAgent->networks);
}
if(car_status->done == true){
//push_back_list_TYPE_L_INT(qlStatus->list_main_cumul, car_status->cumulative_reward);
printf(" cumul : %ld ", car_status->cumulative_reward);
// printf(" cumul : %ld ", car_status->cumulative_reward);
if(car_status->cumulative_reward > qlStatus->progress_best_cumul->end_list->value){
push_back_list_TYPE_L_INT(qlStatus->progress_best_cumul, car_status->cumulative_reward);
FOR_LIST_FORM_BEGIN(TYPE_L_INT, qlStatus->progress_best_cumul){
printf(" | %ld |,",(qlStatus->progress_best_cumul)->current_list->value);
}
printf("%s ",pprint->string_space);
}
break;
}
}
if(pprint->printed){
Sleep(pprint->delay->delay_between_episodes);
}
//if(pprint->printed){
// Sleep(pprint->delay->delay_between_episodes);
//}
}
}
pthread_join(threadPrint, NULL);
}
@@ -37,6 +37,7 @@ struct qlearning_params {
float minimum_threshold_exploration_factor;
long int nb_training_before_update_weight_in_target;
size_t number_episodes;
// size_t threshold_number_same_action;
};
@@ -46,6 +47,10 @@ struct status_qlearning {
struct main_list_TYPE_L_INT * progress_best_cumul;
long int nb_training_after_updated_weight_in_target;
size_t nb_episodes;
size_t index_episode;
int action;
// int last_action;
// size_t count_last_action;
};
struct delay_params {
+38 -12
View File
@@ -64,7 +64,7 @@ sensors * create_sensors(size_t nb_xs){
struct vehicle * create_vehicle(struct blocks *path){
struct vehicle * ret_vehicle = malloc(sizeof(struct vehicle));
pthread_mutex_init(&(ret_vehicle->mut_coord), NULL);
ret_vehicle->coord = create_coordinate(2);
ret_vehicle->sensor = create_sensors(NB_SENSORS);
ret_vehicle->old_sensor = create_sensors(NB_SENSORS);
@@ -117,6 +117,7 @@ void free_sensors(sensors *snsr){
}
void free_vehicle(struct vehicle * vhcl){
pthread_mutex_destroy(&(vhcl->mut_coord));
free_coordinate(vhcl->coord);
free_blocks(vhcl->path);
free_sensors(vhcl->sensor);
@@ -392,8 +393,10 @@ void print_vehicle_n_path(struct vehicle *v, float scale_x, float scale_y){
}
void move_vehicle(struct vehicle *v){
pthread_mutex_lock(&(v->mut_coord));
v->coord->x[0] += v->speed * cos(v->direction * M_PI / 180);
v->coord->x[1] -= v->speed * sin(v->direction * M_PI / 180);
pthread_mutex_unlock(&(v->mut_coord));
}
float distance2_coordinate(coordinate *c0, coordinate *c1){
@@ -413,11 +416,12 @@ float distance2_coordinate(coordinate *c0, coordinate *c1){
diStep_sensor->x[1] -= step_sensor * sin(direction_radian);\
}\
dist = (distance2_coordinate(diStep_sensor, v->coord)/5);\
printf("| dist :%f | ",dist);\
v->sensor->x[position] = (float)(MIN((SUBDIVISION-1),(int)dist))/SUBDIVISION ;\
/*printf("| dist :%f | ",dist);*/\
v->sensor->x[position] = (float)(MIN((SUBDIVISION-1),dist))/SUBDIVISION ;\
//v->sensor->x[position] = (float)(MIN((SUBDIVISION-1),(int)dist))/SUBDIVISION ;\
//v->sensor->x[position] = (MIN(49,(distance2_coordinate(diStep_sensor, v->coord)/5))) ;\
@@ -430,7 +434,7 @@ void read_sensor(struct vehicle *v){
coordinate * diStep_sensor = create_coordinate(2);
copy_coordinate(diStep_sensor, v->coord->x);
float dist;
printf("\n");
// printf("\n");
// count the number of step until we go out of the path = distance
// center sensor
float direction_radian ;
@@ -493,11 +497,23 @@ void add_string_log(struct game_status *status, char *str ){
}
float addEpsilonRand(){
int rangeRand = 500;
int randd = xrand() % rangeRand;
int sign = (-1)*((randd %2)*2) + 1;
float addR = sign * (float)randd/(rangeRand* 10);
return addR;
}
void step_vehicle(struct vehicle *v, int action){
//float action_x[NB_ACTION]={-3,0,3}; // [LEFT, CENTER, RIGHT]
float action_x[NB_ACTION]={-3,0,3}; // [LEFT, CENTER, RIGHT]
v->direction = (float)((int)(v->direction + action_x[action % 3]) % 360) ;
v->speed = SPEED; // /5;
//v->direction += addEpsilonRand();
//v->speed = SPEED; // /5;
move_vehicle(v);
read_sensor(v);
struct game_status *status = v->status;
@@ -517,6 +533,7 @@ void step_vehicle(struct vehicle *v, int action){
bool broken = false;
long pprec, prec, next;
char msg[48];
//size_t count_i[path->nb_blocks];
for(long i=0; i< path->nb_blocks; ++i){
//prec = (i-1)%(path->nb_blocks);
pprec = (i + path->nb_blocks - 2 )%(path->nb_blocks);
@@ -531,7 +548,12 @@ void step_vehicle(struct vehicle *v, int action){
status->done = false;
sprintf(msg," %ld,",i);
add_string_log(status, msg);
}
//count_i[i] = 0;
}/*else{
if(count_i[i]>10000)
status->reward = -10;
++count_i[i];
}*/
if(path->marker[next] == true){
status->reward = REWARD_STOP;
status->done = true;
@@ -550,6 +572,9 @@ void step_vehicle(struct vehicle *v, int action){
}
status->cumulative_reward += status->reward;
if(status->cumulative_reward < -25000){
status->done = true;
}
}
#define RANDOM 1
@@ -565,28 +590,29 @@ void reset(struct vehicle *v){
sprintf(v->status->log,"\n");
v->status->cur_log = 0;
//if(init){
srand(time(NULL));
// init = false;
//srand(time(NULL));
//init = false;
//}
int random;
int diff;
diff = path->upper_bound_block[0]->x[0] - path->lower_bound_block[0]->x[0];
random = rand() % (diff/2) ;
random = xrand() % (diff/2) ;
#if RANDOM
v->coord->x[0] = path->lower_bound_block[0]->x[0] + random;
#else
v->coord->x[0] = path->lower_bound_block[0]->x[0] + diff/2;
#endif
diff = path->upper_bound_block[0]->x[1] - path->lower_bound_block[0]->x[1];
random = rand() % (diff/2);
random = xrand() % (diff/2);
#if RANDOM
v->coord->x[1] = path->lower_bound_block[0]->x[1] + random;
#else
v->coord->x[1] = path->lower_bound_block[0]->x[1] + diff/2;
#endif
random = rand() % 50;
random = xrand() % 50;
#if RANDOM
v->direction = random - 25;
// v->direction = 115 - random ;
v->direction = random - 25 ;
#else
v->direction = -90;
#endif
+1
View File
@@ -82,6 +82,7 @@ struct blocks {
typedef tensor_TYPE_FLOAT sensors;
struct vehicle {
pthread_mutex_t mut_coord;
coordinate *coord;
float direction;
float speed;
+24 -9
View File
@@ -401,7 +401,7 @@ float df(float x){
return exp(-x)/ ((1+exp(-x)) * (1+exp(-x)));
}
#if 1
TEST(first_learn_vehicle_rev50){
TEST(first_learn_vehicle_rev50_8){
size_t nb_block = 7;
size_t dim= 2;
struct blocks * path = create_blocks(nb_block, dim);
@@ -467,8 +467,8 @@ TEST(first_learn_vehicle_rev50){
struct status_qlearning *qlstatus = create_status_qlearning ();
struct delay_params *dly = create_delay_params (
100/*size_t delay_between_episodes*/,
10/*size_t delay_between_games*/
500/*size_t delay_between_episodes*/,
50/*size_t delay_between_games*/
);
struct qlearning_params *qlparams = create_qlearning_params (
@@ -510,7 +510,7 @@ TEST(first_learn_vehicle_rev50){
#if 1
TEST(first_learn_vehicle_50){
TEST(first_learn_vehicle_50__9){
size_t nb_block = 7;
size_t dim= 2;
struct blocks * path = create_blocks(nb_block, dim);
@@ -518,7 +518,23 @@ TEST(first_learn_vehicle_50){
#if 1
copy_coordinate(path->lower_bound_block[4], (float[]){0,0});
copy_coordinate(path->upper_bound_block[4], (float[]){150,250});
copy_coordinate(path->lower_bound_block[3], (float[]){150,40});
copy_coordinate(path->upper_bound_block[3], (float[]){250,150});
copy_coordinate(path->lower_bound_block[2], (float[]){250,80});
copy_coordinate(path->upper_bound_block[2], (float[]){360,200});
copy_coordinate(path->lower_bound_block[1], (float[]){360,70});
copy_coordinate(path->upper_bound_block[1], (float[]){600,150});
copy_coordinate(path->lower_bound_block[0], (float[]){600,90});
copy_coordinate(path->upper_bound_block[0], (float[]){760,300});
copy_coordinate(path->lower_bound_block[6], (float[]){260,300});
copy_coordinate(path->upper_bound_block[6], (float[]){760,360});
copy_coordinate(path->lower_bound_block[5], (float[]){0,250});
copy_coordinate(path->upper_bound_block[5], (float[]){410,300});
/*
copy_coordinate(path->lower_bound_block[0], (float[]){0,0});
copy_coordinate(path->upper_bound_block[0], (float[]){150,250});
copy_coordinate(path->lower_bound_block[1], (float[]){150,0});
@@ -534,7 +550,6 @@ TEST(first_learn_vehicle_50){
copy_coordinate(path->lower_bound_block[6], (float[]){0,250});
copy_coordinate(path->upper_bound_block[6], (float[]){410,300});
/*
copy_coordinate(path->lower_bound_block[0], (float[]){0,0});
copy_coordinate(path->upper_bound_block[0], (float[]){100,250});
copy_coordinate(path->lower_bound_block[1], (float[]){100,0});
@@ -611,8 +626,8 @@ TEST(first_learn_vehicle_50){
struct status_qlearning *qlstatus = create_status_qlearning ();
struct delay_params *dly = create_delay_params (
100/*size_t delay_between_episodes*/,
10/*size_t delay_between_games*/
500/*size_t delay_between_episodes*/,
50/*size_t delay_between_games*/
);
struct qlearning_params *qlparams = create_qlearning_params (
@@ -653,7 +668,7 @@ TEST(first_learn_vehicle_50){
#if 0
#if 1
TEST(first_learn_vehicle){
size_t nb_block = 7;
size_t dim= 2;
@@ -763,7 +778,7 @@ TEST(first_learn_vehicle){
#if 0
#if 1
TEST(first_learn_vehicle){
size_t nb_block = 7;
size_t dim= 2;