Source code for blackbox_mpc.utils.iterative_mpc

from blackbox_mpc.policies.mpc_policy import \
    MPCPolicy
from blackbox_mpc.utils.dynamics_learning import learn_dynamics_from_policy
import logging
logging.getLogger().setLevel(logging.INFO)
import tensorflow as tf
from blackbox_mpc.dynamics_handlers.system_dynamics_handler import \
    SystemDynamicsHandler


[docs]def learn_dynamics_iteratively_w_mpc(env, number_of_initial_rollouts, number_of_rollouts_for_refinement, number_of_refinement_steps, task_horizon, env_action_space=None, env_observation_space=None, initial_policy=None, refinement_policy=None, planning_horizon=None, reward_function=None, is_normalized=True, optimizer_name='CEM', optimizer=None, num_agents=None, nn_optimizer=tf.keras.optimizers.Adam, dynamics_function=None, system_dynamics_handler=None, log_dir=None, tf_writer=None, save_model_frequency=1, saved_model_dir=None, exploration_noise=False, epochs=30, learning_rate=1e-3, validation_split=0.2, batch_size=128, start_episode=0, **optimizer_args): """ This is the learn dynamics function iteratively using mpc policy for the runner class which samples n rollouts using an initial policy and then uses these rollouts to learn a dynamics function for the system which is then used to _sample further rollouts to refine the dynamics function. Parameters --------- env: parallelgymEnv a wrapped gym environment using blackbox.environment_utils.EnvironmentWrapper funcs env_action_space: gym.ActionSpace Defines the action space of the gym environment. env_observation_space: gym.ObservationSpace Defines the observation space of the gym environment. num_agents: tf.int32 Defines the number of runner running in parallel dynamics_function: DeterministicDynamicsFunctionBaseClass Defines the system dynamics function. system_dynamics_handler: SystemDynamicsHandler The system_dynamics_handler is a handler of the state, actions and targets processing funcs as well. number_of_initial_rollouts: Int Number of initial rollouts/ episodes to perform for each of the agents in the vectorized environment. number_of_rollouts_for_refinement: Int Number of refinement rollouts/ episodes to perform for each of the agents in the vectorized environment. number_of_refinement_steps: Int Number of refinemnet steps train, collect, train..etc to run for. task_horizon: Int The task horizon/ episode length. initial_policy: ModelBasedBasePolicy or ModelFreeBasePolicy The policy to be used in collecting the initial episodes from the different agents. refinement_policy: ModelBasedBasePolicy The policy to be used in collecting the followup episodes to refine the policy. exploration_noise: bool If noise should be added to the actions to help in exploration. learning_rate: float Learning rate to be used in training the dynamics function. epochs: Int Number of epochs to be used in training the dynamics function everytime train is called. validation_split: float32 Defines the validation split to be used of the rollouts collected. batch_size: int Defines the batch size to be used for training the model. nn_optimizer: tf.keras.optimizers Defines the optimizer to use with the neural network. is_normalized: bool Defines if the dynamics function should be trained with normalization or not. reward_function: tf_function Defines the reward function with the prototype: tf_func_name(current_state, current_actions, next_state), where current_state is BatchXdim_S, next_state is BatchXdim_S and current_actions is BatchXdim_U. planning_horizon: tf.int32 Defines the planning horizon for the optimizer (how many steps to lookahead and optimize for). optimizer: OptimizerBaseClass Optimizer to be used that optimizes for the best action sequence and returns the first action. optimizer_name: str optimizer name between in ['CEM', 'CMA-ES', 'PI2', 'RandomSearch', 'PSO', 'SPSA']. saved_model_dir: string Defines the saved model directory where the model is saved in, in case of loading the model. save_model_frequency: Int Defines how often the model should be saved (defined relative to the number of refining iters) start_episode: Int the episode index for tensorflow logging purposes exploration_noise: bool Defines if exploration noise should be added to the action to be executed. log_dir: string Defines the log directory to save the normalization statistics in. tf_writer: tf.summary Tensorflow writer to be used in logging the data. Returns ------- system_dynamics_handler: SystemDynamicsHandler The system_dynamics_handler holds the trained system dynamics. mpc_policy: ModelBasedBasePolicy The policy that was refined to be used as a control policy """ if number_of_initial_rollouts > 0: system_dynamics_handler = learn_dynamics_from_policy( env=env, policy=initial_policy, number_of_rollouts=number_of_initial_rollouts, task_horizon=task_horizon, dynamics_function=dynamics_function, system_dynamics_handler=system_dynamics_handler, epochs=epochs, learning_rate=learning_rate, validation_split=validation_split, batch_size=batch_size, is_normalized=is_normalized, nn_optimizer=nn_optimizer, tf_writer=tf_writer, exploration_noise=exploration_noise, log_dir=log_dir, save_model_frequency=save_model_frequency, saved_model_dir=saved_model_dir) logging.info("Trained initial system model") else: if system_dynamics_handler is None: system_dynamics_handler = SystemDynamicsHandler( env_action_space=env_action_space, env_observation_space=env_observation_space, true_model=False, dynamics_function=dynamics_function, tf_writer=tf_writer, is_normalized=is_normalized, log_dir=log_dir, save_model_frequency=save_model_frequency, saved_model_dir=saved_model_dir) if refinement_policy is None: refinement_policy = MPCPolicy(reward_function=reward_function, env_action_space=env_action_space, env_observation_space=env_observation_space, dynamics_handler=system_dynamics_handler, optimizer=optimizer, optimizer_name=optimizer_name, num_agents=num_agents, planning_horizon=planning_horizon, tf_writer=tf_writer, **optimizer_args) for i in range(number_of_refinement_steps): system_dynamics_handler = learn_dynamics_from_policy( env=env, policy=refinement_policy, number_of_rollouts=number_of_rollouts_for_refinement, task_horizon=task_horizon, system_dynamics_handler=system_dynamics_handler, epochs=epochs, learning_rate=learning_rate, validation_split=validation_split, batch_size=batch_size, is_normalized=is_normalized, nn_optimizer=nn_optimizer, tf_writer=tf_writer, exploration_noise=exploration_noise, start_episode=start_episode + (number_of_rollouts_for_refinement*i)) return system_dynamics_handler, refinement_policy