-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsb3_type2.py
127 lines (109 loc) · 6.33 KB
/
sb3_type2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
'''
define your only reward type and rewards
'''
import numpy as np
from rlgym.envs import Match
from rlgym.utils.action_parsers import DiscreteAction
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import VecMonitor, VecNormalize, VecCheckNan
from stable_baselines3.ppo import MlpPolicy
from rlgym.utils.obs_builders import AdvancedObs
from rlgym.utils.reward_functions.common_rewards import VelocityPlayerToBallReward, VelocityBallToGoalReward, EventReward
from rlgym.utils.reward_functions.common_rewards import LiuDistancePlayerToBallReward, LiuDistanceBallToGoalReward, RewardIfBehindBall, RewardIfClosestToBall, VelocityReward
from rlgym.utils import RewardFunction
from rlgym.utils.state_setters import DefaultState
from rlgym.utils.terminal_conditions.common_conditions import TimeoutCondition, GoalScoredCondition
from rlgym_tools.sb3_utils import SB3MultipleInstanceEnv
from rlgym.utils.reward_functions import CombinedReward
from customReward import myCustomRewards
from torch.nn import Tanh
from collections import defaultdict
from rlgym.utils.gamestates import GameState, PlayerData
class customEventReward(EventReward):
def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray, optional_data=None):
# print("self.last_registered_values",self.last_registered_values)
if player.car_id not in self.last_registered_values:
self.last_registered_values[player.car_id] = self._extract_values(player, state)
old_values = self.last_registered_values[player.car_id]
new_values = self._extract_values(player, state)
diff_values = new_values - old_values
diff_values[diff_values < 0] = 0 # We only care about increasing values
reward = np.dot(self.weights, diff_values)
self.last_registered_values[player.car_id] = new_values
return reward
if __name__ == '__main__': # Required for multiprocessing
frame_skip = 8 # Number of ticks to repeat an action
half_life_seconds = 5 # Easier to conceptualize, after this many seconds the reward discount is 0.5
fps = 120 / frame_skip
gamma = np.exp(np.log(0.5) / (fps * half_life_seconds)) # Quick mafs
print(f"fps={fps}, gamma={gamma})")
rewardType = 2
rewards = [
VelocityReward(),
LiuDistancePlayerToBallReward(),
VelocityPlayerToBallReward(),
LiuDistanceBallToGoalReward(),
VelocityBallToGoalReward(),
# RewardIfBehindBall(RewardFunction),
# RewardIfClosestToBall(RewardFunction),
customEventReward(team_goal = 100.0,
goal = 100.0,
concede = -100.0,
shot = 5.0,
save = 30.0,
demo = 10.0,),
]
def get_match(): # Need to use a function so that each instance can call it and produce their own objects
return Match(
team_size=1, # 3v3 to get as many agents going as possible, will make results more noisy
tick_skip=frame_skip,
reward_function=myCustomRewards(rewardType, rewards, REWARDINCREASESTEP=10000000),
spawn_opponents=True,
terminal_conditions=[TimeoutCondition(round(fps * 30)), GoalScoredCondition()], # Some basic terminals
obs_builder=AdvancedObs(), # Not that advanced, good default
state_setter=DefaultState(), # Resets to kickoff position
action_parser=DiscreteAction() # Discrete > Continuous don't @ me
)
env = SB3MultipleInstanceEnv(get_match, 3) # Start 2 instances, waiting 60 seconds between each
env = VecCheckNan(env) # Optional
env = VecMonitor(env) # Recommended, logs mean reward and ep_len to Tensorboard
env = VecNormalize(env, norm_obs=False, gamma=gamma) # Highly recommended, normalizes rewards
policy_kwargs = dict(
activation_fn = Tanh,
net_arch=[512, 512, dict(pi=[256, 256, 256], vf=[256, 256, 256])]
)
# Hyperparameters presumably better than default; inspired by original PPO paper
model = PPO(
MlpPolicy,
env,
policy_kwargs=policy_kwargs,
n_epochs=32, # PPO calls for multiple epochs
learning_rate=1e-5, # Around this is fairly common for PPO
ent_coef=0.01, # From PPO Atari
vf_coef=1., # From PPO Atari
gamma=gamma, # Gamma as calculated using half-life
verbose=3, # Print out all the info as we're going
batch_size=4096, # Batch size as high as possible within reason
n_steps=4096, # Number of steps to perform before optimizing network
tensorboard_log="sb3_type2_out/logs", # `tensorboard --logdir out/logs` in terminal to see graphs
device="auto" # Uses GPU if available
)
# Save model every so often
# Divide by num_envs (number of agents) because callback only increments every time all agents have taken a step
# This saves to specified folder with a specified name
callback = CheckpointCallback(round(5_000_000 / env.num_envs), save_path="type2_policy", name_prefix="rl_model")
# model.learn(100_000_000, callback=callback)
# Now, if one wants to load a trained model from a checkpoint, use this function
# This will contain all the attributes of the original model
# Any attribute can be overwritten by using the custom_objects parameter,
# which includes n_envs (number of agents), which has to be overwritten to use a different amount
model = PPO.load(
"type2_policy/rl_model_109999964_steps.zip",
env,
custom_objects=dict(n_envs=env.num_envs, _last_obs=None), # Need this to change number of agents
device="auto", # Need to set device again (if using a specific one)
force_reset=True # Make SB3 reset the env so it doesn't think we're continuing from last state
)
# Use reset_num_timesteps=False to keep going with same logger/checkpoints
model.learn(100_000_000, callback=callback, reset_num_timesteps=False)