-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun.py
105 lines (82 loc) · 3.15 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import time
import torch
from unityagents import UnityEnvironment
import numpy as np
from ddpg_agent import Agent
from collections import deque
import torch
from model import *
from config import Config
device = torch.device("cpu")
env = UnityEnvironment(file_name="Tennis.app")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=False)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
config = Config(num_workers=num_agents)
def run(env, agent, max_step=10000, train_mode=True):
env_info = env.reset(train_mode=train_mode)[brain_name]
states = env_info.vector_observations
scores = np.zeros(num_agents)
agent.reset()
for t in range(max_step):
actions = agent.act(states)
env_info = env.step(actions)[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
if train_mode:
agent.step(states, actions, rewards, next_states, dones)
states = next_states
scores += rewards
if np.any(dones):
break
return np.mean(scores)
def ddpg(agent, train_mode=True):
scores_window = deque(maxlen=100)
scores = []
for i in range(1, config.episode_count+1):
begin = time.time()
score = run(env=env, agent=agent, train_mode=train_mode)
scores_window.append(score)
score_average = np.mean(scores_window)
scores.append(score)
if i % 10 == 0:
print('\rEpisode {} Average score: {:.2f} Min: {:.2f} Max: {:.2f} Time: {:.2f} Epsilon: {:.2f}'.format(
i,
score_average,
np.min(scores),
np.max(scores),
time.time() - begin,
agent.epsilon
))
if score_average >= 0.5:
torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
print('\nSolve in {:d} episodes. Average score: {:.2f}'.format(i, score_average))
break
return scores
if __name__ == '__main__':
rand_seed = 0
agent = Agent(config=config,
state_size=state_size,
action_size=action_size,
num_agents=num_agents,
random_seed=rand_seed,
device=torch.device('cpu'),
actor_trained_weight_filename="checkpoint_actor.pth",
critic_trained_weight_filename="checkpoint_critic.pth")
scores = ddpg(agent, train_mode=False)
env.close()