-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdirect_train_deep_max_entropy_rl.py
More file actions
249 lines (205 loc) · 8.38 KB
/
direct_train_deep_max_entropy_rl.py
File metadata and controls
249 lines (205 loc) · 8.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import gym
import numpy as np
import matplotlib.pyplot as plt
import math
import torch
import torch.nn as nn
import torch.optim as optim
class QNetwork(nn.Module):
def __init__(self, input_size, output_size):
super(QNetwork, self).__init__()
self.fc1 = nn.Linear(input_size, 64)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(64, 32)
self.relu2 = nn.ReLU()
self.output_layer = nn.Linear(32, output_size)
def forward(self, state):
x = self.fc1(state)
x = self.relu1(x)
x = self.fc2(x)
x = self.relu2(x)
q_values = self.output_layer(x)
return q_values
class MaxEntropyDeepRL:
def __init__(self, state_size, action_size, feature_matrix, one_feature, learning_rate=0.001, gamma=0.99):
self.q_network = QNetwork(state_size, action_size)
self.target_q_network = QNetwork(state_size, action_size)
self.target_q_network.load_state_dict(self.q_network.state_dict())
self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
self.gamma = gamma
self.feature_matrix = feature_matrix
self.one_feature = one_feature
def select_action(self, state, epsilon):
"""
Selects an action based on the q values from the network with epsilon greedy.
:param state:
:param epsilon:
:return:
"""
if np.random.rand() < epsilon:
return np.random.choice(3)
else:
with torch.no_grad():
q_values = self.q_network(torch.FloatTensor(state))
return torch.argmax(q_values).item()
def update_q_network(self, state, action, reward, next_state, done):
"""
Updates the q network based on the reward
:param state:
:param action:
:param reward:
:param next_state:
:param done:
:return:
"""
state = torch.FloatTensor(state)
next_state = torch.FloatTensor(next_state)
q_values = self.q_network(state)
next_q_values = self.target_q_network(next_state)
target = q_values.clone()
if not done:
target[action] = reward + self.gamma * torch.max(next_q_values).item()
else:
target[action] = reward
loss = nn.MSELoss()(q_values, target.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_network(self):
"""
Updates the target network.
:return:
"""
self.target_q_network.load_state_dict(self.q_network.state_dict())
def state_to_idx(self, env, state):
"""
Converts state (pos, vel) to the integer value using the mountain car environment.
:param state:
:return:
"""
""" """
env_low = env.observation_space.low
env_high = env.observation_space.high
env_distance = (env_high - env_low) / self.one_feature
position_idx = int((state[0] - env_low[0]) / env_distance[0])
velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
state_idx = position_idx + velocity_idx * self.one_feature
return state_idx
def discretize_state(self, env, state):
"""
Discretizes the position and velocity of the given state.
:param state:
:return:
"""
env_low = env.observation_space.low
env_high = env.observation_space.high
env_distance = (env_high - env_low) / self.one_feature
position_idx = int((state[0] - env_low[0]) / env_distance[0])
velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
return [position_idx, velocity_idx]
def get_reward(self, n_states, state_idx):
"""
Returns the achieved reward.
:param n_states:
:param state_idx:
:return:
"""
irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,))
return irl_rewards[state_idx]
# Training Loop
def train(agent, env, learner_feature_expectations, n_states, episodes=5000, max_steps=10000,
epsilon_start=1.0,
epsilon_decay=0.995, epsilon_min=0.01):
epsilon = epsilon_start
episode_arr, scores = [], []
best_reward = -math.inf
for episode in range(episodes):
state, info = env.reset()
total_reward = 0
for step in range(max_steps):
action = agent.select_action(state, epsilon)
next_state, reward, done, _, _ = env.step(action)
total_reward += reward
agent.update_q_network(state, action, reward, next_state, done)
agent.update_target_network()
# State counting for densitiy
state_idx = agent.state_to_idx(env, state)
learner_feature_expectations += agent.feature_matrix[int(state_idx)]
state = next_state
if done:
break
# Keep track of best performing network
if total_reward > best_reward:
best_reward = total_reward
torch.save(agent.q_network.state_dict(),
f"../results/maxentropydeep_{episode}_best_network_w_{total_reward}_RL.pth")
if (episode + 1) % 10 == 0:
# calculate density
learner = learner_feature_expectations / episode
learner_feature_expectations = np.zeros(n_states)
scores.append(total_reward)
episode_arr.append(episode)
epsilon = max(epsilon * epsilon_decay, epsilon_min)
print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}")
if (episode + 1) % 1000 == 0:
score_avg = np.mean(scores)
print('{} episode average score is {:.2f}'.format(episode, score_avg))
save_plot_as_png(episode_arr, scores, f"../learning_curves/maxent_{episodes}_{episode}_qnetwork_RL.png")
save_heatmap_as_png(learner.reshape((20, 20)), f"../heatmap/learner_{episode}_deep_RL.png")
save_heatmap_as_png(theta.reshape((20, 20)), f"../heatmap/theta_{episode}_deep_RL.png")
torch.save(agent.q_network.state_dict(), f"../results/maxent_{episodes}_{episode}_network_RL.pth")
if episode == episodes - 1:
save_plot_as_png(episode_arr, scores, f"../learning_curves/maxentdeep_{episodes}_qdeep_RL.png")
torch.save(agent.q_network.state_dict(), f"../results/maxentdeep_{episodes}_q_network_RL.pth")
def save_heatmap_as_png(data, output_path, title=None, xlabel="Position", ylabel="Velocity"):
"""
Create a heatmap from a numpy array and save it as a PNG file.
:param data: 2D numpy array containing the heatmap data.
:param output_path: Output path for saving the PNG file.
:param xlabel: Label for the x-axis (optional).
:param ylabel: Label for the y-axis (optional).
:param title: Title for the plot (optional).
"""
fig, ax = plt.subplots()
im = ax.imshow(data, cmap='viridis', interpolation='nearest')
plt.colorbar(im)
if xlabel:
plt.xlabel(xlabel)
if ylabel:
plt.ylabel(ylabel)
if title:
plt.title(title)
plt.savefig(output_path, format='png')
plt.close(fig)
def save_plot_as_png(x, y, output_path, title=None, xlabel="Episodes", ylabel="Scores"):
"""
Create a line plot from x and y data and save it as a PNG file.
:param x: 1D numpy array or list representing the x-axis values.
:param y: 1D numpy array or list representing the y-axis values.
:param output_path: Output path for saving the plot as a PNG file.
:param xlabel: Label for the x-axis (optional).
:param ylabel: Label for the y-axis (optional).
:param title: Title for the plot (optional).
"""
fig, ax = plt.subplots()
ax.plot(x, y)
if xlabel:
plt.xlabel(xlabel)
if ylabel:
plt.ylabel(ylabel)
if title:
plt.title(title)
plt.savefig(output_path, format='png')
plt.close(fig)
# Main function
if __name__ == "__main__":
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0]
action_size = 3 # env.action_space.n
# Feature Matrix
n_states = 400 # 20 * 20
one_feature = 20 # number of state per one feature
feature_matrix = np.eye(n_states)
agent = MaxEntropyDeepRL(state_size, action_size, feature_matrix, one_feature)
learner_feature_expectations = np.zeros(n_states)
train(agent, env, learner_feature_expectations, n_states)