Skip to content

Commit 6d1111a

Browse files
authored
Merge pull request #152 from cpnota/release/0.5.2
release/0.5.2
2 parents 68d355a + ff83a20 commit 6d1111a

19 files changed

+186
-35
lines changed

all/approximation/q_dist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def __init__(
1414
v_min,
1515
v_max,
1616
name="q_dist",
17-
**kwargs,
17+
**kwargs
1818
):
1919
device = next(model.parameters()).device
2020
self.n_actions = n_actions

all/approximation/v_network_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ def test_multi_reinforce(self):
3939
mask=torch.tensor([1, 1, 0, 1, 0, 0])
4040
)
4141
result1 = self.v(states[0:2])
42-
result2 = self.v(states[2:4])
43-
result3 = self.v(states[4:6])
4442
self.v.reinforce(loss(result1, torch.tensor([1, 2])).float())
43+
result2 = self.v(states[2:4])
4544
self.v.reinforce(loss(result2, torch.tensor([1, 1])).float())
45+
result3 = self.v(states[4:6])
4646
self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
4747
with self.assertRaises(Exception):
4848
self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())

all/environments/abstract.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,11 @@ def step(self, action):
4141
4242
Returns
4343
-------
44-
State
45-
The state of the environment after the action is applied
44+
all.environments.State
45+
The State of the environment after the action is applied.
46+
This State object includes both the done flag and any additional "info"
4647
float
4748
The reward achieved by the previous action
48-
done
49-
True if the environment has entered a terminal state and should be reset
50-
info
51-
Diagnostic information useful for debugging
5249
"""
5350

5451
@abstractmethod

all/experiments/parallel_env_experiment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __init__(
3232

3333
# test state
3434
self._test_episodes = 100
35-
self._test_episodes_started = 0
35+
self._test_episodes_started = self._n_envs
3636
self._test_returns = []
3737
self._should_save_returns = [True] * self._n_envs
3838

all/experiments/single_env_experiment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _run_training_episode(self):
7676
def _run_test_episode(self):
7777
# initialize the episode
7878
self._env.reset()
79-
action = self._agent.act(self._env.state, self._env.reward)
79+
action = self._agent.eval(self._env.state, self._env.reward)
8080
returns = 0
8181

8282
# loop until the episode is finished

all/experiments/single_env_experiment_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ def test_writes_test_returns(self):
7676
experiment = MockExperiment(dqn(), self.env, quiet=True)
7777
experiment.train(episodes=5)
7878
returns = experiment.test(episodes=4)
79-
expected_mean = 10.25
80-
expected_std = 1.0897247358851685
79+
expected_mean = 9.5
80+
expected_std = 0.5
8181
np.testing.assert_equal(np.mean(returns), expected_mean)
8282
np.testing.assert_equal(
8383
experiment._writer.data["evaluation/returns-test/mean"]["values"],

all/experiments/writer.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ class ExperimentWriter(SummaryWriter, Writer):
1414
tagging the run with a combination of the agent name, the commit hash of the
1515
current git repo of the working directory (if any), and the current time.
1616
Also writes summary statistics into CSV files.
17-
18-
1917
Args:
2018
experiment (all.experiments.Experiment): The Experiment associated with the Writer object.
2119
agent_name (str): The name of the Agent the Experiment is being performed on
@@ -24,7 +22,7 @@ class ExperimentWriter(SummaryWriter, Writer):
2422
'''
2523
def __init__(self, experiment, agent_name, env_name, loss=True):
2624
self.env_name = env_name
27-
current_time = str(datetime.now())
25+
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
2826
os.makedirs(
2927
os.path.join(
3028
"runs", ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)), env_name
@@ -51,7 +49,6 @@ def add_schedule(self, name, value, step="frame"):
5149
def add_scalar(self, name, value, step="frame"):
5250
'''
5351
Log an arbitrary scalar.
54-
5552
Args:
5653
name (str): The tag to associate with the scalar
5754
value (number): The value of the scalar at the current step

all/nn/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ class RLNetwork(nn.Module):
1010
"""
1111
Wraps a network such that States can be given as input.
1212
"""
13-
1413
def __init__(self, model, _=None):
1514
super().__init__()
1615
self.model = model
@@ -20,7 +19,7 @@ def forward(self, state):
2019
return self.model(state.features.float()) * state.mask.float().unsqueeze(-1)
2120

2221
class Aggregation(nn.Module):
23-
"""len()
22+
"""
2423
Aggregation layer for the Dueling architecture.
2524
2625
https://arxiv.org/abs/1511.06581

all/optim/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
from .scheduler import LinearScheduler, Schedulable
2+
3+
__all__ = ['Schedulable', 'LinearScheduler']

all/policies/deterministic.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@
44

55

66
class DeterministicPolicy(Approximation):
7+
'''
8+
A DDPG-style deterministic policy.
9+
10+
Args:
11+
model (torch.nn.Module): A Pytorch module representing the policy network.
12+
The input shape should be the same as the shape of the state space,
13+
and the output shape should be the same as the shape of the action space.
14+
optimizer (torch.optim.Optimizer): A optimizer initialized with the
15+
model parameters, e.g. SGD, Adam, RMSprop, etc.
16+
action_space (gym.spaces.Box): The Box representing the action space.
17+
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
18+
'''
719
def __init__(
820
self,
921
model,
@@ -20,6 +32,7 @@ def __init__(
2032
**kwargs
2133
)
2234

35+
2336
class DeterministicPolicyNetwork(RLNetwork):
2437
def __init__(self, model, space):
2538
super().__init__(model)

all/policies/deterministic_test.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,28 +57,24 @@ def test_target(self):
5757
self.space,
5858
target=FixedTarget(3)
5959
)
60-
61-
# choose initial action
6260
state = State(torch.ones(1, STATE_DIM))
63-
action = self.policy(state)
64-
tt.assert_equal(action, torch.zeros(1, ACTION_DIM))
6561

6662
# run update step, make sure target network doesn't change
67-
action.sum().backward(retain_graph=True)
63+
self.policy(state).sum().backward()
6864
self.policy.step()
6965
tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))
7066

7167
# again...
72-
action.sum().backward(retain_graph=True)
68+
self.policy(state).sum().backward()
7369
self.policy.step()
7470
tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))
7571

7672
# third time, target should be updated
77-
action.sum().backward(retain_graph=True)
73+
self.policy(state).sum().backward()
7874
self.policy.step()
7975
tt.assert_allclose(
80-
self.policy.eval(state),
81-
torch.tensor([[-0.595883, -0.595883, -0.595883]]),
76+
self.policy.target(state),
77+
torch.tensor([[-0.574482, -0.574482, -0.574482]]),
8278
atol=1e-4,
8379
)
8480

all/policies/gaussian.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,24 @@
66

77

88
class GaussianPolicy(Approximation):
9+
'''
10+
A Gaussian stochastic policy.
11+
12+
This policy will choose actions from a distribution represented by a spherical Gaussian.
13+
The first n outputs the model will be squashed to [-1, 1] through a tanh function, and then
14+
scaled to the given action_space, and the remaining n outputs will define the amount of noise added.
15+
16+
Args:
17+
model (torch.nn.Module): A Pytorch module representing the policy network.
18+
The input shape should be the same as the shape of the state (or feature) space,
19+
and the output shape should be double the size of the the action space.
20+
The first n outputs will be the unscaled mean of the action for each dimension,
21+
and the second n outputs will be the logarithm of the variance.
22+
optimizer (torch.optim.Optimizer): A optimizer initialized with the
23+
model parameters, e.g. SGD, Adam, RMSprop, etc.
24+
action_space (gym.spaces.Box): The Box representing the action space.
25+
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
26+
'''
927
def __init__(
1028
self,
1129
model,

all/policies/greedy.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,19 @@
33
from all.optim import Schedulable
44

55
class GreedyPolicy(Schedulable):
6+
'''
7+
An "epsilon-greedy" action selection policy for discrete action spaces.
8+
9+
This policy will usually choose the optimal action according to an approximation
10+
of the action value function (the "q-function"), but with probabilty epsilon will
11+
choose a random action instead. GreedyPolicy is a Schedulable, meaning that
12+
epsilon can be varied over time by passing a Scheduler object.
13+
14+
Args:
15+
q (all.approximation.QNetwork): The action-value or "q-function"
16+
num_actions (int): The number of available actions.
17+
epsilon (float, optional): The probability of selecting a random action.
18+
'''
619
def __init__(
720
self,
821
q,

all/policies/soft_deterministic.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,20 @@
33
from all.nn import RLNetwork
44

55
class SoftDeterministicPolicy(Approximation):
6+
'''
7+
A "soft" deterministic policy compatible with soft actor-critic (SAC).
8+
9+
Args:
10+
model (torch.nn.Module): A Pytorch module representing the policy network.
11+
The input shape should be the same as the shape of the state (or feature) space,
12+
and the output shape should be double the size of the the action space
13+
The first n outputs will be the unscaled mean of the action for each dimension,
14+
and the second n outputs will be the logarithm of the variance.
15+
optimizer (torch.optim.Optimizer): A optimizer initialized with the
16+
model parameters, e.g. SGD, Adam, RMSprop, etc.
17+
action_space (gym.spaces.Box): The Box representing the action space.
18+
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
19+
'''
620
def __init__(
721
self,
822
model,
@@ -32,18 +46,35 @@ def forward(self, state):
3246
return self._squash(normal.loc)
3347

3448
def _normal(self, outputs):
35-
means = outputs[:, 0 : self._action_dim]
49+
means = outputs[:, 0:self._action_dim]
3650
logvars = outputs[:, self._action_dim:]
3751
std = logvars.mul(0.5).exp_()
3852
return torch.distributions.normal.Normal(means, std)
3953

4054
def _sample(self, normal):
4155
raw = normal.rsample()
42-
action = self._squash(raw)
56+
log_prob = self._log_prob(normal, raw)
57+
return self._squash(raw), log_prob
58+
59+
def _log_prob(self, normal, raw):
60+
'''
61+
Compute the log probability of a raw action after the action is squashed.
62+
Both inputs act on the raw underlying distribution.
63+
Because tanh_mean does not affect the density, we can ignore it.
64+
However, tanh_scale will affect the relative contribution of each component.'
65+
See Appendix C in the Soft Actor-Critic paper
66+
67+
Args:
68+
normal (torch.distributions.normal.Normal): The "raw" normal distribution.
69+
raw (torch.Tensor): The "raw" action.
70+
71+
Returns:
72+
torch.Tensor: The probability of the raw action, accounting for the affects of tanh.
73+
'''
4374
log_prob = normal.log_prob(raw)
44-
log_prob -= torch.log(1 - action.pow(2) + 1e-6)
45-
log_prob = log_prob.sum(1)
46-
return action, log_prob
75+
log_prob -= torch.log(1 - torch.tanh(raw).pow(2) + 1e-6)
76+
log_prob /= self._tanh_scale
77+
return log_prob.sum(1)
4778

4879
def _squash(self, x):
4980
return torch.tanh(x) * self._tanh_scale + self._tanh_mean
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import unittest
2+
import torch
3+
import numpy as np
4+
import torch_testing as tt
5+
from gym.spaces import Box
6+
from all import nn
7+
from all.environments import State
8+
from all.policies import SoftDeterministicPolicy
9+
10+
STATE_DIM = 2
11+
ACTION_DIM = 3
12+
13+
class TestSoftDeterministic(unittest.TestCase):
14+
def setUp(self):
15+
torch.manual_seed(2)
16+
self.model = nn.Sequential(
17+
nn.Linear0(STATE_DIM, ACTION_DIM * 2)
18+
)
19+
self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
20+
self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
21+
self.policy = SoftDeterministicPolicy(
22+
self.model,
23+
self.optimizer,
24+
self.space
25+
)
26+
27+
def test_output_shape(self):
28+
state = State(torch.randn(1, STATE_DIM))
29+
action, log_prob = self.policy(state)
30+
self.assertEqual(action.shape, (1, ACTION_DIM))
31+
self.assertEqual(log_prob.shape, torch.Size([1]))
32+
33+
state = State(torch.randn(5, STATE_DIM))
34+
action, log_prob = self.policy(state)
35+
self.assertEqual(action.shape, (5, ACTION_DIM))
36+
self.assertEqual(log_prob.shape, torch.Size([5]))
37+
38+
def test_step_one(self):
39+
state = State(torch.randn(1, STATE_DIM))
40+
self.policy(state)
41+
self.policy.step()
42+
43+
def test_converge(self):
44+
state = State(torch.randn(1, STATE_DIM))
45+
target = torch.tensor([0.25, 0.5, -0.5])
46+
47+
for _ in range(0, 200):
48+
action, _ = self.policy(state)
49+
loss = ((target - action) ** 2).mean()
50+
loss.backward()
51+
self.policy.step()
52+
53+
self.assertLess(loss, 0.2)
54+
55+
def test_scaling(self):
56+
self.space = Box(np.array([-10, -5, 100]), np.array([10, -2, 200]))
57+
self.policy = SoftDeterministicPolicy(
58+
self.model,
59+
self.optimizer,
60+
self.space
61+
)
62+
state = State(torch.randn(1, STATE_DIM))
63+
action, log_prob = self.policy(state)
64+
tt.assert_allclose(action, torch.tensor([[-3.09055, -4.752777, 188.98222]]))
65+
tt.assert_allclose(log_prob, torch.tensor([-0.397002]), rtol=1e-4)
66+
67+
if __name__ == '__main__':
68+
unittest.main()

all/policies/softmax.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,17 @@
55

66

77
class SoftmaxPolicy(Approximation):
8+
'''
9+
A softmax (or Boltzmann) stochastic policy for discrete actions.
10+
11+
Args:
12+
model (torch.nn.Module): A Pytorch module representing the policy network.
13+
The input shape should be the same as the shape of the state (or feature) space,
14+
and the output should be a vector the size of the action set.
15+
optimizer (torch.optim.Optimizer): A optimizer initialized with the
16+
model parameters, e.g. SGD, Adam, RMSprop, etc.
17+
kwargs (optional): Any other arguments accepted by all.approximation.Approximation
18+
'''
819
def __init__(
920
self,
1021
model,

docs/source/conf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,13 @@
4040
]
4141

4242
# Autosummary settings
43-
autodoc_default_flags = ['members']
43+
autodoc_default_options = {
44+
'members': True,
45+
'undoc-members': True,
46+
'show-inheritance': True
47+
}
4448
autosummary_generate = True
49+
autodoc_inherit_docstrings = True
4550

4651
# Mock requirements to save resources during doc build machine setup
4752
autodoc_mock_imports = [

docs/source/modules/nn.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ all.nn
55
=================
66

77
.. automodule:: all.nn
8+
:ignore-module-all:
89
:members:

0 commit comments

Comments
 (0)