Merge pull request #152 from cpnota/release/0.5.2

cpnota · web-flow · commit 6d1111afce0d · 2020-06-07T17:55:17.000-04:00
release/0.5.2
diff --git a/all/approximation/q_dist.py b/all/approximation/q_dist.py
@@ -14,7 +14,7 @@ def __init__(
             v_min,
             v_max,
             name="q_dist",
-            **kwargs,
+            **kwargs
     ):
         device = next(model.parameters()).device
         self.n_actions = n_actions
diff --git a/all/approximation/v_network_test.py b/all/approximation/v_network_test.py
@@ -39,10 +39,10 @@ def test_multi_reinforce(self):
             mask=torch.tensor([1, 1, 0, 1, 0, 0])
         )
         result1 = self.v(states[0:2])
-        result2 = self.v(states[2:4])
-        result3 = self.v(states[4:6])
         self.v.reinforce(loss(result1, torch.tensor([1, 2])).float())
+        result2 = self.v(states[2:4])
         self.v.reinforce(loss(result2, torch.tensor([1, 1])).float())
+        result3 = self.v(states[4:6])
         self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
         with self.assertRaises(Exception):
             self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
diff --git a/all/environments/abstract.py b/all/environments/abstract.py
@@ -41,14 +41,11 @@ def step(self, action):
 
         Returns
         -------
-        State
-            The state of the environment after the action is applied
+        all.environments.State
+            The State of the environment after the action is applied.
+            This State object includes both the done flag and any additional "info"
         float
             The reward achieved by the previous action
-        done
-            True if the environment has entered a terminal state and should be reset
-        info
-            Diagnostic information useful for debugging
         """
 
     @abstractmethod
diff --git a/all/experiments/parallel_env_experiment.py b/all/experiments/parallel_env_experiment.py
@@ -32,7 +32,7 @@ def __init__(
 
         # test state
         self._test_episodes = 100
-        self._test_episodes_started = 0
+        self._test_episodes_started = self._n_envs
         self._test_returns = []
         self._should_save_returns = [True] * self._n_envs
 
diff --git a/all/experiments/single_env_experiment.py b/all/experiments/single_env_experiment.py
@@ -76,7 +76,7 @@ def _run_training_episode(self):
     def _run_test_episode(self):
         # initialize the episode
         self._env.reset()
-        action = self._agent.act(self._env.state, self._env.reward)
+        action = self._agent.eval(self._env.state, self._env.reward)
         returns = 0
 
         # loop until the episode is finished
diff --git a/all/experiments/single_env_experiment_test.py b/all/experiments/single_env_experiment_test.py
@@ -76,8 +76,8 @@ def test_writes_test_returns(self):
         experiment = MockExperiment(dqn(), self.env, quiet=True)
         experiment.train(episodes=5)
         returns = experiment.test(episodes=4)
-        expected_mean = 10.25
-        expected_std = 1.0897247358851685
+        expected_mean = 9.5
+        expected_std = 0.5
         np.testing.assert_equal(np.mean(returns), expected_mean)
         np.testing.assert_equal(
             experiment._writer.data["evaluation/returns-test/mean"]["values"],
diff --git a/all/experiments/writer.py b/all/experiments/writer.py
@@ -14,8 +14,6 @@ class ExperimentWriter(SummaryWriter, Writer):
     tagging the run with a combination of the agent name, the commit hash of the
     current git repo of the working directory (if any), and the current time.
     Also writes summary statistics into CSV files.
-
-
     Args:
         experiment (all.experiments.Experiment): The Experiment associated with the Writer object.
         agent_name (str): The name of the Agent the Experiment is being performed on
@@ -24,7 +22,7 @@ class ExperimentWriter(SummaryWriter, Writer):
     '''
     def __init__(self, experiment, agent_name, env_name, loss=True):
         self.env_name = env_name
-        current_time = str(datetime.now())
+        current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
         os.makedirs(
             os.path.join(
                 "runs", ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)), env_name
@@ -51,7 +49,6 @@ def add_schedule(self, name, value, step="frame"):
     def add_scalar(self, name, value, step="frame"):
         '''
         Log an arbitrary scalar.
-
         Args:
             name (str): The tag to associate with the scalar
             value (number): The value of the scalar at the current step
diff --git a/all/nn/__init__.py b/all/nn/__init__.py
@@ -10,7 +10,6 @@ class RLNetwork(nn.Module):
     """
     Wraps a network such that States can be given as input.
     """
-
     def __init__(self, model, _=None):
         super().__init__()
         self.model = model
@@ -20,7 +19,7 @@ def forward(self, state):
         return self.model(state.features.float()) * state.mask.float().unsqueeze(-1)
 
 class Aggregation(nn.Module):
-    """len()
+    """
     Aggregation layer for the Dueling architecture.
 
     https://arxiv.org/abs/1511.06581
diff --git a/all/optim/__init__.py b/all/optim/__init__.py
@@ -1 +1,3 @@
 from .scheduler import LinearScheduler, Schedulable
+
+__all__ = ['Schedulable', 'LinearScheduler']
diff --git a/all/policies/deterministic.py b/all/policies/deterministic.py
@@ -4,6 +4,18 @@
 
 
 class DeterministicPolicy(Approximation):
+    '''
+    A DDPG-style deterministic policy.
+
+    Args:
+        model (torch.nn.Module): A Pytorch module representing the policy network.
+            The input shape should be the same as the shape of the state space,
+            and the output shape should be the same as the shape of the action space.
+        optimizer (torch.optim.Optimizer): A optimizer initialized with the
+            model parameters, e.g. SGD, Adam, RMSprop, etc.
+        action_space (gym.spaces.Box): The Box representing the action space.
+        kwargs (optional): Any other arguments accepted by all.approximation.Approximation
+    '''
     def __init__(
             self,
             model,
@@ -20,6 +32,7 @@ def __init__(
             **kwargs
         )
 
+
 class DeterministicPolicyNetwork(RLNetwork):
     def __init__(self, model, space):
         super().__init__(model)
diff --git a/all/policies/deterministic_test.py b/all/policies/deterministic_test.py
@@ -57,28 +57,24 @@ def test_target(self):
             self.space,
             target=FixedTarget(3)
         )
-
-        # choose initial action
         state = State(torch.ones(1, STATE_DIM))
-        action = self.policy(state)
-        tt.assert_equal(action, torch.zeros(1, ACTION_DIM))
 
         # run update step, make sure target network doesn't change
-        action.sum().backward(retain_graph=True)
+        self.policy(state).sum().backward()
         self.policy.step()
         tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))
 
         # again...
-        action.sum().backward(retain_graph=True)
+        self.policy(state).sum().backward()
         self.policy.step()
         tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))
 
         # third time, target should be updated
-        action.sum().backward(retain_graph=True)
+        self.policy(state).sum().backward()
         self.policy.step()
         tt.assert_allclose(
-            self.policy.eval(state),
-            torch.tensor([[-0.595883, -0.595883, -0.595883]]),
+            self.policy.target(state),
+            torch.tensor([[-0.574482, -0.574482, -0.574482]]),
             atol=1e-4,
         )
 
diff --git a/all/policies/gaussian.py b/all/policies/gaussian.py
@@ -6,6 +6,24 @@
 
 
 class GaussianPolicy(Approximation):
+    '''
+    A Gaussian stochastic policy.
+
+    This policy will choose actions from a distribution represented by a spherical Gaussian.
+    The first n outputs the model will be squashed to [-1, 1] through a tanh function, and then
+    scaled to the given action_space, and the remaining n outputs will define the amount of noise added.
+
+    Args:
+        model (torch.nn.Module): A Pytorch module representing the policy network.
+            The input shape should be the same as the shape of the state (or feature) space,
+            and the output shape should be double the size of the the action space.
+            The first n outputs will be the unscaled mean of the action for each dimension,
+            and the second n outputs will be the logarithm of the variance.
+        optimizer (torch.optim.Optimizer): A optimizer initialized with the
+            model parameters, e.g. SGD, Adam, RMSprop, etc.
+        action_space (gym.spaces.Box): The Box representing the action space.
+        kwargs (optional): Any other arguments accepted by all.approximation.Approximation
+    '''
     def __init__(
             self,
             model,
diff --git a/all/policies/greedy.py b/all/policies/greedy.py
@@ -3,6 +3,19 @@
 from all.optim import Schedulable
 
 class GreedyPolicy(Schedulable):
+    '''
+    An  "epsilon-greedy" action selection policy for discrete action spaces.
+
+    This policy will usually choose the optimal action according to an approximation
+    of the action value function (the "q-function"), but with probabilty epsilon will
+    choose a random action instead. GreedyPolicy is a Schedulable, meaning that
+    epsilon can be varied over time by passing a Scheduler object.
+
+    Args:
+        q (all.approximation.QNetwork): The action-value or "q-function"
+        num_actions (int): The number of available actions.
+        epsilon (float, optional): The probability of selecting a random action.
+    '''
     def __init__(
             self,
             q,
diff --git a/all/policies/soft_deterministic.py b/all/policies/soft_deterministic.py
@@ -3,6 +3,20 @@
 from all.nn import RLNetwork
 
 class SoftDeterministicPolicy(Approximation):
+    '''
+    A "soft" deterministic policy compatible with soft actor-critic (SAC).
+
+    Args:
+        model (torch.nn.Module): A Pytorch module representing the policy network.
+            The input shape should be the same as the shape of the state (or feature) space,
+            and the output shape should be double the size of the the action space
+            The first n outputs will be the unscaled mean of the action for each dimension,
+            and the second n outputs will be the logarithm of the variance.
+        optimizer (torch.optim.Optimizer): A optimizer initialized with the
+            model parameters, e.g. SGD, Adam, RMSprop, etc.
+        action_space (gym.spaces.Box): The Box representing the action space.
+        kwargs (optional): Any other arguments accepted by all.approximation.Approximation
+    '''
     def __init__(
             self,
             model,
@@ -32,18 +46,35 @@ def forward(self, state):
         return self._squash(normal.loc)
 
     def _normal(self, outputs):
-        means = outputs[:, 0 : self._action_dim]
+        means = outputs[:, 0:self._action_dim]
         logvars = outputs[:, self._action_dim:]
         std = logvars.mul(0.5).exp_()
         return torch.distributions.normal.Normal(means, std)
 
     def _sample(self, normal):
         raw = normal.rsample()
-        action = self._squash(raw)
+        log_prob = self._log_prob(normal, raw)
+        return self._squash(raw), log_prob
+
+    def _log_prob(self, normal, raw):
+        '''
+        Compute the log probability of a raw action after the action is squashed.
+        Both inputs act on the raw underlying distribution.
+        Because tanh_mean does not affect the density, we can ignore it.
+        However, tanh_scale will affect the relative contribution of each component.'
+        See Appendix C in the Soft Actor-Critic paper
+
+        Args:
+            normal (torch.distributions.normal.Normal): The "raw" normal distribution.
+            raw (torch.Tensor): The "raw" action.
+
+        Returns:
+            torch.Tensor: The probability of the raw action, accounting for the affects of tanh.
+        '''
         log_prob = normal.log_prob(raw)
-        log_prob -= torch.log(1 - action.pow(2) + 1e-6)
-        log_prob = log_prob.sum(1)
-        return action, log_prob
+        log_prob -= torch.log(1 - torch.tanh(raw).pow(2) + 1e-6)
+        log_prob /= self._tanh_scale
+        return log_prob.sum(1)
 
     def _squash(self, x):
         return torch.tanh(x) * self._tanh_scale + self._tanh_mean
diff --git a/all/policies/soft_deterministic_test.py b/all/policies/soft_deterministic_test.py
@@ -0,0 +1,68 @@
+import unittest
+import torch
+import numpy as np
+import torch_testing as tt
+from gym.spaces import Box
+from all import nn
+from all.environments import State
+from all.policies import SoftDeterministicPolicy
+
+STATE_DIM = 2
+ACTION_DIM = 3
+
+class TestSoftDeterministic(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(2)
+        self.model = nn.Sequential(
+            nn.Linear0(STATE_DIM, ACTION_DIM * 2)
+        )
+        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
+        self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
+        self.policy = SoftDeterministicPolicy(
+            self.model,
+            self.optimizer,
+            self.space
+        )
+
+    def test_output_shape(self):
+        state = State(torch.randn(1, STATE_DIM))
+        action, log_prob = self.policy(state)
+        self.assertEqual(action.shape, (1, ACTION_DIM))
+        self.assertEqual(log_prob.shape, torch.Size([1]))
+
+        state = State(torch.randn(5, STATE_DIM))
+        action, log_prob = self.policy(state)
+        self.assertEqual(action.shape, (5, ACTION_DIM))
+        self.assertEqual(log_prob.shape, torch.Size([5]))
+
+    def test_step_one(self):
+        state = State(torch.randn(1, STATE_DIM))
+        self.policy(state)
+        self.policy.step()
+
+    def test_converge(self):
+        state = State(torch.randn(1, STATE_DIM))
+        target = torch.tensor([0.25, 0.5, -0.5])
+
+        for _ in range(0, 200):
+            action, _ = self.policy(state)
+            loss = ((target - action) ** 2).mean()
+            loss.backward()
+            self.policy.step()
+
+        self.assertLess(loss, 0.2)
+
+    def test_scaling(self):
+        self.space = Box(np.array([-10, -5, 100]), np.array([10, -2, 200]))
+        self.policy = SoftDeterministicPolicy(
+            self.model,
+            self.optimizer,
+            self.space
+        )
+        state = State(torch.randn(1, STATE_DIM))
+        action, log_prob = self.policy(state)
+        tt.assert_allclose(action, torch.tensor([[-3.09055, -4.752777, 188.98222]]))
+        tt.assert_allclose(log_prob, torch.tensor([-0.397002]), rtol=1e-4)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/all/policies/softmax.py b/all/policies/softmax.py
@@ -5,6 +5,17 @@
 
 
 class SoftmaxPolicy(Approximation):
+    '''
+    A softmax (or Boltzmann) stochastic policy for discrete actions.
+
+    Args:
+        model (torch.nn.Module): A Pytorch module representing the policy network.
+            The input shape should be the same as the shape of the state (or feature) space,
+            and the output should be a vector the size of the action set.
+        optimizer (torch.optim.Optimizer): A optimizer initialized with the
+            model parameters, e.g. SGD, Adam, RMSprop, etc.
+        kwargs (optional): Any other arguments accepted by all.approximation.Approximation
+    '''
     def __init__(
             self,
             model,
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -40,8 +40,13 @@
 ]
 
 # Autosummary settings
-autodoc_default_flags = ['members']
+autodoc_default_options = {
+    'members': True,
+    'undoc-members': True,
+    'show-inheritance': True
+}
 autosummary_generate = True
+autodoc_inherit_docstrings = True
 
 # Mock requirements to save resources during doc build machine setup
 autodoc_mock_imports = [
diff --git a/docs/source/modules/nn.rst b/docs/source/modules/nn.rst
@@ -5,4 +5,5 @@ all.nn
 =================
 
 .. automodule:: all.nn
+    :ignore-module-all:
     :members:
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`from .scheduler import LinearScheduler, Schedulable`
	`2`	`+`
	`3`	`+__all__ = ['Schedulable', 'LinearScheduler']`