Replaced space.shape[0] with len(space), supporting Tuple spaces.

michalgregor · michalgregor · commit 96479c64f399 · 2020-08-23T15:52:59.000+02:00
* Tuple spaces do not have a shape attribute. To better support them,
  references to space.shape[0] have been replaced with len(space), which
  is more generic.
diff --git a/all/agents/sac.py b/all/agents/sac.py
@@ -22,7 +22,7 @@ class SAC(Agent):
         v (VNetwork): An Approximation of the state-value function.
         replay_buffer (ReplayBuffer): The experience replay buffer.
         discount_factor (float): Discount factor for future rewards.
-        entropy_target (float): The desired entropy of the policy. Usually -env.action_space.shape[0]
+        entropy_target (float): The desired entropy of the policy. Usually -len(env.action_space)
         minibatch_size (int): The number of experiences to sample in each training update.
         replay_start_size (int): Number of experiences in replay buffer when training begins.
         temperature_initial (float): The initial temperature used in the maximum entropy objective.
diff --git a/all/policies/deterministic.py b/all/policies/deterministic.py
@@ -36,7 +36,7 @@ def __init__(
 class DeterministicPolicyNetwork(RLNetwork):
     def __init__(self, model, space):
         super().__init__(model)
-        self._action_dim = space.shape[0]
+        self._action_dim = len(space)
         self._tanh_scale = torch.tensor((space.high - space.low) / 2).to(self.device)
         self._tanh_mean = torch.tensor((space.high + space.low) / 2).to(self.device)
 
diff --git a/all/policies/soft_deterministic.py b/all/policies/soft_deterministic.py
@@ -33,7 +33,7 @@ def __init__(
 class SoftDeterministicPolicyNetwork(RLNetwork):
     def __init__(self, model, space):
         super().__init__(model)
-        self._action_dim = space.shape[0]
+        self._action_dim = len(space)
         self._tanh_scale = torch.tensor((space.high - space.low) / 2).to(self.device)
         self._tanh_mean = torch.tensor((space.high + space.low) / 2).to(self.device)
 
diff --git a/all/presets/classic_control/models/__init__.py b/all/presets/classic_control/models/__init__.py
@@ -4,7 +4,7 @@
 def fc_relu_q(env, hidden=64):
     return nn.Sequential(
         nn.Flatten(),
-        nn.Linear(env.state_space.shape[0], hidden),
+        nn.Linear(len(env.state_space), hidden),
         nn.ReLU(),
         nn.Linear(hidden, env.action_space.n),
     )
@@ -15,10 +15,10 @@ def dueling_fc_relu_q(env):
         nn.Flatten(),
         nn.Dueling(
             nn.Sequential(
-                nn.Linear(env.state_space.shape[0], 256), nn.ReLU(), nn.Linear(256, 1)
+                nn.Linear(len(env.state_space), 256), nn.ReLU(), nn.Linear(256, 1)
             ),
             nn.Sequential(
-                nn.Linear(env.state_space.shape[0], 256),
+                nn.Linear(len(env.state_space), 256),
                 nn.ReLU(),
                 nn.Linear(256, env.action_space.n),
             ),
@@ -28,7 +28,7 @@ def dueling_fc_relu_q(env):
 
 def fc_relu_features(env, hidden=64):
     return nn.Sequential(
-        nn.Flatten(), nn.Linear(env.state_space.shape[0], hidden), nn.ReLU()
+        nn.Flatten(), nn.Linear(len(env.state_space), hidden), nn.ReLU()
     )
 
 
@@ -43,7 +43,7 @@ def fc_policy_head(env, hidden=64):
 def fc_relu_dist_q(env, hidden=64, atoms=51):
     return nn.Sequential(
         nn.Flatten(),
-        nn.Linear(env.state_space.shape[0], hidden),
+        nn.Linear(len(env.state_space), hidden),
         nn.ReLU(),
         nn.Linear0(hidden, env.action_space.n * atoms),
     )
@@ -52,7 +52,7 @@ def fc_relu_dist_q(env, hidden=64, atoms=51):
 def fc_relu_rainbow(env, hidden=64, atoms=51, sigma=0.5):
     return nn.Sequential(
         nn.Flatten(),
-        nn.Linear(env.state_space.shape[0], hidden),
+        nn.Linear(len(env.state_space), hidden),
         nn.ReLU(),
         nn.CategoricalDueling(
             nn.NoisyFactorizedLinear(hidden, atoms, sigma_init=sigma),
diff --git a/all/presets/continuous/models/__init__.py b/all/presets/continuous/models/__init__.py
@@ -10,7 +10,7 @@
 
 def fc_q(env, hidden1=400, hidden2=300):
     return nn.Sequential(
-        nn.Linear(env.state_space.shape[0] + env.action_space.shape[0] + 1, hidden1),
+        nn.Linear(len(env.state_space) + len(env.action_space) + 1, hidden1),
         nn.ReLU(),
         nn.Linear(hidden1, hidden2),
         nn.ReLU(),
@@ -19,7 +19,7 @@ def fc_q(env, hidden1=400, hidden2=300):
 
 def fc_v(env, hidden1=400, hidden2=300):
     return nn.Sequential(
-        nn.Linear(env.state_space.shape[0] + 1, hidden1),
+        nn.Linear(len(env.state_space) + 1, hidden1),
         nn.ReLU(),
         nn.Linear(hidden1, hidden2),
         nn.ReLU(),
@@ -28,25 +28,25 @@ def fc_v(env, hidden1=400, hidden2=300):
 
 def fc_deterministic_policy(env, hidden1=400, hidden2=300):
     return nn.Sequential(
-        nn.Linear(env.state_space.shape[0] + 1, hidden1),
+        nn.Linear(len(env.state_space) + 1, hidden1),
         nn.ReLU(),
         nn.Linear(hidden1, hidden2),
         nn.ReLU(),
-        nn.Linear0(hidden2, env.action_space.shape[0]),
+        nn.Linear0(hidden2, len(env.action_space)),
     )
 
 def fc_soft_policy(env, hidden1=400, hidden2=300):
     return nn.Sequential(
-        nn.Linear(env.state_space.shape[0] + 1, hidden1),
+        nn.Linear(len(env.state_space) + 1, hidden1),
         nn.ReLU(),
         nn.Linear(hidden1, hidden2),
         nn.ReLU(),
-        nn.Linear0(hidden2, env.action_space.shape[0] * 2),
+        nn.Linear0(hidden2, len(env.action_space) * 2),
     )
 
 def fc_actor_critic(env, hidden1=400, hidden2=300):
     features = nn.Sequential(
-        nn.Linear(env.state_space.shape[0] + 1, hidden1),
+        nn.Linear(len(env.state_space) + 1, hidden1),
         nn.ReLU(),
     )
 
@@ -59,7 +59,7 @@ def fc_actor_critic(env, hidden1=400, hidden2=300):
     policy = nn.Sequential(
         nn.Linear(hidden1, hidden2),
         nn.ReLU(),
-        nn.Linear(hidden2, env.action_space.shape[0] * 2)
+        nn.Linear(hidden2, len(env.action_space) * 2)
     )
 
     return features, v, policy
diff --git a/all/presets/continuous/sac.py b/all/presets/continuous/sac.py
@@ -52,7 +52,7 @@ def sac(
         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
         temperature_initial (float): Initial value of the temperature parameter.
         lr_temperature (float): Learning rate for the temperature. Should be low compared to other learning rates.
-        entropy_target_scaling (float): The target entropy will be -(entropy_target_scaling * env.action_space.shape[0])
+        entropy_target_scaling (float): The target entropy will be -(entropy_target_scaling * len(env.action_space))
         q1_model_constructor(function): The function used to construct the neural q1 model.
         q2_model_constructor(function): The function used to construct the neural q2 model.
         v_model_constructor(function): The function used to construct the neural v model.
@@ -126,7 +126,7 @@ def _sac(env, writer=DummyWriter()):
             v,
             replay_buffer,
             temperature_initial=temperature_initial,
-            entropy_target=(-env.action_space.shape[0] * entropy_target_scaling),
+            entropy_target=(-len(env.action_space) * entropy_target_scaling),
             lr_temperature=lr_temperature,
             replay_start_size=replay_start_size,
             discount_factor=discount_factor,
diff --git a/docs/source/guide/basic_concepts.rst b/docs/source/guide/basic_concepts.rst
@@ -218,7 +218,7 @@ In order to actually apply this agent to a problem, for example, a classic contr
         def _vqn(env, writer=DummyWriter()):
             # create a pytorch model
             model = nn.Sequential(
-                nn.Linear(env.state_space.shape[0], 64),
+                nn.Linear(len(env.state_space), 64),
                 nn.ReLU(),
                 nn.Linear(64, env.action_space.n),
             ).to(device)