diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee1087d0bd9f61ca3a469f271ca00453a3f42ad0
--- /dev/null
+++ b/a2c_sb3_cartpole.py
@@ -0,0 +1,47 @@
+import gymnasium as gym
+
+from stable_baselines3 import A2C
+import wandb
+from wandb.integration.sb3 import WandbCallback
+from huggingface_sb3 import package_to_hub
+
+
+# from documentation of wandb
+config = {
+    "policy_type": "MlpPolicy",
+    "total_timesteps": 25000,
+    "env_name": "CartPole-v1",
+}
+run = wandb.init(
+    project="sb3",
+    config=config,
+    sync_tensorboard=True,
+    monitor_gym=True,
+    save_code=True,
+)
+
+env = gym.make("CartPole-v1", render_mode="rgb_array")
+
+model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
+#model = A2C("MlpPolicy", env, )
+model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),)
+#model.learn(total_timesteps=10_000)
+vec_env = model.get_env()
+obs = vec_env.reset()
+for i in range(1000):
+    action, _state = model.predict(obs, deterministic=True)
+    obs, reward, done, info = vec_env.step(action)
+    vec_env.render("human")
+    # VecEnv resets automatically
+    # if done:
+    #   obs = vec_env.reset()
+
+run.finish()
+
+package_to_hub(model=model, 
+               model_name="CartPole-v1",
+               model_architecture="A2C",
+               env_id="CartPole-v1",
+               eval_env=env,
+               repo_id="lennartoe/Cartpole-v1",
+               commit_message="First commit")
\ No newline at end of file
diff --git a/evaluate_reinforce_cartpole.py.py b/evaluate_reinforce_cartpole.py.py
new file mode 100644
index 0000000000000000000000000000000000000000..01479f7b7a18bf02be0ba4a09f0d5bb078e093fc
--- /dev/null
+++ b/evaluate_reinforce_cartpole.py.py
@@ -0,0 +1,32 @@
+import gymnasium as gym
+import torch
+from reinforce_cartpole import Policy
+# Create the environment
+env = gym.make("CartPole-v1", render_mode="human")
+
+# Reset the environment and get the initial observation
+observation = env.reset()[0]
+
+policy = Policy()
+# load learned policy
+policy.load_state_dict(torch.load('policy.pth', weights_only=True))
+policy.eval()
+
+for _ in range(200):
+    # sample action from policy
+    print(observation)
+    print(torch.from_numpy(observation).float())
+    action_probs = policy(torch.from_numpy(observation).float())
+    action = torch.distributions.Categorical(action_probs).sample()
+    # Apply the action to the environment
+    # Returns next observation, reward, done signal (indicating
+    # if the episode has ended), and an additional info dictionary
+    observation, reward, terminated, truncated, info = env.step(action.numpy())
+    # Render the environment to visualize the agent's behavior
+    env.render()
+    print(terminated or truncated)
+    if terminated or truncated: 
+        # Terminated before max step
+        break
+
+env.close()
diff --git a/policy.pth b/policy.pth
new file mode 100644
index 0000000000000000000000000000000000000000..970f5c065a6275f6d9a5038936d3e7bff9adc6ac
Binary files /dev/null and b/policy.pth differ
diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..50be37c67a30fb2a5c723903fc7cf67482403427
--- /dev/null
+++ b/reinforce_cartpole.py
@@ -0,0 +1,121 @@
+import gymnasium as gym
+import torch
+import numpy as np
+
+class Policy(torch.nn.Module):
+    def __init__(self, input_size=4, output_size=2):
+        super(Policy, self).__init__()
+        self.fc1 = torch.nn.Linear(input_size, 128)
+        self.relu = torch.nn.ReLU()
+        self.dropout = torch.nn.Dropout(0.2)
+        self.fc2 = torch.nn.Linear(128, output_size)
+        self.softmax = torch.nn.Softmax(dim=0)
+    
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        #print(x)
+        x = self.softmax(x)
+        #print(x)
+        return x
+
+
+def main():
+    policy = Policy()
+    optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
+
+    # Create the environment
+    env = gym.make("CartPole-v1")
+
+    # Reset the environment and get the initial observation
+
+    gamma = 0.99
+    total_reward = []
+    total_loss = []
+    epochs = 500
+
+    max_steps = env.spec.max_episode_steps
+
+    for _ in range(epochs):
+        print(_)
+        # Reset the environment
+        observation = env.reset()[0]
+        # Reset buffer
+        # rewards = torch.zeros(max_steps)
+        # log_probs = torch.zeros(max_steps)
+        rewards = []
+        log_probs = []
+        for step in range(max_steps):
+            # Select a random action from the action space
+            #print(observation)
+            action_probs = policy(torch.from_numpy(observation).float())
+
+            # Sample an action from the action probabilities
+            action = torch.distributions.Categorical(action_probs).sample()
+            #print("Action")
+            #print(action)
+            # Apply the action to the environment
+            observation, reward, terminated, truncated, info = env.step(action.numpy())
+            #print(observation)
+            # env.render()
+            # does this come before adding to the rewards or after
+            
+            # rewards[step] = reward
+            # log_probs[step] = torch.log(action_probs[action])
+            rewards.append(torch.tensor(reward))
+            log_probs.append(torch.log(action_probs[action]))
+
+            if terminated or truncated:
+                break
+
+        # apply gamma
+        # transform rewards and log_probs into tensors
+        rewards = torch.stack(rewards)
+        log_probs = torch.stack(log_probs)
+        rewards_length = len(rewards)
+        rewards_tensor = torch.zeros(rewards_length, rewards_length)
+        for i in range(rewards_length):
+            for j in range(rewards_length-i):
+                rewards_tensor[i,j] = rewards[i+j]
+        #print(rewards_tensor)
+        for i in range(rewards_length):
+            for j in range(rewards_length):
+                rewards_tensor[i,j] = rewards_tensor[i,j] * np.pow(gamma,j)
+        #print(rewards_tensor)
+        normalized_rewards = torch.sum(rewards_tensor, dim=1) 
+        #print(normalized_rewards)
+        normalized_rewards = normalized_rewards- torch.mean(normalized_rewards)
+        normalized_rewards /= torch.std(normalized_rewards)
+        
+
+        loss = -torch.sum(log_probs * normalized_rewards)
+        total_reward.append(sum(rewards))
+        # optimize
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss.append(loss.detach().numpy())
+        # Render the environment to visualize the agent's behavior
+        #env.render()
+
+    # save the model weights
+    torch.save(policy.state_dict(), "policy.pth")
+
+
+    print(total_reward)
+    print(total_loss)
+    env.close()
+
+    # plot the rewards and the loss side by side
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(1,2)
+    ax[0].plot(total_reward)
+    ax[1].plot(total_loss)
+    plt.show()
+
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file