import numpy as np # Import von NumPy

# Definition der Gridworld-Umgebung
class Gridworld:
    def __init__(self, size, start, goal):
        self.size = size
        self.start = start
        self.goal = goal
        self.state = start
        self.actions = ['up', 'down', 'left', 'right']
        self.grid=np.zeros((self.size,self.size))
        self.grid[start]=1
        self.grid[goal]=0

    def reset(self):
        self.state = self.start
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 'up':
            x = max(0, x - 1)
        elif action == 'down':
            x = min(self.size - 1, x + 1)
        elif action == 'left':
            y = max(0, y - 1)
        elif action == 'right':
            y = min(self.size - 1, y + 1)
        self.state = (x, y)
        self.grid[self.state]+=1
        reward = 1 if self.state == self.goal else -0.1
        done = self.state == self.goal
        return self.state, reward, done

env = Gridworld(size=4, start=(0, 0), goal=(3, 3))
state = env.reset()
print("Startzustand:", state)
env.grid

Startzustand: (0, 0)

array([[1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

# Beispiel für einen Schritt in der Umgebung
steps=['right','down','right','down','right','down']
for step in steps:
    next_state, reward, done = env.step(step)
    print("Nächster Zustand:", next_state, "Belohnung:", reward, "Ziel erreicht:", done)

Nächster Zustand: (0, 1) Belohnung: -0.1 Ziel erreicht: False
Nächster Zustand: (1, 1) Belohnung: -0.1 Ziel erreicht: False
Nächster Zustand: (1, 2) Belohnung: -0.1 Ziel erreicht: False
Nächster Zustand: (2, 2) Belohnung: -0.1 Ziel erreicht: False
Nächster Zustand: (2, 3) Belohnung: -0.1 Ziel erreicht: False
Nächster Zustand: (3, 3) Belohnung: 1 Ziel erreicht: True

import plotly.express as px
px.imshow(env.grid)

env = Gridworld(size=4, start=(0, 0), goal=(3, 3))
trials = 500

rewards = []
retries = []
for trial in range(trials):
    env.reset()
    n, r_sum = 0, 0
    done = False
    while not done:
        # Wir wählen eine zufällige Aktion
        action = np.random.choice(env.actions)
        # Führe die Aktion aus und erhalte von der Umgebung den neuen Zustand und die Belohnung
        new_state, reward, done = env.step(action)
        # Sammel Erfolgsstatistik
        r_sum += reward
        n += 1
    rewards.append(r_sum)
    retries.append(n)

import plotly.express as px
px.line(retries)

px.line(rewards)

px.imshow(env.grid)

env = Gridworld(size=4, start=(0, 0), goal=(3, 3))

# Initialisiere Q-Werte
Q = np.zeros((env.size, env.size, len(env.actions)))

# Hyperparameter
alpha = 0.2  # Lernrate
gamma = 0.9  # Diskontierungsfaktor
epsilon = 0.1  # Epsilon für die Epsilon-Greedy-Strategie

rewardsQ=[]
retriesQ=[]
for trial in range(trials):
    state = env.reset()
    done = False
    n, r_sum=0, 0
    while not done:
        if np.random.uniform(0,1) < epsilon:
            # Wähle Aktion entweder Zufällig aus
            action = np.random.randint(0, len(env.actions))
        else:
            # Wähle Aktion mit maximaler erwarteten Belohnung Q(s)
            action = np.argmax(Q[state[0], state[1], :])
        # Führe die Aktion aus und erhalte von der Umgebung den neuen Zustand und die Belohnung
        new_state, reward, done = env.step(env.actions[action])
        # Aktualisiere Q-Werte auf Basis der erhaltenen Belohnung
        Q[state[0], state[1], action] += alpha * (reward + gamma * np.max(Q[new_state[0], new_state[1], :]) - Q[state[0], state[1], action])
        # Aktualisiere den Zustand
        state = new_state
        # Sammel Erfolgsstatistik
        r_sum+=reward
        n += 1
    rewardsQ.append(r_sum)
    retriesQ.append(n)

px.line(retriesQ)

px.line(rewardsQ)

px.imshow(env.grid)

import numpy as np
import pandas as pd

# Simulierte Zeitreihe (z.B. Aktienpreise)
np.random.seed(42)
data = np.cumsum(np.random.randn(200) + 0.5)

# Daten visualisieren
px.line(data)

class StockTradingEnv:
    def __init__(self, data, num_states=40):
        self.data = data
        self.current_step = 0
        self.state = None
        self.states = np.linspace(min(data), max(data), num_states)
        self.done = False
        self.actions = ['kaufen', 'verkauf']
        self.position = 0  # 1 = long, -1 = short, 0 = neutral
        self.balance = 0
        self.current_price=0
        self.old_price = 0

    def discretize(self):
        return np.digitize(self.data[self.current_step], self.states) - 1

    def reset(self):
        self.current_step = 0
        #self.state = self.data[self.current_step:self.current_step+5]
        self.state = self.discretize()
        self.done = False
        self.position = 0
        self.balance = 0
        self.old_price = 0
        self.current_price=0
        return self.state

    def step(self, action):
        self.current_step += 1
        if self.current_step > len(self.data) - 2:
            self.done = True
        self.current_price = self.data[self.current_step]
        reward = 0
        if action == 0:  # Kaufen
            if self.position == 0: # Kaufe Aktie
                self.position = 1
                self.balance -= self.current_price
                self.old_price = self.current_price
            elif self.position == -1: # Verkaufe Short
                reward = 2 * (self.old_price - self.current_price)
                self.position = 0
                self.balance += self.old_price
        elif action == 1:  # Verkaufen
            if self.position == 0: # Kaufe Short
                self.position = -1
                self.balance += self.current_price
                self.old_price = self.current_price
            elif self.position == 1: # Verkaufe Aktie
                reward = 2 * (self.current_price - self.old_price)
                self.position = 0
                self.balance -= self.old_price
        elif action == 2:  # Halten
            pass
        self.state = self.discretize()
        return self.state, reward, self.done

num_states=40

# Beispiel für die Erstellung und Verwendung der StockTradingEnv-Umgebung
env = StockTradingEnv(data, num_states)
state = env.reset()
print("Startzustand:", state)

# Beispiel für einen Schritt in der Umgebung
for action in [0,2,2,2,2,2,1,1,2,2,2,2,2,0]:#  0 = Kaufen, 2 = Halten, 1 = Verkaufen
    next_state, reward, done = env.step(action)  
    print("Nächster Zustand:", next_state, "Belohnung:", reward, "Ziel erreicht:", done)

Startzustand: 0
Nächster Zustand: 0 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 0 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 1 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 1 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 1 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 2 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 14.098151214993003 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 3 Belohnung: 0 Ziel erreicht: False
Nächster Zustand: 2 Belohnung: 1.565646416803098 Ziel erreicht: False

import time

# Beispiel für die Erstellung und Verwendung der StockTradingEnv-Umgebung
tic=time.time()
env = StockTradingEnv(data, num_states)

# Initialisiere Q-Werte
Q = np.zeros((num_states, 3))

# Hyperparameter
alpha = 0.1  # Lernrate
gamma = 0.9  # Diskontierungsfaktor
epsilon = 0.1  # Epsilon für die Epsilon-Greedy-Strategie

# Training des Q-Learning-Agenten
for episode in range(1000):
    state = env.reset()
    done = False
    actions = []
    rewards = []
    total_reward = 0
    
    while not done:
        if np.random.random() < epsilon:
            # Wähle Aktion entweder Zufällig aus
            action = np.random.randint(0, len(env.actions))
        else:
            # Wähle Aktion mit maximaler erwarteten Belohnung Q(s)
            action = np.argmax(Q[env.state])
        # Führe die Aktion aus und erhalte von der Umgebung den neuen Zustand und die Belohnung
        new_state, reward, done = env.step(action)
        new_state_idx = env.current_step
        # Aktualisiere Q-Werte auf Basis der erhaltenen Belohnung
        Q[state, action] += alpha * (reward + gamma * np.max(Q[new_state]) - Q[state, action])
        # Aktualisiere den Zustand
        state = new_state
        # Sammel Erfolgsstatistik
        actions.append(action)
        total_reward+=-reward
        rewards.append(total_reward)
print(f"Execution Time: {time.time()-tic}")

Execution Time: 1.185906171798706

px.line(rewards)

px.line(actions)

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class StockTradingEnvGym(gym.Env):
    def __init__(self, data):
        super(StockTradingEnvGym, self).__init__()
        self.data = data.copy()
        self.current_step = 0
        self.balance = 10000  # Startguthaben
        self.position = 0  # 0: neutral, 1: long, -1: short
        self.old_price = 0
        self.current_price = self.data[self.current_step]
        # Actions: 0 = Kaufen, 1 = Verkaufen, 2 = Halten
        self.action_space = spaces.Discrete(3)
        # Observation space: [current price, position, balance]
        self.observation_space = spaces.Box(
            low=np.array([-np.inf, -1, -np.inf]), 
            high=np.array([np.inf, 1, np.inf]), 
            dtype=np.float32
        )

    def reset(self, seed=0, options=None):
        self.current_step = 0
        self.balance = 10000
        self.position = 0
        self.current_price = self.data[self.current_step]
        return self._get_obs(), {}

    def _get_obs(self):
        return np.array([self.current_price, self.position, self.balance])

    def step(self, action):
        prev_price = self.current_price
        self.current_step += 1
        self.current_price = self.data[self.current_step]
        reward = 0
        if action == 0:  # Kaufen
            if self.position == 0: # Kaufe Aktie
                self.position = 1
                self.balance -= self.current_price
                self.old_price = self.current_price
            elif self.position == -1: # Verkaufe Short
                reward = 2 * (self.old_price - self.current_price)
                self.position = 0
                self.balance += self.old_price
        elif action == 1:  # Verkaufen
            if self.position == 0: # Kaufe Short
                self.position = -1
                self.balance += self.current_price
                self.old_price = self.current_price
            elif self.position == 1: # Verkaufe Aktie
                reward = 2 * (self.current_price - self.old_price)
                self.position = 0
                self.balance -= self.old_price
        elif action == 2:  # Halten
            reward = 0
        done = self.current_step >= len(self.data) - 1
        return self._get_obs(), reward, done, False, {}

    def render(self, mode='human'):
        print(f'Step: {self.current_step}, Price: {self.current_price}, Position: {self.position}, Balance: {self.balance}')

import os
os.environ["PYTHONWARNINGS"]="ignore::DeprecationWarning"
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.registry import register_env

tic=time.time()
def env_creator(env_config):
    return StockTradingEnvGym(data)  # return an env instance

register_env("StockTradingEnvGym", env_creator)

config = (
    PPOConfig()
    .environment("StockTradingEnvGym")
    .env_runners(num_env_runners=2)
    .framework("torch")
    .training()
    .evaluation(evaluation_num_env_runners=1)
)

algo = config.build()  # 2. build the algorithm,

for _ in range(10):
    algo.train()  # 3. train it,

#algo.evaluate()  # 4. and evaluate it.
print(f"Execution Time: {time.time()-tic}")

2024-06-24 11:11:54,421	WARNING algorithm_config.py:4078 -- You have specified 1 evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation will not occur automatically with each call to `Algorithm.train()`. Instead, you will have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.
/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm.py:525: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
`UnifiedLogger` will be removed in Ray 2.7.

/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.

/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.

/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.

2024-06-24 11:11:56,045	INFO worker.py:1770 -- Started a local Ray instance.
2024-06-24 11:11:59,353	WARNING algorithm_config.py:4078 -- You have specified 1 evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation will not occur automatically with each call to `Algorithm.train()`. Instead, you will have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.
2024-06-24 11:12:01,524	WARNING util.py:61 -- Install gputil for GPU system monitoring.
2024-06-24 11:12:02,343	WARNING deprecation.py:50 -- DeprecationWarning: `ray.rllib.execution.train_ops.multi_gpu_train_one_step` has been deprecated. This will raise an error in the future!

Execution Time: 48.61755394935608

(RolloutWorker pid=80435) /Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/gymnasium/core.py:311: UserWarning: WARN: env.single_observation_space to get variables from other wrappers is deprecated and will be removed in v1.0, to get this variable you can do `env.unwrapped.single_observation_space` for environment variables or `env.get_wrapper_attr('single_observation_space')` that will search the reminding wrappers.
(RolloutWorker pid=80435)   logger.warn(
(RolloutWorker pid=80435) /Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/gymnasium/core.py:311: UserWarning: WARN: env.single_action_space to get variables from other wrappers is deprecated and will be removed in v1.0, to get this variable you can do `env.unwrapped.single_action_space` for environment variables or `env.get_wrapper_attr('single_action_space')` that will search the reminding wrappers.
(RolloutWorker pid=80435)   logger.warn(
(RolloutWorker pid=80435) 2024-06-24 11:12:46,958	WARNING env_runner_v2.py:301 -- Could not import gymnasium.envs.classic_control.rendering! Try `pip install gymnasium[all]`.
(RolloutWorker pid=80656) /Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/gymnasium/core.py:311: UserWarning: WARN: env.single_observation_space to get variables from other wrappers is deprecated and will be removed in v1.0, to get this variable you can do `env.unwrapped.single_observation_space` for environment variables or `env.get_wrapper_attr('single_observation_space')` that will search the reminding wrappers.
(RolloutWorker pid=80656)   logger.warn(
(RolloutWorker pid=80656) /Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/gymnasium/core.py:311: UserWarning: WARN: env.single_action_space to get variables from other wrappers is deprecated and will be removed in v1.0, to get this variable you can do `env.unwrapped.single_action_space` for environment variables or `env.get_wrapper_attr('single_action_space')` that will search the reminding wrappers.
(RolloutWorker pid=80656)   logger.warn(

# Evaluierung des Agents
env = StockTradingEnvGym(data)
state = env.reset()
done = False
total_reward = 0

actions = []
rewards = []
while not done:
    action = algo.compute_single_action(env._get_obs(), state)
    state, reward, done, _, _ = env.step(action[0])
    total_reward += reward
    actions.append(action)
    rewards.append(total_reward)
    env.render()

print("Gesamtbelohnung:", total_reward)

Step: 1, Price: 1.358449851840048, Position: 0, Balance: 10000
Step: 2, Price: 2.5061383899407406, Position: 0, Balance: 10000
Step: 3, Price: 4.529168246348766, Position: 0, Balance: 10000
Step: 4, Price: 4.79501487162543, Position: 1, Balance: 9995.204985128374
Step: 5, Price: 5.060877914676249, Position: 1, Balance: 9995.204985128374
Step: 6, Price: 7.1400907301836405, Position: 1, Balance: 9995.204985128374
Step: 7, Price: 8.407525459336549, Position: 1, Balance: 9995.204985128374
Step: 8, Price: 8.438051073401597, Position: 1, Balance: 9995.204985128374
Step: 9, Price: 9.480611116987562, Position: 1, Balance: 9995.204985128374
Step: 10, Price: 9.517193424175101, Position: 1, Balance: 9995.204985128374
Step: 11, Price: 9.551463670604845, Position: 1, Balance: 9995.204985128374
Step: 12, Price: 10.29342594217088, Position: 1, Balance: 9995.204985128374
Step: 13, Price: 8.88014569751308, Position: 1, Balance: 9995.204985128374
Step: 14, Price: 7.6552278650000485, Position: 1, Balance: 9995.204985128374
Step: 15, Price: 7.592940335759076, Position: 1, Balance: 9995.204985128374
Step: 16, Price: 7.0801092154246525, Position: 1, Balance: 9995.204985128374
Step: 17, Price: 7.8943565480199265, Position: 1, Balance: 9995.204985128374
Step: 18, Price: 7.486332472498716, Position: 1, Balance: 9995.204985128374
Step: 19, Price: 6.574028771163424, Position: 1, Balance: 9995.204985128374
Step: 20, Price: 8.539677540084979, Position: 1, Balance: 9995.204985128374
Step: 21, Price: 8.813901239598444, Position: 1, Balance: 9995.204985128374
Step: 22, Price: 9.381429444286368, Position: 1, Balance: 9995.204985128374
Step: 23, Price: 8.456681258072912, Position: 1, Balance: 9995.204985128374
Step: 24, Price: 8.41229853354773, Position: 1, Balance: 9995.204985128374
Step: 25, Price: 9.023221123257596, Position: 1, Balance: 9995.204985128374
Step: 26, Price: 8.372227545835292, Position: 1, Balance: 9995.204985128374
Step: 27, Price: 9.247925564180964, Position: 1, Balance: 9995.204985128374
Step: 28, Price: 9.147286874262159, Position: 1, Balance: 9995.204985128374
Step: 29, Price: 9.355593124468882, Position: 1, Balance: 9995.204985128374
Step: 30, Price: 9.253886512239486, Position: 1, Balance: 9995.204985128374
Step: 31, Price: 11.606164696748424, Position: 1, Balance: 9995.204985128374
Step: 32, Price: 12.09266747201049, Position: 1, Balance: 9995.204985128374
Step: 33, Price: 11.53495654305459, Position: 1, Balance: 9995.204985128374
Step: 34, Price: 12.85750145515778, Position: 1, Balance: 9995.204985128374
Step: 35, Price: 12.136657805186758, Position: 0, Balance: 9990.409970256747
Step: 36, Price: 12.845521400191513, Position: 0, Balance: 9990.409970256747
Step: 37, Price: 11.385851276311739, Position: 1, Balance: 9979.024118980436
Step: 38, Price: 10.557665227413308, Position: 1, Balance: 9979.024118980436
Step: 39, Price: 11.254526463282431, Position: 1, Balance: 9979.024118980436
Step: 40, Price: 12.492993043277842, Position: 1, Balance: 9979.024118980436
Step: 41, Price: 13.164361324467812, Position: 1, Balance: 9979.024118980436
Step: 42, Price: 13.548713042079571, Position: 1, Balance: 9979.024118980436
Step: 43, Price: 13.747609346490282, Position: 1, Balance: 9979.024118980436
Step: 44, Price: 12.769087356122855, Position: 1, Balance: 9979.024118980436
Step: 45, Price: 12.549243147728147, Position: 1, Balance: 9979.024118980436
Step: 46, Price: 12.588604376768359, Position: 1, Balance: 9979.024118980436
Step: 47, Price: 14.145726602987274, Position: 1, Balance: 9979.024118980436
Step: 48, Price: 14.989344892555735, Position: 1, Balance: 9979.024118980436
Step: 49, Price: 13.726304737193, Position: 1, Balance: 9979.024118980436
Step: 50, Price: 14.550388706587796, Position: 1, Balance: 9979.024118980436
Step: 51, Price: 14.66530642617148, Position: 1, Balance: 9979.024118980436
Step: 52, Price: 14.48838442586552, Position: 1, Balance: 9979.024118980436
Step: 53, Price: 15.600060714706387, Position: 1, Balance: 9979.024118980436
Step: 54, Price: 17.131060237202338, Position: 1, Balance: 9979.024118980436
Step: 55, Price: 18.562340356318536, Position: 1, Balance: 9979.024118980436
Step: 56, Price: 18.223122833095896, Position: 1, Balance: 9979.024118980436
Step: 57, Price: 18.413910457244683, Position: 1, Balance: 9979.024118980436
Step: 58, Price: 19.245173888648246, Position: 1, Balance: 9979.024118980436
Step: 59, Price: 20.720719015770605, Position: 1, Balance: 9979.024118980436
Step: 60, Price: 20.741544777925316, Position: 1, Balance: 9979.024118980436
Step: 61, Price: 21.0558858012615, Position: 1, Balance: 9979.024118980436
Step: 62, Price: 20.44955082725547, Position: 1, Balance: 9979.024118980436
Step: 63, Price: 19.7533442031748, Position: 1, Balance: 9979.024118980436
Step: 64, Price: 21.065870025568998, Position: 1, Balance: 9979.024118980436
Step: 65, Price: 22.92211005413982, Position: 1, Balance: 9979.024118980436
Step: 66, Price: 23.350099932559488, Position: 1, Balance: 9979.024118980436
Step: 67, Price: 24.853632830451513, Position: 1, Balance: 9979.024118980436
Step: 68, Price: 25.715268855499147, Position: 1, Balance: 9979.024118980436
Step: 69, Price: 25.570149100894024, Position: 0, Balance: 9967.638267704126
Step: 70, Price: 26.431544706402438, Position: 1, Balance: 9941.206722997724
Step: 71, Price: 28.469581272868407, Position: 1, Balance: 9941.206722997724
Step: 72, Price: 28.933755233758454, Position: 1, Balance: 9941.206722997724
Step: 73, Price: 30.99839888957246, Position: 1, Balance: 9941.206722997724
Step: 74, Price: 28.878653785482715, Position: 1, Balance: 9941.206722997724
Step: 75, Price: 30.20055628985794, Position: 1, Balance: 9941.206722997724
Step: 76, Price: 30.78760335809611, Position: 1, Balance: 9941.206722997724
Step: 77, Price: 30.988596007630242, Position: 0, Balance: 9914.775178291322
Step: 78, Price: 31.580356784165744, Position: 0, Balance: 9914.775178291322
Step: 79, Price: 30.09278786956485, Position: 0, Balance: 9914.775178291322
Step: 80, Price: 30.373115981727338, Position: 1, Balance: 9884.402062309595
Step: 81, Price: 31.230228553239083, Position: 1, Balance: 9884.402062309595
Step: 82, Price: 33.2081225979806, Position: 1, Balance: 9884.402062309595
Step: 83, Price: 33.18985237970695, Position: 1, Balance: 9884.402062309595
Step: 84, Price: 32.88135877681376, Position: 1, Balance: 9884.402062309595
Step: 85, Price: 32.87960173322922, Position: 1, Balance: 9884.402062309595
Step: 86, Price: 34.295003850931295, Position: 1, Balance: 9884.402062309595
Step: 87, Price: 35.12375496059098, Position: 1, Balance: 9884.402062309595
Step: 88, Price: 35.09399475682394, Position: 1, Balance: 9884.402062309595
Step: 89, Price: 36.107262189937295, Position: 1, Balance: 9884.402062309595
Step: 90, Price: 36.70433973928534, Position: 1, Balance: 9884.402062309595
Step: 91, Price: 38.172984729818225, Position: 1, Balance: 9884.402062309595
Step: 92, Price: 37.97093163594087, Position: 1, Balance: 9884.402062309595
Step: 93, Price: 38.143269489343105, Position: 0, Balance: 9854.028946327868
Step: 94, Price: 38.25116133621095, Position: -1, Balance: 9892.280107664079
Step: 95, Price: 37.28764638807883, Position: 0, Balance: 9930.53126900029
Step: 96, Price: 38.0837666651434, Position: 0, Balance: 9930.53126900029
Step: 97, Price: 38.84482193732329, Position: 1, Balance: 9891.686447062966
Step: 98, Price: 39.34993539396575, Position: 1, Balance: 9891.686447062966
Step: 99, Price: 39.6153482605906, Position: 1, Balance: 9891.686447062966
Step: 100, Price: 38.69997751854019, Position: 1, Balance: 9891.686447062966
Step: 101, Price: 38.77933219577483, Position: 0, Balance: 9852.841625125642
Step: 102, Price: 38.93661767924806, Position: 0, Balance: 9852.841625125642
Step: 103, Price: 38.63434041002644, Position: 0, Balance: 9852.841625125642
Step: 104, Price: 38.97305469836043, Position: 0, Balance: 9852.841625125642
Step: 105, Price: 39.877105555174964, Position: 0, Balance: 9852.841625125642
Step: 106, Price: 42.263291456385495, Position: 0, Balance: 9852.841625125642
Step: 107, Price: 42.93786926921734, Position: 0, Balance: 9852.841625125642
Step: 108, Price: 43.6954196599401, Position: 0, Balance: 9852.841625125642
Step: 109, Price: 44.12097374417393, Position: 0, Balance: 9852.841625125642
Step: 110, Price: 42.70220252887489, Position: 1, Balance: 9810.139422596767
Step: 111, Price: 43.175688653425674, Position: 1, Balance: 9810.139422596767
Step: 112, Price: 43.7359188633667, Position: 1, Balance: 9810.139422596767
Step: 113, Price: 46.69916097585198, Position: 1, Balance: 9810.139422596767
Step: 114, Price: 47.00680001107086, Position: 1, Balance: 9810.139422596767
Step: 115, Price: 47.80834735340447, Position: 1, Balance: 9810.139422596767
Step: 116, Price: 48.273635583699225, Position: 1, Balance: 9810.139422596767
Step: 117, Price: 47.604957546079696, Position: 1, Balance: 9810.139422596767
Step: 118, Price: 49.24778036059472, Position: 0, Balance: 9767.437220067892
Step: 119, Price: 50.49971339328149, Position: 1, Balance: 9716.93750667461
Step: 120, Price: 51.790745340324534, Position: 1, Balance: 9716.93750667461
Step: 121, Price: 51.381357885529795, Position: 1, Balance: 9716.93750667461
Step: 122, Price: 53.2841521964659, Position: 1, Balance: 9716.93750667461
Step: 123, Price: 52.382301133673614, Position: 1, Balance: 9716.93750667461
Step: 124, Price: 53.46915822747388, Position: 1, Balance: 9716.93750667461
Step: 125, Price: 56.159613853283865, Position: 1, Balance: 9716.93750667461
Step: 126, Price: 55.669077528153174, Position: 1, Balance: 9716.93750667461
Step: 127, Price: 55.6027797985504, Position: 1, Balance: 9716.93750667461
Step: 128, Price: 56.20243116363804, Position: 1, Balance: 9716.93750667461
Step: 129, Price: 56.19895550952184, Position: 1, Balance: 9716.93750667461
Step: 130, Price: 55.148292078455704, Position: 1, Balance: 9716.93750667461
Step: 131, Price: 55.71685505326173, Position: 1, Balance: 9716.93750667461
Step: 132, Price: 55.15455133953563, Position: 1, Balance: 9716.93750667461
Step: 133, Price: 56.12814377017081, Position: 1, Balance: 9716.93750667461
Step: 134, Price: 55.70871953593701, Position: 1, Balance: 9716.93750667461
Step: 135, Price: 57.758653940954545, Position: 1, Balance: 9716.93750667461
Step: 136, Price: 57.47540064861831, Position: 0, Balance: 9666.43779328133
Step: 137, Price: 57.65333913241263, Position: 0, Balance: 9666.43779328133
Step: 138, Price: 58.966856349782304, Position: 0, Balance: 9666.43779328133
Step: 139, Price: 58.235992033348346, Position: -1, Balance: 9724.673785314679
Step: 140, Price: 58.96345196795247, Position: -1, Balance: 9724.673785314679
Step: 141, Price: 60.770594722234904, Position: -1, Balance: 9724.673785314679
Step: 142, Price: 59.663111487673675, Position: -1, Balance: 9724.673785314679
Step: 143, Price: 60.347745346205976, Position: -1, Balance: 9724.673785314679
Step: 144, Price: 61.1076281404544, Position: -1, Balance: 9724.673785314679
Step: 145, Price: 62.38945101223171, Position: -1, Balance: 9724.673785314679
Step: 146, Price: 61.652500301353626, Position: 0, Balance: 9782.909777348028
Step: 147, Price: 60.83204368826935, Position: 1, Balance: 9722.077733659758
Step: 148, Price: 61.85398525388625, Position: 1, Balance: 9722.077733659758
Step: 149, Price: 62.650969927119434, Position: 1, Balance: 9722.077733659758
Step: 150, Price: 63.40146277746531, Position: 0, Balance: 9661.245689971489
Step: 151, Price: 64.24791098696228, Position: 1, Balance: 9596.997778984527
Step: 152, Price: 64.06788626538379, Position: 1, Balance: 9596.997778984527
Step: 153, Price: 64.80013996254479, Position: 1, Balance: 9596.997778984527
Step: 154, Price: 65.59321243584347, Position: 1, Balance: 9596.997778984527
Step: 155, Price: 65.3788610178171, Position: 1, Balance: 9596.997778984527
Step: 156, Price: 67.74463552896185, Position: 1, Balance: 9596.997778984527
Step: 157, Price: 68.71846844987364, Position: 1, Balance: 9596.997778984527
Step: 158, Price: 68.02716495267099, Position: 1, Balance: 9596.997778984527
Step: 159, Price: 69.18371856130481, Position: 1, Balance: 9596.997778984527
Step: 160, Price: 68.70903689107749, Position: 1, Balance: 9596.997778984527
Step: 161, Price: 69.99612149481995, Position: 1, Balance: 9596.997778984527
Step: 162, Price: 71.65471707382736, Position: 1, Balance: 9596.997778984527
Step: 163, Price: 71.33403475547564, Position: 1, Balance: 9596.997778984527
Step: 164, Price: 72.79741088471997, Position: 1, Balance: 9596.997778984527
Step: 165, Price: 73.71019181165646, Position: 1, Balance: 9596.997778984527
Step: 166, Price: 75.03225197165095, Position: 1, Balance: 9596.997778984527
Step: 167, Price: 77.4290449543049, Position: 1, Balance: 9596.997778984527
Step: 168, Price: 77.68365683830203, Position: 1, Balance: 9596.997778984527
Step: 169, Price: 77.42992067394454, Position: 1, Balance: 9596.997778984527
Step: 170, Price: 77.04040624431902, Position: 1, Balance: 9596.997778984527
Step: 171, Price: 76.72459595935358, Position: 1, Balance: 9596.997778984527
Step: 172, Price: 77.14749424993947, Position: 1, Balance: 9596.997778984527
Step: 173, Price: 77.98864622475611, Position: 1, Balance: 9596.997778984527
Step: 174, Price: 78.76533702408612, Position: 1, Balance: 9596.997778984527
Step: 175, Price: 80.09252027312215, Position: 1, Balance: 9596.997778984527
Step: 176, Price: 80.60552216500005, Position: 1, Balance: 9596.997778984527
Step: 177, Price: 82.55905624215737, Position: 0, Balance: 9532.749867997565
Step: 178, Price: 82.79439940891942, Position: 1, Balance: 9449.955468588645
Step: 179, Price: 86.01456857550903, Position: 1, Balance: 9449.955468588645
Step: 180, Price: 87.14023592327403, Position: 1, Balance: 9449.955468588645
Step: 181, Price: 86.78307836685775, Position: 1, Balance: 9449.955468588645
Step: 182, Price: 86.21218586879664, Position: 1, Balance: 9449.955468588645
Step: 183, Price: 87.19465828403982, Position: 1, Balance: 9449.955468588645
Step: 184, Price: 87.47119549871397, Position: 1, Balance: 9449.955468588645
Step: 185, Price: 88.68519599280606, Position: 1, Balance: 9449.955468588645
Step: 186, Price: 89.6584336173796, Position: 1, Balance: 9449.955468588645
Step: 187, Price: 90.08560470472273, Position: 1, Balance: 9449.955468588645
Step: 188, Price: 89.73881098665431, Position: 1, Balance: 9449.955468588645
Step: 189, Price: 88.72396376196845, Position: 1, Balance: 9449.955468588645
Step: 190, Price: 88.77744880990143, Position: 1, Balance: 9449.955468588645
Step: 191, Price: 90.1338476042249, Position: 1, Balance: 9449.955468588645
Step: 192, Price: 90.8479413483551, Position: 1, Balance: 9449.955468588645
Step: 193, Price: 90.10220256964313, Position: 0, Balance: 9367.161069179725
Step: 194, Price: 90.77538349549431, Position: 0, Balance: 9367.161069179725
Step: 195, Price: 91.66070087522314, Position: 0, Balance: 9367.161069179725
Step: 196, Price: 91.276843439022, Position: 0, Balance: 9367.161069179725
Step: 197, Price: 91.93056854496753, Position: 1, Balance: 9275.230500634758
Step: 198, Price: 92.48877726341352, Position: 1, Balance: 9275.230500634758
Step: 199, Price: 91.8458069655829, Position: 1, Balance: 9275.230500634758
Gesamtbelohnung: 146.08859019547393

px.line([a[0] for a in actions])

px.line(rewards)

from ray.rllib.algorithms.ppo import PPOConfig

config = (  # 1. Configure the algorithm,
    PPOConfig()
    .environment("Pusher-v4", render_env=True)
    .env_runners(num_env_runners=2)
    .framework("torch")
    .training()
    .evaluation(evaluation_num_env_runners=1)
)

algo = config.build()  # 2. build the algorithm,

for _ in range(5):
    algo.train()  # 3. train it,

#algo.evaluate()  # 4. and evaluate it.
print(f"Execution Time: {time.time()-tic}")

2024-06-24 11:12:43,177	WARNING algorithm_config.py:4078 -- You have specified 1 evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation will not occur automatically with each call to `Algorithm.train()`. Instead, you will have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.
/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm.py:525: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
`UnifiedLogger` will be removed in Ray 2.7.

/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.

/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.

/Users/jploennigs/miniconda3/envs/lehre4/lib/python3.11/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning:

This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.

2024-06-24 11:12:47,084	WARNING algorithm_config.py:4078 -- You have specified 1 evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation will not occur automatically with each call to `Algorithm.train()`. Instead, you will have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.
2024-06-24 11:12:49,605	WARNING util.py:61 -- Install gputil for GPU system monitoring.

Execution Time: 79.14778399467468

from IPython.display import IFrame
IFrame(width="800", height="413", src="https://www.youtube.com/embed/_QmcH1TyNwg", title="Gymnasium - Pusher-v4, Test with different algorithms",
         frameborder="0", allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share", referrerpolicy="strict-origin-when-cross-origin", allowfullscreen=True)

from ray.rllib.algorithms.ppo import PPOConfig

config = (  # 1. Configure the algorithm,
    PPOConfig()
    .environment('FetchPickAndPlaceDense-v2')
    .env_runners(num_env_runners=2)
    .framework("torch")
    .training()
    .evaluation(evaluation_num_env_runners=1)
)

algo = config.build()  # 2. build the algorithm,

for _ in range(5):
    algo.train()  # 3. train it,

#algo.evaluate()  # 4. and evaluate it.
print(f"Execution Time: {time.time()-tic}")

2024-06-24 11:13:13,573	WARNING algorithm_config.py:4078 -- You have specified 1 evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation will not occur automatically with each call to `Algorithm.train()`. Instead, you will have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.
2024-06-24 11:13:17,302	WARNING algorithm_config.py:4078 -- You have specified 1 evaluation workers, but your `evaluation_interval` is 0 or None! Therefore, evaluation will not occur automatically with each call to `Algorithm.train()`. Instead, you will have to call `Algorithm.evaluate()` manually in order to trigger an evaluation run.
2024-06-24 11:13:20,204	WARNING util.py:61 -- Install gputil for GPU system monitoring.

Execution Time: 119.0861268043518

Grundlegende Konzepte¶

Beispiel: Gridworld-Umgebung¶

Random Walk¶

Q-Learning¶

Zeitreihe¶

RL Frameworks¶

Robotik¶

Robot Pusher¶

Pick and Place¶