1+ import numpy as np
2+ import pandas as pd
3+ import tensorflow as tf
4+ tf .get_logger ().setLevel ('ERROR' )
5+ for gpu in tf .config .experimental .list_physical_devices ('GPU' ):
6+ tf .config .experimental .set_memory_growth (gpu , True )
7+
8+ from keras import layers , models
9+
10+ from finrock .data_feeder import PdDataFeeder
11+ from finrock .trading_env import TradingEnv
12+ from finrock .scalers import MinMaxScaler
13+ from finrock .reward import simpleReward
14+ from finrock .metrics import DifferentActions , AccountValue
15+
16+ from rockrl .utils .misc import MeanAverage
17+ from rockrl .utils .memory import Memory
18+ from rockrl .tensorflow import PPOAgent
19+
20+ df = pd .read_csv ('Datasets/random_sinusoid.csv' )
21+ df = df [:- 1000 ] # leave 1000 for testing
22+
23+ pd_data_feeder = PdDataFeeder (df )
24+
25+
26+ env = TradingEnv (
27+ data_feeder = pd_data_feeder ,
28+ output_transformer = MinMaxScaler (min = pd_data_feeder .min , max = pd_data_feeder .max ),
29+ initial_balance = 1000.0 ,
30+ max_episode_steps = 1000 ,
31+ window_size = 50 ,
32+ reward_function = simpleReward ,
33+ metrics = [
34+ DifferentActions (),
35+ AccountValue (),
36+ ]
37+ )
38+
39+ action_space = env .action_space
40+ input_shape = env .observation_space .shape
41+
42+
43+ actor_model = models .Sequential ([
44+ layers .Input (shape = input_shape , dtype = tf .float32 ),
45+ layers .Flatten (),
46+ layers .Dense (512 , activation = 'elu' ),
47+ layers .Dense (256 , activation = 'elu' ),
48+ layers .Dense (64 , activation = 'elu' ),
49+ layers .Dropout (0.5 ),
50+ layers .Dense (action_space , activation = 'softmax' )
51+ ])
52+
53+ critic_model = models .Sequential ([
54+ layers .Input (shape = input_shape , dtype = tf .float32 ),
55+ layers .Flatten (),
56+ layers .Dense (512 , activation = 'elu' ),
57+ layers .Dense (256 , activation = 'elu' ),
58+ layers .Dense (64 , activation = 'elu' ),
59+ layers .Dropout (0.5 ),
60+ layers .Dense (1 , activation = None )
61+ ])
62+
63+ agent = PPOAgent (
64+ actor = actor_model ,
65+ critic = critic_model ,
66+ optimizer = tf .keras .optimizers .Adam (learning_rate = 0.0002 ),
67+ batch_size = 512 ,
68+ lamda = 0.95 ,
69+ kl_coeff = 0.5 ,
70+ c2 = 0.01 ,
71+ writer_comment = 'ppo_sinusoid' ,
72+ )
73+
74+
75+ memory = Memory ()
76+ meanAverage = MeanAverage (best_mean_score_episode = 1000 )
77+ state , info = env .reset ()
78+ rewards = 0.0
79+ while True :
80+ action , prob = agent .act (state )
81+
82+ next_state , reward , terminated , truncated , info = env .step (action )
83+ memory .append (state , action , reward , prob , terminated , truncated , next_state , info )
84+ state = next_state
85+
86+ if memory .done :
87+ history = agent .train (memory )
88+ mean_reward = meanAverage (np .sum (memory .rewards ))
89+
90+ if meanAverage .is_best (agent .epoch ):
91+ agent .save_models ('ppo_sinusoid' )
92+
93+ if history ['kl_div' ] > 0.05 :
94+ agent .reduce_learning_rate (0.99 , verbose = False )
95+
96+ print (agent .epoch , np .sum (memory .rewards ), mean_reward , info ["metrics" ]['account_value' ], history ['kl_div' ])
97+ agent .log_to_writer (info ['metrics' ])
98+ memory .reset ()
99+ state , info = env .reset ()
100+
101+
102+ if agent .epoch >= 10000 :
103+ break
0 commit comments