测试代码#
import set_env
import logging
import multiprocessing as mp
import sys
import time
from pathlib import Path
from d2py.utils.log_config import config_logging
root_dir = Path(".").resolve()
sys.path.extend([str(root_dir.parents[2]/"tests/gym-multigrid")])
logger_dir = root_dir/".temp"
logger_dir.mkdir(parents=True, exist_ok=True)
temp_dir = root_dir/"images"
temp_dir.mkdir(parents=True, exist_ok=True)
logger_name = "drlhp"
logger = logging.getLogger(logger_name)
config_logging(f'{logger_dir}/{logger_name}.log', logger_name, maxBytes=50000, backupCount=2)
import argparse
import random
import numpy as np
import torch
import gymnasium as gym
# import pybullet_envs
# import matplotlib.pyplot as plt
# model
from utils.model.ppo import PPO
from utils.config import get_config
# wrappers
from utils.human_feedback_wrapper import HumanFeedback, SyntheticFeedback
from utils.reward_wrapper import FeedbackReward
from dataclasses import dataclass
@dataclass
class EnvConfig:
env_name: str # ["cartpole", "pendulum", "cheetah"]
seed: int = 1
entropy: float = 0.1 # [0.0, 0.01, 0.05, 0.1]
synthetic: bool = False
constant_ask: int = 1000 # [100, 1000, 10000]
collect_initial: int = 0 # [0, 50, 200]
num_batches: int = 100
def __post_init__(self):
self.config = get_config(
self.env_name,
self.seed,
self.entropy,
self.constant_ask,
self.collect_initial,
self.num_batches
)
env_config = EnvConfig("pendulum")
if env_config.synthetic:
env = SyntheticFeedback(FeedbackReward(gym.make(env_config.config.env_name)), config=env_config.config)
else:
env = HumanFeedback(FeedbackReward(gym.make(env_config.config.env_name)), config=env_config.config)
entropy: 0.1
Updating reward network every 1000 steps
Collect a preference every 1000 steps
Clip length = 30
--Using human feedback.--
eval_env = gym.make(env_config.config.env_name)
# train model
observation = eval_env.reset()
eval_env.reset()
model = PPO(env, eval_env, env_config.config, env_config.seed)
model.train()
print (f"{env.pref_db.total_labeled} preference collected total")
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[7], line 5
3 eval_env.reset()
4 model = PPO(env, eval_env, env_config.config, env_config.seed)
----> 5 model.train()
6 print (f"{env.pref_db.total_labeled} preference collected total")
File /media/pc/data/lxw/ai/d2py/doc/libs/drlhp/utils/model/ppo.py:77, in PPO.train(self)
72 averaged_total_rewards = [] # the returns for each iteration
74 for t in range(self.config.num_batches):
75
76 # collect a minibatch of samples
---> 77 paths, total_rewards = self.sample_path(self.env)
78 eval_paths, total_eval_rewards = self.sample_path(self.eval_env, num_episodes = self.eval_num_episodes)
79 all_eval_total_rewards.extend(total_eval_rewards)
File /media/pc/data/lxw/ai/d2py/doc/libs/drlhp/utils/model/ppo.py:162, in PPO.sample_path(self, env, num_episodes)
160 states.append(state)
161 # Note the difference between this line and the corresponding line in PolicyGradient.
--> 162 action, old_logprob = self.policy.act(states[-1][None], return_log_prob = True)
163 assert old_logprob.shape == (1,)
164 action, old_logprob = action[0], old_logprob[0]
TypeError: tuple indices must be integers or slices, not NoneType
obs = np.arange(100*100*3).reshape(100, 100, 3)
np.array([np.array(obs)]).shape
(1, 100, 100, 3)