测试代码

测试代码#

import set_env
import logging
import multiprocessing as mp
import sys
import time
from pathlib import Path
from d2py.utils.log_config import config_logging
root_dir = Path(".").resolve()
sys.path.extend([str(root_dir.parents[2]/"tests/gym-multigrid")])

logger_dir = root_dir/".temp"
logger_dir.mkdir(parents=True, exist_ok=True)
temp_dir = root_dir/"images"
temp_dir.mkdir(parents=True, exist_ok=True)

logger_name = "drlhp"
logger = logging.getLogger(logger_name)
config_logging(f'{logger_dir}/{logger_name}.log', logger_name, maxBytes=50000, backupCount=2)
import argparse
import random
import numpy as np
import torch
import gymnasium as gym
# import pybullet_envs
# import matplotlib.pyplot as plt

# model
from utils.model.ppo import PPO
from utils.config import get_config
# wrappers
from utils.human_feedback_wrapper import HumanFeedback, SyntheticFeedback
from utils.reward_wrapper import FeedbackReward
from dataclasses import dataclass

@dataclass
class EnvConfig:
    env_name: str # ["cartpole", "pendulum", "cheetah"]
    seed: int = 1
    entropy: float = 0.1 # [0.0, 0.01, 0.05, 0.1]
    synthetic: bool = False
    constant_ask: int = 1000 # [100, 1000, 10000]
    collect_initial: int = 0 # [0, 50, 200] 
    num_batches: int = 100

    def __post_init__(self):
        self.config = get_config(
            self.env_name, 
            self.seed, 
            self.entropy, 
            self.constant_ask, 
            self.collect_initial, 
            self.num_batches
        )
env_config = EnvConfig("pendulum")
if env_config.synthetic:
    env = SyntheticFeedback(FeedbackReward(gym.make(env_config.config.env_name)), config=env_config.config)
else:
    env = HumanFeedback(FeedbackReward(gym.make(env_config.config.env_name)), config=env_config.config)
entropy:  0.1
Updating reward network every 1000 steps
Collect a preference every 1000 steps
Clip length =  30
--Using human feedback.--
eval_env = gym.make(env_config.config.env_name)
# train model
observation = eval_env.reset()
eval_env.reset()
model = PPO(env, eval_env, env_config.config, env_config.seed)
model.train()
print (f"{env.pref_db.total_labeled} preference collected total")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[7], line 5
      3 eval_env.reset()
      4 model = PPO(env, eval_env, env_config.config, env_config.seed)
----> 5 model.train()
      6 print (f"{env.pref_db.total_labeled} preference collected total")

File /media/pc/data/lxw/ai/d2py/doc/libs/drlhp/utils/model/ppo.py:77, in PPO.train(self)
     72 averaged_total_rewards = []  # the returns for each iteration
     74 for t in range(self.config.num_batches):
     75 
     76     # collect a minibatch of samples
---> 77     paths, total_rewards = self.sample_path(self.env)
     78     eval_paths, total_eval_rewards = self.sample_path(self.eval_env, num_episodes = self.eval_num_episodes)
     79     all_eval_total_rewards.extend(total_eval_rewards)

File /media/pc/data/lxw/ai/d2py/doc/libs/drlhp/utils/model/ppo.py:162, in PPO.sample_path(self, env, num_episodes)
    160 states.append(state)
    161 # Note the difference between this line and the corresponding line in PolicyGradient.
--> 162 action, old_logprob = self.policy.act(states[-1][None], return_log_prob = True)
    163 assert old_logprob.shape == (1,)
    164 action, old_logprob = action[0], old_logprob[0]

TypeError: tuple indices must be integers or slices, not NoneType
obs = np.arange(100*100*3).reshape(100, 100, 3)
np.array([np.array(obs)]).shape
(1, 100, 100, 3)