for i, transition in enumerate(episode_cache):
new_goals = generate_goals(i, episode_cache, args.HER_sample_num)
for new_goal in new_goals:
reward = calcu_reward(new_goal, state, action)
state, action, new_state = gene_new_sas(new_goal, transition)
ram.add(state, action, reward, new_state)
for i, transition in enumerate(episode_cache):
new_goals = generate_goals(i, episode_cache, args.HER_sample_num)
for new_goal in new_goals:
state = transition[0]
action = transition[1]
reward = calcu_reward(new_goal, state, action)
state, action, new_state = gene_new_sas(new_goal, transition) # 一个transition被换成了各种goals
ram.add(state, action, reward, new_state)
Otherwise, this algorithm is not convergent. I have tried to train it.