Comments (3)
我注意到以下代码:
alpaca_zh = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=alpaca_zh_path),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=alpaca_zh_map_fn,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length)
其中alpaca_zh_map_fn为默认:
# Copyright (c) OpenMMLab. All rights reserved.
from xtuner.utils import SYSTEM_TEMPLATE
def alpaca_zh_map_fn(example):
return {
'conversation': [{
'system': SYSTEM_TEMPLATE.alpaca,
'input': f"{example['instruction_zh']}\n{example['input_zh']}",
'output': example['output_zh']
}]
}
这是否意味着我在代码里直接定义的SYSTEM是无效的?
from xtuner.
我最终修改成如下代码解决了,我认为官方的map_fn需要追加一个自动修改!
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory
from xtuner.engine import DatasetInfoHook, EvaluateChatHook
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################xtuner list-cfg
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = '/data1/tzz/Pku政务大模型/Model/Qwen-7B-Chat'
# Data
alpaca_zh_path = '/data1/tzz/Pku政务大模型/Trainer/Data'
prompt_template = PROMPT_TEMPLATE.qwen_chat
max_length = 2048
pack_to_max_length = True
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 16
dataloader_num_workers = 0
max_epochs = 1
optim_type = AdamW
lr = 3e-4
betas = (0.9, 0.999)
weight_decay = 1e-3
max_norm = 1 # grad clip
warmup_ratio = 0.03
# export CUDA_VISIBLE_DEVICES=1,2,3,4,5
# NPROC_PER_NODE=5 xtuner train '/data1/tzz/Pku政务大模型/Trainer/XTuner/config/qwen_7b.py' --deepspeed deepspeed_zero3
# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = "你的任务是重庆市政务文书写作、政务问答。\n参照你固有的知识或者我给出的法律文献,在引用法律文件时使用《》包裹其名称。\n"
evaluation_inputs = [
'信件标题:平潭综合实验区政策咨询\n信件内容:如何申请成为重庆市政府的平潭综合实验区政策援助对象?范围和条件分别是什么?', '信件标题:询问步行街烟火管理政策\n信件内容:您好,我想开一家熟食摊位在我所在区的步行街,我想请问重庆市对于步行街烟火管理有没有特定的政策规定需要我们遵循?'
]
def SYSTEM_map_fn(example):
return {
'conversation': [{
'system': SYSTEM,
'input': f"{example['instruction_zh']}\n{example['input_zh']}",
'output': example['output_zh']
}]
}
# 你的任务是重庆市政务文书写作、政务问答。
# 你生成的问题必须包含:1、信件标题,2、信件内容。你生成的答复内容部分必须有法律依据,且有礼貌的开头,例如:“您好!来信收悉,现回复如下:”,参照你固有的知识或者我给出的法律文献,在引用法律文件时使用《》包裹其名称。
# 信件标题:平潭综合实验区政策咨询\n信件内容:如何申请成为重庆市政府的平潭综合实验区政策援助对象?范围和条件分别是什么?
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right',
eos_token='<|endoftext|>')
model = dict(
type=SupervisedFinetune,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')
),
# lora=dict(
# type=LoraConfig,
# r=64,
# lora_alpha=16,
# lora_dropout=0.1,
# bias='none',
# task_type='CAUSAL_LM')
)
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_zh = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path=alpaca_zh_path),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=SYSTEM_map_fn,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length)
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=alpaca_zh,
sampler=dict(type=DefaultSampler, shuffle=True),
collate_fn=dict(type=default_collate_fn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
T_max=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 100 iterations.
logger=dict(type=LoggerHook, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per epoch.
checkpoint=dict(type=CheckpointHook, interval=1),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)
from xtuner.
确实是由 system 设置不准确导致的,使用时得仔细配置 map_fn。
为避免类似问题,我们会将预设的 system 移除
from xtuner.
Related Issues (20)
- Any method to finetune embedding layers using Xtuner? HOT 1
- 关于gemma的template问题 HOT 1
- 关于自定义图文数据微调 HOT 8
- RuntimeError: Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank. HOT 2
- Is there any plan to support MAC?
- KeyError: 'Column length not in the dataset. Current columns in the dataset: []' HOT 2
- internlm2_20b_qlora_msagent_react_e3_gpu8脚本运行时报错 HOT 22
- 请问如何对deepspeed中的相关参数进行配置,比如master_port? HOT 2
- xtuner check-custom-dataset /home/internlm2.py不通过怎么办? HOT 1
- 数据template 的截断-- input和output想切分一下该去哪里改呢? HOT 1
- 使用deepseek-coder-6.7B作为基座进行SFT时报错 HOT 6
- 在配置文件中尝试添加val_evaluator时,出现ModuleNotFoundError HOT 4
- mengine - WARNING - WARNING: command error: 'libGL.so.1: cannot open shared object file: No such file or directory'! HOT 1
- LLava的文本处理,好像没有对齐 HOT 1
- 咨询关于 FLOPs 的计算 HOT 2
- Support CohereForAI/c4ai-command-r-v01 HOT 1
- 微调完成后,测试对话inference-Mixtral8X7B HOT 3
- xtuner convert merge出现OOM! 单卡a100 HOT 1
- [Feature] 提前确认离线处理的 llava 数据所使用的 tokenizer 是否和当前配置中一致
- 在微调过程中,评估样例出现循环,到底是因为评估时没有惩罚参数,还是因为微调已经过拟合了? HOT 4
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from xtuner.