Coder Social home page Coder Social logo

Comments (3)

LumenScope avatar LumenScope commented on June 11, 2024

我注意到以下代码:

alpaca_zh = dict(
    type=process_hf_dataset,
    dataset=dict(type=load_dataset, path=alpaca_zh_path),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=alpaca_zh_map_fn,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length)

其中alpaca_zh_map_fn为默认:

# Copyright (c) OpenMMLab. All rights reserved.
from xtuner.utils import SYSTEM_TEMPLATE


def alpaca_zh_map_fn(example):
    return {
        'conversation': [{
            'system': SYSTEM_TEMPLATE.alpaca,
            'input': f"{example['instruction_zh']}\n{example['input_zh']}",
            'output': example['output_zh']
        }]
    }

这是否意味着我在代码里直接定义的SYSTEM是无效的?

from xtuner.

LumenScope avatar LumenScope commented on June 11, 2024

我最终修改成如下代码解决了,我认为官方的map_fn需要追加一个自动修改!

# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                            LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory
from xtuner.engine import DatasetInfoHook, EvaluateChatHook
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE

#######################################################################xtuner list-cfg
#                          PART 1  Settings                           #
#######################################################################
# Model
pretrained_model_name_or_path = '/data1/tzz/Pku政务大模型/Model/Qwen-7B-Chat'

# Data
alpaca_zh_path = '/data1/tzz/Pku政务大模型/Trainer/Data'
prompt_template = PROMPT_TEMPLATE.qwen_chat
max_length = 2048
pack_to_max_length = True

# Scheduler & Optimizer
batch_size = 1  # per_device
accumulative_counts = 16
dataloader_num_workers = 0
max_epochs = 1
optim_type = AdamW
lr = 3e-4
betas = (0.9, 0.999)
weight_decay = 1e-3
max_norm = 1  # grad clip
warmup_ratio = 0.03
# export CUDA_VISIBLE_DEVICES=1,2,3,4,5
# NPROC_PER_NODE=5 xtuner train '/data1/tzz/Pku政务大模型/Trainer/XTuner/config/qwen_7b.py'  --deepspeed deepspeed_zero3
# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = "你的任务是重庆市政务文书写作、政务问答。\n参照你固有的知识或者我给出的法律文献,在引用法律文件时使用《》包裹其名称。\n"
evaluation_inputs = [
    '信件标题:平潭综合实验区政策咨询\n信件内容:如何申请成为重庆市政府的平潭综合实验区政策援助对象?范围和条件分别是什么?', '信件标题:询问步行街烟火管理政策\n信件内容:您好,我想开一家熟食摊位在我所在区的步行街,我想请问重庆市对于步行街烟火管理有没有特定的政策规定需要我们遵循?'
]
def SYSTEM_map_fn(example):
    return {
        'conversation': [{
            'system': SYSTEM,
            'input': f"{example['instruction_zh']}\n{example['input_zh']}",
            'output': example['output_zh']
        }]
    }
# 你的任务是重庆市政务文书写作、政务问答。 
# 你生成的问题必须包含:1、信件标题,2、信件内容。你生成的答复内容部分必须有法律依据,且有礼貌的开头,例如:“您好!来信收悉,现回复如下:”,参照你固有的知识或者我给出的法律文献,在引用法律文件时使用《》包裹其名称。
# 信件标题:平潭综合实验区政策咨询\n信件内容:如何申请成为重庆市政府的平潭综合实验区政策援助对象?范围和条件分别是什么?
#######################################################################
#                      PART 2  Model & Tokenizer                      #
#######################################################################
tokenizer = dict(
    type=AutoTokenizer.from_pretrained,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    trust_remote_code=True,
    padding_side='right',
    eos_token='<|endoftext|>')

model = dict(
    type=SupervisedFinetune,
    llm=dict(
        type=AutoModelForCausalLM.from_pretrained,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        quantization_config=dict(
            type=BitsAndBytesConfig,
            load_in_4bit=True,
            load_in_8bit=False,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4')
        ),
    # lora=dict(
    #     type=LoraConfig,
    #     r=64,
    #     lora_alpha=16,
    #     lora_dropout=0.1,
    #     bias='none',
    #     task_type='CAUSAL_LM')
    )

#######################################################################
#                      PART 3  Dataset & Dataloader                   #
#######################################################################
alpaca_zh = dict(
    type=process_hf_dataset,
    dataset=dict(type=load_dataset, path=alpaca_zh_path),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=SYSTEM_map_fn,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length)

train_dataloader = dict(
    batch_size=batch_size,
    num_workers=dataloader_num_workers,
    dataset=alpaca_zh,
    sampler=dict(type=DefaultSampler, shuffle=True),
    collate_fn=dict(type=default_collate_fn))

#######################################################################
#                    PART 4  Scheduler & Optimizer                    #
#######################################################################
# optimizer
optim_wrapper = dict(
    type=AmpOptimWrapper,
    optimizer=dict(
        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
    accumulative_counts=accumulative_counts,
    loss_scale='dynamic',
    dtype='float16')

# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-5,
        by_epoch=True,
        begin=0,
        end=warmup_ratio * max_epochs,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=0.0,
        by_epoch=True,
        begin=warmup_ratio * max_epochs,
        T_max=max_epochs,
        convert_to_iter_based=True)
]

# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)

#######################################################################
#                           PART 5  Runtime                           #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
    dict(type=DatasetInfoHook, tokenizer=tokenizer),
    dict(
        type=EvaluateChatHook,
        tokenizer=tokenizer,
        every_n_iters=evaluation_freq,
        evaluation_inputs=evaluation_inputs,
        system=SYSTEM,
        prompt_template=prompt_template)
]

# configure default hooks
default_hooks = dict(
    # record the time of every iteration.
    timer=dict(type=IterTimerHook),
    # print log every 100 iterations.
    logger=dict(type=LoggerHook, interval=10),
    # enable the parameter scheduler.
    param_scheduler=dict(type=ParamSchedulerHook),
    # save checkpoint per epoch.
    checkpoint=dict(type=CheckpointHook, interval=1),
    # set sampler seed in distributed evrionment.
    sampler_seed=dict(type=DistSamplerSeedHook),
)

# configure environment
env_cfg = dict(
    # whether to enable cudnn benchmark
    cudnn_benchmark=False,
    # set multi process parameters
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    # set distributed parameters
    dist_cfg=dict(backend='nccl'),
)

# set visualizer
visualizer = None

# set log level
log_level = 'INFO'

# load from which checkpoint
load_from = None

# whether to resume training from the loaded checkpoint
resume = False

# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

from xtuner.

LZHgrla avatar LZHgrla commented on June 11, 2024

确实是由 system 设置不准确导致的,使用时得仔细配置 map_fn。

为避免类似问题,我们会将预设的 system 移除

#363
#395

from xtuner.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.