I have an input of 23bp sequence with a 2 classification problem, I put my data as said in $DATA_PATH/train.tsv and dev.tsv
Input:
`
export KMER=3
export MODEL_PATH='/mnt/d/M3/Projects/BCB/DNABERT/models/3-new-12w-0/'
export DATA_PATH='/mnt/d/M3/Projects/BCB/DNABERT/examples/sample_data/ft/prom-core/3'
export OUTPUT_PATH='/mnt/d/M3/Projects/BCB/DNABERT/examples/OUTPUT/fit/3mer/'
python run_finetune.py
--model_type dna
--tokenizer_name=dna$KMER
--model_name_or_path $MODEL_PATH
--task_name dnasplice
--do_train
--do_eval
--data_dir $DATA_PATH
--max_seq_length 23
--per_gpu_eval_batch_size=8
--per_gpu_train_batch_size=8
--learning_rate 2e-4
--num_train_epochs 3.0
--output_dir $OUTPUT_PATH
--evaluate_during_training
--logging_steps 100
--save_steps 4000
--warmup_percent 0.1
--hidden_dropout_prob 0.1
--overwrite_output
--weight_decay 0.01
--n_process 8
`
I get an error:
`
07/28/2021 20:37:24 - WARNING - main - Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, 16-bits training: False
07/28/2021 20:37:24 - INFO - transformers.configuration_utils - loading configuration file /mnt/d/M3/Projects/BCB/DNABERT/models/3-new-12w-0/config.json
07/28/2021 20:37:24 - INFO - transformers.configuration_utils - Model config BertConfig {
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"do_sample": false,
"eos_token_ids": 0,
"finetuning_task": "dnasplice",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"is_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-12,
"length_penalty": 1.0,
"max_length": 20,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_beams": 1,
"num_hidden_layers": 12,
"num_labels": 3,
"num_return_sequences": 1,
"num_rnn_layer": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_past": true,
"pad_token_id": 0,
"pruned_heads": {},
"repetition_penalty": 1.0,
"rnn": "lstm",
"rnn_dropout": 0.0,
"rnn_hidden": 768,
"split": 10,
"temperature": 1.0,
"top_k": 50,
"top_p": 1.0,
"torchscript": false,
"type_vocab_size": 2,
"use_bfloat16": false,
"vocab_size": 69
}
============================================================
<class 'transformers.tokenization_dna.DNATokenizer'>
07/28/2021 20:37:24 - INFO - transformers.tokenization_utils - loading file https://raw.githubusercontent.com/jerryji1993/DNABERT/master/src/transformers/dnabert-config/bert-config-3/vocab.txt from cache at /home/woreom/.cache/torch/transformers/e1e7221d086d0af09215b2c6ef3ded41de274c79ace1930c48dfce242a7b36fa.b24b7bce4d95258cccdbc46b651c8283db3a0f1324fb97567c8b22b19970f82c
07/28/2021 20:37:24 - INFO - transformers.modeling_utils - loading weights file /mnt/d/M3/Projects/BCB/DNABERT/models/3-new-12w-0/pytorch_model.bin
07/28/2021 20:37:26 - INFO - transformers.modeling_utils - Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
07/28/2021 20:37:26 - INFO - transformers.modeling_utils - Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
07/28/2021 20:37:26 - INFO - main - finish loading model
07/28/2021 20:37:28 - INFO - main - Training/evaluation parameters Namespace(adam_epsilon=1e-08, attention_probs_dropout_prob=0.1, beta1=0.9, beta2=0.999, cache_dir='', config_name='', data_dir='/mnt/d/M3/Projects/BCB/DNABERT/examples/sample_data/ft/prom-core/3', device=device(type='cuda'), do_ensemble_pred=False, do_eval=True, do_lower_case=False, do_predict=False, do_train=True, do_visualize=False, early_stop=0, eval_all_checkpoints=False, evaluate_during_training=True, fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=1, hidden_dropout_prob=0.1, learning_rate=0.0002, local_rank=-1, logging_steps=100, max_grad_norm=1.0, max_seq_length=75, max_steps=-1, model_name_or_path='/mnt/d/M3/Projects/BCB/DNABERT/models/3-new-12w-0/', model_type='dna', n_gpu=1, n_process=8, no_cuda=False, num_rnn_layer=2, num_train_epochs=3.0, output_dir='/mnt/d/M3/Projects/BCB/DNABERT/examples/OUTPUT/fit/3mer/', output_mode='classification', overwrite_cache=False, overwrite_output_dir=True, per_gpu_eval_batch_size=8, per_gpu_pred_batch_size=8, per_gpu_train_batch_size=8, predict_dir=None, predict_scan_size=1, result_dir=None, rnn='lstm', rnn_dropout=0.0, rnn_hidden=768, save_steps=4000, save_total_limit=None, seed=42, server_ip='', server_port='', should_continue=False, task_name='dnasplice', tokenizer_name='dna3', visualize_data_dir=None, visualize_models=None, visualize_train=False, warmup_percent=0.1, warmup_steps=0, weight_decay=0.01)
07/28/2021 20:37:28 - INFO - main - Loading features from cached file /mnt/d/M3/Projects/BCB/DNABERT/examples/sample_data/ft/prom-core/3/cached_train_3-new-12w-0_75_dnasplice
07/28/2021 20:37:29 - INFO - main - ***** Running training *****
07/28/2021 20:37:29 - INFO - main - Num examples = 16748
07/28/2021 20:37:29 - INFO - main - Num Epochs = 3
07/28/2021 20:37:29 - INFO - main - Instantaneous batch size per GPU = 8
07/28/2021 20:37:29 - INFO - main - Total train batch size (w. parallel, distributed & accumulation) = 8
07/28/2021 20:37:29 - INFO - main - Gradient Accumulation steps = 1
07/28/2021 20:37:29 - INFO - main - Total optimization steps = 6282
07/28/2021 20:37:29 - INFO - main - Continuing training from checkpoint, will skip to saved global_step
07/28/2021 20:37:29 - INFO - main - Continuing training from epoch 0
07/28/2021 20:37:29 - INFO - main - Continuing training from global step 0
07/28/2021 20:37:29 - INFO - main - Will skip the first 0 steps in the first epoch
Epoch: 0%| | 0/3 [00:00<?, ?it/s07/28/2021 20:38:09 - INFO - main - Loading features from cached file /mnt/d/M3/Projects/BCB/DNABERT/examples/sample_data/ft/prom-core/3/cached_dev_3-new-12w-0_75_dnasplice
07/28/2021 20:38:09 - INFO - main - ***** Running evaluation *****
07/28/2021 20:38:09 - INFO - main - Num examples = 424
07/28/2021 20:38:09 - INFO - main - Batch size = 8
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 53/53 [00:05<00:00, 9.22it/s]
/home/woreom/anaconda3/envs/dnabert/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1248: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use zero_division
parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/woreom/anaconda3/envs/dnabert/lib/python3.6/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars
mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
Iteration: 5%|███▎ | 99/2094 [00:46<15:32, 2.14it/s]
Epoch: 0%| | 0/3 [00:46<?, ?it/s]
Traceback (most recent call last):
File "run_finetune.py", line 1282, in
main()
File "run_finetune.py", line 1097, in main
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
File "run_finetune.py", line 304, in train
results = evaluate(args, model, tokenizer)
File "run_finetune.py", line 447, in evaluate
result = compute_metrics(eval_task, preds, out_label_ids, probs)
File "/mnt/d/M3/Projects/BCB/DNABERT/src/transformers/data/metrics/init.py", line 110, in glue_compute_metrics
return acc_f1_mcc_auc_pre_rec(preds, labels, probs)
File "/mnt/d/M3/Projects/BCB/DNABERT/src/transformers/data/metrics/init.py", line 79, in acc_f1_mcc_auc_pre_rec
auc = roc_auc_score(labels, probs, average="macro", multi_class="ovo")
File "/home/woreom/anaconda3/envs/dnabert/lib/python3.6/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/woreom/anaconda3/envs/dnabert/lib/python3.6/site-packages/sklearn/metrics/_ranking.py", line 538, in roc_auc_score
multi_class, average, sample_weight)
File "/home/woreom/anaconda3/envs/dnabert/lib/python3.6/site-packages/sklearn/metrics/_ranking.py", line 632, in _multiclass_roc_auc_score
"Number of classes in y_true not equal to the number of "
ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'
`