I'm trying to replicate the results reported in the paper.
I had some trouble when encoding labels from original tokens to tokenized tokens. The tokens ids length are usually longer than the word tokens since they can be split into multiple sub-word tokens. For example, the word "Bệnh" is splitted into 'Be@@', '̣@@', '', 'nh', which also causes the corresponding label to be longer.
The problem can be easily solved if the tokenizer output contains 'word_ids' attribute which indicates which token the each input_id comes from. This is what most tokenizers in huggingface supports.
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples[text_column_name],
padding=padding,
truncation=True,
max_length=data_args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
)
labels = []
for i, label in enumerate(examples[label_column_name]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label_to_id[label[word_idx]])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
if data_args.label_all_tokens:
label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
0%| | 0/6 [00:00<?, ?ba/s]
Traceback (most recent call last):
File "C:\Users\quang\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3441, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-7-66d85e73c564>", line 101, in <module>
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\arrow_dataset.py", line 2035, in map
return self._map_single(
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\arrow_dataset.py", line 521, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\arrow_dataset.py", line 488, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\fingerprint.py", line 406, in wrapper
out = func(self, *args, **kwargs)
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\arrow_dataset.py", line 2403, in _map_single
batch = apply_function_on_filtered_inputs(
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\arrow_dataset.py", line 2290, in apply_function_on_filtered_inputs
function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\datasets\arrow_dataset.py", line 1990, in decorated
result = f(decorated_item, *args, **kwargs)
File "<ipython-input-7-66d85e73c564>", line 75, in tokenize_and_align_labels
word_ids = tokenized_inputs.word_ids(batch_index=i)
File "C:\Users\quang\PycharmProjects\DeepGamingAI_FPS\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 353, in word_ids
raise ValueError("word_ids() is not available when using Python-based tokenizers")
ValueError: word_ids() is not available when using Python-based tokenizers