Fix attention_mask and position_ids

2023-04-02 01:58:45 +08:00 · 2023-04-02 01:58:45 +08:00 · 373fd6b9d4
parent e22cddf212
commit 373fd6b9d4
1 changed files with 22 additions and 20 deletions
--- a/tokenization_chatglm.py
+++ b/tokenization_chatglm.py
@ -340,7 +340,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
        token_ids_0 += [self.sp_tokenizer[self.bos_token]]

        if token_ids_1 is not None:
-            if token_ids_1[-1] != eop_id:
+            if not token_ids_1 or token_ids_1[-1] != eop_id:
                token_ids_1 += [eop_id]
            token_ids_0 += token_ids_1

@ -397,7 +397,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

        # Initialize attention mask if not present.
-        if return_attention_mask:
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
                if bos_token_id in required_input:
                    context_length = required_input.index(bos_token_id)
                else:
@ -408,6 +409,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
                attention_mask = np.bool_(attention_mask < 0.5)
                encoded_inputs["attention_mask"] = attention_mask

+            if "position_ids" not in encoded_inputs:
                position_ids = np.arange(seq_length, dtype=np.int64)
                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
                if mask_token in required_input: