NLP-预训练模型-2018-Bert-解析：BertForMaskedLM-蒲公英云

import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForMaskedLM
# Load pre-trained model (weights)
with torch.no_grad():
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(r'D:\Pretrained_Model\bert-base-chinese')
    model = BertForMaskedLM.from_pretrained(r'D:\Pretrained_Model\bert-base-chinese')
    model.eval()
    sentence = "我不会忘记和你一起奋斗的时光。"
    tokenize_input = tokenizer.tokenize(sentence)
    print('tokenize_input = ', tokenize_input)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    sen_len = len(tokenize_input)
    sentence_loss = 0.
    for idx, word in enumerate(tokenize_input):
        print('\n\n idx = {0}'.format(idx))
        # add mask to i-th character of the sentence
        tokenize_input[idx] = '[MASK]'
        mask_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
        print('\t mask_input = {0}'.format(mask_input))
        output = model(mask_input)
        print('\n\t output = {0}'.format(output))
        prediction_scores = output[0]
        print('\n\t prediction_scores = output[0] = {0}'.format(prediction_scores))
        softmax = nn.Softmax(dim=0)
        ps = softmax(prediction_scores[0, idx]).log()
        print('\n\t ps = {0}'.format(ps))
        word_loss = ps[tensor_input[0, idx]]
        print('\n\t word_loss = {0}'.format(word_loss))
        sentence_loss += word_loss.item()
        tokenize_input[idx] = word
    ppl = np.exp(-sentence_loss / sen_len)
    print("sentence_loss = {0}；ppl = {1}".format(sentence_loss, ppl))

打印结果：

tokenize_input =  ['我', '不', '会', '忘', '记', '和', '你', '一', '起', '奋', '斗', '的', '时', '光', '。']
 idx = 0
     mask_input = tensor([[ 103,  679,  833, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-10.0067,  -9.9702, -10.3403,  ...,  -7.0367,  -7.9918,  -7.8884],
         [ -8.9250,  -8.6627,  -8.8329,  ...,  -5.6988,  -5.0543,  -7.3196],
         [-17.5815, -16.8282, -17.5551,  ..., -11.2575,  -8.4464, -15.8063],
         ...,
         [-17.7271, -17.4097, -18.3814,  ..., -12.5380, -14.9620, -13.0537],
         [-14.8090, -15.5407, -14.8516,  ...,  -9.6344,  -8.9355, -11.3215],
         [-10.2498, -10.0447, -10.2479,  ...,  -5.7584,  -4.9482,  -5.1695]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-10.0067,  -9.9702, -10.3403,  ...,  -7.0367,  -7.9918,  -7.8884],
         [ -8.9250,  -8.6627,  -8.8329,  ...,  -5.6988,  -5.0543,  -7.3196],
         [-17.5815, -16.8282, -17.5551,  ..., -11.2575,  -8.4464, -15.8063],
         ...,
         [-17.7271, -17.4097, -18.3814,  ..., -12.5380, -14.9620, -13.0537],
         [-14.8090, -15.5407, -14.8516,  ...,  -9.6344,  -8.9355, -11.3215],
         [-10.2498, -10.0447, -10.2479,  ...,  -5.7584,  -4.9482,  -5.1695]]])
     ps = tensor([-20.2204, -20.1840, -20.5541,  ..., -17.2505, -18.2055, -18.1022])
     word_loss = -4.207489013671875
 idx = 1
     mask_input = tensor([[2769,  103,  833, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-14.2572, -13.5664, -13.7818,  ..., -12.4880, -11.4941, -11.0043],
         [-13.3572, -12.7593, -13.1295,  ...,  -8.9165,  -6.9501,  -8.0928],
         [-18.3267, -17.2391, -16.6626,  ...,  -9.1351,  -8.5136, -10.8610],
         ...,
         [-16.8631, -15.9635, -16.3637,  ..., -11.8876, -12.6025, -10.4363],
         [-14.1836, -14.0044, -13.6275,  ...,  -9.0348, -10.7950,  -9.2346],
         [-16.2714, -15.7472, -15.5543,  ...,  -9.3256,  -9.7824,  -7.4806]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-14.2572, -13.5664, -13.7818,  ..., -12.4880, -11.4941, -11.0043],
         [-13.3572, -12.7593, -13.1295,  ...,  -8.9165,  -6.9501,  -8.0928],
         [-18.3267, -17.2391, -16.6626,  ...,  -9.1351,  -8.5136, -10.8610],
         ...,
         [-16.8631, -15.9635, -16.3637,  ..., -11.8876, -12.6025, -10.4363],
         [-14.1836, -14.0044, -13.6275,  ...,  -9.0348, -10.7950,  -9.2346],
         [-16.2714, -15.7472, -15.5543,  ...,  -9.3256,  -9.7824,  -7.4806]]])
     ps = tensor([-27.0073, -26.4094, -26.7796,  ..., -22.5666, -20.6002, -21.7429])
     word_loss = -3.4179904460906982
 idx = 2
     mask_input = tensor([[2769,  679,  103, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-10.4260,  -9.7421, -10.0949,  ...,  -9.1981,  -9.3232,  -9.0737],
         [-11.1497, -10.3329, -10.3952,  ...,  -6.6423,  -5.8855,  -7.4425],
         [-10.2441,  -9.8596, -10.0538,  ...,  -6.8899,  -6.3872,  -7.1557],
         ...,
         [-14.8344, -13.9255, -14.6416,  ..., -11.8463, -11.3034,  -9.4505],
         [-13.0585, -12.7334, -12.5315,  ...,  -9.1430,  -9.0249,  -8.6625],
         [-10.8999, -10.1885, -10.4381,  ...,  -6.9490,  -6.5864,  -5.2088]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-10.4260,  -9.7421, -10.0949,  ...,  -9.1981,  -9.3232,  -9.0737],
         [-11.1497, -10.3329, -10.3952,  ...,  -6.6423,  -5.8855,  -7.4425],
         [-10.2441,  -9.8596, -10.0538,  ...,  -6.8899,  -6.3872,  -7.1557],
         ...,
         [-14.8344, -13.9255, -14.6416,  ..., -11.8463, -11.3034,  -9.4505],
         [-13.0585, -12.7334, -12.5315,  ...,  -9.1430,  -9.0249,  -8.6625],
         [-10.8999, -10.1885, -10.4381,  ...,  -6.9490,  -6.5864,  -5.2088]]])
     ps = tensor([-23.9556, -23.5712, -23.7654,  ..., -20.6015, -20.0987, -20.8673])
     word_loss = -3.0778353214263916
 idx = 3
     mask_input = tensor([[2769,  679,  833,  103, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-11.1854, -10.8186, -10.8980,  ..., -10.0304,  -6.8312, -10.1228],
         [-18.3292, -17.1635, -18.1168,  ..., -12.8976,  -6.5055, -10.3133],
         [-18.9977, -17.6461, -18.6712,  ..., -12.0834,  -9.4692, -13.3222],
         ...,
         [-15.9868, -15.1038, -15.7956,  ..., -11.8385,  -8.8921, -11.2440],
         [-13.2753, -13.0012, -12.8868,  ...,  -8.5294,  -7.7151,  -9.6861],
         [-14.0791, -13.6179, -13.8650,  ...,  -9.9380,  -8.0259,  -6.6505]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-11.1854, -10.8186, -10.8980,  ..., -10.0304,  -6.8312, -10.1228],
         [-18.3292, -17.1635, -18.1168,  ..., -12.8976,  -6.5055, -10.3133],
         [-18.9977, -17.6461, -18.6712,  ..., -12.0834,  -9.4692, -13.3222],
         ...,
         [-15.9868, -15.1038, -15.7956,  ..., -11.8385,  -8.8921, -11.2440],
         [-13.2753, -13.0012, -12.8868,  ...,  -8.5294,  -7.7151,  -9.6861],
         [-14.0791, -13.6179, -13.8650,  ...,  -9.9380,  -8.0259,  -6.6505]]])
     ps = tensor([-28.6803, -28.3364, -28.7086,  ..., -26.4609, -23.3448, -25.8600])
     word_loss = -0.024608036503195763
 idx = 4
     mask_input = tensor([[2769,  679,  833, 2563,  103, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-11.6949, -11.0920, -11.4218,  ..., -10.2302,  -9.3920, -10.9836],
         [-18.6331, -17.9585, -18.3607,  ..., -12.7316, -10.2360, -14.0741],
         [-19.6247, -18.4559, -19.2653,  ..., -12.6368, -11.0657, -15.6243],
         ...,
         [-15.9810, -15.1353, -15.9852,  ..., -12.4308, -12.2341, -10.9428],
         [-13.4082, -13.1908, -13.3454,  ..., -10.0117, -10.6251, -10.7604],
         [-13.8807, -13.1495, -13.6315,  ...,  -9.3678,  -9.9106,  -7.1275]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-11.6949, -11.0920, -11.4218,  ..., -10.2302,  -9.3920, -10.9836],
         [-18.6331, -17.9585, -18.3607,  ..., -12.7316, -10.2360, -14.0741],
         [-19.6247, -18.4559, -19.2653,  ..., -12.6368, -11.0657, -15.6243],
         ...,
         [-15.9810, -15.1353, -15.9852,  ..., -12.4308, -12.2341, -10.9428],
         [-13.4082, -13.1908, -13.3454,  ..., -10.0117, -10.6251, -10.7604],
         [-13.8807, -13.1495, -13.6315,  ...,  -9.3678,  -9.9106,  -7.1275]]])
     ps = tensor([-30.6680, -30.0711, -30.5083,  ..., -28.1964, -25.7133, -29.4577])
     word_loss = -0.021782301366329193
 idx = 5
     mask_input = tensor([[2769,  679,  833, 2563, 6381,  103,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-10.8215, -10.1308, -10.5400,  ...,  -9.4374,  -9.1841,  -9.7690],
         [-16.6464, -15.7021, -16.0986,  ...,  -9.1416,  -7.5447,  -8.9926],
         [-18.4551, -17.0224, -17.3103,  ...,  -8.7594,  -8.8654, -10.6732],
         ...,
         [-14.8322, -13.5759, -14.5636,  ..., -10.8961, -10.6665,  -8.9241],
         [-12.3797, -11.8117, -11.9058,  ...,  -8.7238,  -9.1733,  -9.1059],
         [-12.6140, -11.4767, -11.6919,  ...,  -8.0748,  -9.4955,  -5.7950]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-10.8215, -10.1308, -10.5400,  ...,  -9.4374,  -9.1841,  -9.7690],
         [-16.6464, -15.7021, -16.0986,  ...,  -9.1416,  -7.5447,  -8.9926],
         [-18.4551, -17.0224, -17.3103,  ...,  -8.7594,  -8.8654, -10.6732],
         ...,
         [-14.8322, -13.5759, -14.5636,  ..., -10.8961, -10.6665,  -8.9241],
         [-12.3797, -11.8117, -11.9058,  ...,  -8.7238,  -9.1733,  -9.1059],
         [-12.6140, -11.4767, -11.6919,  ...,  -8.0748,  -9.4955,  -5.7950]]])
     ps = tensor([-20.0339, -19.5133, -20.0343,  ..., -17.0866, -17.4351, -15.4161])
     word_loss = -2.464529037475586
 idx = 6
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  103,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-11.6927, -10.7244, -10.8993,  ...,  -8.0539,  -8.4719,  -9.0431],
         [-14.6502, -14.0066, -14.4193,  ...,  -7.7190,  -5.6522,  -8.8189],
         [-17.8192, -16.0978, -17.0802,  ...,  -8.5008,  -7.9125, -11.5379],
         ...,
         [-15.0797, -14.0576, -14.8092,  ..., -10.5593, -11.1677,  -9.6744],
         [-12.6444, -12.2899, -12.1446,  ...,  -8.7772,  -9.4889,  -9.6838],
         [-11.8326, -11.0902, -11.1104,  ...,  -7.6406,  -8.1461,  -6.2924]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-11.6927, -10.7244, -10.8993,  ...,  -8.0539,  -8.4719,  -9.0431],
         [-14.6502, -14.0066, -14.4193,  ...,  -7.7190,  -5.6522,  -8.8189],
         [-17.8192, -16.0978, -17.0802,  ...,  -8.5008,  -7.9125, -11.5379],
         ...,
         [-15.0797, -14.0576, -14.8092,  ..., -10.5593, -11.1677,  -9.6744],
         [-12.6444, -12.2899, -12.1446,  ...,  -8.7772,  -9.4889,  -9.6838],
         [-11.8326, -11.0902, -11.1104,  ...,  -7.6406,  -8.1461,  -6.2924]]])
     ps = tensor([-17.8420, -17.7343, -17.7814,  ..., -15.6324, -16.8942, -15.6699])
     word_loss = -3.217534065246582
 idx = 7
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  103, 6629, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-12.0652, -11.1372, -11.7658,  ..., -10.3255,  -9.5978, -10.2930],
         [-17.4623, -16.3227, -17.0211,  ..., -10.0448,  -8.8320, -11.6701],
         [-19.7825, -18.2467, -18.9617,  ..., -10.4417, -10.0575, -13.2705],
         ...,
         [-16.7194, -15.7009, -16.5568,  ..., -11.9396, -12.9538,  -9.1279],
         [-14.1858, -13.9772, -14.0763,  ...,  -9.9030, -10.4625,  -8.7678],
         [-14.0998, -13.0324, -13.3418,  ...,  -8.7676, -10.0443,  -6.4476]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-12.0652, -11.1372, -11.7658,  ..., -10.3255,  -9.5978, -10.2930],
         [-17.4623, -16.3227, -17.0211,  ..., -10.0448,  -8.8320, -11.6701],
         [-19.7825, -18.2467, -18.9617,  ..., -10.4417, -10.0575, -13.2705],
         ...,
         [-16.7194, -15.7009, -16.5568,  ..., -11.9396, -12.9538,  -9.1279],
         [-14.1858, -13.9772, -14.0763,  ...,  -9.9030, -10.4625,  -8.7678],
         [-14.0998, -13.0324, -13.3418,  ...,  -8.7676, -10.0443,  -6.4476]]])
     ps = tensor([-29.0154, -28.9152, -28.5686,  ..., -23.7333, -25.4041, -24.8862])
     word_loss = -0.006231430917978287
 idx = 8
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671,  103, 1939, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-12.3327, -11.7290, -12.1774,  ..., -10.6400,  -9.2812, -10.8762],
         [-17.4025, -16.3325, -17.3093,  ...,  -9.6641,  -8.0054, -10.9477],
         [-19.8157, -18.1812, -19.2325,  ..., -10.3199,  -9.6911, -13.2068],
         ...,
         [-15.4990, -14.1986, -15.4210,  ..., -10.8605, -11.1951,  -9.2175],
         [-13.5214, -13.1154, -13.2580,  ...,  -9.1551,  -8.5442,  -8.5556],
         [-13.9661, -12.7296, -13.4830,  ...,  -7.9905,  -9.4974,  -5.5795]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-12.3327, -11.7290, -12.1774,  ..., -10.6400,  -9.2812, -10.8762],
         [-17.4025, -16.3325, -17.3093,  ...,  -9.6641,  -8.0054, -10.9477],
         [-19.8157, -18.1812, -19.2325,  ..., -10.3199,  -9.6911, -13.2068],
         ...,
         [-15.4990, -14.1986, -15.4210,  ..., -10.8605, -11.1951,  -9.2175],
         [-13.5214, -13.1154, -13.2580,  ...,  -9.1551,  -8.5442,  -8.5556],
         [-13.9661, -12.7296, -13.4830,  ...,  -7.9905,  -9.4974,  -5.5795]]])
     ps = tensor([-26.1031, -25.4673, -25.6910,  ..., -23.7415, -24.6235, -23.6001])
     word_loss = -0.4470815658569336
 idx = 9
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671, 6629,  103, 3159, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-12.9872, -12.3978, -12.9848,  ..., -11.8125, -12.0875, -12.0079],
         [-17.5210, -16.8555, -17.3870,  ..., -10.8851,  -9.9333, -12.9947],
         [-19.9390, -18.8892, -19.5466,  ..., -12.1456, -11.2809, -13.7224],
         ...,
         [-14.5711, -13.7166, -14.6204,  ..., -10.1978, -11.9384,  -9.0040],
         [-13.0610, -12.8815, -12.9802,  ...,  -9.4830,  -9.4141, -10.5692],
         [-14.2910, -13.4047, -14.0815,  ...,  -8.9679, -11.1636,  -6.8003]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-12.9872, -12.3978, -12.9848,  ..., -11.8125, -12.0875, -12.0079],
         [-17.5210, -16.8555, -17.3870,  ..., -10.8851,  -9.9333, -12.9947],
         [-19.9390, -18.8892, -19.5466,  ..., -12.1456, -11.2809, -13.7224],
         ...,
         [-14.5711, -13.7166, -14.6204,  ..., -10.1978, -11.9384,  -9.0040],
         [-13.0610, -12.8815, -12.9802,  ...,  -9.4830,  -9.4141, -10.5692],
         [-14.2910, -13.4047, -14.0815,  ...,  -8.9679, -11.1636,  -6.8003]]])
     ps = tensor([-23.7871, -23.3177, -23.7469,  ..., -19.6898, -21.1636, -19.3445])
     word_loss = -1.905866265296936
 idx = 10
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671, 6629, 1939,  103, 4638,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-10.2435,  -9.4682,  -9.9029,  ...,  -8.6173,  -7.9944,  -9.5463],
         [-14.7155, -14.1531, -14.7035,  ...,  -7.7060,  -7.0066,  -8.7167],
         [-17.8262, -16.8357, -17.2724,  ...,  -9.3416,  -9.6015, -11.3678],
         ...,
         [-13.5025, -12.6059, -13.4680,  ...,  -9.6887, -10.2040,  -7.5718],
         [-11.8572, -11.8200, -11.6956,  ...,  -8.0838,  -8.2098,  -7.9838],
         [-11.4906, -10.7753, -11.1489,  ...,  -6.4764,  -8.7700,  -4.7994]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-10.2435,  -9.4682,  -9.9029,  ...,  -8.6173,  -7.9944,  -9.5463],
         [-14.7155, -14.1531, -14.7035,  ...,  -7.7060,  -7.0066,  -8.7167],
         [-17.8262, -16.8357, -17.2724,  ...,  -9.3416,  -9.6015, -11.3678],
         ...,
         [-13.5025, -12.6059, -13.4680,  ...,  -9.6887, -10.2040,  -7.5718],
         [-11.8572, -11.8200, -11.6956,  ...,  -8.0838,  -8.2098,  -7.9838],
         [-11.4906, -10.7753, -11.1489,  ...,  -6.4764,  -8.7700,  -4.7994]]])
     ps = tensor([-23.3028, -23.2676, -24.0384,  ..., -20.8967, -21.3373, -20.7125])
     word_loss = -0.3310864269733429
 idx = 11
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159,  103,
         3198, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-11.1587, -10.6746, -11.6326,  ...,  -9.9938,  -8.8795, -10.4635],
         [-14.6800, -14.1649, -14.6931,  ...,  -7.1853,  -6.1263, -11.4231],
         [-17.5996, -16.4610, -17.1693,  ...,  -7.9229,  -7.0681, -13.4018],
         ...,
         [-13.6107, -12.4848, -13.5183,  ...,  -9.4305,  -9.1442,  -7.4951],
         [-11.5701, -11.2959, -11.3109,  ...,  -7.2745,  -7.0823,  -8.4521],
         [-13.5606, -12.9446, -13.3137,  ...,  -8.5220,  -9.7932,  -7.7482]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-11.1587, -10.6746, -11.6326,  ...,  -9.9938,  -8.8795, -10.4635],
         [-14.6800, -14.1649, -14.6931,  ...,  -7.1853,  -6.1263, -11.4231],
         [-17.5996, -16.4610, -17.1693,  ...,  -7.9229,  -7.0681, -13.4018],
         ...,
         [-13.6107, -12.4848, -13.5183,  ...,  -9.4305,  -9.1442,  -7.4951],
         [-11.5701, -11.2959, -11.3109,  ...,  -7.2745,  -7.0823,  -8.4521],
         [-13.5606, -12.9446, -13.3137,  ...,  -8.5220,  -9.7932,  -7.7482]]])
     ps = tensor([-24.5581, -24.6442, -24.8213,  ..., -21.0443, -21.8916, -20.5020])
     word_loss = -0.0409548319876194
 idx = 12
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
          103, 1045,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-11.2262, -11.1173, -11.6287,  ..., -10.3565,  -9.4391, -11.4225],
         [-15.2089, -14.6585, -15.3925,  ...,  -7.9105,  -6.7598, -10.2716],
         [-17.7514, -16.8604, -17.4242,  ...,  -8.0904,  -8.6169, -12.3799],
         ...,
         [-11.9148, -11.5928, -12.1447,  ...,  -7.0739,  -9.0568,  -7.8991],
         [-10.9299, -10.9160, -10.9438,  ...,  -5.0096,  -7.1774,  -7.3603],
         [-14.6292, -14.3548, -14.3348,  ...,  -6.8946, -10.3034,  -8.7604]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-11.2262, -11.1173, -11.6287,  ..., -10.3565,  -9.4391, -11.4225],
         [-15.2089, -14.6585, -15.3925,  ...,  -7.9105,  -6.7598, -10.2716],
         [-17.7514, -16.8604, -17.4242,  ...,  -8.0904,  -8.6169, -12.3799],
         ...,
         [-11.9148, -11.5928, -12.1447,  ...,  -7.0739,  -9.0568,  -7.8991],
         [-10.9299, -10.9160, -10.9438,  ...,  -5.0096,  -7.1774,  -7.3603],
         [-14.6292, -14.3548, -14.3348,  ...,  -6.8946, -10.3034,  -8.7604]]])
     ps = tensor([-26.6420, -26.3200, -26.8719,  ..., -21.8011, -23.7840, -22.6264])
     word_loss = -0.2741313576698303
 idx = 13
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198,  103,  511]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-12.9708, -12.1014, -12.6502,  ..., -10.6858, -10.9495, -11.6393],
         [-17.4693, -16.4352, -17.2923,  ..., -10.1345,  -9.2979, -12.3043],
         [-19.2976, -17.8839, -18.8252,  ..., -11.4233, -10.9146, -13.9556],
         ...,
         [-14.2439, -13.8837, -14.3827,  ..., -10.8131,  -9.7626, -10.4449],
         [-11.0731, -11.4156, -11.2104,  ...,  -8.5579,  -9.0104,  -8.7935],
         [-13.5802, -13.1632, -13.3280,  ...,  -9.2640, -10.9600,  -8.3216]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-12.9708, -12.1014, -12.6502,  ..., -10.6858, -10.9495, -11.6393],
         [-17.4693, -16.4352, -17.2923,  ..., -10.1345,  -9.2979, -12.3043],
         [-19.2976, -17.8839, -18.8252,  ..., -11.4233, -10.9146, -13.9556],
         ...,
         [-14.2439, -13.8837, -14.3827,  ..., -10.8131,  -9.7626, -10.4449],
         [-11.0731, -11.4156, -11.2104,  ...,  -8.5579,  -9.0104,  -8.7935],
         [-13.5802, -13.1632, -13.3280,  ...,  -9.2640, -10.9600,  -8.3216]]])
     ps = tensor([-26.7180, -27.0605, -26.8553,  ..., -24.2028, -24.6553, -24.4384])
     word_loss = -2.3570048809051514
 idx = 14
     mask_input = tensor([[2769,  679,  833, 2563, 6381, 1469,  872,  671, 6629, 1939, 3159, 4638,
         3198, 1045,  103]])
     output = MaskedLMOutput(loss=None, logits=tensor([[[-11.3928, -10.5590, -11.3276,  ...,  -8.2870,  -7.0281,  -9.3417],
         [-15.1979, -14.2848, -14.9167,  ...,  -8.0477,  -3.3199,  -9.9085],
         [-16.9765, -15.7591, -16.4064,  ...,  -7.3844,  -3.6073, -10.5002],
         ...,
         [-14.3350, -13.5203, -14.7181,  ...,  -9.1939,  -8.4368,  -6.3008],
         [-11.5855, -11.6669, -11.5224,  ...,  -6.1303,  -7.0456,  -5.4713],
         [ -9.3767,  -9.1142,  -9.3964,  ...,  -5.2297,  -5.3290,  -3.2478]]]), hidden_states=None, attentions=None)
     prediction_scores = output[0] = tensor([[[-11.3928, -10.5590, -11.3276,  ...,  -8.2870,  -7.0281,  -9.3417],
         [-15.1979, -14.2848, -14.9167,  ...,  -8.0477,  -3.3199,  -9.9085],
         [-16.9765, -15.7591, -16.4064,  ...,  -7.3844,  -3.6073, -10.5002],
         ...,
         [-14.3350, -13.5203, -14.7181,  ...,  -9.1939,  -8.4368,  -6.3008],
         [-11.5855, -11.6669, -11.5224,  ...,  -6.1303,  -7.0456,  -5.4713],
         [ -9.3767,  -9.1142,  -9.3964,  ...,  -5.2297,  -5.3290,  -3.2478]]])
     ps = tensor([-20.6789, -20.4164, -20.6986,  ..., -16.5319, -16.6312, -14.5500])
     word_loss = -1.3718788623809814
sentence_loss = -23.16600384376943；ppl = 4.685160888290345
Process finished with exit code 0