Command that produces this log: python train.py --model_name coref --xlmr_model_name xlm-roberta-large --batch_size 16 --xlmr_learning_rate 2e-5 --accumulate_step 4 --max_epoch 20 --event_hidden_num 500 --p1_data_weight 0.2 --learning_rate 9e-4 ---------------------------------------------------------------------------------------------------- > trainable params: >>> xlmr.embeddings.word_embeddings.weight: torch.Size([250002, 1024]) >>> xlmr.embeddings.position_embeddings.weight: torch.Size([514, 1024]) >>> xlmr.embeddings.token_type_embeddings.weight: torch.Size([1, 1024]) >>> xlmr.embeddings.LayerNorm.weight: torch.Size([1024]) >>> xlmr.embeddings.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.0.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.0.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.0.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.0.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.0.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.0.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.0.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.0.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.0.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.0.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.1.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.1.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.1.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.1.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.1.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.1.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.1.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.1.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.1.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.1.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.2.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.2.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.2.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.2.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.2.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.2.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.2.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.2.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.2.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.2.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.3.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.3.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.3.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.3.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.3.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.3.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.3.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.3.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.3.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.3.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.4.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.4.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.4.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.4.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.4.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.4.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.4.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.4.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.4.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.4.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.5.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.5.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.5.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.5.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.5.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.5.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.5.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.5.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.5.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.5.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.6.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.6.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.6.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.6.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.6.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.6.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.6.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.6.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.6.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.6.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.7.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.7.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.7.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.7.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.7.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.7.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.7.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.7.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.7.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.7.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.8.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.8.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.8.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.8.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.8.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.8.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.8.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.8.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.8.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.8.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.9.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.9.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.9.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.9.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.9.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.9.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.9.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.9.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.9.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.9.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.10.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.10.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.10.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.10.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.10.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.10.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.10.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.10.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.10.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.10.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.11.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.11.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.11.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.11.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.11.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.11.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.11.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.11.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.11.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.11.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.12.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.12.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.12.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.12.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.12.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.12.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.12.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.12.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.12.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.12.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.13.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.13.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.13.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.13.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.13.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.13.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.13.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.13.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.13.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.13.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.14.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.14.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.14.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.14.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.14.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.14.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.14.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.14.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.14.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.14.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.15.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.15.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.15.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.15.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.15.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.15.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.15.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.15.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.15.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.15.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.16.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.16.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.16.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.16.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.16.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.16.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.16.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.16.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.16.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.16.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.17.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.17.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.17.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.17.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.17.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.17.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.17.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.17.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.17.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.17.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.18.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.18.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.18.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.18.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.18.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.18.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.18.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.18.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.18.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.18.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.19.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.19.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.19.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.19.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.19.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.19.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.19.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.19.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.19.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.19.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.20.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.20.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.20.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.20.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.20.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.20.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.20.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.20.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.20.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.20.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.21.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.21.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.21.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.21.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.21.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.21.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.21.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.21.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.21.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.21.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.22.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.22.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.22.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.22.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.22.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.22.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.22.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.22.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.22.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.22.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.attention.self.query.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.23.attention.self.query.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.attention.self.key.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.23.attention.self.key.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.attention.self.value.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.23.attention.self.value.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.attention.output.dense.weight: torch.Size([1024, 1024]) >>> xlmr.encoder.layer.23.attention.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.attention.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.23.attention.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.intermediate.dense.weight: torch.Size([4096, 1024]) >>> xlmr.encoder.layer.23.intermediate.dense.bias: torch.Size([4096]) >>> xlmr.encoder.layer.23.output.dense.weight: torch.Size([1024, 4096]) >>> xlmr.encoder.layer.23.output.dense.bias: torch.Size([1024]) >>> xlmr.encoder.layer.23.output.LayerNorm.weight: torch.Size([1024]) >>> xlmr.encoder.layer.23.output.LayerNorm.bias: torch.Size([1024]) >>> xlmr.pooler.dense.weight: torch.Size([1024, 1024]) >>> xlmr.pooler.dense.bias: torch.Size([1024]) >>> type_embedding.weight: torch.Size([122, 100]) >>> trans_rep.weight: torch.Size([1024, 1124]) >>> trans_rep.bias: torch.Size([1024]) >>> coref_type_ffn.weight: torch.Size([3, 4096]) >>> coref_type_ffn.bias: torch.Size([3]) n_trainable_params: 561066923, n_nontrainable_params: 0 ---------------------------------------------------------------------------------------------------- ****************************** Epoch: 0 command: python train.py --model_name coref --xlmr_model_name xlm-roberta-large --batch_size 16 --xlmr_learning_rate 2e-5 --accumulate_step 4 --max_epoch 20 --event_hidden_num 500 --p1_data_weight 0.2 --learning_rate 9e-4 2023-01-22 22:10:20.602893: step: 4/533, loss: 0.01594550721347332 2023-01-22 22:10:21.696604: step: 8/533, loss: 0.01623973436653614 2023-01-22 22:10:22.764475: step: 12/533, loss: 0.025600062683224678 2023-01-22 22:10:23.839669: step: 16/533, loss: 0.04417184367775917 2023-01-22 22:10:24.927601: step: 20/533, loss: 0.06865813583135605 2023-01-22 22:10:25.999465: step: 24/533, loss: 0.04775005206465721 2023-01-22 22:10:27.085004: step: 28/533, loss: 0.01723719947040081 2023-01-22 22:10:28.186432: step: 32/533, loss: 0.020858611911535263 2023-01-22 22:10:29.276873: step: 36/533, loss: 0.0168582946062088 2023-01-22 22:10:30.379190: step: 40/533, loss: 0.016195565462112427 2023-01-22 22:10:31.506846: step: 44/533, loss: 0.013944416306912899 2023-01-22 22:10:32.586770: step: 48/533, loss: 0.01876860111951828 2023-01-22 22:10:33.655199: step: 52/533, loss: 0.06417802721261978 2023-01-22 22:10:34.755793: step: 56/533, loss: 0.03598829358816147 2023-01-22 22:10:35.845505: step: 60/533, loss: 0.030301397666335106 2023-01-22 22:10:36.944992: step: 64/533, loss: 0.01978672668337822 2023-01-22 22:10:38.019406: step: 68/533, loss: 0.05278826877474785 2023-01-22 22:10:39.095406: step: 72/533, loss: 0.006483330857008696 2023-01-22 22:10:40.175538: step: 76/533, loss: 0.048134855926036835 2023-01-22 22:10:41.261543: step: 80/533, loss: 0.019971700385212898 2023-01-22 22:10:42.368310: step: 84/533, loss: 0.014037557877600193 2023-01-22 22:10:43.460124: step: 88/533, loss: 0.05830860882997513 2023-01-22 22:10:44.563093: step: 92/533, loss: 0.053832344710826874 2023-01-22 22:10:45.642036: step: 96/533, loss: 0.025877904146909714 2023-01-22 22:10:46.738231: step: 100/533, loss: 0.010820673778653145 2023-01-22 22:10:47.818828: step: 104/533, loss: 0.02344861999154091 2023-01-22 22:10:48.895595: step: 108/533, loss: 0.02452809363603592 2023-01-22 22:10:49.993745: step: 112/533, loss: 0.026574421674013138 2023-01-22 22:10:51.067783: step: 116/533, loss: 0.018311357125639915 2023-01-22 22:10:52.168884: step: 120/533, loss: 0.01614903286099434 2023-01-22 22:10:53.251398: step: 124/533, loss: 0.014634843915700912 2023-01-22 22:10:54.336299: step: 128/533, loss: 0.013529905118048191 2023-01-22 22:10:55.429938: step: 132/533, loss: 0.057390667498111725 2023-01-22 22:10:56.527539: step: 136/533, loss: 0.01302299927920103 2023-01-22 22:10:57.619331: step: 140/533, loss: 0.015509441494941711 2023-01-22 22:10:58.701621: step: 144/533, loss: 0.01691310480237007 2023-01-22 22:10:59.784135: step: 148/533, loss: 0.0 2023-01-22 22:11:00.864182: step: 152/533, loss: 0.037468478083610535 2023-01-22 22:11:01.933009: step: 156/533, loss: 0.016779234632849693 2023-01-22 22:11:02.998046: step: 160/533, loss: 0.01217052061110735 2023-01-22 22:11:04.103144: step: 164/533, loss: 0.011917327530682087 2023-01-22 22:11:05.194177: step: 168/533, loss: 0.01307509746402502 2023-01-22 22:11:06.283111: step: 172/533, loss: 0.03886246681213379 2023-01-22 22:11:07.379080: step: 176/533, loss: 0.026157686486840248 2023-01-22 22:11:08.492202: step: 180/533, loss: 0.016990577802062035 2023-01-22 22:11:09.570557: step: 184/533, loss: 0.014245955273509026 2023-01-22 22:11:10.655880: step: 188/533, loss: 0.013696500100195408 2023-01-22 22:11:11.781775: step: 192/533, loss: 0.08603041619062424 2023-01-22 22:11:12.905529: step: 196/533, loss: 0.03828587755560875 2023-01-22 22:11:14.001438: step: 200/533, loss: 0.018191803246736526 2023-01-22 22:11:15.101838: step: 204/533, loss: 0.06008240580558777 2023-01-22 22:11:16.187877: step: 208/533, loss: 0.05548308417201042 2023-01-22 22:11:17.294824: step: 212/533, loss: 0.024112559854984283 2023-01-22 22:11:18.394291: step: 216/533, loss: 0.01636626571416855 2023-01-22 22:11:19.509411: step: 220/533, loss: 0.018532846122980118 2023-01-22 22:11:20.594620: step: 224/533, loss: 0.02115771174430847 2023-01-22 22:11:21.708429: step: 228/533, loss: 0.03492189571261406 2023-01-22 22:11:22.796627: step: 232/533, loss: 0.007193278521299362 2023-01-22 22:11:23.897686: step: 236/533, loss: 0.009550342336297035 2023-01-22 22:11:24.983016: step: 240/533, loss: 0.06778249889612198 2023-01-22 22:11:26.058202: step: 244/533, loss: 0.02128215692937374 2023-01-22 22:11:27.137110: step: 248/533, loss: 0.011629464104771614 2023-01-22 22:11:28.244467: step: 252/533, loss: 0.01152625773102045 2023-01-22 22:11:29.346359: step: 256/533, loss: 0.030918773263692856 2023-01-22 22:11:30.441286: step: 260/533, loss: 0.01586327701807022 2023-01-22 22:11:31.517210: step: 264/533, loss: 0.017985232174396515 2023-01-22 22:11:32.606215: step: 268/533, loss: 0.016134044155478477 2023-01-22 22:11:33.698710: step: 272/533, loss: 0.017573989927768707 2023-01-22 22:11:34.796302: step: 276/533, loss: 0.12894156575202942 2023-01-22 22:11:35.913001: step: 280/533, loss: 0.013384195044636726 2023-01-22 22:11:37.000427: step: 284/533, loss: 0.03833853453397751 2023-01-22 22:11:38.102632: step: 288/533, loss: 0.03186187893152237 2023-01-22 22:11:39.211061: step: 292/533, loss: 0.0245597492903471 2023-01-22 22:11:40.294042: step: 296/533, loss: 0.015201129950582981 2023-01-22 22:11:41.386374: step: 300/533, loss: 0.0454351082444191 2023-01-22 22:11:42.491613: step: 304/533, loss: 0.058753274381160736 2023-01-22 22:11:43.609564: step: 308/533, loss: 0.05572102591395378 2023-01-22 22:11:44.713350: step: 312/533, loss: 0.018142374232411385 2023-01-22 22:11:45.814130: step: 316/533, loss: 0.012502665631473064 2023-01-22 22:11:46.909955: step: 320/533, loss: 0.06353705376386642 2023-01-22 22:11:48.005032: step: 324/533, loss: 0.07839985936880112 2023-01-22 22:11:49.087807: step: 328/533, loss: 0.049963563680648804 2023-01-22 22:11:50.187623: step: 332/533, loss: 0.012806715443730354 2023-01-22 22:11:51.280208: step: 336/533, loss: 0.009461612440645695 2023-01-22 22:11:52.376456: step: 340/533, loss: 0.013769224286079407 2023-01-22 22:11:53.476888: step: 344/533, loss: 0.03596681356430054 2023-01-22 22:11:54.576029: step: 348/533, loss: 0.013126334175467491 2023-01-22 22:11:55.679597: step: 352/533, loss: 0.04650581628084183 2023-01-22 22:11:56.784403: step: 356/533, loss: 0.01901560090482235 2023-01-22 22:11:57.889579: step: 360/533, loss: 0.019694849848747253 2023-01-22 22:11:58.974488: step: 364/533, loss: 0.01667892560362816 2023-01-22 22:12:00.067645: step: 368/533, loss: 0.014500715769827366 2023-01-22 22:12:01.181015: step: 372/533, loss: 0.07043975591659546 2023-01-22 22:12:02.284289: step: 376/533, loss: 0.012630096636712551 2023-01-22 22:12:03.383069: step: 380/533, loss: 0.05359623581171036 2023-01-22 22:12:04.468671: step: 384/533, loss: 0.010712051764130592 2023-01-22 22:12:05.553544: step: 388/533, loss: 0.013953730463981628 2023-01-22 22:12:06.646073: step: 392/533, loss: 0.049763984978199005 2023-01-22 22:12:07.724496: step: 396/533, loss: 0.023058507591485977 2023-01-22 22:12:08.836907: step: 400/533, loss: 0.0220797136425972 2023-01-22 22:12:09.922900: step: 404/533, loss: 0.03378187492489815 2023-01-22 22:12:11.007619: step: 408/533, loss: 0.04932564124464989 2023-01-22 22:12:12.110402: step: 412/533, loss: 0.04368259385228157 2023-01-22 22:12:13.214789: step: 416/533, loss: 0.01589009538292885 2023-01-22 22:12:14.295028: step: 420/533, loss: 0.013841111212968826 2023-01-22 22:12:15.389690: step: 424/533, loss: 0.007664002478122711 2023-01-22 22:12:16.490981: step: 428/533, loss: 0.008571386337280273 2023-01-22 22:12:17.584366: step: 432/533, loss: 0.03587321564555168 2023-01-22 22:12:18.656337: step: 436/533, loss: 0.05509749799966812 2023-01-22 22:12:19.761537: step: 440/533, loss: 0.013623368926346302 2023-01-22 22:12:20.851198: step: 444/533, loss: 0.00893553625792265 2023-01-22 22:12:21.939169: step: 448/533, loss: 0.04643327742815018 2023-01-22 22:12:23.062061: step: 452/533, loss: 0.009403081610798836 2023-01-22 22:12:24.164144: step: 456/533, loss: 0.0585842989385128 2023-01-22 22:12:25.289979: step: 460/533, loss: 0.014009098522365093 2023-01-22 22:12:26.389652: step: 464/533, loss: 0.014628607779741287 2023-01-22 22:12:27.480247: step: 468/533, loss: 0.008291634730994701 2023-01-22 22:12:28.567738: step: 472/533, loss: 0.011228042654693127 2023-01-22 22:12:29.650623: step: 476/533, loss: 0.024613995105028152 2023-01-22 22:12:30.743329: step: 480/533, loss: 0.01312523614615202 2023-01-22 22:12:31.824858: step: 484/533, loss: 0.01068204641342163 2023-01-22 22:12:32.963599: step: 488/533, loss: 0.0446341447532177 2023-01-22 22:12:34.047589: step: 492/533, loss: 0.009662832133471966 2023-01-22 22:12:35.116506: step: 496/533, loss: 0.022391293197870255 2023-01-22 22:12:36.205586: step: 500/533, loss: 0.01502079889178276 2023-01-22 22:12:37.304206: step: 504/533, loss: 0.014822776429355145 2023-01-22 22:12:38.407313: step: 508/533, loss: 0.010265301913022995 2023-01-22 22:12:39.513667: step: 512/533, loss: 0.019270094111561775 2023-01-22 22:12:40.599268: step: 516/533, loss: 0.017036855220794678 2023-01-22 22:12:41.738884: step: 520/533, loss: 0.036090970039367676 2023-01-22 22:12:42.832687: step: 524/533, loss: 0.007205290719866753 2023-01-22 22:12:43.918572: step: 528/533, loss: 0.010381779633462429 2023-01-22 22:12:45.039425: step: 532/533, loss: 0.009772643446922302 2023-01-22 22:12:46.148409: step: 536/533, loss: 0.006553178187459707 2023-01-22 22:12:47.261487: step: 540/533, loss: 0.01546100340783596 2023-01-22 22:12:48.348557: step: 544/533, loss: 0.03761184215545654 2023-01-22 22:12:49.451423: step: 548/533, loss: 0.05195557326078415 2023-01-22 22:12:50.537874: step: 552/533, loss: 0.008721227757632732 2023-01-22 22:12:51.620933: step: 556/533, loss: 0.043604906648397446 2023-01-22 22:12:52.705148: step: 560/533, loss: 0.061946433037519455 2023-01-22 22:12:53.807047: step: 564/533, loss: 0.009984291158616543 2023-01-22 22:12:54.913218: step: 568/533, loss: 0.010358656756579876 2023-01-22 22:12:56.015810: step: 572/533, loss: 0.027664847671985626 2023-01-22 22:12:57.132483: step: 576/533, loss: 0.016084466129541397 2023-01-22 22:12:58.243695: step: 580/533, loss: 0.0022245210129767656 2023-01-22 22:12:59.341281: step: 584/533, loss: 0.020262613892555237 2023-01-22 22:13:00.426082: step: 588/533, loss: 0.01354117039591074 2023-01-22 22:13:01.536794: step: 592/533, loss: 0.04978485032916069 2023-01-22 22:13:02.633774: step: 596/533, loss: 0.01026873104274273 2023-01-22 22:13:03.714160: step: 600/533, loss: 0.008875931613147259 2023-01-22 22:13:04.801768: step: 604/533, loss: 0.017024287953972816 2023-01-22 22:13:05.884806: step: 608/533, loss: 0.013755924999713898 2023-01-22 22:13:06.981572: step: 612/533, loss: 0.05518774315714836 2023-01-22 22:13:08.062201: step: 616/533, loss: 0.0161385890096426 2023-01-22 22:13:09.147022: step: 620/533, loss: 0.013520707376301289 2023-01-22 22:13:10.247710: step: 624/533, loss: 0.036341942846775055 2023-01-22 22:13:11.338498: step: 628/533, loss: 0.015753421932458878 2023-01-22 22:13:12.455367: step: 632/533, loss: 0.03309519588947296 2023-01-22 22:13:13.556206: step: 636/533, loss: 0.03799267113208771 2023-01-22 22:13:14.646656: step: 640/533, loss: 0.008471294306218624 2023-01-22 22:13:15.735240: step: 644/533, loss: 0.029864558950066566 2023-01-22 22:13:16.843903: step: 648/533, loss: 0.037011321634054184 2023-01-22 22:13:17.937592: step: 652/533, loss: 0.013239260762929916 2023-01-22 22:13:19.036237: step: 656/533, loss: 0.0035461753141134977 2023-01-22 22:13:20.150816: step: 660/533, loss: 0.022695209830999374 2023-01-22 22:13:21.243319: step: 664/533, loss: 0.009942811913788319 2023-01-22 22:13:22.322194: step: 668/533, loss: 0.05884157866239548 2023-01-22 22:13:23.438795: step: 672/533, loss: 0.038355376571416855 2023-01-22 22:13:24.517932: step: 676/533, loss: 0.010683365166187286 2023-01-22 22:13:25.604523: step: 680/533, loss: 0.10095253586769104 2023-01-22 22:13:26.708212: step: 684/533, loss: 0.009038984775543213 2023-01-22 22:13:27.804951: step: 688/533, loss: 0.027139971032738686 2023-01-22 22:13:28.943255: step: 692/533, loss: 0.01258578710258007 2023-01-22 22:13:30.041507: step: 696/533, loss: 0.00844421423971653 2023-01-22 22:13:31.129442: step: 700/533, loss: 0.011624434031546116 2023-01-22 22:13:32.247277: step: 704/533, loss: 0.01921788975596428 2023-01-22 22:13:33.335208: step: 708/533, loss: 0.023436591029167175 2023-01-22 22:13:34.450061: step: 712/533, loss: 0.03221143037080765 2023-01-22 22:13:35.552910: step: 716/533, loss: 0.008329442702233791 2023-01-22 22:13:36.649720: step: 720/533, loss: 0.0020313686691224575 2023-01-22 22:13:37.736759: step: 724/533, loss: 0.06754942238330841 2023-01-22 22:13:38.847816: step: 728/533, loss: 0.02158988267183304 2023-01-22 22:13:39.957667: step: 732/533, loss: 0.08073949813842773 2023-01-22 22:13:41.057246: step: 736/533, loss: 0.010897660627961159 2023-01-22 22:13:42.182500: step: 740/533, loss: 0.044490743428468704 2023-01-22 22:13:43.265931: step: 744/533, loss: 0.01984536275267601 2023-01-22 22:13:44.403089: step: 748/533, loss: 0.010430199094116688 2023-01-22 22:13:45.501511: step: 752/533, loss: 0.06072988733649254 2023-01-22 22:13:46.585163: step: 756/533, loss: 0.01221085712313652 2023-01-22 22:13:47.663189: step: 760/533, loss: 0.0648007020354271 2023-01-22 22:13:48.757063: step: 764/533, loss: 0.019192948937416077 2023-01-22 22:13:49.851890: step: 768/533, loss: 0.014133261516690254 2023-01-22 22:13:50.946314: step: 772/533, loss: 0.014667399227619171 2023-01-22 22:13:52.024284: step: 776/533, loss: 0.004726926796138287 2023-01-22 22:13:53.121149: step: 780/533, loss: 0.05146340653300285 2023-01-22 22:13:54.196365: step: 784/533, loss: 0.008362147957086563 2023-01-22 22:13:55.285262: step: 788/533, loss: 0.004807041957974434 2023-01-22 22:13:56.377929: step: 792/533, loss: 0.0443873405456543 2023-01-22 22:13:57.483819: step: 796/533, loss: 0.013109874911606312 2023-01-22 22:13:58.578217: step: 800/533, loss: 0.01260485127568245 2023-01-22 22:13:59.678053: step: 804/533, loss: 0.012299302034080029 2023-01-22 22:14:00.775710: step: 808/533, loss: 0.023661024868488312 2023-01-22 22:14:01.890188: step: 812/533, loss: 0.008686655201017857 2023-01-22 22:14:02.999713: step: 816/533, loss: 0.006012697238475084 2023-01-22 22:14:04.094719: step: 820/533, loss: 0.02251603826880455 2023-01-22 22:14:05.190665: step: 824/533, loss: 0.016256058588624 2023-01-22 22:14:06.311483: step: 828/533, loss: 0.004223429597914219 2023-01-22 22:14:07.386509: step: 832/533, loss: 0.010521873831748962 2023-01-22 22:14:08.490310: step: 836/533, loss: 0.025704102590680122 2023-01-22 22:14:09.579910: step: 840/533, loss: 0.011306677013635635 2023-01-22 22:14:10.674649: step: 844/533, loss: 0.025942904874682426 2023-01-22 22:14:11.781774: step: 848/533, loss: 0.01663108915090561 2023-01-22 22:14:12.875376: step: 852/533, loss: 0.03758041188120842 2023-01-22 22:14:13.983658: step: 856/533, loss: 0.011881772428750992 2023-01-22 22:14:15.099950: step: 860/533, loss: 0.012492097914218903 2023-01-22 22:14:16.217909: step: 864/533, loss: 0.06700002402067184 2023-01-22 22:14:17.318671: step: 868/533, loss: 0.04401533305644989 2023-01-22 22:14:18.413581: step: 872/533, loss: 0.009267843328416348 2023-01-22 22:14:19.530299: step: 876/533, loss: 0.014800194650888443 2023-01-22 22:14:20.616252: step: 880/533, loss: 0.01819654181599617 2023-01-22 22:14:21.698164: step: 884/533, loss: 0.019947953522205353 2023-01-22 22:14:22.778250: step: 888/533, loss: 0.00857956800609827 2023-01-22 22:14:23.852350: step: 892/533, loss: 0.006512962281703949 2023-01-22 22:14:24.952218: step: 896/533, loss: 0.05075188726186752 2023-01-22 22:14:26.033974: step: 900/533, loss: 0.03060637228190899 2023-01-22 22:14:27.141829: step: 904/533, loss: 0.012079216539859772 2023-01-22 22:14:28.220853: step: 908/533, loss: 0.008230199106037617 2023-01-22 22:14:29.317770: step: 912/533, loss: 0.007888578809797764 2023-01-22 22:14:30.395401: step: 916/533, loss: 0.023600688204169273 2023-01-22 22:14:31.496175: step: 920/533, loss: 0.020242273807525635 2023-01-22 22:14:32.584260: step: 924/533, loss: 0.012885221280157566 2023-01-22 22:14:33.679448: step: 928/533, loss: 0.055731192231178284 2023-01-22 22:14:34.758226: step: 932/533, loss: 0.013757705688476562 2023-01-22 22:14:35.839285: step: 936/533, loss: 0.011107505299150944 2023-01-22 22:14:36.930373: step: 940/533, loss: 0.018321719020605087 2023-01-22 22:14:38.023297: step: 944/533, loss: 0.01166111696511507 2023-01-22 22:14:39.119252: step: 948/533, loss: 0.04753773286938667 2023-01-22 22:14:40.190338: step: 952/533, loss: 0.008567269891500473 2023-01-22 22:14:41.286679: step: 956/533, loss: 0.04113031178712845 2023-01-22 22:14:42.372591: step: 960/533, loss: 0.011874553747475147 2023-01-22 22:14:43.443505: step: 964/533, loss: 0.010472199879586697 2023-01-22 22:14:44.524087: step: 968/533, loss: 0.03212105855345726 2023-01-22 22:14:45.637888: step: 972/533, loss: 0.018843937665224075 2023-01-22 22:14:46.732812: step: 976/533, loss: 0.008022463880479336 2023-01-22 22:14:47.812873: step: 980/533, loss: 0.010757801122963428 2023-01-22 22:14:48.899231: step: 984/533, loss: 0.016277750954031944 2023-01-22 22:14:50.002585: step: 988/533, loss: 0.051602788269519806 2023-01-22 22:14:51.082750: step: 992/533, loss: 0.011717583984136581 2023-01-22 22:14:52.184743: step: 996/533, loss: 0.013772377744317055 2023-01-22 22:14:53.291871: step: 1000/533, loss: 0.023755326867103577 2023-01-22 22:14:54.394229: step: 1004/533, loss: 0.01009039580821991 2023-01-22 22:14:55.512047: step: 1008/533, loss: 0.005220043007284403 2023-01-22 22:14:56.602852: step: 1012/533, loss: 0.0058016544207930565 2023-01-22 22:14:57.705086: step: 1016/533, loss: 0.041087113320827484 2023-01-22 22:14:58.780025: step: 1020/533, loss: 0.01604340597987175 2023-01-22 22:14:59.847078: step: 1024/533, loss: 0.03404732793569565 2023-01-22 22:15:00.967676: step: 1028/533, loss: 0.011191300116479397 2023-01-22 22:15:02.061241: step: 1032/533, loss: 0.0059611620381474495 2023-01-22 22:15:03.152274: step: 1036/533, loss: 0.055433738976716995 2023-01-22 22:15:04.241642: step: 1040/533, loss: 0.017789214849472046 2023-01-22 22:15:05.328889: step: 1044/533, loss: 0.011357159353792667 2023-01-22 22:15:06.441305: step: 1048/533, loss: 0.05060678347945213 2023-01-22 22:15:07.525722: step: 1052/533, loss: 0.010508159175515175 2023-01-22 22:15:08.608272: step: 1056/533, loss: 0.008604254573583603 2023-01-22 22:15:09.708774: step: 1060/533, loss: 0.006093201693147421 2023-01-22 22:15:10.812663: step: 1064/533, loss: 0.044833723455667496 2023-01-22 22:15:11.911381: step: 1068/533, loss: 0.049959197640419006 2023-01-22 22:15:12.979560: step: 1072/533, loss: 0.009037730284035206 2023-01-22 22:15:14.065019: step: 1076/533, loss: 0.03558586910367012 2023-01-22 22:15:15.182132: step: 1080/533, loss: 0.012874887324869633 2023-01-22 22:15:16.266673: step: 1084/533, loss: 0.044012732803821564 2023-01-22 22:15:17.361261: step: 1088/533, loss: 0.07905545830726624 2023-01-22 22:15:18.461689: step: 1092/533, loss: 0.01607437990605831 2023-01-22 22:15:19.557305: step: 1096/533, loss: 0.014090060256421566 2023-01-22 22:15:20.651807: step: 1100/533, loss: 0.01583777740597725 2023-01-22 22:15:21.722591: step: 1104/533, loss: 0.01360901165753603 2023-01-22 22:15:22.823769: step: 1108/533, loss: 0.01037066150456667 2023-01-22 22:15:23.912297: step: 1112/533, loss: 0.009823258966207504 2023-01-22 22:15:25.007135: step: 1116/533, loss: 0.025182589888572693 2023-01-22 22:15:26.104398: step: 1120/533, loss: 0.007517727091908455 2023-01-22 22:15:27.216982: step: 1124/533, loss: 0.0158709529787302 2023-01-22 22:15:28.317674: step: 1128/533, loss: 0.015840407460927963 2023-01-22 22:15:29.392150: step: 1132/533, loss: 0.0510435588657856 2023-01-22 22:15:30.487020: step: 1136/533, loss: 0.021922774612903595 2023-01-22 22:15:31.570866: step: 1140/533, loss: 0.009434031322598457 2023-01-22 22:15:32.656621: step: 1144/533, loss: 0.010468150489032269 2023-01-22 22:15:33.751525: step: 1148/533, loss: 0.01804952882230282 2023-01-22 22:15:34.837022: step: 1152/533, loss: 0.05072910338640213 2023-01-22 22:15:35.963632: step: 1156/533, loss: 0.06464947760105133 2023-01-22 22:15:37.054310: step: 1160/533, loss: 0.016536487266421318 2023-01-22 22:15:38.169539: step: 1164/533, loss: 0.028511976823210716 2023-01-22 22:15:39.237441: step: 1168/533, loss: 0.006800743285566568 2023-01-22 22:15:40.333697: step: 1172/533, loss: 0.038277629762887955 2023-01-22 22:15:41.403110: step: 1176/533, loss: 0.008436434902250767 2023-01-22 22:15:42.515248: step: 1180/533, loss: 0.06570687144994736 2023-01-22 22:15:43.618525: step: 1184/533, loss: 0.040889672935009 2023-01-22 22:15:44.714161: step: 1188/533, loss: 0.004861390683799982 2023-01-22 22:15:45.801050: step: 1192/533, loss: 0.014111604541540146 2023-01-22 22:15:46.915166: step: 1196/533, loss: 0.010974625125527382 2023-01-22 22:15:47.998799: step: 1200/533, loss: 0.005597899202257395 2023-01-22 22:15:49.105247: step: 1204/533, loss: 0.00976946298032999 2023-01-22 22:15:50.190104: step: 1208/533, loss: 0.007208252791315317 2023-01-22 22:15:51.274227: step: 1212/533, loss: 0.0031363293528556824 2023-01-22 22:15:52.362518: step: 1216/533, loss: 0.013956178911030293 2023-01-22 22:15:53.460349: step: 1220/533, loss: 0.010982777923345566 2023-01-22 22:15:54.546589: step: 1224/533, loss: 0.02406073734164238 2023-01-22 22:15:55.653333: step: 1228/533, loss: 0.02114749699831009 2023-01-22 22:15:56.742679: step: 1232/533, loss: 0.020083541050553322 2023-01-22 22:15:57.837483: step: 1236/533, loss: 0.006492003332823515 2023-01-22 22:15:58.920045: step: 1240/533, loss: 0.017302969470620155 2023-01-22 22:16:00.021204: step: 1244/533, loss: 0.012112781405448914 2023-01-22 22:16:01.122575: step: 1248/533, loss: 0.009047483094036579 2023-01-22 22:16:02.209663: step: 1252/533, loss: 0.010445504449307919 2023-01-22 22:16:03.314997: step: 1256/533, loss: 0.00498929200693965 2023-01-22 22:16:04.411137: step: 1260/533, loss: 0.009748070500791073 2023-01-22 22:16:05.513713: step: 1264/533, loss: 0.027679728344082832 2023-01-22 22:16:06.619592: step: 1268/533, loss: 0.008537578396499157 2023-01-22 22:16:07.718653: step: 1272/533, loss: 0.006342281587421894 2023-01-22 22:16:08.812217: step: 1276/533, loss: 0.00981914158910513 2023-01-22 22:16:09.902350: step: 1280/533, loss: 0.03996405377984047 2023-01-22 22:16:11.017674: step: 1284/533, loss: 0.00966727826744318 2023-01-22 22:16:12.098638: step: 1288/533, loss: 0.008135600946843624 2023-01-22 22:16:13.195517: step: 1292/533, loss: 0.008073159493505955 2023-01-22 22:16:14.300946: step: 1296/533, loss: 0.014329371973872185 2023-01-22 22:16:15.408107: step: 1300/533, loss: 0.008889544755220413 2023-01-22 22:16:16.493862: step: 1304/533, loss: 0.07610899209976196 2023-01-22 22:16:17.571894: step: 1308/533, loss: 0.01661599986255169 2023-01-22 22:16:18.654461: step: 1312/533, loss: 0.010576597414910793 2023-01-22 22:16:19.756607: step: 1316/533, loss: 0.005682086572051048 2023-01-22 22:16:20.855380: step: 1320/533, loss: 0.041883524507284164 2023-01-22 22:16:21.935508: step: 1324/533, loss: 0.012618999928236008 2023-01-22 22:16:23.036467: step: 1328/533, loss: 0.06422930955886841 2023-01-22 22:16:24.126799: step: 1332/533, loss: 0.05344794690608978 2023-01-22 22:16:25.239478: step: 1336/533, loss: 0.016741331666707993 2023-01-22 22:16:26.346218: step: 1340/533, loss: 0.011511317454278469 2023-01-22 22:16:27.458820: step: 1344/533, loss: 0.06317970901727676 2023-01-22 22:16:28.549371: step: 1348/533, loss: 0.0070836711674928665 2023-01-22 22:16:29.665146: step: 1352/533, loss: 0.04140499606728554 2023-01-22 22:16:30.772784: step: 1356/533, loss: 0.007597205229103565 2023-01-22 22:16:31.871414: step: 1360/533, loss: 0.007210825104266405 2023-01-22 22:16:32.961592: step: 1364/533, loss: 0.018754880875349045 2023-01-22 22:16:34.042857: step: 1368/533, loss: 0.019920868799090385 2023-01-22 22:16:35.152081: step: 1372/533, loss: 0.04888672009110451 2023-01-22 22:16:36.246406: step: 1376/533, loss: 0.03360056132078171 2023-01-22 22:16:37.357104: step: 1380/533, loss: 0.020786672830581665 2023-01-22 22:16:38.461137: step: 1384/533, loss: 0.038574960082769394 2023-01-22 22:16:39.555444: step: 1388/533, loss: 0.009738821536302567 2023-01-22 22:16:40.628765: step: 1392/533, loss: 0.004805063828825951 2023-01-22 22:16:41.723804: step: 1396/533, loss: 0.006448970176279545 2023-01-22 22:16:42.843167: step: 1400/533, loss: 0.038339950144290924 2023-01-22 22:16:43.954695: step: 1404/533, loss: 0.007654563523828983 2023-01-22 22:16:45.040879: step: 1408/533, loss: 0.010259542614221573 2023-01-22 22:16:46.155191: step: 1412/533, loss: 0.01623718999326229 2023-01-22 22:16:47.249277: step: 1416/533, loss: 0.014163188636302948 2023-01-22 22:16:48.343589: step: 1420/533, loss: 0.009330200031399727 2023-01-22 22:16:49.440764: step: 1424/533, loss: 0.02008504420518875 2023-01-22 22:16:50.510500: step: 1428/533, loss: 0.008662853389978409 2023-01-22 22:16:51.620863: step: 1432/533, loss: 0.04573655501008034 2023-01-22 22:16:52.722928: step: 1436/533, loss: 0.009107109159231186 2023-01-22 22:16:53.841744: step: 1440/533, loss: 0.006593978963792324 2023-01-22 22:16:54.944119: step: 1444/533, loss: 0.01476842537522316 2023-01-22 22:16:56.030788: step: 1448/533, loss: 0.01119696069508791 2023-01-22 22:16:57.117830: step: 1452/533, loss: 0.06139640882611275 2023-01-22 22:16:58.225433: step: 1456/533, loss: 0.012749544344842434 2023-01-22 22:16:59.314732: step: 1460/533, loss: 0.00825601164251566 2023-01-22 22:17:00.420801: step: 1464/533, loss: 0.014513211324810982 2023-01-22 22:17:01.522231: step: 1468/533, loss: 0.022862115874886513 2023-01-22 22:17:02.626554: step: 1472/533, loss: 0.0071062324568629265 2023-01-22 22:17:03.718236: step: 1476/533, loss: 0.01253710500895977 2023-01-22 22:17:04.811611: step: 1480/533, loss: 0.01281536091119051 2023-01-22 22:17:05.888983: step: 1484/533, loss: 0.008709692396223545 2023-01-22 22:17:06.988126: step: 1488/533, loss: 0.012691261246800423 2023-01-22 22:17:08.067018: step: 1492/533, loss: 0.017040710896253586 2023-01-22 22:17:09.192899: step: 1496/533, loss: 0.0008615574333816767 2023-01-22 22:17:10.270966: step: 1500/533, loss: 0.016135990619659424 2023-01-22 22:17:11.353270: step: 1504/533, loss: 0.016571009531617165 2023-01-22 22:17:12.451495: step: 1508/533, loss: 0.018696913495659828 2023-01-22 22:17:13.566453: step: 1512/533, loss: 0.004675515461713076 2023-01-22 22:17:14.680745: step: 1516/533, loss: 0.007222931366413832 2023-01-22 22:17:15.773667: step: 1520/533, loss: 0.01710495725274086 2023-01-22 22:17:16.888607: step: 1524/533, loss: 0.02974708564579487 2023-01-22 22:17:17.977163: step: 1528/533, loss: 0.005800000857561827 2023-01-22 22:17:19.074485: step: 1532/533, loss: 0.020921112969517708 2023-01-22 22:17:20.167696: step: 1536/533, loss: 0.006851104088127613 2023-01-22 22:17:21.270693: step: 1540/533, loss: 0.01091989129781723 2023-01-22 22:17:22.400746: step: 1544/533, loss: 0.009013384580612183 2023-01-22 22:17:23.495972: step: 1548/533, loss: 0.003497521160170436 2023-01-22 22:17:24.575941: step: 1552/533, loss: 0.009167495183646679 2023-01-22 22:17:25.680380: step: 1556/533, loss: 0.03354962170124054 2023-01-22 22:17:26.778050: step: 1560/533, loss: 0.03983305022120476 2023-01-22 22:17:27.869729: step: 1564/533, loss: 0.011866360902786255 2023-01-22 22:17:28.956349: step: 1568/533, loss: 0.015298436395823956 2023-01-22 22:17:30.068960: step: 1572/533, loss: 0.0059750620275735855 2023-01-22 22:17:31.179936: step: 1576/533, loss: 0.016493136063218117 2023-01-22 22:17:32.278236: step: 1580/533, loss: 0.011474487371742725 2023-01-22 22:17:33.369418: step: 1584/533, loss: 0.0072156270034611225 2023-01-22 22:17:34.472696: step: 1588/533, loss: 0.04844214394688606 2023-01-22 22:17:35.562295: step: 1592/533, loss: 0.048341382294893265 2023-01-22 22:17:36.650655: step: 1596/533, loss: 0.006165795028209686 2023-01-22 22:17:37.736037: step: 1600/533, loss: 0.01480008289217949 2023-01-22 22:17:38.822966: step: 1604/533, loss: 0.024683600291609764 2023-01-22 22:17:39.916751: step: 1608/533, loss: 0.016722355037927628 2023-01-22 22:17:41.023013: step: 1612/533, loss: 0.011787992902100086 2023-01-22 22:17:42.113006: step: 1616/533, loss: 0.015091774053871632 2023-01-22 22:17:43.197813: step: 1620/533, loss: 0.006417169701308012 2023-01-22 22:17:44.250357: step: 1624/533, loss: 0.01806468702852726 2023-01-22 22:17:45.316945: step: 1628/533, loss: 0.010624691843986511 2023-01-22 22:17:46.388496: step: 1632/533, loss: 0.012124743312597275 2023-01-22 22:17:47.484896: step: 1636/533, loss: 0.0067804791033267975 2023-01-22 22:17:48.576806: step: 1640/533, loss: 0.023889632895588875 2023-01-22 22:17:49.676146: step: 1644/533, loss: 0.00952153094112873 2023-01-22 22:17:50.806385: step: 1648/533, loss: 0.012334941886365414 2023-01-22 22:17:51.927797: step: 1652/533, loss: 0.011717134155333042 2023-01-22 22:17:53.019275: step: 1656/533, loss: 0.0055471849627792835 2023-01-22 22:17:54.113103: step: 1660/533, loss: 0.010411684401333332 2023-01-22 22:17:55.242907: step: 1664/533, loss: 0.031117409467697144 2023-01-22 22:17:56.345167: step: 1668/533, loss: 0.01255433913320303 2023-01-22 22:17:57.444929: step: 1672/533, loss: 0.031448844820261 2023-01-22 22:17:58.532323: step: 1676/533, loss: 0.004290777258574963 2023-01-22 22:17:59.640545: step: 1680/533, loss: 0.009934306144714355 2023-01-22 22:18:00.746050: step: 1684/533, loss: 0.009225849062204361 2023-01-22 22:18:01.847156: step: 1688/533, loss: 0.007756582461297512 2023-01-22 22:18:02.957351: step: 1692/533, loss: 0.02128407172858715 2023-01-22 22:18:04.063712: step: 1696/533, loss: 0.022385496646165848 2023-01-22 22:18:05.154429: step: 1700/533, loss: 0.013403598219156265 2023-01-22 22:18:06.271581: step: 1704/533, loss: 0.0141100799664855 2023-01-22 22:18:07.406134: step: 1708/533, loss: 0.016555318608880043 2023-01-22 22:18:08.513353: step: 1712/533, loss: 0.007286285050213337 2023-01-22 22:18:09.599631: step: 1716/533, loss: 0.015398475341498852 2023-01-22 22:18:10.700718: step: 1720/533, loss: 0.006275469437241554 2023-01-22 22:18:11.788163: step: 1724/533, loss: 0.014816278591752052 2023-01-22 22:18:12.882822: step: 1728/533, loss: 0.03165755793452263 2023-01-22 22:18:13.972026: step: 1732/533, loss: 0.011096123605966568 2023-01-22 22:18:15.076123: step: 1736/533, loss: 0.0065680621191859245 2023-01-22 22:18:16.162148: step: 1740/533, loss: 0.007600780576467514 2023-01-22 22:18:17.239420: step: 1744/533, loss: 0.008735504932701588 2023-01-22 22:18:18.337782: step: 1748/533, loss: 0.007815550081431866 2023-01-22 22:18:19.444710: step: 1752/533, loss: 0.010598676279187202 2023-01-22 22:18:20.541354: step: 1756/533, loss: 0.009456527419388294 2023-01-22 22:18:21.622767: step: 1760/533, loss: 0.007084310986101627 2023-01-22 22:18:22.723184: step: 1764/533, loss: 0.0 2023-01-22 22:18:23.852409: step: 1768/533, loss: 0.009452968835830688 2023-01-22 22:18:24.935085: step: 1772/533, loss: 0.008064905181527138 2023-01-22 22:18:26.051533: step: 1776/533, loss: 0.016719559207558632 2023-01-22 22:18:27.141213: step: 1780/533, loss: 0.01680837944149971 2023-01-22 22:18:28.231695: step: 1784/533, loss: 0.0068550980649888515 2023-01-22 22:18:29.326050: step: 1788/533, loss: 0.027965400367975235 2023-01-22 22:18:30.412024: step: 1792/533, loss: 0.0015603190986439586 2023-01-22 22:18:31.508632: step: 1796/533, loss: 0.009419466368854046 2023-01-22 22:18:32.616811: step: 1800/533, loss: 0.013402135111391544 2023-01-22 22:18:33.702556: step: 1804/533, loss: 0.013771142810583115 2023-01-22 22:18:34.798569: step: 1808/533, loss: 0.057813484221696854 2023-01-22 22:18:35.903536: step: 1812/533, loss: 0.014227626845240593 2023-01-22 22:18:36.984923: step: 1816/533, loss: 0.004887161776423454 2023-01-22 22:18:38.077061: step: 1820/533, loss: 0.04112822562456131 2023-01-22 22:18:39.168577: step: 1824/533, loss: 0.015044288709759712 2023-01-22 22:18:40.313761: step: 1828/533, loss: 0.017685730010271072 2023-01-22 22:18:41.398364: step: 1832/533, loss: 0.005706385709345341 2023-01-22 22:18:42.504306: step: 1836/533, loss: 0.02949630469083786 2023-01-22 22:18:43.593171: step: 1840/533, loss: 0.00394033407792449 2023-01-22 22:18:44.690801: step: 1844/533, loss: 0.006914061028510332 2023-01-22 22:18:45.796777: step: 1848/533, loss: 0.02550038881599903 2023-01-22 22:18:46.909983: step: 1852/533, loss: 0.008674263022840023 2023-01-22 22:18:48.005052: step: 1856/533, loss: 0.02656950056552887 2023-01-22 22:18:49.118812: step: 1860/533, loss: 0.008736667223274708 2023-01-22 22:18:50.218616: step: 1864/533, loss: 0.08847321569919586 2023-01-22 22:18:51.309036: step: 1868/533, loss: 0.012927834875881672 2023-01-22 22:18:52.385736: step: 1872/533, loss: 0.0009801897685974836 2023-01-22 22:18:53.469802: step: 1876/533, loss: 0.05433003604412079 2023-01-22 22:18:54.575332: step: 1880/533, loss: 0.010582808405160904 2023-01-22 22:18:55.662471: step: 1884/533, loss: 0.008794862776994705 2023-01-22 22:18:56.767603: step: 1888/533, loss: 0.04107755422592163 2023-01-22 22:18:57.869070: step: 1892/533, loss: 0.024445341899991035 2023-01-22 22:18:58.961902: step: 1896/533, loss: 0.011122948490083218 2023-01-22 22:19:00.092209: step: 1900/533, loss: 0.0302229393273592 2023-01-22 22:19:01.180355: step: 1904/533, loss: 0.006792495027184486 2023-01-22 22:19:02.269988: step: 1908/533, loss: 0.01251665223389864 2023-01-22 22:19:03.372603: step: 1912/533, loss: 0.046664897352457047 2023-01-22 22:19:04.486945: step: 1916/533, loss: 0.044249892234802246 2023-01-22 22:19:05.581998: step: 1920/533, loss: 0.008466018363833427 2023-01-22 22:19:06.698127: step: 1924/533, loss: 0.028148703277111053 2023-01-22 22:19:07.803618: step: 1928/533, loss: 0.0255095474421978 2023-01-22 22:19:08.884321: step: 1932/533, loss: 0.010918544605374336 2023-01-22 22:19:09.997757: step: 1936/533, loss: 0.005810788832604885 2023-01-22 22:19:11.104361: step: 1940/533, loss: 0.061996083706617355 2023-01-22 22:19:12.217286: step: 1944/533, loss: 0.0060608782805502415 2023-01-22 22:19:13.295159: step: 1948/533, loss: 0.01125352457165718 2023-01-22 22:19:14.381963: step: 1952/533, loss: 0.02412468194961548 2023-01-22 22:19:15.472870: step: 1956/533, loss: 0.004431331064552069 2023-01-22 22:19:16.580516: step: 1960/533, loss: 0.03832579404115677 2023-01-22 22:19:17.670104: step: 1964/533, loss: 0.010433322750031948 2023-01-22 22:19:18.767494: step: 1968/533, loss: 0.018249930813908577 2023-01-22 22:19:19.887914: step: 1972/533, loss: 0.013660329394042492 2023-01-22 22:19:20.988768: step: 1976/533, loss: 0.011202868074178696 2023-01-22 22:19:22.078494: step: 1980/533, loss: 0.015651913359761238 2023-01-22 22:19:23.152635: step: 1984/533, loss: 0.05221601948142052 2023-01-22 22:19:24.251710: step: 1988/533, loss: 0.007966392673552036 2023-01-22 22:19:25.344764: step: 1992/533, loss: 0.006035948172211647 2023-01-22 22:19:26.439906: step: 1996/533, loss: 0.000829281925689429 2023-01-22 22:19:27.527174: step: 2000/533, loss: 0.04328630492091179 2023-01-22 22:19:28.629139: step: 2004/533, loss: 0.048432111740112305 2023-01-22 22:19:29.704021: step: 2008/533, loss: 0.00907258503139019 2023-01-22 22:19:30.813349: step: 2012/533, loss: 0.026627948507666588 2023-01-22 22:19:31.916168: step: 2016/533, loss: 0.006006287876516581 2023-01-22 22:19:33.016090: step: 2020/533, loss: 0.03545968979597092 2023-01-22 22:19:34.105040: step: 2024/533, loss: 0.0053750029765069485 2023-01-22 22:19:35.185570: step: 2028/533, loss: 0.024792838841676712 2023-01-22 22:19:36.267808: step: 2032/533, loss: 0.03346027061343193 2023-01-22 22:19:37.389171: step: 2036/533, loss: 0.03992001339793205 2023-01-22 22:19:38.482203: step: 2040/533, loss: 0.04616117477416992 2023-01-22 22:19:39.575166: step: 2044/533, loss: 0.014946511946618557 2023-01-22 22:19:40.666640: step: 2048/533, loss: 0.030672762542963028 2023-01-22 22:19:41.751030: step: 2052/533, loss: 0.010912436991930008 2023-01-22 22:19:42.848429: step: 2056/533, loss: 0.03719894960522652 2023-01-22 22:19:43.947299: step: 2060/533, loss: 0.004531790968030691 2023-01-22 22:19:45.067079: step: 2064/533, loss: 0.024654746055603027 2023-01-22 22:19:46.203807: step: 2068/533, loss: 0.014481114223599434 2023-01-22 22:19:47.285615: step: 2072/533, loss: 0.03695772960782051 2023-01-22 22:19:48.381409: step: 2076/533, loss: 0.012480844743549824 2023-01-22 22:19:49.504333: step: 2080/533, loss: 0.006541540380567312 2023-01-22 22:19:50.621132: step: 2084/533, loss: 0.024730829522013664 2023-01-22 22:19:51.722502: step: 2088/533, loss: 0.026747513562440872 2023-01-22 22:19:52.819803: step: 2092/533, loss: 0.012856218963861465 2023-01-22 22:19:53.901611: step: 2096/533, loss: 0.03269417583942413 2023-01-22 22:19:54.998365: step: 2100/533, loss: 0.062488213181495667 2023-01-22 22:19:56.095550: step: 2104/533, loss: 0.004493589047342539 2023-01-22 22:19:57.196567: step: 2108/533, loss: 0.006249538157135248 2023-01-22 22:19:58.321350: step: 2112/533, loss: 0.006704983301460743 2023-01-22 22:19:59.421192: step: 2116/533, loss: 0.00742362579330802 2023-01-22 22:20:00.542324: step: 2120/533, loss: 0.007133388426154852 2023-01-22 22:20:01.649927: step: 2124/533, loss: 0.009088111110031605 2023-01-22 22:20:02.718109: step: 2128/533, loss: 0.006826450582593679 2023-01-22 22:20:03.816971: step: 2132/533, loss: 0.01824604906141758 ================================================== Loss: 0.022 -------------------- Dev Chinese: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.30689400993171945, 'r': 0.3127174256799494, 'f1': 0.30977835213032584}, 'combined': 0.22825773314866113, 'stategy': 1, 'epoch': 0} Test Chinese: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.3477051139630923, 'r': 0.3530544234086783, 'f1': 0.35035935147426095}, 'combined': 0.24403138908654992, 'stategy': 1, 'epoch': 0} Dev Korean: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.3149038251270523, 'r': 0.3268546344297867, 'f1': 0.3207679559487851}, 'combined': 0.23635533596226269, 'stategy': 1, 'epoch': 0} Test Korean: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.35132125453224455, 'r': 0.3546960984758781, 'f1': 0.3530006104334503}, 'combined': 0.24587107194369676, 'stategy': 1, 'epoch': 0} Dev Russian: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.28978125, 'r': 0.32992172675521825, 'f1': 0.30855146406388645}, 'combined': 0.2273537103628637, 'stategy': 1, 'epoch': 0} Test Russian: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.34054184265348364, 'r': 0.3392320663355856, 'f1': 0.33988569266763874}, 'combined': 0.23673630335059417, 'stategy': 1, 'epoch': 0} Sample Chinese: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.28532608695652173, 'r': 0.375, 'f1': 0.32407407407407407}, 'combined': 0.21604938271604937, 'stategy': 1, 'epoch': 0} Sample Korean: {'template': {'p': 0.5, 'r': 0.5, 'f1': 0.5}, 'slot': {'p': 0.25757575757575757, 'r': 0.3695652173913043, 'f1': 0.30357142857142855}, 'combined': 0.15178571428571427, 'stategy': 1, 'epoch': 0} Sample Russian: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.5119047619047619, 'r': 0.3706896551724138, 'f1': 0.43}, 'combined': 0.2866666666666666, 'stategy': 1, 'epoch': 0} New best chinese model... New best korean model... New best russian model... ================================================== Current best result: -------------------- Dev for Chinese: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.30689400993171945, 'r': 0.3127174256799494, 'f1': 0.30977835213032584}, 'combined': 0.22825773314866113, 'stategy': 1, 'epoch': 0} Test for Chinese: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.3477051139630923, 'r': 0.3530544234086783, 'f1': 0.35035935147426095}, 'combined': 0.24403138908654992, 'stategy': 1, 'epoch': 0} Chinese: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.28532608695652173, 'r': 0.375, 'f1': 0.32407407407407407}, 'combined': 0.21604938271604937, 'stategy': 1, 'epoch': 0} -------------------- Dev for Korean: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.3149038251270523, 'r': 0.3268546344297867, 'f1': 0.3207679559487851}, 'combined': 0.23635533596226269, 'stategy': 1, 'epoch': 0} Test for Korean: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.35132125453224455, 'r': 0.3546960984758781, 'f1': 0.3530006104334503}, 'combined': 0.24587107194369676, 'stategy': 1, 'epoch': 0} Korean: {'template': {'p': 0.5, 'r': 0.5, 'f1': 0.5}, 'slot': {'p': 0.25757575757575757, 'r': 0.3695652173913043, 'f1': 0.30357142857142855}, 'combined': 0.15178571428571427, 'stategy': 1, 'epoch': 0} -------------------- Dev for Russian: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.28978125, 'r': 0.32992172675521825, 'f1': 0.30855146406388645}, 'combined': 0.2273537103628637, 'stategy': 1, 'epoch': 0} Test for Russian: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.34054184265348364, 'r': 0.3392320663355856, 'f1': 0.33988569266763874}, 'combined': 0.23673630335059417, 'stategy': 1, 'epoch': 0} Russian: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.5119047619047619, 'r': 0.3706896551724138, 'f1': 0.43}, 'combined': 0.2866666666666666, 'stategy': 1, 'epoch': 0} ****************************** Epoch: 1 command: python train.py --model_name coref --xlmr_model_name xlm-roberta-large --batch_size 16 --xlmr_learning_rate 2e-5 --accumulate_step 4 --max_epoch 20 --event_hidden_num 500 --p1_data_weight 0.2 --learning_rate 9e-4 2023-01-22 22:23:01.951786: step: 4/533, loss: 0.008925613947212696 2023-01-22 22:23:03.041523: step: 8/533, loss: 0.011093520559370518 2023-01-22 22:23:04.138463: step: 12/533, loss: 0.0058023459278047085 2023-01-22 22:23:05.221850: step: 16/533, loss: 0.0254288911819458 2023-01-22 22:23:06.321778: step: 20/533, loss: 0.01613426022231579 2023-01-22 22:23:07.427274: step: 24/533, loss: 0.022490452975034714 2023-01-22 22:23:08.517202: step: 28/533, loss: 0.026726754382252693 2023-01-22 22:23:09.604679: step: 32/533, loss: 0.0064178332686424255 2023-01-22 22:23:10.679187: step: 36/533, loss: 0.007065150421112776 2023-01-22 22:23:11.763320: step: 40/533, loss: 0.005126288626343012 2023-01-22 22:23:12.848563: step: 44/533, loss: 0.0175628624856472 2023-01-22 22:23:13.911921: step: 48/533, loss: 0.003506530076265335 2023-01-22 22:23:14.977296: step: 52/533, loss: 0.01116432435810566 2023-01-22 22:23:16.073030: step: 56/533, loss: 0.024234648793935776 2023-01-22 22:23:17.153828: step: 60/533, loss: 0.02434203401207924 2023-01-22 22:23:18.251942: step: 64/533, loss: 0.012589383870363235 2023-01-22 22:23:19.323807: step: 68/533, loss: 0.03491578623652458 2023-01-22 22:23:20.413622: step: 72/533, loss: 0.017968161031603813 2023-01-22 22:23:21.507592: step: 76/533, loss: 0.01611667312681675 2023-01-22 22:23:22.582008: step: 80/533, loss: 0.0032249377109110355 2023-01-22 22:23:23.679033: step: 84/533, loss: 0.03869962692260742 2023-01-22 22:23:24.764742: step: 88/533, loss: 0.008260197006165981 2023-01-22 22:23:25.858735: step: 92/533, loss: 0.03007417358458042 2023-01-22 22:23:26.926750: step: 96/533, loss: 0.0388333722949028 2023-01-22 22:23:28.003814: step: 100/533, loss: 0.00821117963641882 2023-01-22 22:23:29.096001: step: 104/533, loss: 0.013922277837991714 2023-01-22 22:23:30.190380: step: 108/533, loss: 0.015692410990595818 2023-01-22 22:23:31.281266: step: 112/533, loss: 0.015286232344806194 2023-01-22 22:23:32.380372: step: 116/533, loss: 0.010069034062325954 2023-01-22 22:23:33.474507: step: 120/533, loss: 0.024279817938804626 2023-01-22 22:23:34.558312: step: 124/533, loss: 0.004837061744183302 2023-01-22 22:23:35.652349: step: 128/533, loss: 0.018335094675421715 2023-01-22 22:23:36.756386: step: 132/533, loss: 0.04698308929800987 2023-01-22 22:23:37.850663: step: 136/533, loss: 0.007336444687098265 2023-01-22 22:23:38.966747: step: 140/533, loss: 0.010608860291540623 2023-01-22 22:23:40.068566: step: 144/533, loss: 0.03679569810628891 2023-01-22 22:23:41.177038: step: 148/533, loss: 0.010398210026323795 2023-01-22 22:23:42.274731: step: 152/533, loss: 0.009912537410855293 2023-01-22 22:23:43.372046: step: 156/533, loss: 0.012385787442326546 2023-01-22 22:23:44.496604: step: 160/533, loss: 0.051133185625076294 2023-01-22 22:23:45.581211: step: 164/533, loss: 0.002025886904448271 2023-01-22 22:23:46.667457: step: 168/533, loss: 0.01717594638466835 2023-01-22 22:23:47.746562: step: 172/533, loss: 0.016960248351097107 2023-01-22 22:23:48.844486: step: 176/533, loss: 0.008156201802194118 2023-01-22 22:23:49.942673: step: 180/533, loss: 0.005672939121723175 2023-01-22 22:23:51.048688: step: 184/533, loss: 0.013414998538792133 2023-01-22 22:23:52.137751: step: 188/533, loss: 0.016220100224018097 2023-01-22 22:23:53.231107: step: 192/533, loss: 0.003968982491642237 2023-01-22 22:23:54.300548: step: 196/533, loss: 0.01731756702065468 2023-01-22 22:23:55.400078: step: 200/533, loss: 0.010059109888970852 2023-01-22 22:23:56.498096: step: 204/533, loss: 0.016753770411014557 2023-01-22 22:23:57.576250: step: 208/533, loss: 0.00759066641330719 2023-01-22 22:23:58.654608: step: 212/533, loss: 0.008445285260677338 2023-01-22 22:23:59.749787: step: 216/533, loss: 0.04048917070031166 2023-01-22 22:24:00.860523: step: 220/533, loss: 0.013726937584578991 2023-01-22 22:24:01.971139: step: 224/533, loss: 0.003849520580843091 2023-01-22 22:24:03.059514: step: 228/533, loss: 0.01220738422125578 2023-01-22 22:24:04.164277: step: 232/533, loss: 0.03621504455804825 2023-01-22 22:24:05.238549: step: 236/533, loss: 0.01172435749322176 2023-01-22 22:24:06.327333: step: 240/533, loss: 0.006699166260659695 2023-01-22 22:24:07.411576: step: 244/533, loss: 0.011525461450219154 2023-01-22 22:24:08.514191: step: 248/533, loss: 0.029795458540320396 2023-01-22 22:24:09.621373: step: 252/533, loss: 0.012900792062282562 2023-01-22 22:24:10.703657: step: 256/533, loss: 0.007308041211217642 2023-01-22 22:24:11.798766: step: 260/533, loss: 0.012591083534061909 2023-01-22 22:24:12.923264: step: 264/533, loss: 0.004951964132487774 2023-01-22 22:24:14.006781: step: 268/533, loss: 0.014783511869609356 2023-01-22 22:24:15.092484: step: 272/533, loss: 0.022247154265642166 2023-01-22 22:24:16.182715: step: 276/533, loss: 0.014170602895319462 2023-01-22 22:24:17.263609: step: 280/533, loss: 0.006533030420541763 2023-01-22 22:24:18.367611: step: 284/533, loss: 0.009323183447122574 2023-01-22 22:24:19.483318: step: 288/533, loss: 0.03841555863618851 2023-01-22 22:24:20.578333: step: 292/533, loss: 0.0021806550212204456 2023-01-22 22:24:21.656182: step: 296/533, loss: 0.023267842829227448 2023-01-22 22:24:22.741928: step: 300/533, loss: 0.005537248682230711 2023-01-22 22:24:23.821721: step: 304/533, loss: 0.011080612428486347 2023-01-22 22:24:24.920139: step: 308/533, loss: 0.011464636772871017 2023-01-22 22:24:26.021291: step: 312/533, loss: 0.011529198847711086 2023-01-22 22:24:27.126645: step: 316/533, loss: 0.009971776977181435 2023-01-22 22:24:28.215900: step: 320/533, loss: 0.012636848725378513 2023-01-22 22:24:29.317902: step: 324/533, loss: 0.006611999124288559 2023-01-22 22:24:30.409458: step: 328/533, loss: 0.03487914055585861 2023-01-22 22:24:31.507304: step: 332/533, loss: 0.011139449663460255 2023-01-22 22:24:32.617685: step: 336/533, loss: 0.010063005611300468 2023-01-22 22:24:33.714694: step: 340/533, loss: 0.009021430276334286 2023-01-22 22:24:34.834312: step: 344/533, loss: 0.014709413051605225 2023-01-22 22:24:35.905354: step: 348/533, loss: 0.028703078627586365 2023-01-22 22:24:36.985658: step: 352/533, loss: 0.009137424640357494 2023-01-22 22:24:38.090047: step: 356/533, loss: 0.005573668982833624 2023-01-22 22:24:39.187201: step: 360/533, loss: 0.030432390049099922 2023-01-22 22:24:40.280212: step: 364/533, loss: 0.010921692475676537 2023-01-22 22:24:41.372383: step: 368/533, loss: 0.00629640556871891 2023-01-22 22:24:42.471579: step: 372/533, loss: 0.011271798051893711 2023-01-22 22:24:43.563954: step: 376/533, loss: 0.031551189720630646 2023-01-22 22:24:44.676075: step: 380/533, loss: 0.006694453302770853 2023-01-22 22:24:45.765542: step: 384/533, loss: 0.00836116448044777 2023-01-22 22:24:46.850332: step: 388/533, loss: 0.021361079066991806 2023-01-22 22:24:47.962136: step: 392/533, loss: 0.00994281005114317 2023-01-22 22:24:49.071449: step: 396/533, loss: 0.004525742493569851 2023-01-22 22:24:50.174576: step: 400/533, loss: 0.004288197495043278 2023-01-22 22:24:51.258993: step: 404/533, loss: 0.034830134361982346 2023-01-22 22:24:52.360220: step: 408/533, loss: 0.021062446758151054 2023-01-22 22:24:53.438747: step: 412/533, loss: 0.03264647722244263 2023-01-22 22:24:54.555282: step: 416/533, loss: 0.005878729280084372 2023-01-22 22:24:55.661493: step: 420/533, loss: 0.0061667487025260925 2023-01-22 22:24:56.744737: step: 424/533, loss: 0.014203639701008797 2023-01-22 22:24:57.842568: step: 428/533, loss: 0.031891848891973495 2023-01-22 22:24:58.951512: step: 432/533, loss: 0.015588675625622272 2023-01-22 22:25:00.045604: step: 436/533, loss: 0.012439212761819363 2023-01-22 22:25:01.173746: step: 440/533, loss: 0.04775090515613556 2023-01-22 22:25:02.272707: step: 444/533, loss: 0.014180255122482777 2023-01-22 22:25:03.370856: step: 448/533, loss: 0.01566915214061737 2023-01-22 22:25:04.467243: step: 452/533, loss: 0.01565147191286087 2023-01-22 22:25:05.564940: step: 456/533, loss: 0.009835238568484783 2023-01-22 22:25:06.706458: step: 460/533, loss: 0.005634940695017576 2023-01-22 22:25:07.841373: step: 464/533, loss: 0.013527895323932171 2023-01-22 22:25:08.985359: step: 468/533, loss: 0.007232138887047768 2023-01-22 22:25:10.091976: step: 472/533, loss: 0.005645276978611946 2023-01-22 22:25:11.185405: step: 476/533, loss: 0.00942808948457241 2023-01-22 22:25:12.268379: step: 480/533, loss: 0.02817002311348915 2023-01-22 22:25:13.362829: step: 484/533, loss: 0.005511586554348469 2023-01-22 22:25:14.447944: step: 488/533, loss: 0.0095926932990551 2023-01-22 22:25:15.572305: step: 492/533, loss: 0.041165996342897415 2023-01-22 22:25:16.666766: step: 496/533, loss: 0.018384834751486778 2023-01-22 22:25:17.790013: step: 500/533, loss: 0.006295149214565754 2023-01-22 22:25:18.912451: step: 504/533, loss: 0.009075933136045933 2023-01-22 22:25:20.023536: step: 508/533, loss: 0.03459804505109787 2023-01-22 22:25:21.105761: step: 512/533, loss: 0.010590721853077412 2023-01-22 22:25:22.191982: step: 516/533, loss: 0.011437548324465752 2023-01-22 22:25:23.305493: step: 520/533, loss: 0.004229051060974598 2023-01-22 22:25:24.396569: step: 524/533, loss: 0.022210363298654556 2023-01-22 22:25:25.509153: step: 528/533, loss: 0.003922032192349434 2023-01-22 22:25:26.601871: step: 532/533, loss: 0.019950641319155693 2023-01-22 22:25:27.693989: step: 536/533, loss: 0.022808806970715523 2023-01-22 22:25:28.787907: step: 540/533, loss: 0.0014436112251132727 2023-01-22 22:25:29.877590: step: 544/533, loss: 0.014141925610601902 2023-01-22 22:25:30.964024: step: 548/533, loss: 0.012821177020668983 2023-01-22 22:25:32.065056: step: 552/533, loss: 0.01623542234301567 2023-01-22 22:25:33.183300: step: 556/533, loss: 0.006309705786406994 2023-01-22 22:25:34.275536: step: 560/533, loss: 0.029209788888692856 2023-01-22 22:25:35.364670: step: 564/533, loss: 0.008930402807891369 2023-01-22 22:25:36.463540: step: 568/533, loss: 0.0056000202894210815 2023-01-22 22:25:37.553844: step: 572/533, loss: 0.022733649238944054 2023-01-22 22:25:38.620093: step: 576/533, loss: 0.005594725254923105 2023-01-22 22:25:39.707970: step: 580/533, loss: 0.0038018396589905024 2023-01-22 22:25:40.809397: step: 584/533, loss: 0.009740735404193401 2023-01-22 22:25:41.912292: step: 588/533, loss: 0.011428650468587875 2023-01-22 22:25:42.997532: step: 592/533, loss: 0.05814409255981445 2023-01-22 22:25:44.117991: step: 596/533, loss: 0.09873968362808228 2023-01-22 22:25:45.189524: step: 600/533, loss: 0.011110356077551842 2023-01-22 22:25:46.290202: step: 604/533, loss: 0.004199073649942875 2023-01-22 22:25:47.391294: step: 608/533, loss: 0.016400018706917763 2023-01-22 22:25:48.484949: step: 612/533, loss: 0.043520767241716385 2023-01-22 22:25:49.585151: step: 616/533, loss: 0.0034728357568383217 2023-01-22 22:25:50.685812: step: 620/533, loss: 0.004987440072000027 2023-01-22 22:25:51.789757: step: 624/533, loss: 0.03634863346815109 2023-01-22 22:25:52.886739: step: 628/533, loss: 0.033024080097675323 2023-01-22 22:25:53.998132: step: 632/533, loss: 0.04103824123740196 2023-01-22 22:25:55.107049: step: 636/533, loss: 0.027138726785779 2023-01-22 22:25:56.218244: step: 640/533, loss: 0.03234970569610596 2023-01-22 22:25:57.303597: step: 644/533, loss: 0.047655124217271805 2023-01-22 22:25:58.403072: step: 648/533, loss: 0.05748002976179123 2023-01-22 22:25:59.508374: step: 652/533, loss: 0.008228065446019173 2023-01-22 22:26:00.605921: step: 656/533, loss: 0.016573339700698853 2023-01-22 22:26:01.693267: step: 660/533, loss: 0.005543990526348352 2023-01-22 22:26:02.794466: step: 664/533, loss: 0.0066401418298482895 2023-01-22 22:26:03.909318: step: 668/533, loss: 0.006095356307923794 2023-01-22 22:26:04.994738: step: 672/533, loss: 0.024907980114221573 2023-01-22 22:26:06.078468: step: 676/533, loss: 0.004693935159593821 2023-01-22 22:26:07.172025: step: 680/533, loss: 0.016910288482904434 2023-01-22 22:26:08.260977: step: 684/533, loss: 0.03568674996495247 2023-01-22 22:26:09.359206: step: 688/533, loss: 0.006058427505195141 2023-01-22 22:26:10.450406: step: 692/533, loss: 0.06406237185001373 2023-01-22 22:26:11.546324: step: 696/533, loss: 0.007241168990731239 2023-01-22 22:26:12.638826: step: 700/533, loss: 0.012354905717074871 2023-01-22 22:26:13.734559: step: 704/533, loss: 0.018679505214095116 2023-01-22 22:26:14.835804: step: 708/533, loss: 0.005419182125478983 2023-01-22 22:26:15.957758: step: 712/533, loss: 0.019503310322761536 2023-01-22 22:26:17.050087: step: 716/533, loss: 0.01600051298737526 2023-01-22 22:26:18.164676: step: 720/533, loss: 0.007699155248701572 2023-01-22 22:26:19.267986: step: 724/533, loss: 0.00482341879978776 2023-01-22 22:26:20.360788: step: 728/533, loss: 0.019026000052690506 2023-01-22 22:26:21.468394: step: 732/533, loss: 0.02634083852171898 2023-01-22 22:26:22.556600: step: 736/533, loss: 0.05907056853175163 2023-01-22 22:26:23.647565: step: 740/533, loss: 0.08109515905380249 2023-01-22 22:26:24.742521: step: 744/533, loss: 0.005308034364134073 2023-01-22 22:26:25.850881: step: 748/533, loss: 0.046959392726421356 2023-01-22 22:26:26.925141: step: 752/533, loss: 0.011808075942099094 2023-01-22 22:26:28.012193: step: 756/533, loss: 0.03541991859674454 2023-01-22 22:26:29.105690: step: 760/533, loss: 0.006159640848636627 2023-01-22 22:26:30.224490: step: 764/533, loss: 0.02843761444091797 2023-01-22 22:26:31.320234: step: 768/533, loss: 0.00233129458501935 2023-01-22 22:26:32.417712: step: 772/533, loss: 0.035656556487083435 2023-01-22 22:26:33.523361: step: 776/533, loss: 0.03360661491751671 2023-01-22 22:26:34.610916: step: 780/533, loss: 0.007703785318881273 2023-01-22 22:26:35.689542: step: 784/533, loss: 0.012355204671621323 2023-01-22 22:26:36.779388: step: 788/533, loss: 0.01141770463436842 2023-01-22 22:26:37.863200: step: 792/533, loss: 0.010619927197694778 2023-01-22 22:26:38.983969: step: 796/533, loss: 0.008327803574502468 2023-01-22 22:26:40.092809: step: 800/533, loss: 0.014103076420724392 2023-01-22 22:26:41.193959: step: 804/533, loss: 0.051497168838977814 2023-01-22 22:26:42.279454: step: 808/533, loss: 0.020746290683746338 2023-01-22 22:26:43.378602: step: 812/533, loss: 0.002470416948199272 2023-01-22 22:26:44.485004: step: 816/533, loss: 0.010753748007118702 2023-01-22 22:26:45.600953: step: 820/533, loss: 0.014591149985790253 2023-01-22 22:26:46.676662: step: 824/533, loss: 0.01518035214394331 2023-01-22 22:26:47.793231: step: 828/533, loss: 0.0417749248445034 2023-01-22 22:26:48.890602: step: 832/533, loss: 0.009555557742714882 2023-01-22 22:26:49.992259: step: 836/533, loss: 0.008642081171274185 2023-01-22 22:26:51.087027: step: 840/533, loss: 0.05746659263968468 2023-01-22 22:26:52.195358: step: 844/533, loss: 0.0027747510466724634 2023-01-22 22:26:53.300851: step: 848/533, loss: 0.011802561581134796 2023-01-22 22:26:54.412696: step: 852/533, loss: 0.00964966882020235 2023-01-22 22:26:55.502516: step: 856/533, loss: 0.04255588352680206 2023-01-22 22:26:56.605727: step: 860/533, loss: 0.021322228014469147 2023-01-22 22:26:57.708012: step: 864/533, loss: 0.01150614582002163 2023-01-22 22:26:58.817787: step: 868/533, loss: 0.020424969494342804 2023-01-22 22:26:59.905556: step: 872/533, loss: 0.01587420515716076 2023-01-22 22:27:00.994666: step: 876/533, loss: 0.015274500474333763 2023-01-22 22:27:02.108621: step: 880/533, loss: 0.002101684920489788 2023-01-22 22:27:03.188206: step: 884/533, loss: 0.00220762868411839 2023-01-22 22:27:04.277726: step: 888/533, loss: 0.01109248399734497 2023-01-22 22:27:05.398723: step: 892/533, loss: 0.01444181241095066 2023-01-22 22:27:06.487805: step: 896/533, loss: 0.014774741604924202 2023-01-22 22:27:07.588337: step: 900/533, loss: 0.00579412467777729 2023-01-22 22:27:08.665584: step: 904/533, loss: 0.00621021119877696 2023-01-22 22:27:09.797183: step: 908/533, loss: 0.004264539107680321 2023-01-22 22:27:10.917851: step: 912/533, loss: 0.07819143682718277 2023-01-22 22:27:12.009644: step: 916/533, loss: 0.011373464949429035 2023-01-22 22:27:13.133409: step: 920/533, loss: 0.015380415134131908 2023-01-22 22:27:14.228217: step: 924/533, loss: 0.02404681220650673 2023-01-22 22:27:15.329643: step: 928/533, loss: 0.008432273752987385 2023-01-22 22:27:16.413207: step: 932/533, loss: 0.04816380515694618 2023-01-22 22:27:17.515287: step: 936/533, loss: 0.0032615098170936108 2023-01-22 22:27:18.623052: step: 940/533, loss: 0.005723059177398682 2023-01-22 22:27:19.721580: step: 944/533, loss: 0.013308648020029068 2023-01-22 22:27:20.817815: step: 948/533, loss: 0.006898361258208752 2023-01-22 22:27:21.901055: step: 952/533, loss: 0.0268127229064703 2023-01-22 22:27:22.999769: step: 956/533, loss: 0.005686312913894653 2023-01-22 22:27:24.106841: step: 960/533, loss: 0.04237457737326622 2023-01-22 22:27:25.217069: step: 964/533, loss: 0.02490498125553131 2023-01-22 22:27:26.306223: step: 968/533, loss: 0.014365775510668755 2023-01-22 22:27:27.394162: step: 972/533, loss: 0.007850486785173416 2023-01-22 22:27:28.481968: step: 976/533, loss: 0.012416807003319263 2023-01-22 22:27:29.575241: step: 980/533, loss: 0.0034069353714585304 2023-01-22 22:27:30.674424: step: 984/533, loss: 0.0281415693461895 2023-01-22 22:27:31.768665: step: 988/533, loss: 0.015958987176418304 2023-01-22 22:27:32.884125: step: 992/533, loss: 0.011150212027132511 2023-01-22 22:27:33.979041: step: 996/533, loss: 0.008381768129765987 2023-01-22 22:27:35.082506: step: 1000/533, loss: 0.016811953857541084 2023-01-22 22:27:36.188984: step: 1004/533, loss: 0.008414200507104397 2023-01-22 22:27:37.304351: step: 1008/533, loss: 0.03560592979192734 2023-01-22 22:27:38.385405: step: 1012/533, loss: 0.0062303701415658 2023-01-22 22:27:39.462098: step: 1016/533, loss: 0.0024646995589137077 2023-01-22 22:27:40.550925: step: 1020/533, loss: 0.038395971059799194 2023-01-22 22:27:41.639495: step: 1024/533, loss: 0.005241533275693655 2023-01-22 22:27:42.728511: step: 1028/533, loss: 0.0069887530989944935 2023-01-22 22:27:43.851943: step: 1032/533, loss: 0.008667053654789925 2023-01-22 22:27:44.943420: step: 1036/533, loss: 0.003115372033789754 2023-01-22 22:27:46.061238: step: 1040/533, loss: 0.0341968834400177 2023-01-22 22:27:47.160757: step: 1044/533, loss: 0.0 2023-01-22 22:27:48.263199: step: 1048/533, loss: 0.03794778138399124 2023-01-22 22:27:49.354709: step: 1052/533, loss: 0.08537909388542175 2023-01-22 22:27:50.445576: step: 1056/533, loss: 0.018842177465558052 2023-01-22 22:27:51.526228: step: 1060/533, loss: 0.011245747096836567 2023-01-22 22:27:52.620820: step: 1064/533, loss: 0.020408974960446358 2023-01-22 22:27:53.715562: step: 1068/533, loss: 0.013045798987150192 2023-01-22 22:27:54.820900: step: 1072/533, loss: 0.021844634786248207 2023-01-22 22:27:55.917282: step: 1076/533, loss: 0.0075671738013625145 2023-01-22 22:27:57.009302: step: 1080/533, loss: 0.008715600706636906 2023-01-22 22:27:58.074716: step: 1084/533, loss: 0.04451737180352211 2023-01-22 22:27:59.158356: step: 1088/533, loss: 0.006853511091321707 2023-01-22 22:28:00.237166: step: 1092/533, loss: 0.0015571132535114884 2023-01-22 22:28:01.346840: step: 1096/533, loss: 0.002805503783747554 2023-01-22 22:28:02.439810: step: 1100/533, loss: 0.005748588126152754 2023-01-22 22:28:03.543643: step: 1104/533, loss: 0.008831224404275417 2023-01-22 22:28:04.659423: step: 1108/533, loss: 0.0013629422755911946 2023-01-22 22:28:05.743035: step: 1112/533, loss: 0.04705285280942917 2023-01-22 22:28:06.806037: step: 1116/533, loss: 0.007837486453354359 2023-01-22 22:28:07.909622: step: 1120/533, loss: 0.0032384663354605436 2023-01-22 22:28:08.981447: step: 1124/533, loss: 0.011592747643589973 2023-01-22 22:28:10.069922: step: 1128/533, loss: 0.005072848871350288 2023-01-22 22:28:11.162648: step: 1132/533, loss: 0.031061384826898575 2023-01-22 22:28:12.261468: step: 1136/533, loss: 0.009369976818561554 2023-01-22 22:28:13.344048: step: 1140/533, loss: 0.011183694005012512 2023-01-22 22:28:14.418771: step: 1144/533, loss: 0.003200326580554247 2023-01-22 22:28:15.509417: step: 1148/533, loss: 0.0077477023005485535 2023-01-22 22:28:16.603004: step: 1152/533, loss: 0.007399132940918207 2023-01-22 22:28:17.740603: step: 1156/533, loss: 0.011876900680363178 2023-01-22 22:28:18.840967: step: 1160/533, loss: 0.004952153656631708 2023-01-22 22:28:19.973465: step: 1164/533, loss: 0.013890908099710941 2023-01-22 22:28:21.090521: step: 1168/533, loss: 0.013034052215516567 2023-01-22 22:28:22.185611: step: 1172/533, loss: 0.011917910538613796 2023-01-22 22:28:23.280917: step: 1176/533, loss: 0.004274799022823572 2023-01-22 22:28:24.371511: step: 1180/533, loss: 0.007342600263655186 2023-01-22 22:28:25.465796: step: 1184/533, loss: 0.0021191914565861225 2023-01-22 22:28:26.556045: step: 1188/533, loss: 0.009192829951643944 2023-01-22 22:28:27.661834: step: 1192/533, loss: 0.03928673267364502 2023-01-22 22:28:28.760893: step: 1196/533, loss: 0.0055250064469873905 2023-01-22 22:28:29.838605: step: 1200/533, loss: 0.019327865913510323 2023-01-22 22:28:30.935056: step: 1204/533, loss: 0.014316604472696781 2023-01-22 22:28:32.018846: step: 1208/533, loss: 0.0019721314311027527 2023-01-22 22:28:33.110189: step: 1212/533, loss: 0.0225637499243021 2023-01-22 22:28:34.207795: step: 1216/533, loss: 0.00505601242184639 2023-01-22 22:28:35.275473: step: 1220/533, loss: 0.054867424070835114 2023-01-22 22:28:36.418636: step: 1224/533, loss: 0.004886872600764036 2023-01-22 22:28:37.521509: step: 1228/533, loss: 0.011014881543815136 2023-01-22 22:28:38.615380: step: 1232/533, loss: 0.007566630840301514 2023-01-22 22:28:39.699649: step: 1236/533, loss: 0.02089790068566799 2023-01-22 22:28:40.818623: step: 1240/533, loss: 0.008350999094545841 2023-01-22 22:28:41.904780: step: 1244/533, loss: 0.010961202904582024 2023-01-22 22:28:42.998492: step: 1248/533, loss: 0.027240445837378502 2023-01-22 22:28:44.104470: step: 1252/533, loss: 0.021779421716928482 2023-01-22 22:28:45.200538: step: 1256/533, loss: 0.04111041501164436 2023-01-22 22:28:46.281715: step: 1260/533, loss: 0.020230380818247795 2023-01-22 22:28:47.364898: step: 1264/533, loss: 0.029437899589538574 2023-01-22 22:28:48.452896: step: 1268/533, loss: 0.050302885472774506 2023-01-22 22:28:49.586577: step: 1272/533, loss: 0.05923996865749359 2023-01-22 22:28:50.683698: step: 1276/533, loss: 0.0068550799041986465 2023-01-22 22:28:51.774571: step: 1280/533, loss: 0.00029076560167595744 2023-01-22 22:28:52.887162: step: 1284/533, loss: 0.010229308158159256 2023-01-22 22:28:53.965575: step: 1288/533, loss: 0.055063337087631226 2023-01-22 22:28:55.073697: step: 1292/533, loss: 0.0035267339553683996 2023-01-22 22:28:56.170096: step: 1296/533, loss: 0.009123510681092739 2023-01-22 22:28:57.254813: step: 1300/533, loss: 0.0038117847871035337 2023-01-22 22:28:58.374848: step: 1304/533, loss: 0.0029906199779361486 2023-01-22 22:28:59.485710: step: 1308/533, loss: 0.011358625255525112 2023-01-22 22:29:00.569018: step: 1312/533, loss: 0.03628367558121681 2023-01-22 22:29:01.660202: step: 1316/533, loss: 0.007179169915616512 2023-01-22 22:29:02.749820: step: 1320/533, loss: 0.008094103075563908 2023-01-22 22:29:03.849501: step: 1324/533, loss: 0.028030727058649063 2023-01-22 22:29:04.947143: step: 1328/533, loss: 0.008527093566954136 2023-01-22 22:29:06.048065: step: 1332/533, loss: 0.02082739770412445 2023-01-22 22:29:07.160408: step: 1336/533, loss: 0.014704165048897266 2023-01-22 22:29:08.242886: step: 1340/533, loss: 0.011326384730637074 2023-01-22 22:29:09.361154: step: 1344/533, loss: 0.011415835469961166 2023-01-22 22:29:10.472991: step: 1348/533, loss: 0.022344104945659637 2023-01-22 22:29:11.566762: step: 1352/533, loss: 0.006475296337157488 2023-01-22 22:29:12.633132: step: 1356/533, loss: 0.004845849238336086 2023-01-22 22:29:13.714783: step: 1360/533, loss: 0.00635699275881052 2023-01-22 22:29:14.813917: step: 1364/533, loss: 0.05521085113286972 2023-01-22 22:29:15.916964: step: 1368/533, loss: 0.005095184780657291 2023-01-22 22:29:17.008082: step: 1372/533, loss: 0.0037074252031743526 2023-01-22 22:29:18.086387: step: 1376/533, loss: 0.01019546389579773 2023-01-22 22:29:19.196503: step: 1380/533, loss: 0.021380798891186714 2023-01-22 22:29:20.293805: step: 1384/533, loss: 0.04180828481912613 2023-01-22 22:29:21.404857: step: 1388/533, loss: 0.02912544086575508 2023-01-22 22:29:22.499692: step: 1392/533, loss: 0.0059900544583797455 2023-01-22 22:29:23.587595: step: 1396/533, loss: 0.009176409803330898 2023-01-22 22:29:24.698256: step: 1400/533, loss: 0.025094222277402878 2023-01-22 22:29:25.795506: step: 1404/533, loss: 0.003887948114424944 2023-01-22 22:29:26.899322: step: 1408/533, loss: 0.0069276876747608185 2023-01-22 22:29:27.993315: step: 1412/533, loss: 0.03354548290371895 2023-01-22 22:29:29.102803: step: 1416/533, loss: 0.012077408842742443 2023-01-22 22:29:30.210047: step: 1420/533, loss: 0.015202338807284832 2023-01-22 22:29:31.295400: step: 1424/533, loss: 0.0213848315179348 2023-01-22 22:29:32.371584: step: 1428/533, loss: 0.00997061375528574 2023-01-22 22:29:33.472934: step: 1432/533, loss: 0.02448503114283085 2023-01-22 22:29:34.574712: step: 1436/533, loss: 0.010743062011897564 2023-01-22 22:29:35.667004: step: 1440/533, loss: 0.011644295416772366 2023-01-22 22:29:36.752693: step: 1444/533, loss: 0.007209083065390587 2023-01-22 22:29:37.854661: step: 1448/533, loss: 0.00923768151551485 2023-01-22 22:29:38.960237: step: 1452/533, loss: 0.01466023176908493 2023-01-22 22:29:40.039324: step: 1456/533, loss: 0.023172680288553238 2023-01-22 22:29:41.143567: step: 1460/533, loss: 0.032273199409246445 2023-01-22 22:29:42.243670: step: 1464/533, loss: 0.053501833230257034 2023-01-22 22:29:43.341048: step: 1468/533, loss: 0.05315322428941727 2023-01-22 22:29:44.420694: step: 1472/533, loss: 0.044286470860242844 2023-01-22 22:29:45.547926: step: 1476/533, loss: 0.00724642351269722 2023-01-22 22:29:46.636250: step: 1480/533, loss: 0.0035075454507023096 2023-01-22 22:29:47.744919: step: 1484/533, loss: 0.07574257999658585 2023-01-22 22:29:48.846461: step: 1488/533, loss: 0.02287207916378975 2023-01-22 22:29:49.945305: step: 1492/533, loss: 0.049778617918491364 2023-01-22 22:29:51.034289: step: 1496/533, loss: 0.008154568262398243 2023-01-22 22:29:52.119406: step: 1500/533, loss: 0.025839634239673615 2023-01-22 22:29:53.202390: step: 1504/533, loss: 0.026096895337104797 2023-01-22 22:29:54.287311: step: 1508/533, loss: 0.01595170982182026 2023-01-22 22:29:55.369935: step: 1512/533, loss: 0.009852842427790165 2023-01-22 22:29:56.465043: step: 1516/533, loss: 0.005769921932369471 2023-01-22 22:29:57.553857: step: 1520/533, loss: 0.010454357601702213 2023-01-22 22:29:58.653331: step: 1524/533, loss: 0.00987264234572649 2023-01-22 22:29:59.766484: step: 1528/533, loss: 0.03630882129073143 2023-01-22 22:30:00.881356: step: 1532/533, loss: 0.003901050426065922 2023-01-22 22:30:01.980874: step: 1536/533, loss: 0.011978207156062126 2023-01-22 22:30:03.094625: step: 1540/533, loss: 0.00884245429188013 2023-01-22 22:30:04.199792: step: 1544/533, loss: 0.009602317586541176 2023-01-22 22:30:05.300057: step: 1548/533, loss: 0.029825082048773766 2023-01-22 22:30:06.420272: step: 1552/533, loss: 0.006534433923661709 2023-01-22 22:30:07.508077: step: 1556/533, loss: 0.0547732338309288 2023-01-22 22:30:08.605700: step: 1560/533, loss: 0.008808031678199768 2023-01-22 22:30:09.713277: step: 1564/533, loss: 0.0069520603865385056 2023-01-22 22:30:10.831674: step: 1568/533, loss: 0.007622621953487396 2023-01-22 22:30:11.938402: step: 1572/533, loss: 0.0411258190870285 2023-01-22 22:30:13.043285: step: 1576/533, loss: 0.007049611769616604 2023-01-22 22:30:14.148807: step: 1580/533, loss: 0.028327833861112595 2023-01-22 22:30:15.260426: step: 1584/533, loss: 0.009556399658322334 2023-01-22 22:30:16.347679: step: 1588/533, loss: 0.022109180688858032 2023-01-22 22:30:17.446049: step: 1592/533, loss: 0.006933972239494324 2023-01-22 22:30:18.538171: step: 1596/533, loss: 0.008853824809193611 2023-01-22 22:30:19.654508: step: 1600/533, loss: 0.04165737330913544 2023-01-22 22:30:20.739163: step: 1604/533, loss: 0.017674963921308517 2023-01-22 22:30:21.820818: step: 1608/533, loss: 0.009653130546212196 2023-01-22 22:30:22.897803: step: 1612/533, loss: 0.0032906457781791687 2023-01-22 22:30:24.039798: step: 1616/533, loss: 0.06782279163599014 2023-01-22 22:30:25.142595: step: 1620/533, loss: 0.019513968378305435 2023-01-22 22:30:26.235976: step: 1624/533, loss: 0.00859641283750534 2023-01-22 22:30:27.329959: step: 1628/533, loss: 0.03269661217927933 2023-01-22 22:30:28.411211: step: 1632/533, loss: 0.016636256128549576 2023-01-22 22:30:29.533348: step: 1636/533, loss: 0.004539536312222481 2023-01-22 22:30:30.639855: step: 1640/533, loss: 0.02281329780817032 2023-01-22 22:30:31.751924: step: 1644/533, loss: 0.015804894268512726 2023-01-22 22:30:32.840344: step: 1648/533, loss: 0.013316814787685871 2023-01-22 22:30:33.930779: step: 1652/533, loss: 0.013073669746518135 2023-01-22 22:30:35.042073: step: 1656/533, loss: 0.01558060571551323 2023-01-22 22:30:36.165458: step: 1660/533, loss: 0.04010102152824402 2023-01-22 22:30:37.261842: step: 1664/533, loss: 0.007026083767414093 2023-01-22 22:30:38.347367: step: 1668/533, loss: 0.0055320607498288155 2023-01-22 22:30:39.438027: step: 1672/533, loss: 0.00664052739739418 2023-01-22 22:30:40.516319: step: 1676/533, loss: 0.004452748689800501 2023-01-22 22:30:41.600266: step: 1680/533, loss: 0.006865574046969414 2023-01-22 22:30:42.688529: step: 1684/533, loss: 0.008633121848106384 2023-01-22 22:30:43.791917: step: 1688/533, loss: 0.00979382824152708 2023-01-22 22:30:44.902141: step: 1692/533, loss: 0.004290101118385792 2023-01-22 22:30:45.995230: step: 1696/533, loss: 0.008509436622262001 2023-01-22 22:30:47.094870: step: 1700/533, loss: 0.005751847289502621 2023-01-22 22:30:48.204847: step: 1704/533, loss: 0.00995751190930605 2023-01-22 22:30:49.313795: step: 1708/533, loss: 0.017905499786138535 2023-01-22 22:30:50.401515: step: 1712/533, loss: 0.003714309073984623 2023-01-22 22:30:51.492330: step: 1716/533, loss: 0.03903310373425484 2023-01-22 22:30:52.588241: step: 1720/533, loss: 0.005910523235797882 2023-01-22 22:30:53.695643: step: 1724/533, loss: 0.012380363419651985 2023-01-22 22:30:54.794013: step: 1728/533, loss: 0.005427168682217598 2023-01-22 22:30:55.897445: step: 1732/533, loss: 0.007450432050973177 2023-01-22 22:30:57.037166: step: 1736/533, loss: 0.06731580197811127 2023-01-22 22:30:58.134597: step: 1740/533, loss: 0.007790098432451487 2023-01-22 22:30:59.214691: step: 1744/533, loss: 0.008770889602601528 2023-01-22 22:31:00.317141: step: 1748/533, loss: 0.011013220064342022 2023-01-22 22:31:01.395787: step: 1752/533, loss: 0.008153360337018967 2023-01-22 22:31:02.488230: step: 1756/533, loss: 0.020969144999980927 2023-01-22 22:31:03.570558: step: 1760/533, loss: 0.010635321028530598 2023-01-22 22:31:04.656486: step: 1764/533, loss: 0.005536994896829128 2023-01-22 22:31:05.764817: step: 1768/533, loss: 0.01389839593321085 2023-01-22 22:31:06.865954: step: 1772/533, loss: 0.005563698243349791 2023-01-22 22:31:07.991947: step: 1776/533, loss: 0.003085100557655096 2023-01-22 22:31:09.105701: step: 1780/533, loss: 0.011299679055809975 2023-01-22 22:31:10.193856: step: 1784/533, loss: 0.0077589950524270535 2023-01-22 22:31:11.288119: step: 1788/533, loss: 0.004612649325281382 2023-01-22 22:31:12.391000: step: 1792/533, loss: 0.034109704196453094 2023-01-22 22:31:13.488823: step: 1796/533, loss: 0.00584327382966876 2023-01-22 22:31:14.570751: step: 1800/533, loss: 0.003515771823003888 2023-01-22 22:31:15.661591: step: 1804/533, loss: 0.004595890175551176 2023-01-22 22:31:16.748936: step: 1808/533, loss: 0.004432844929397106 2023-01-22 22:31:17.847669: step: 1812/533, loss: 0.004281031899154186 2023-01-22 22:31:18.964678: step: 1816/533, loss: 0.027832726016640663 2023-01-22 22:31:20.053284: step: 1820/533, loss: 0.008137507364153862 2023-01-22 22:31:21.146274: step: 1824/533, loss: 0.030173610895872116 2023-01-22 22:31:22.254667: step: 1828/533, loss: 0.00615853164345026 2023-01-22 22:31:23.352056: step: 1832/533, loss: 0.005501364823430777 2023-01-22 22:31:24.437124: step: 1836/533, loss: 0.010705860331654549 2023-01-22 22:31:25.522470: step: 1840/533, loss: 0.0012563667260110378 2023-01-22 22:31:26.659767: step: 1844/533, loss: 0.014464114792644978 2023-01-22 22:31:27.752571: step: 1848/533, loss: 0.004503981675952673 2023-01-22 22:31:28.842510: step: 1852/533, loss: 0.012309740297496319 2023-01-22 22:31:29.975087: step: 1856/533, loss: 0.006218461319804192 2023-01-22 22:31:31.081363: step: 1860/533, loss: 0.014311340637505054 2023-01-22 22:31:32.187687: step: 1864/533, loss: 0.023816652595996857 2023-01-22 22:31:33.278042: step: 1868/533, loss: 0.00803129281848669 2023-01-22 22:31:34.349779: step: 1872/533, loss: 0.01694415509700775 2023-01-22 22:31:35.448505: step: 1876/533, loss: 0.0550045408308506 2023-01-22 22:31:36.541830: step: 1880/533, loss: 0.008916349150240421 2023-01-22 22:31:37.635213: step: 1884/533, loss: 0.007531862240284681 2023-01-22 22:31:38.723306: step: 1888/533, loss: 0.007989827543497086 2023-01-22 22:31:39.813668: step: 1892/533, loss: 0.03825875371694565 2023-01-22 22:31:40.898374: step: 1896/533, loss: 0.010656577534973621 2023-01-22 22:31:41.987593: step: 1900/533, loss: 0.002253742655739188 2023-01-22 22:31:43.102845: step: 1904/533, loss: 0.041025061160326004 2023-01-22 22:31:44.216637: step: 1908/533, loss: 0.01935526356101036 2023-01-22 22:31:45.301388: step: 1912/533, loss: 0.002935813507065177 2023-01-22 22:31:46.379219: step: 1916/533, loss: 0.019959282130002975 2023-01-22 22:31:47.459903: step: 1920/533, loss: 0.005920074880123138 2023-01-22 22:31:48.560383: step: 1924/533, loss: 0.0035745338536798954 2023-01-22 22:31:49.673102: step: 1928/533, loss: 0.02764144539833069 2023-01-22 22:31:50.763762: step: 1932/533, loss: 0.013049495406448841 2023-01-22 22:31:51.874111: step: 1936/533, loss: 0.007185091730207205 2023-01-22 22:31:52.954402: step: 1940/533, loss: 0.01010767463594675 2023-01-22 22:31:54.038359: step: 1944/533, loss: 0.0068223485723137856 2023-01-22 22:31:55.145990: step: 1948/533, loss: 0.00973474606871605 2023-01-22 22:31:56.251962: step: 1952/533, loss: 0.009427509270608425 2023-01-22 22:31:57.331830: step: 1956/533, loss: 0.03984740003943443 2023-01-22 22:31:58.440611: step: 1960/533, loss: 0.011379594914615154 2023-01-22 22:31:59.554737: step: 1964/533, loss: 0.041487812995910645 2023-01-22 22:32:00.676207: step: 1968/533, loss: 0.010327148251235485 2023-01-22 22:32:01.756181: step: 1972/533, loss: 0.010539263486862183 2023-01-22 22:32:02.828223: step: 1976/533, loss: 0.004065783228725195 2023-01-22 22:32:03.925151: step: 1980/533, loss: 0.0056487261317670345 2023-01-22 22:32:05.034409: step: 1984/533, loss: 0.005893161986023188 2023-01-22 22:32:06.136425: step: 1988/533, loss: 0.005986033007502556 2023-01-22 22:32:07.220570: step: 1992/533, loss: 0.007133099716156721 2023-01-22 22:32:08.310154: step: 1996/533, loss: 0.007295568007975817 2023-01-22 22:32:09.424242: step: 2000/533, loss: 0.07170341908931732 2023-01-22 22:32:10.507507: step: 2004/533, loss: 0.001890211016871035 2023-01-22 22:32:11.600438: step: 2008/533, loss: 0.006241047754883766 2023-01-22 22:32:12.694151: step: 2012/533, loss: 0.010247474536299706 2023-01-22 22:32:13.779528: step: 2016/533, loss: 0.00493429321795702 2023-01-22 22:32:14.881348: step: 2020/533, loss: 0.007579256314784288 2023-01-22 22:32:15.975326: step: 2024/533, loss: 0.039033062756061554 2023-01-22 22:32:17.090645: step: 2028/533, loss: 0.04566526785492897 2023-01-22 22:32:18.186386: step: 2032/533, loss: 0.008977861143648624 2023-01-22 22:32:19.289077: step: 2036/533, loss: 0.004118893761187792 2023-01-22 22:32:20.376423: step: 2040/533, loss: 0.008037014864385128 2023-01-22 22:32:21.472588: step: 2044/533, loss: 0.0704910084605217 2023-01-22 22:32:22.581757: step: 2048/533, loss: 0.018464455381035805 2023-01-22 22:32:23.665945: step: 2052/533, loss: 0.006822636816650629 2023-01-22 22:32:24.751441: step: 2056/533, loss: 0.007116106804460287 2023-01-22 22:32:25.824862: step: 2060/533, loss: 0.010958656668663025 2023-01-22 22:32:26.949199: step: 2064/533, loss: 0.016052599996328354 2023-01-22 22:32:28.039250: step: 2068/533, loss: 0.0020786316599696875 2023-01-22 22:32:29.123619: step: 2072/533, loss: 0.008314988575875759 2023-01-22 22:32:30.202727: step: 2076/533, loss: 0.017723323777318 2023-01-22 22:32:31.300548: step: 2080/533, loss: 0.0037312619388103485 2023-01-22 22:32:32.382560: step: 2084/533, loss: 0.009290730580687523 2023-01-22 22:32:33.454832: step: 2088/533, loss: 0.01150690671056509 2023-01-22 22:32:34.536484: step: 2092/533, loss: 0.027441728860139847 2023-01-22 22:32:35.623630: step: 2096/533, loss: 0.0028937114402651787 2023-01-22 22:32:36.709726: step: 2100/533, loss: 0.006101224105805159 2023-01-22 22:32:37.785650: step: 2104/533, loss: 0.01195582840591669 2023-01-22 22:32:38.877781: step: 2108/533, loss: 0.009777838364243507 2023-01-22 22:32:39.968470: step: 2112/533, loss: 0.03552310913801193 2023-01-22 22:32:41.080822: step: 2116/533, loss: 0.003615034045651555 2023-01-22 22:32:42.175285: step: 2120/533, loss: 0.005735160317271948 2023-01-22 22:32:43.276160: step: 2124/533, loss: 0.027683071792125702 2023-01-22 22:32:44.365034: step: 2128/533, loss: 0.02342553436756134 2023-01-22 22:32:45.448699: step: 2132/533, loss: 0.004547464195638895 ================================================== Loss: 0.017 -------------------- Dev Chinese: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.32428147077713554, 'r': 0.31935879190385835, 'f1': 0.32180130656469097}, 'combined': 0.23711675220556175, 'stategy': 1, 'epoch': 1} Test Chinese: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.3663224699848436, 'r': 0.3480063464856014, 'f1': 0.3569295861390784}, 'combined': 0.24860767193766656, 'stategy': 1, 'epoch': 1} Dev Korean: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.3289270176644567, 'r': 0.3301753175417412, 'f1': 0.32954998550094244}, 'combined': 0.24282630510595757, 'stategy': 1, 'epoch': 1} Test Korean: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.36936707206632263, 'r': 0.34807790364751445, 'f1': 0.35840662482399854}, 'combined': 0.24963645510129254, 'stategy': 1, 'epoch': 1} Dev Russian: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.30669266441821247, 'r': 0.34510199240986716, 'f1': 0.324765625}, 'combined': 0.23930098684210524, 'stategy': 1, 'epoch': 1} Test Russian: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.35636499897879703, 'r': 0.3392320663355856, 'f1': 0.34758753594976266}, 'combined': 0.2421007713082924, 'stategy': 1, 'epoch': 1} Sample Chinese: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.28532608695652173, 'r': 0.375, 'f1': 0.32407407407407407}, 'combined': 0.21604938271604937, 'stategy': 1, 'epoch': 1} Sample Korean: {'template': {'p': 0.5, 'r': 0.5, 'f1': 0.5}, 'slot': {'p': 0.23484848484848486, 'r': 0.33695652173913043, 'f1': 0.2767857142857143}, 'combined': 0.13839285714285715, 'stategy': 1, 'epoch': 1} Sample Russian: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.5119047619047619, 'r': 0.3706896551724138, 'f1': 0.43}, 'combined': 0.2866666666666666, 'stategy': 1, 'epoch': 1} New best chinese model... New best russian model... ================================================== Current best result: -------------------- Dev for Chinese: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.32428147077713554, 'r': 0.31935879190385835, 'f1': 0.32180130656469097}, 'combined': 0.23711675220556175, 'stategy': 1, 'epoch': 1} Test for Chinese: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.3663224699848436, 'r': 0.3480063464856014, 'f1': 0.3569295861390784}, 'combined': 0.24860767193766656, 'stategy': 1, 'epoch': 1} Chinese: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.28532608695652173, 'r': 0.375, 'f1': 0.32407407407407407}, 'combined': 0.21604938271604937, 'stategy': 1, 'epoch': 1} -------------------- Dev for Korean: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.3149038251270523, 'r': 0.3268546344297867, 'f1': 0.3207679559487851}, 'combined': 0.23635533596226269, 'stategy': 1, 'epoch': 0} Test for Korean: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.35132125453224455, 'r': 0.3546960984758781, 'f1': 0.3530006104334503}, 'combined': 0.24587107194369676, 'stategy': 1, 'epoch': 0} Korean: {'template': {'p': 0.5, 'r': 0.5, 'f1': 0.5}, 'slot': {'p': 0.25757575757575757, 'r': 0.3695652173913043, 'f1': 0.30357142857142855}, 'combined': 0.15178571428571427, 'stategy': 1, 'epoch': 0} -------------------- Dev for Russian: {'template': {'p': 1.0, 'r': 0.5833333333333334, 'f1': 0.7368421052631579}, 'slot': {'p': 0.30669266441821247, 'r': 0.34510199240986716, 'f1': 0.324765625}, 'combined': 0.23930098684210524, 'stategy': 1, 'epoch': 1} Test for Russian: {'template': {'p': 0.9459459459459459, 'r': 0.5511811023622047, 'f1': 0.6965174129353234}, 'slot': {'p': 0.35636499897879703, 'r': 0.3392320663355856, 'f1': 0.34758753594976266}, 'combined': 0.2421007713082924, 'stategy': 1, 'epoch': 1} Russian: {'template': {'p': 1.0, 'r': 0.5, 'f1': 0.6666666666666666}, 'slot': {'p': 0.5119047619047619, 'r': 0.3706896551724138, 'f1': 0.43}, 'combined': 0.2866666666666666, 'stategy': 1, 'epoch': 1} ****************************** Epoch: 2 command: python train.py --model_name coref --xlmr_model_name xlm-roberta-large --batch_size 16 --xlmr_learning_rate 2e-5 --accumulate_step 4 --max_epoch 20 --event_hidden_num 500 --p1_data_weight 0.2 --learning_rate 9e-4 2023-01-22 22:35:24.892633: step: 4/533, loss: 0.03143540024757385 2023-01-22 22:35:25.989026: step: 8/533, loss: 0.007290362846106291 2023-01-22 22:35:27.090683: step: 12/533, loss: 0.05290674790740013 2023-01-22 22:35:28.181738: step: 16/533, loss: 0.030136551707983017 2023-01-22 22:35:29.270470: step: 20/533, loss: 0.004592083394527435 2023-01-22 22:35:30.370500: step: 24/533, loss: 0.002864312846213579 2023-01-22 22:35:31.463719: step: 28/533, loss: 0.00025884315255098045 2023-01-22 22:35:32.544382: step: 32/533, loss: 0.0036720414645969868 2023-01-22 22:35:33.627684: step: 36/533, loss: 0.008596212603151798 2023-01-22 22:35:34.713209: step: 40/533, loss: 0.049139782786369324 2023-01-22 22:35:35.805009: step: 44/533, loss: 0.01291133277118206 2023-01-22 22:35:36.887859: step: 48/533, loss: 0.006683246698230505 2023-01-22 22:35:37.946704: step: 52/533, loss: 0.04288350045681 2023-01-22 22:35:39.020465: step: 56/533, loss: 0.01680571213364601 2023-01-22 22:35:40.122099: step: 60/533, loss: 0.004482606891542673 2023-01-22 22:35:41.210789: step: 64/533, loss: 0.008595411665737629 2023-01-22 22:35:42.300450: step: 68/533, loss: 0.005513062234967947 2023-01-22 22:35:43.385602: step: 72/533, loss: 0.017828622832894325 2023-01-22 22:35:44.465219: step: 76/533, loss: 0.00782066024839878 2023-01-22 22:35:45.553703: step: 80/533, loss: 0.006408941466361284 2023-01-22 22:35:46.623016: step: 84/533, loss: 0.0036198075395077467 2023-01-22 22:35:47.690199: step: 88/533, loss: 0.01893874630331993 2023-01-22 22:35:48.777363: step: 92/533, loss: 0.012946904636919498 2023-01-22 22:35:49.855169: step: 96/533, loss: 0.034745074808597565 2023-01-22 22:35:50.934989: step: 100/533, loss: 0.003039517905563116 2023-01-22 22:35:52.010131: step: 104/533, loss: 0.0 2023-01-22 22:35:53.118136: step: 108/533, loss: 0.00045108553604222834 2023-01-22 22:35:54.217785: step: 112/533, loss: 0.02059978060424328 2023-01-22 22:35:55.311613: step: 116/533, loss: 0.006527471821755171 2023-01-22 22:35:56.408165: step: 120/533, loss: 0.037255242466926575 2023-01-22 22:35:57.505179: step: 124/533, loss: 0.01731891743838787 2023-01-22 22:35:58.614026: step: 128/533, loss: 0.008575621992349625 2023-01-22 22:35:59.715665: step: 132/533, loss: 0.011498386971652508 2023-01-22 22:36:00.797410: step: 136/533, loss: 0.011816848069429398 2023-01-22 22:36:01.881870: step: 140/533, loss: 0.004683245904743671 2023-01-22 22:36:02.979093: step: 144/533, loss: 0.008969353511929512 2023-01-22 22:36:04.074051: step: 148/533, loss: 0.007163285277783871 2023-01-22 22:36:05.168020: step: 152/533, loss: 0.0037118778564035892 2023-01-22 22:36:06.266423: step: 156/533, loss: 0.005938536487519741 2023-01-22 22:36:07.362253: step: 160/533, loss: 0.0073188054375350475 2023-01-22 22:36:08.468449: step: 164/533, loss: 0.02381749078631401 2023-01-22 22:36:09.544104: step: 168/533, loss: 0.04931014031171799 2023-01-22 22:36:10.643569: step: 172/533, loss: 0.0024933486711233854 2023-01-22 22:36:11.748241: step: 176/533, loss: 0.006205394398421049 2023-01-22 22:36:12.863035: step: 180/533, loss: 0.006418520584702492 2023-01-22 22:36:13.943767: step: 184/533, loss: 0.011980396695435047 2023-01-22 22:36:15.041231: step: 188/533, loss: 0.011190742254257202 2023-01-22 22:36:16.120392: step: 192/533, loss: 0.017391767352819443 2023-01-22 22:36:17.202726: step: 196/533, loss: 0.004236615262925625 2023-01-22 22:36:18.303298: step: 200/533, loss: 0.012163491919636726 2023-01-22 22:36:19.421601: step: 204/533, loss: 0.006327173672616482 2023-01-22 22:36:20.523469: step: 208/533, loss: 0.005510473623871803 2023-01-22 22:36:21.632131: step: 212/533, loss: 0.005834348499774933 2023-01-22 22:36:22.736645: step: 216/533, loss: 0.00440135458484292 2023-01-22 22:36:23.837052: step: 220/533, loss: 0.004853852093219757 2023-01-22 22:36:24.941074: step: 224/533, loss: 0.005192887969315052 2023-01-22 22:36:26.046739: step: 228/533, loss: 0.00664374977350235 2023-01-22 22:36:27.140679: step: 232/533, loss: 0.007884320802986622 2023-01-22 22:36:28.234136: step: 236/533, loss: 0.008248616941273212 2023-01-22 22:36:29.327605: step: 240/533, loss: 0.013699514791369438 2023-01-22 22:36:30.436547: step: 244/533, loss: 0.016546521335840225 2023-01-22 22:36:31.538818: step: 248/533, loss: 0.004130684770643711 2023-01-22 22:36:32.649739: step: 252/533, loss: 0.009633234702050686 2023-01-22 22:36:33.762410: step: 256/533, loss: 0.027422703802585602 2023-01-22 22:36:34.854670: step: 260/533, loss: 0.005947364494204521 2023-01-22 22:36:35.934381: step: 264/533, loss: 0.0081117432564497 2023-01-22 22:36:37.011586: step: 268/533, loss: 0.006476367823779583 2023-01-22 22:36:38.100095: step: 272/533, loss: 0.004160203970968723 2023-01-22 22:36:39.209607: step: 276/533, loss: 0.012318079359829426 2023-01-22 22:36:40.286999: step: 280/533, loss: 0.011149939149618149 2023-01-22 22:36:41.374542: step: 284/533, loss: 0.008003066293895245 2023-01-22 22:36:42.464165: step: 288/533, loss: 0.00974404439330101 2023-01-22 22:36:43.595520: step: 292/533, loss: 0.0058694640174508095 2023-01-22 22:36:44.695393: step: 296/533, loss: 0.009975536726415157 2023-01-22 22:36:45.781967: step: 300/533, loss: 0.018420135602355003 2023-01-22 22:36:46.871267: step: 304/533, loss: 0.010691123083233833 2023-01-22 22:36:47.970168: step: 308/533, loss: 0.01665141060948372 2023-01-22 22:36:49.098660: step: 312/533, loss: 0.003338320879265666 2023-01-22 22:36:50.231466: step: 316/533, loss: 0.011709248647093773 2023-01-22 22:36:51.338060: step: 320/533, loss: 0.006270202342420816 2023-01-22 22:36:52.448334: step: 324/533, loss: 0.005804487504065037 2023-01-22 22:36:53.543364: step: 328/533, loss: 0.005920395255088806 2023-01-22 22:36:54.629383: step: 332/533, loss: 0.0020027682185173035 2023-01-22 22:36:55.724391: step: 336/533, loss: 0.0036866022273898125 2023-01-22 22:36:56.824731: step: 340/533, loss: 0.005470162723213434 2023-01-22 22:36:57.913793: step: 344/533, loss: 0.003237447002902627 2023-01-22 22:36:58.995352: step: 348/533, loss: 0.01068158820271492 2023-01-22 22:37:00.079387: step: 352/533, loss: 0.014397768303751945 2023-01-22 22:37:01.168092: step: 356/533, loss: 0.019023533910512924 2023-01-22 22:37:02.269724: step: 360/533, loss: 0.015253226272761822 2023-01-22 22:37:03.376825: step: 364/533, loss: 0.006832933984696865 2023-01-22 22:37:04.464580: step: 368/533, loss: 0.010551640763878822 2023-01-22 22:37:05.547036: step: 372/533, loss: 0.004681070800870657 2023-01-22 22:37:06.634758: step: 376/533, loss: 0.05827747657895088 2023-01-22 22:37:07.733431: step: 380/533, loss: 0.019183877855539322 2023-01-22 22:37:08.822970: step: 384/533, loss: 0.008916245773434639 2023-01-22 22:37:09.924578: step: 388/533, loss: 0.015328424982726574 2023-01-22 22:37:11.010897: step: 392/533, loss: 0.007892209105193615 2023-01-22 22:37:12.096176: step: 396/533, loss: 0.006977444048970938 2023-01-22 22:37:13.193728: step: 400/533, loss: 0.009565283544361591 2023-01-22 22:37:14.287145: step: 404/533, loss: 0.0036076803226023912 2023-01-22 22:37:15.376747: step: 408/533, loss: 0.005722390487790108 2023-01-22 22:37:16.469876: step: 412/533, loss: 0.034901220351457596 2023-01-22 22:37:17.572474: step: 416/533, loss: 0.003244719933718443 2023-01-22 22:37:18.686418: step: 420/533, loss: 0.011517805978655815 2023-01-22 22:37:19.794335: step: 424/533, loss: 0.007823131047189236 2023-01-22 22:37:20.863207: step: 428/533, loss: 0.004583883564919233 2023-01-22 22:37:21.990379: step: 432/533, loss: 0.020031999796628952 2023-01-22 22:37:23.077746: step: 436/533, loss: 0.00542818196117878 2023-01-22 22:37:24.164289: step: 440/533, loss: 0.004846257623285055 2023-01-22 22:37:25.288070: step: 444/533, loss: 0.015640348196029663 2023-01-22 22:37:26.390920: step: 448/533, loss: 0.0028899298049509525 2023-01-22 22:37:27.478244: step: 452/533, loss: 0.03152679651975632 2023-01-22 22:37:28.599743: step: 456/533, loss: 0.018906928598880768 2023-01-22 22:37:29.696169: step: 460/533, loss: 0.014965725131332874 2023-01-22 22:37:30.775426: step: 464/533, loss: 0.02057889848947525 2023-01-22 22:37:31.887556: step: 468/533, loss: 0.019665833562612534 2023-01-22 22:37:32.962162: step: 472/533, loss: 0.0073335799388587475 2023-01-22 22:37:34.079600: step: 476/533, loss: 0.0044280048459768295 2023-01-22 22:37:35.180956: step: 480/533, loss: 0.020294252783060074 2023-01-22 22:37:36.285778: step: 484/533, loss: 0.008550722151994705 2023-01-22 22:37:37.408008: step: 488/533, loss: 0.003183335065841675 2023-01-22 22:37:38.518114: step: 492/533, loss: 0.000815436476841569 2023-01-22 22:37:39.601571: step: 496/533, loss: 0.005482346285134554 2023-01-22 22:37:40.716543: step: 500/533, loss: 0.013066351413726807 2023-01-22 22:37:41.816503: step: 504/533, loss: 0.00689847394824028 2023-01-22 22:37:42.907413: step: 508/533, loss: 0.02621537446975708 2023-01-22 22:37:44.034275: step: 512/533, loss: 0.003458647057414055 2023-01-22 22:37:45.139802: step: 516/533, loss: 0.004116411320865154 2023-01-22 22:37:46.245160: step: 520/533, loss: 0.005164094269275665 2023-01-22 22:37:47.342298: step: 524/533, loss: 0.022609392181038857 2023-01-22 22:37:48.429233: step: 528/533, loss: 0.023363104090094566 2023-01-22 22:37:49.518250: step: 532/533, loss: 0.005277535412460566 2023-01-22 22:37:50.615160: step: 536/533, loss: 0.0020729824900627136 2023-01-22 22:37:51.725911: step: 540/533, loss: 0.00665440084412694 2023-01-22 22:37:52.811545: step: 544/533, loss: 0.024406228214502335 2023-01-22 22:37:53.902371: step: 548/533, loss: 0.007575994823127985 2023-01-22 22:37:55.003437: step: 552/533, loss: 0.012080642394721508 2023-01-22 22:37:56.074327: step: 556/533, loss: 0.020412802696228027 2023-01-22 22:37:57.159222: step: 560/533, loss: 0.004277101252228022 2023-01-22 22:37:58.248905: step: 564/533, loss: 0.00527096027508378 2023-01-22 22:37:59.361567: step: 568/533, loss: 0.0021309838630259037 2023-01-22 22:38:00.459068: step: 572/533, loss: 0.005911529064178467 2023-01-22 22:38:01.554721: step: 576/533, loss: 0.012290279380977154 2023-01-22 22:38:02.676874: step: 580/533, loss: 0.1478431671857834 2023-01-22 22:38:03.772437: step: 584/533, loss: 0.029301557689905167 2023-01-22 22:38:04.902290: step: 588/533, loss: 0.009973284788429737 2023-01-22 22:38:06.003356: step: 592/533, loss: 0.010667902417480946 2023-01-22 22:38:07.084456: step: 596/533, loss: 0.004778855945914984 2023-01-22 22:38:08.191539: step: 600/533, loss: 0.0064847939647734165 2023-01-22 22:38:09.286664: step: 604/533, loss: 0.004565545357763767 2023-01-22 22:38:10.390760: step: 608/533, loss: 0.013621721416711807 2023-01-22 22:38:11.498256: step: 612/533, loss: 0.008003424853086472 2023-01-22 22:38:12.602408: step: 616/533, loss: 0.05185984447598457 2023-01-22 22:38:13.692679: step: 620/533, loss: 0.01074344851076603 2023-01-22 22:38:14.782796: step: 624/533, loss: 0.007197429891675711 2023-01-22 22:38:15.886708: step: 628/533, loss: 0.012690722942352295 2023-01-22 22:38:16.959889: step: 632/533, loss: 0.0031639900989830494 2023-01-22 22:38:18.043348: step: 636/533, loss: 0.0021190994884818792 2023-01-22 22:38:19.134876: step: 640/533, loss: 0.0095035620033741 2023-01-22 22:38:20.226273: step: 644/533, loss: 0.009052561596035957 2023-01-22 22:38:21.313673: step: 648/533, loss: 0.02644641324877739 2023-01-22 22:38:22.403389: step: 652/533, loss: 0.004775867331773043 2023-01-22 22:38:23.499133: step: 656/533, loss: 0.004322954919189215 2023-01-22 22:38:24.572912: step: 660/533, loss: 0.0017892587929964066 2023-01-22 22:38:25.668384: step: 664/533, loss: 0.032562047243118286 2023-01-22 22:38:26.761142: step: 668/533, loss: 0.006532198283821344 2023-01-22 22:38:27.838773: step: 672/533, loss: 0.005284937564283609 2023-01-22 22:38:28.922000: step: 676/533, loss: 0.00776162464171648 2023-01-22 22:38:29.989032: step: 680/533, loss: 0.01250469870865345 2023-01-22 22:38:31.075519: step: 684/533, loss: 0.009527524933218956 2023-01-22 22:38:32.177676: step: 688/533, loss: 0.01906448043882847 2023-01-22 22:38:33.282399: step: 692/533, loss: 0.04905780032277107 2023-01-22 22:38:34.376256: step: 696/533, loss: 0.00746331037953496 2023-01-22 22:38:35.467328: step: 700/533, loss: 0.005055660381913185 2023-01-22 22:38:36.555378: step: 704/533, loss: 0.009620510041713715 2023-01-22 22:38:37.666721: step: 708/533, loss: 0.006037093698978424 2023-01-22 22:38:38.750495: step: 712/533, loss: 0.018835173919796944 2023-01-22 22:38:39.847628: step: 716/533, loss: 0.03065245784819126 2023-01-22 22:38:40.960292: step: 720/533, loss: 0.015692785382270813 2023-01-22 22:38:42.053415: step: 724/533, loss: 0.00310829421505332 2023-01-22 22:38:43.146079: step: 728/533, loss: 0.006041796412318945 2023-01-22 22:38:44.279973: step: 732/533, loss: 0.0027411954943090677 2023-01-22 22:38:45.389637: step: 736/533, loss: 0.007090037688612938 2023-01-22 22:38:46.491885: step: 740/533, loss: 0.003693882841616869 2023-01-22 22:38:47.608822: step: 744/533, loss: 0.014240190386772156 2023-01-22 22:38:48.711480: step: 748/533, loss: 0.029635991901159286 2023-01-22 22:38:49.803270: step: 752/533, loss: 0.024593954905867577 2023-01-22 22:38:50.898158: step: 756/533, loss: 0.0072605423629283905 2023-01-22 22:38:51.988266: step: 760/533, loss: 0.021155603229999542 2023-01-22 22:38:53.085824: step: 764/533, loss: 0.0008442546240985394 2023-01-22 22:38:54.194708: step: 768/533, loss: 0.002843779744580388 2023-01-22 22:38:55.298184: step: 772/533, loss: 0.0 2023-01-22 22:38:56.391263: step: 776/533, loss: 0.0042098453268408775 2023-01-22 22:38:57.493145: step: 780/533, loss: 0.014037761837244034 2023-01-22 22:38:58.602364: step: 784/533, loss: 0.011157218366861343 2023-01-22 22:38:59.688461: step: 788/533, loss: 0.0074334233067929745 2023-01-22 22:39:00.793844: step: 792/533, loss: 0.005933855660259724 2023-01-22 22:39:01.873343: step: 796/533, loss: 0.018789594992995262 2023-01-22 22:39:02.983779: step: 800/533, loss: 0.007686153054237366 2023-01-22 22:39:04.083805: step: 804/533, loss: 0.00516678998246789 2023-01-22 22:39:05.162968: step: 808/533, loss: 0.026426780968904495 2023-01-22 22:39:06.276609: step: 812/533, loss: 0.0035205024760216475 2023-01-22 22:39:07.368724: step: 816/533, loss: 0.018345776945352554 2023-01-22 22:39:08.429222: step: 820/533, loss: 0.0026903992984443903 2023-01-22 22:39:09.528426: step: 824/533, loss: 0.00849522091448307 2023-01-22 22:39:10.602947: step: 828/533, loss: 0.00024266712716780603 2023-01-22 22:39:11.717407: step: 832/533, loss: 0.004328001290559769 2023-01-22 22:39:12.846651: step: 836/533, loss: 0.013686156831681728 2023-01-22 22:39:13.941503: step: 840/533, loss: 0.028534643352031708 2023-01-22 22:39:15.030474: step: 844/533, loss: 0.009142156690359116 2023-01-22 22:39:16.118380: step: 848/533, loss: 0.043152861297130585 2023-01-22 22:39:17.215732: step: 852/533, loss: 0.04180504381656647 2023-01-22 22:39:18.308167: step: 856/533, loss: 0.01873677968978882 2023-01-22 22:39:19.411888: step: 860/533, loss: 0.012232224456965923 2023-01-22 22:39:20.523355: step: 864/533, loss: 0.03122578375041485 2023-01-22 22:39:21.651636: step: 868/533, loss: 0.02284267544746399 2023-01-22 22:39:22.744267: step: 872/533, loss: 0.005114917643368244 2023-01-22 22:39:23.818679: step: 876/533, loss: 0.003608610015362501 2023-01-22 22:39:24.914247: step: 880/533, loss: 0.022983195260167122 2023-01-22 22:39:26.012409: step: 884/533, loss: 0.011115807108581066 2023-01-22 22:39:27.118939: step: 888/533, loss: 0.007405625656247139 2023-01-22 22:39:28.235090: step: 892/533, loss: 0.011830934323370457 2023-01-22 22:39:29.335115: step: 896/533, loss: 0.011407998390495777 2023-01-22 22:39:30.424004: step: 900/533, loss: 0.002506178803741932 2023-01-22 22:39:31.543365: step: 904/533, loss: 0.007663481868803501 2023-01-22 22:39:32.648032: step: 908/533, loss: 0.009060348384082317 2023-01-22 22:39:33.749076: step: 912/533, loss: 0.008394896984100342 2023-01-22 22:39:34.822646: step: 916/533, loss: 0.004666364286094904 2023-01-22 22:39:35.928132: step: 920/533, loss: 0.02784525416791439 2023-01-22 22:39:37.019598: step: 924/533, loss: 0.0032175423111766577 2023-01-22 22:39:38.098908: step: 928/533, loss: 0.004276135470718145 2023-01-22 22:39:39.195588: step: 932/533, loss: 0.01193176954984665 2023-01-22 22:39:40.284670: step: 936/533, loss: 0.006394187454134226 2023-01-22 22:39:41.368061: step: 940/533, loss: 0.011017643846571445 2023-01-22 22:39:42.462733: step: 944/533, loss: 0.003448704956099391 2023-01-22 22:39:43.558842: step: 948/533, loss: 0.0011138966074213386 2023-01-22 22:39:44.657712: step: 952/533, loss: 0.02269626595079899 2023-01-22 22:39:45.758727: step: 956/533, loss: 0.00867333635687828 2023-01-22 22:39:46.855755: step: 960/533, loss: 0.012057257816195488 2023-01-22 22:39:47.961320: step: 964/533, loss: 0.0028111450374126434 2023-01-22 22:39:49.056244: step: 968/533, loss: 0.003028042847290635 2023-01-22 22:39:50.138694: step: 972/533, loss: 0.0003705882409121841 2023-01-22 22:39:51.249374: step: 976/533, loss: 0.0076306890696287155 2023-01-22 22:39:52.356412: step: 980/533, loss: 0.006001919507980347 2023-01-22 22:39:53.424539: step: 984/533, loss: 0.0024036907125264406 2023-01-22 22:39:54.499236: step: 988/533, loss: 0.0006804695003665984 2023-01-22 22:39:55.603940: step: 992/533, loss: 0.008289298042654991 2023-01-22 22:39:56.702413: step: 996/533, loss: 0.05045729875564575 2023-01-22 22:39:57.805819: step: 1000/533, loss: 0.012132973410189152 2023-01-22 22:39:58.922524: step: 1004/533, loss: 0.00616392120718956 2023-01-22 22:40:00.019676: step: 1008/533, loss: 0.004644361790269613 2023-01-22 22:40:01.104795: step: 1012/533, loss: 0.006192962173372507 2023-01-22 22:40:02.206529: step: 1016/533, loss: 0.010288700461387634 2023-01-22 22:40:03.320187: step: 1020/533, loss: 0.0170535109937191 2023-01-22 22:40:04.416909: step: 1024/533, loss: 0.04445505142211914 2023-01-22 22:40:05.519024: step: 1028/533, loss: 0.008840791881084442 2023-01-22 22:40:06.603105: step: 1032/533, loss: 0.013872926123440266 2023-01-22 22:40:07.693161: step: 1036/533, loss: 0.03146233782172203 2023-01-22 22:40:08.782351: step: 1040/533, loss: 0.05298042297363281 2023-01-22 22:40:09.882006: step: 1044/533, loss: 0.005206727888435125 2023-01-22 22:40:10.969298: step: 1048/533, loss: 0.004956633318215609 2023-01-22 22:40:12.093670: step: 1052/533, loss: 0.04634881019592285 2023-01-22 22:40:13.213888: step: 1056/533, loss: 0.0393965058028698 2023-01-22 22:40:14.299149: step: 1060/533, loss: 0.011618509888648987 2023-01-22 22:40:15.396555: step: 1064/533, loss: 0.003678616601973772 2023-01-22 22:40:16.487855: step: 1068/533, loss: 0.005601240321993828 2023-01-22 22:40:17.586218: step: 1072/533, loss: 0.0068100872449576855 2023-01-22 22:40:18.694865: step: 1076/533, loss: 0.009597988799214363 2023-01-22 22:40:19.794998: step: 1080/533, loss: 0.008503307588398457 2023-01-22 22:40:20.865973: step: 1084/533, loss: 0.01531152706593275 2023-01-22 22:40:21.948134: step: 1088/533, loss: 0.0026673744432628155 2023-01-22 22:40:23.032906: step: 1092/533, loss: 0.0025334900710731745 2023-01-22 22:40:24.164028: step: 1096/533, loss: 0.0071305204182863235 2023-01-22 22:40:25.279498: step: 1100/533, loss: 0.011470625177025795 2023-01-22 22:40:26.391543: step: 1104/533, loss: 0.00866924412548542 2023-01-22 22:40:27.491736: step: 1108/533, loss: 0.013297321274876595 2023-01-22 22:40:28.583930: step: 1112/533, loss: 0.029257912188768387 2023-01-22 22:40:29.706545: step: 1116/533, loss: 0.0039297533221542835 2023-01-22 22:40:30.812279: step: 1120/533, loss: 0.007797276601195335 2023-01-22 22:40:31.912494: step: 1124/533, loss: 0.003927475772798061 2023-01-22 22:40:33.005662: step: 1128/533, loss: 0.011094893328845501 2023-01-22 22:40:34.088136: step: 1132/533, loss: 0.010308323428034782 2023-01-22 22:40:35.165137: step: 1136/533, loss: 0.01924131065607071 2023-01-22 22:40:36.268306: step: 1140/533, loss: 0.007669389247894287 2023-01-22 22:40:37.380510: step: 1144/533, loss: 0.03659879416227341 2023-01-22 22:40:38.465421: step: 1148/533, loss: 0.036211684346199036 2023-01-22 22:40:39.590827: step: 1152/533, loss: 0.0032197199761867523 2023-01-22 22:40:40.679959: step: 1156/533, loss: 0.009942148812115192 2023-01-22 22:40:41.771085: step: 1160/533, loss: 0.03760348632931709 2023-01-22 22:40:42.887238: step: 1164/533, loss: 0.009087193757295609 2023-01-22 22:40:43.982361: step: 1168/533, loss: 0.006520627997815609 2023-01-22 22:40:45.095931: step: 1172/533, loss: 0.007842054590582848 2023-01-22 22:40:46.188808: step: 1176/533, loss: 0.02118450589478016 2023-01-22 22:40:47.284446: step: 1180/533, loss: 0.002092203591018915 2023-01-22 22:40:48.382803: step: 1184/533, loss: 0.008313585072755814 2023-01-22 22:40:49.468734: step: 1188/533, loss: 0.015645988285541534 2023-01-22 22:40:50.571169: step: 1192/533, loss: 0.018593642860651016 2023-01-22 22:40:51.670454: step: 1196/533, loss: 0.049543894827365875 2023-01-22 22:40:52.760316: step: 1200/533, loss: 0.021710002794861794 2023-01-22 22:40:53.901000: step: 1204/533, loss: 0.00868787057697773 2023-01-22 22:40:54.992752: step: 1208/533, loss: 0.0015336856013163924 2023-01-22 22:40:56.106460: step: 1212/533, loss: 0.00531360087916255 2023-01-22 22:40:57.201112: step: 1216/533, loss: 0.004147379659116268 2023-01-22 22:40:58.329391: step: 1220/533, loss: 0.005360549781471491 2023-01-22 22:40:59.432868: step: 1224/533, loss: 0.0031740490812808275 2023-01-22 22:41:00.529465: step: 1228/533, loss: 0.002876545302569866 2023-01-22 22:41:01.639408: step: 1232/533, loss: 0.0034809508360922337 2023-01-22 22:41:02.724771: step: 1236/533, loss: 0.003415369661524892 2023-01-22 22:41:03.835987: step: 1240/533, loss: 0.0038287120405584574 2023-01-22 22:41:04.947338: step: 1244/533, loss: 0.008860662579536438 2023-01-22 22:41:06.042502: step: 1248/533, loss: 0.007989543490111828 2023-01-22 22:41:07.147547: step: 1252/533, loss: 0.019805900752544403 2023-01-22 22:41:08.235166: step: 1256/533, loss: 0.00515039311721921 2023-01-22 22:41:09.336104: step: 1260/533, loss: 0.005152449943125248 2023-01-22 22:41:10.429817: step: 1264/533, loss: 0.06558722257614136 2023-01-22 22:41:11.535889: step: 1268/533, loss: 0.046278487890958786 2023-01-22 22:41:12.638153: step: 1272/533, loss: 0.0010448063258081675 2023-01-22 22:41:13.735626: step: 1276/533, loss: 0.015850402414798737 2023-01-22 22:41:14.833402: step: 1280/533, loss: 0.007296034134924412 2023-01-22 22:41:15.928910: step: 1284/533, loss: 0.012302565388381481 2023-01-22 22:41:17.047326: step: 1288/533, loss: 0.015379000455141068 2023-01-22 22:41:18.138433: step: 1292/533, loss: 0.003552776761353016 2023-01-22 22:41:19.227202: step: 1296/533, loss: 0.004709901288151741 2023-01-22 22:41:20.371643: step: 1300/533, loss: 0.009566957131028175 2023-01-22 22:41:21.486919: step: 1304/533, loss: 0.007086416240781546 2023-01-22 22:41:22.608361: step: 1308/533, loss: 0.008032411336898804 2023-01-22 22:41:23.706353: step: 1312/533, loss: 0.016809573397040367 2023-01-22 22:41:24.797127: step: 1316/533, loss: 0.006960244849324226 2023-01-22 22:41:25.916196: step: 1320/533, loss: 0.008412796072661877 2023-01-22 22:41:27.018966: step: 1324/533, loss: 0.01985945738852024 2023-01-22 22:41:28.090541: step: 1328/533, loss: 0.00668836385011673 2023-01-22 22:41:29.193879: step: 1332/533, loss: 0.011092136614024639 2023-01-22 22:41:30.293431: step: 1336/533, loss: 0.019724303856492043 2023-01-22 22:41:31.392635: step: 1340/533, loss: 0.014218992553651333 2023-01-22 22:41:32.457539: step: 1344/533, loss: 0.0051703364588320255 2023-01-22 22:41:33.557655: step: 1348/533, loss: 0.022777661681175232 2023-01-22 22:41:34.645667: step: 1352/533, loss: 0.01124496664851904 2023-01-22 22:41:35.756194: step: 1356/533, loss: 0.003010059939697385 2023-01-22 22:41:36.838797: step: 1360/533, loss: 9.990805119741708e-05 2023-01-22 22:41:37.967353: step: 1364/533, loss: 0.13131920993328094 2023-01-22 22:41:39.049982: step: 1368/533, loss: 0.006658780388534069 2023-01-22 22:41:40.137081: step: 1372/533, loss: 0.0027997377328574657 2023-01-22 22:41:41.262269: step: 1376/533, loss: 0.003025772050023079 2023-01-22 22:41:42.359323: step: 1380/533, loss: 0.02321680821478367 2023-01-22 22:41:43.434781: step: 1384/533, loss: 0.006161246448755264 2023-01-22 22:41:44.527603: step: 1388/533, loss: 0.04634488746523857 2023-01-22 22:41:45.646540: step: 1392/533, loss: 0.039342641830444336 2023-01-22 22:41:46.726932: step: 1396/533, loss: 0.005271706264466047 2023-01-22 22:41:47.853423: step: 1400/533, loss: 0.012976311147212982 2023-01-22 22:41:48.961483: step: 1404/533, loss: 0.02522117644548416 2023-01-22 22:41:50.053585: step: 1408/533, loss: 0.0011854091426357627 2023-01-22 22:41:51.170482: step: 1412/533, loss: 0.007242171093821526 2023-01-22 22:41:52.288644: step: 1416/533, loss: 0.020117472857236862 2023-01-22 22:41:53.371600: step: 1420/533, loss: 0.0024570454843342304