2020-05-04 09:28:36,738 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:28:36,740 Model: "TextClassifier( (document_embeddings): TransformerDocumentEmbeddings( (model): RobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(50265, 768, padding_idx=1) (position_embeddings): Embedding(514, 768, padding_idx=1) (token_type_embeddings): Embedding(1, 768) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (decoder): Linear(in_features=768, out_features=2, bias=True) (loss_function): CrossEntropyLoss() (beta): 1.0 (weights): None (weight_tensor) None )" 2020-05-04 09:28:36,740 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:28:36,740 Corpus: "Corpus: 10265 train + 1140 dev + 1267 test sentences Corpus: 54533 train + 6059 dev + 6732 test sentences Corpus: 8611 train + 957 dev + 1063 test sentences Corpus: 209098 train + 23233 dev + 25814 test sentences" 2020-05-04 09:28:36,740 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:28:36,740 Parameters: 2020-05-04 09:28:36,741 - learning_rate: "3e-05" 2020-05-04 09:28:36,741 - mini_batch_size: "256" 2020-05-04 09:28:36,741 - patience: "1" 2020-05-04 09:28:36,741 - anneal_factor: "0.1" 2020-05-04 09:28:36,741 - max_epochs: "20" 2020-05-04 09:28:36,741 - shuffle: "True" 2020-05-04 09:28:36,741 - train_with_dev: "False" 2020-05-04 09:28:36,741 - batch_growth_annealing: "False" 2020-05-04 09:28:36,742 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:28:36,742 Model training base path: "resources/taggers/sentiment-curated-roberta-base-classic-256-0" 2020-05-04 09:28:36,742 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:28:36,742 Device: cuda:0 2020-05-04 09:28:36,742 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:28:36,742 Embeddings storage mode: none 2020-05-04 09:28:36,745 ---------------------------------------------------------------------------------------------------- 2020-05-04 09:38:47,720 epoch 1 - iter 110/1104 - loss 0.27618256 - samples/sec: 46.88 2020-05-04 09:48:58,684 epoch 1 - iter 220/1104 - loss 0.21547991 - samples/sec: 46.74 2020-05-04 09:59:07,826 epoch 1 - iter 330/1104 - loss 0.18181421 - samples/sec: 46.89 2020-05-04 10:09:17,520 epoch 1 - iter 440/1104 - loss 0.16649143 - samples/sec: 46.84 2020-05-04 10:19:29,576 epoch 1 - iter 550/1104 - loss 0.16208020 - samples/sec: 46.69 2020-05-04 10:29:36,510 epoch 1 - iter 660/1104 - loss 0.15414644 - samples/sec: 47.09 2020-05-04 10:39:47,305 epoch 1 - iter 770/1104 - loss 0.14536825 - samples/sec: 46.77 2020-05-04 10:49:56,344 epoch 1 - iter 880/1104 - loss 0.14161813 - samples/sec: 46.90 2020-05-04 11:00:05,769 epoch 1 - iter 990/1104 - loss 0.13813618 - samples/sec: 46.87 2020-05-04 11:10:13,590 epoch 1 - iter 1100/1104 - loss 0.13732443 - samples/sec: 47.01 2020-05-04 11:10:34,047 ---------------------------------------------------------------------------------------------------- 2020-05-04 11:10:34,050 EPOCH 1 done: loss 0.1370 - lr 0.0000300 2020-05-04 11:14:56,581 DEV : loss 0.10124450922012329 - score 0.9658 2020-05-04 11:19:59,037 TEST : loss 0.10412824898958206 - score 0.9654 2020-05-04 11:20:12,379 BAD EPOCHS (no improvement): 0 2020-05-04 11:20:15,238 ---------------------------------------------------------------------------------------------------- 2020-05-04 11:30:26,266 epoch 2 - iter 110/1104 - loss 0.09490479 - samples/sec: 46.83 2020-05-04 11:40:36,434 epoch 2 - iter 220/1104 - loss 0.09808639 - samples/sec: 46.81 2020-05-04 11:50:44,487 epoch 2 - iter 330/1104 - loss 0.09789896 - samples/sec: 46.94 2020-05-04 12:00:51,084 epoch 2 - iter 440/1104 - loss 0.09304061 - samples/sec: 46.95 2020-05-04 12:10:59,785 epoch 2 - iter 550/1104 - loss 0.08885015 - samples/sec: 46.78 2020-05-04 12:21:07,864 epoch 2 - iter 660/1104 - loss 0.09030596 - samples/sec: 46.82 2020-05-04 12:31:16,205 epoch 2 - iter 770/1104 - loss 0.08785450 - samples/sec: 46.81 2020-05-04 12:41:21,701 epoch 2 - iter 880/1104 - loss 0.08842319 - samples/sec: 47.04 2020-05-04 12:51:29,990 epoch 2 - iter 990/1104 - loss 0.08758730 - samples/sec: 46.80 2020-05-04 13:01:35,244 epoch 2 - iter 1100/1104 - loss 0.08681568 - samples/sec: 47.05 2020-05-04 13:01:54,865 ---------------------------------------------------------------------------------------------------- 2020-05-04 13:01:54,869 EPOCH 2 done: loss 0.0866 - lr 0.0000300 2020-05-04 13:06:22,687 DEV : loss 0.10372272878885269 - score 0.967 2020-05-04 13:11:31,526 TEST : loss 0.10546315461397171 - score 0.9667 2020-05-04 13:11:44,967 BAD EPOCHS (no improvement): 0 2020-05-04 13:11:47,803 ---------------------------------------------------------------------------------------------------- 2020-05-04 13:21:55,143 epoch 3 - iter 110/1104 - loss 0.07518043 - samples/sec: 47.01 2020-05-04 13:32:04,810 epoch 3 - iter 220/1104 - loss 0.07190887 - samples/sec: 46.77 2020-05-04 13:42:15,802 epoch 3 - iter 330/1104 - loss 0.07090306 - samples/sec: 46.72 2020-05-04 13:52:23,071 epoch 3 - iter 440/1104 - loss 0.07599360 - samples/sec: 47.00 2020-05-04 14:02:33,933 epoch 3 - iter 550/1104 - loss 0.07149043 - samples/sec: 46.69 2020-05-04 14:12:40,817 epoch 3 - iter 660/1104 - loss 0.06918399 - samples/sec: 46.92 2020-05-04 14:22:46,857 epoch 3 - iter 770/1104 - loss 0.06880568 - samples/sec: 46.99 2020-05-04 14:32:53,138 epoch 3 - iter 880/1104 - loss 0.07017511 - samples/sec: 46.96 2020-05-04 14:42:59,497 epoch 3 - iter 990/1104 - loss 0.06896419 - samples/sec: 46.96 2020-05-04 14:53:05,983 epoch 3 - iter 1100/1104 - loss 0.07216313 - samples/sec: 46.95 2020-05-04 14:53:25,703 ---------------------------------------------------------------------------------------------------- 2020-05-04 14:53:25,707 EPOCH 3 done: loss 0.0720 - lr 0.0000300 2020-05-04 14:57:47,526 DEV : loss 0.1030198335647583 - score 0.9687 2020-05-04 15:02:49,813 TEST : loss 0.10596299171447754 - score 0.9681 2020-05-04 15:03:03,184 BAD EPOCHS (no improvement): 0 2020-05-04 15:03:05,994 ---------------------------------------------------------------------------------------------------- 2020-05-04 15:13:14,925 epoch 4 - iter 110/1104 - loss 0.03622629 - samples/sec: 46.90 2020-05-04 15:23:21,537 epoch 4 - iter 220/1104 - loss 0.04910018 - samples/sec: 46.94 2020-05-04 15:33:29,341 epoch 4 - iter 330/1104 - loss 0.05830612 - samples/sec: 46.95 2020-05-04 15:43:38,523 epoch 4 - iter 440/1104 - loss 0.05589168 - samples/sec: 46.84 2020-05-04 15:53:46,479 epoch 4 - iter 550/1104 - loss 0.05483489 - samples/sec: 46.93 2020-05-04 16:03:55,124 epoch 4 - iter 660/1104 - loss 0.05842726 - samples/sec: 46.85 2020-05-04 16:14:03,729 epoch 4 - iter 770/1104 - loss 0.05913065 - samples/sec: 46.88 2020-05-04 16:24:09,138 epoch 4 - iter 880/1104 - loss 0.06015629 - samples/sec: 47.11 2020-05-04 16:34:16,066 epoch 4 - iter 990/1104 - loss 0.06010633 - samples/sec: 46.91 2020-05-04 16:44:27,138 epoch 4 - iter 1100/1104 - loss 0.06104814 - samples/sec: 46.64 2020-05-04 16:44:47,310 ---------------------------------------------------------------------------------------------------- 2020-05-04 16:44:47,313 EPOCH 4 done: loss 0.0609 - lr 0.0000300 2020-05-04 16:49:15,396 DEV : loss 0.10930655151605606 - score 0.968 2020-05-04 16:54:24,072 TEST : loss 0.11596352607011795 - score 0.9662 2020-05-04 16:54:37,413 BAD EPOCHS (no improvement): 1 2020-05-04 16:54:37,414 ---------------------------------------------------------------------------------------------------- 2020-05-04 17:04:50,086 epoch 5 - iter 110/1104 - loss 0.03476773 - samples/sec: 46.69 2020-05-04 17:15:00,352 epoch 5 - iter 220/1104 - loss 0.05265532 - samples/sec: 46.74 2020-05-04 17:25:09,516 epoch 5 - iter 330/1104 - loss 0.05338909 - samples/sec: 46.85 2020-05-04 17:35:18,291 epoch 5 - iter 440/1104 - loss 0.05414768 - samples/sec: 46.90 2020-05-04 17:45:27,441 epoch 5 - iter 550/1104 - loss 0.05174209 - samples/sec: 46.83 2020-05-04 17:55:34,002 epoch 5 - iter 660/1104 - loss 0.05396556 - samples/sec: 47.04 2020-05-04 18:05:40,116 epoch 5 - iter 770/1104 - loss 0.05363536 - samples/sec: 47.07 2020-05-04 18:15:49,021 epoch 5 - iter 880/1104 - loss 0.05376992 - samples/sec: 46.76 2020-05-04 18:25:58,603 epoch 5 - iter 990/1104 - loss 0.05320980 - samples/sec: 46.74 2020-05-04 18:36:09,067 epoch 5 - iter 1100/1104 - loss 0.05187936 - samples/sec: 46.75 2020-05-04 18:36:28,919 ---------------------------------------------------------------------------------------------------- 2020-05-04 18:36:28,921 EPOCH 5 done: loss 0.0518 - lr 0.0000300 2020-05-04 18:40:52,005 DEV : loss 0.12184158712625504 - score 0.9684 2020-05-04 18:45:54,400 TEST : loss 0.12647730112075806 - score 0.9685 2020-05-04 18:46:07,599 BAD EPOCHS (no improvement): 2 2020-05-04 18:46:07,600 ---------------------------------------------------------------------------------------------------- 2020-05-04 18:56:16,797 epoch 6 - iter 110/1104 - loss 0.02974166 - samples/sec: 46.98 2020-05-04 19:06:25,553 epoch 6 - iter 220/1104 - loss 0.03402215 - samples/sec: 46.85 2020-05-04 19:16:37,702 epoch 6 - iter 330/1104 - loss 0.03318033 - samples/sec: 46.58 2020-05-04 19:26:44,355 epoch 6 - iter 440/1104 - loss 0.03306723 - samples/sec: 47.02 2020-05-04 19:36:50,809 epoch 6 - iter 550/1104 - loss 0.03498884 - samples/sec: 47.01 2020-05-04 19:46:58,719 epoch 6 - iter 660/1104 - loss 0.03472797 - samples/sec: 46.93 2020-05-04 19:57:09,335 epoch 6 - iter 770/1104 - loss 0.03540372 - samples/sec: 46.73 2020-05-04 20:07:19,356 epoch 6 - iter 880/1104 - loss 0.03533914 - samples/sec: 46.75 2020-05-04 20:17:29,277 epoch 6 - iter 990/1104 - loss 0.03474279 - samples/sec: 46.76 2020-05-04 20:27:38,895 epoch 6 - iter 1100/1104 - loss 0.03448066 - samples/sec: 46.82 2020-05-04 20:27:58,609 ---------------------------------------------------------------------------------------------------- 2020-05-04 20:27:58,613 EPOCH 6 done: loss 0.0344 - lr 0.0000030 2020-05-04 20:32:22,602 DEV : loss 0.1384792923927307 - score 0.9685 2020-05-04 20:37:29,018 TEST : loss 0.14346325397491455 - score 0.9688 2020-05-04 20:37:42,696 BAD EPOCHS (no improvement): 1 2020-05-04 20:37:42,697 ---------------------------------------------------------------------------------------------------- 2020-05-04 20:47:57,225 epoch 7 - iter 110/1104 - loss 0.02621884 - samples/sec: 46.63