2020-11-22 21:40:17,521 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:40:17,523 Model: "TARSClassifier( (document_embeddings): None (decoder): None (loss_function): None (tars_model): TextClassifier( (document_embeddings): TransformerDocumentEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (decoder): Linear(in_features=768, out_features=2, bias=True) (loss_function): CrossEntropyLoss() ) (beta): 1.0 (weights): None (weight_tensor) None )" 2020-11-22 21:40:17,524 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:40:17,524 Corpus: "Corpus: 49999 train + 32499 dev + 50000 test sentences" 2020-11-22 21:40:17,524 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:40:17,524 Parameters: 2020-11-22 21:40:17,524 - learning_rate: "0.00015625" 2020-11-22 21:40:17,524 - mini_batch_size: "8" 2020-11-22 21:40:17,524 - patience: "3" 2020-11-22 21:40:17,524 - anneal_factor: "0.5" 2020-11-22 21:40:17,524 - max_epochs: "1" 2020-11-22 21:40:17,524 - shuffle: "True" 2020-11-22 21:40:17,524 - train_with_dev: "False" 2020-11-22 21:40:17,524 - batch_growth_annealing: "False" 2020-11-22 21:40:17,524 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:40:17,524 Model training base path: "resources/71" 2020-11-22 21:40:17,524 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:40:17,524 Device: cuda:0 2020-11-22 21:40:17,524 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:40:17,524 Embeddings storage mode: cpu 2020-11-22 21:40:17,530 ---------------------------------------------------------------------------------------------------- 2020-11-22 21:46:59,877 epoch 1 - iter 625/6250 - loss 0.19234078 - samples/sec: 12.66 - lr: 0.000156 2020-11-22 21:53:34,840 epoch 1 - iter 1250/6250 - loss 0.19418782 - samples/sec: 12.93 - lr: 0.000156 2020-11-22 22:00:15,674 epoch 1 - iter 1875/6250 - loss 0.19202128 - samples/sec: 12.70 - lr: 0.000156 2020-11-22 22:06:57,779 epoch 1 - iter 2500/6250 - loss 0.18980797 - samples/sec: 12.66 - lr: 0.000156 2020-11-22 22:13:36,031 epoch 1 - iter 3125/6250 - loss 0.18928352 - samples/sec: 12.78 - lr: 0.000156 2020-11-22 22:20:17,689 epoch 1 - iter 3750/6250 - loss 0.18834419 - samples/sec: 12.71 - lr: 0.000156 2020-11-22 22:27:02,815 epoch 1 - iter 4375/6250 - loss 0.18895342 - samples/sec: 12.56 - lr: 0.000156 2020-11-22 22:33:33,752 epoch 1 - iter 5000/6250 - loss 0.18867368 - samples/sec: 13.02 - lr: 0.000156 2020-11-22 22:40:07,054 epoch 1 - iter 5625/6250 - loss 0.18815422 - samples/sec: 12.95 - lr: 0.000156 2020-11-22 22:46:51,037 epoch 1 - iter 6250/6250 - loss 0.18739277 - samples/sec: 12.61 - lr: 0.000156 2020-11-22 22:46:51,544 ---------------------------------------------------------------------------------------------------- 2020-11-22 22:46:51,545 EPOCH 1 done: loss 0.1874 - lr 0.0001563 2020-11-22 23:15:29,043 DEV : loss 0.41572660207748413 - score 0.6571 2020-11-22 23:17:11,714 BAD EPOCHS (no improvement): 0 2020-11-22 23:17:15,045 ---------------------------------------------------------------------------------------------------- 2020-11-22 23:17:15,045 Testing using best model ... 2020-11-22 23:17:15,046 loading file resources/71/best-model.pt 2020-11-23 00:01:39,658 0.6527 2020-11-23 00:01:39,658 Results: - F-score (micro) 0.6527 - F-score (macro) 0.6527 - Accuracy 0.6527 By class: precision recall f1-score support positive_restaurant_sentiment_4 0.5679 0.5871 0.5773 10000 neutral_restaurant_sentiment_3 0.5942 0.5823 0.5882 10000 very_negative_restaurant_sentiment_1 0.7612 0.7742 0.7676 10000 negative_restaurant_sentiment_2 0.6050 0.5986 0.6018 10000 very_positive_restaurant_sentiment_5 0.7361 0.7211 0.7285 10000 micro avg 0.6527 0.6527 0.6527 50000 macro avg 0.6529 0.6527 0.6527 50000 weighted avg 0.6529 0.6527 0.6527 50000 samples avg 0.6527 0.6527 0.6527 50000 2020-11-23 00:01:39,659 ----------------------------------------------------------------------------------------------------