File to follow along in class
reviews_corp <- data_corpus_moviereviews
reviews_dfm <- dfm(reviews_corp,remove_punct=T)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
reviews_train <- dfm_sample(reviews_dfm,0.8*ndoc(reviews_corp))
reviews_test <- dfm_subset(reviews_dfm,
!(docnames(reviews_dfm) %in% docnames(reviews_train)))
head(reviews_train,3)
## Document-feature matrix of: 3 documents, 48,339 features (99.20% sparse) and 3 docvars.
## features
## docs plot two teen couples go to a church party drink
## cv906_12332.txt 2 3 0 0 2 27 30 0 0 0
## cv729_10154.txt 0 5 0 0 2 30 30 0 0 0
## cv098_15435.txt 1 0 0 0 0 11 9 0 0 0
## [ reached max_nfeat ... 48,329 more features ]
head(reviews_test,3)
## Document-feature matrix of: 3 documents, 48,339 features (99.47% sparse) and 3 docvars.
## features
## docs plot two teen couples go to a church party drink
## cv000_29416.txt 1 2 4 1 2 16 14 1 1 1
## cv001_19502.txt 0 0 0 0 0 2 13 0 0 0
## cv002_17424.txt 2 1 0 0 2 6 10 0 0 0
## [ reached max_nfeat ... 48,329 more features ]
reviews_train <- reviews_train %>% dfm_trim(1)
reviews_test <- dfm_match(reviews_test, featnames(reviews_train))
head(reviews_test,3)
## Document-feature matrix of: 3 documents, 43,293 features (99.42% sparse) and 3 docvars.
## features
## docs plot two teen couples go to a church party drink
## cv000_29416.txt 1 2 4 1 2 16 14 1 1 1
## cv001_19502.txt 0 0 0 0 0 2 13 0 0 0
## cv002_17424.txt 2 1 0 0 2 6 10 0 0 0
## [ reached max_nfeat ... 43,283 more features ]
start <- Sys.time()
# Model
nb_model<-textmodel_nb(reviews_train,docvars(reviews_train,
"sentiment"))
nb_model
##
## Call:
## textmodel_nb.dfm(x = reviews_train, y = docvars(reviews_train,
## "sentiment"))
##
## Distribution: multinomial ; priors: 0.5 0.5 ; smoothing value: 1 ; 1600 training documents; fitted features.
end <- Sys.time()
end-start
## Time difference of 0.04001403 secs
test_predictions<-predict(nb_model,
newdata=reviews_test)
head(test_predictions,5)
## cv000_29416.txt cv001_19502.txt cv002_17424.txt cv009_29417.txt cv010_29063.txt
## neg neg neg neg pos
## Levels: neg pos
table(docvars(reviews_test,"sentiment"),test_predictions)
## test_predictions
## neg pos
## neg 168 34
## pos 35 163
Try using the Support Vector Machine Classifier instead. For this, you will need to change the classification command to textmodel_svm()
. Measure time to assess if the classifier is faster than Naive Bayes.
start <- Sys.time()
# Model
svm_model<-textmodel_svm(reviews_train,docvars(reviews_train,
"sentiment"))
svm_model
##
## Call:
## textmodel_svm.dfm(x = reviews_train, y = docvars(reviews_train,
## "sentiment"))
##
## 1,600 training documents; 43,294 fitted features.
## Method: L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)
end <- Sys.time()
end-start
## Time difference of 5.099052 secs
Predict the test data and look at the cross-tabulation to see how well the model compares, compared to the Naive Bayes Classifier. If you have still stored the results from your Naive Bayes Model, create a confusion Matrix for both.
predictions_svm <- predict(svm_model,
newdata=reviews_test)
head(predictions_svm,5)
## cv000_29416.txt cv001_19502.txt cv002_17424.txt cv009_29417.txt cv010_29063.txt
## neg neg neg neg pos
## Levels: neg pos
table(docvars(reviews_test,"sentiment"),predictions_svm)
## predictions_svm
## neg pos
## neg 167 35
## pos 34 164
caret::confusionMatrix(docvars(reviews_test,"sentiment"),test_predictions)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 168 34
## pos 35 163
##
## Accuracy : 0.8275
## 95% CI : (0.7868, 0.8632)
## No Information Rate : 0.5075
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6549
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8276
## Specificity : 0.8274
## Pos Pred Value : 0.8317
## Neg Pred Value : 0.8232
## Prevalence : 0.5075
## Detection Rate : 0.4200
## Detection Prevalence : 0.5050
## Balanced Accuracy : 0.8275
##
## 'Positive' Class : neg
##
caret::confusionMatrix(docvars(reviews_test,"sentiment"),predictions_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 167 35
## pos 34 164
##
## Accuracy : 0.8275
## 95% CI : (0.7868, 0.8632)
## No Information Rate : 0.5025
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.655
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8308
## Specificity : 0.8241
## Pos Pred Value : 0.8267
## Neg Pred Value : 0.8283
## Prevalence : 0.5025
## Detection Rate : 0.4175
## Detection Prevalence : 0.5050
## Balanced Accuracy : 0.8275
##
## 'Positive' Class : neg
##