Text Classification

File to follow along in class

Movie reviews corpus

reviews_corp <- data_corpus_moviereviews
reviews_dfm <- dfm(reviews_corp,remove_punct=T)

## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.

## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

Sampling

reviews_train <- dfm_sample(reviews_dfm,0.8*ndoc(reviews_corp))
reviews_test <- dfm_subset(reviews_dfm,
  !(docnames(reviews_dfm) %in% docnames(reviews_train)))


head(reviews_train,3)

## Document-feature matrix of: 3 documents, 48,339 features (99.20% sparse) and 3 docvars.
##                  features
## docs              plot two teen couples go to  a church party drink
##   cv906_12332.txt    2   3    0       0  2 27 30      0     0     0
##   cv729_10154.txt    0   5    0       0  2 30 30      0     0     0
##   cv098_15435.txt    1   0    0       0  0 11  9      0     0     0
## [ reached max_nfeat ... 48,329 more features ]

head(reviews_test,3)

## Document-feature matrix of: 3 documents, 48,339 features (99.47% sparse) and 3 docvars.
##                  features
## docs              plot two teen couples go to  a church party drink
##   cv000_29416.txt    1   2    4       1  2 16 14      1     1     1
##   cv001_19502.txt    0   0    0       0  0  2 13      0     0     0
##   cv002_17424.txt    2   1    0       0  2  6 10      0     0     0
## [ reached max_nfeat ... 48,329 more features ]

Adjusting Feature Set

reviews_train <- reviews_train %>% dfm_trim(1)
reviews_test <- dfm_match(reviews_test, featnames(reviews_train))

head(reviews_test,3)

## Document-feature matrix of: 3 documents, 43,293 features (99.42% sparse) and 3 docvars.
##                  features
## docs              plot two teen couples go to  a church party drink
##   cv000_29416.txt    1   2    4       1  2 16 14      1     1     1
##   cv001_19502.txt    0   0    0       0  0  2 13      0     0     0
##   cv002_17424.txt    2   1    0       0  2  6 10      0     0     0
## [ reached max_nfeat ... 43,283 more features ]

Training the model

start <- Sys.time()

# Model
nb_model<-textmodel_nb(reviews_train,docvars(reviews_train,
  "sentiment"))
nb_model

## 
## Call:
## textmodel_nb.dfm(x = reviews_train, y = docvars(reviews_train, 
##     "sentiment"))
## 
##  Distribution: multinomial ; priors: 0.5 0.5 ; smoothing value: 1 ; 1600 training documents;  fitted features.

end <- Sys.time()
end-start

## Time difference of 0.04001403 secs

Predicting test data

test_predictions<-predict(nb_model,
  newdata=reviews_test)



head(test_predictions,5)

## cv000_29416.txt cv001_19502.txt cv002_17424.txt cv009_29417.txt cv010_29063.txt 
##             neg             neg             neg             neg             pos 
## Levels: neg pos

table(docvars(reviews_test,"sentiment"),test_predictions)

##      test_predictions
##       neg pos
##   neg 168  34
##   pos  35 163

Exercise: SVM

Try using the Support Vector Machine Classifier instead. For this, you will need to change the classification command to textmodel_svm(). Measure time to assess if the classifier is faster than Naive Bayes.

start <- Sys.time()

# Model
svm_model<-textmodel_svm(reviews_train,docvars(reviews_train,
  "sentiment"))
svm_model

## 
## Call:
## textmodel_svm.dfm(x = reviews_train, y = docvars(reviews_train, 
##     "sentiment"))
## 
## 1,600 training documents; 43,294 fitted features.
## Method: L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)

end <- Sys.time()
end-start

## Time difference of 5.099052 secs

Predict the test data and look at the cross-tabulation to see how well the model compares, compared to the Naive Bayes Classifier. If you have still stored the results from your Naive Bayes Model, create a confusion Matrix for both.

predictions_svm <- predict(svm_model,
  newdata=reviews_test)

head(predictions_svm,5)

## cv000_29416.txt cv001_19502.txt cv002_17424.txt cv009_29417.txt cv010_29063.txt 
##             neg             neg             neg             neg             pos 
## Levels: neg pos

table(docvars(reviews_test,"sentiment"),predictions_svm)

##      predictions_svm
##       neg pos
##   neg 167  35
##   pos  34 164

caret::confusionMatrix(docvars(reviews_test,"sentiment"),test_predictions)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 168  34
##        pos  35 163
##                                           
##                Accuracy : 0.8275          
##                  95% CI : (0.7868, 0.8632)
##     No Information Rate : 0.5075          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6549          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8276          
##             Specificity : 0.8274          
##          Pos Pred Value : 0.8317          
##          Neg Pred Value : 0.8232          
##              Prevalence : 0.5075          
##          Detection Rate : 0.4200          
##    Detection Prevalence : 0.5050          
##       Balanced Accuracy : 0.8275          
##                                           
##        'Positive' Class : neg             
##

caret::confusionMatrix(docvars(reviews_test,"sentiment"),predictions_svm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 167  35
##        pos  34 164
##                                           
##                Accuracy : 0.8275          
##                  95% CI : (0.7868, 0.8632)
##     No Information Rate : 0.5025          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.655           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8308          
##             Specificity : 0.8241          
##          Pos Pred Value : 0.8267          
##          Neg Pred Value : 0.8283          
##              Prevalence : 0.5025          
##          Detection Rate : 0.4175          
##    Detection Prevalence : 0.5050          
##       Balanced Accuracy : 0.8275          
##                                           
##        'Positive' Class : neg             
##