SlideShare a Scribd company logo
WLE-classify
HUNG HUO-SU
09/23/2015
Practical Machine Learning Project 09/24/2015
WLE Data analysis
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
training_csv = read.csv("pml-training.csv")
#Partition Original training data with classe into 2 part
#70% for training model and 30% for verification
inTrain <- createDataPartition(y=training_csv$classe, p=0.7, list=FALSE)
training <- training_csv[inTrain,]
testing <- training_csv[-inTrain,]
#We just only focus on accelerometers, and ignore others sensor
#training_accel contains only accelerometers data without classe
#training_accel_classe contains only accelerometers with classe.
training_accel <- training[grep("^accel", colnames(training))]
training_accel_classe<-cbind(training_accel, training$classe)
colnames(training_accel_classe)[ncol(training_accel_classe)] <-
"classe"
colnames(training_accel_classe)[ncol(training_accel_classe)]
## [1] "classe"
#Use Random Forests method to train the model called modelFit_rf_70
modelFit_rf_70 <- train(training_accel_classe$classe ~ ., data=training
_accel_classe , method="rf", prof=TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
#The accuracy is over 90% and mtry =2 is best.
modelFit_rf_70
## Random Forest
##
## 13737 samples
## 12 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737,
...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9305551 0.9121133 0.003224900 0.004081289
## 7 0.9210913 0.9001473 0.003691518 0.004695061
## 12 0.9033667 0.8777115 0.005626041 0.007144838
##
## Accuracy was used to select the optimal model using the largest val
ue.
## The final value used for the model was mtry = 2.
#Use confusionMatrix to verify model accuracy
#The accuracy of model created by Random Forest is high and over 0.9
confusionMatrix(testing$classe, predict(modelFit_rf_70, testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1613 8 26 23 4
## B 43 1039 35 14 8
## C 13 37 969 6 1
## D 19 4 47 889 5
## E 5 15 6 10 1046
##
## Overall Statistics
##
## Accuracy : 0.9441
## 95% CI : (0.9379, 0.9498)
## No Information Rate : 0.2877
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9293
## Mcnemar's Test P-Value : 9.14e-12
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9527 0.9420 0.8947 0.9437 0.9831
## Specificity 0.9854 0.9791 0.9881 0.9848 0.9925
## Pos Pred Value 0.9636 0.9122 0.9444 0.9222 0.9667
## Neg Pred Value 0.9810 0.9865 0.9765 0.9892 0.9963
## Prevalence 0.2877 0.1874 0.1840 0.1601 0.1808
## Detection Rate 0.2741 0.1766 0.1647 0.1511 0.1777
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9691 0.9605 0.9414 0.9643 0.9878
#Fill the predict result to predRight column
pred <- predict(modelFit_rf_70, testing)
testing$predRight <- pred==testing$classe
#Predict the answers of pml-testing.csv, and get result
testing_csv = read.csv("pml-testing.csv")
answers <- predict(modelFit_rf_70, testing_csv)
answers
## [1] B A C A A E D D A A B C B A E E A B B B
## Levels: A B C D E
#Use pml_write_files() function to create answer of file for 20 problem
s
pml_write_files = function(x){
n = length(x)
for(i in 1:n){
filename = paste0("problem_id_",i,".txt")
write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.name
s=FALSE)
}
}
pml_write_files(answers)
#Although we get over 90% accuracy by testing data from 30% of original
data, we still want to know which conditions cause error prediction.
#We use correlation matrix between factors and find out some less relat
ion factors in order to show the test result in graphic.
#For example, we get the most TRUE column is accel_forearm_y, and
#find the other factor less relation with it.
min(abs(cor(training_accel[which(training_accel_classe$classe == "A
"),])))
## [1] 0.01247164
#Use min_cor_rcname() funciton can retrive row/column name of minimal c
orrelation value for each classe
min_cor_rcname <- function(Class)
{
mdat <- abs(cor(training_accel[which(training_accel_classe$classe ==
Class),]))
index <- which.min(mdat)
k <- arrayInd(index, dim(mdat))
rr <- rownames(mdat)[k[,1]]
cc <- colnames(mdat)[k[,2]]
print(rr)
print(cc)
}
min_cor_rcname("A")
## [1] "accel_belt_y"
## [1] "accel_belt_x"
min_cor_rcname("B")
## [1] "accel_forearm_x"
## [1] "accel_dumbbell_y"
min_cor_rcname("C")
## [1] "accel_forearm_y"
## [1] "accel_arm_y"
min_cor_rcname("D")
## [1] "accel_forearm_y"
## [1] "accel_belt_z"
min_cor_rcname("E")
## [1] "accel_forearm_z"
## [1] "accel_dumbbell_y"
# Divide testing data by classe, because we want to observe error by ea
ch data
testing_A <- testing[which(testing$classe == "A"),]
testing_B <- testing[which(testing$classe == "B"),]
testing_C <- testing[which(testing$classe == "C"),]
testing_D <- testing[which(testing$classe == "D"),]
testing_E <- testing[which(testing$classe == "E"),]
#Plot graphs for each Classe A,B,C,D,E
qplot(accel_belt_x, accel_belt_y, colour=predict(modelFit_rf_70, testin
g_A), data=testing_A, main="Class A")
qplot(accel_dumbbell_x, accel_belt_z, colour=predict(modelFit_rf_70, te
sting_B), data=testing_B, main="Class B")
qplot(accel_belt_y, accel_belt_x, colour=predict(modelFit_rf_70, testin
g_C), data=testing_C, main="Class C")
qplot(accel_belt_x, accel_forearm_x, colour=predict(modelFit_rf_70, tes
ting_D), data=testing_D, main = "Class D")
qplot(accel_dumbbell_y, accel_forearm_z, colour=predict(modelFit_rf_70,
testing_E), data=testing_E, main="Class E")
Summary
• 1.Random Forest algorithm have high accuracy but performance is bad.It need
much time to train model.*
• 2.Most errors happen near the center of each group of each Class, but it is still
predicted error. It may be caused by overfitting.It is better to reduce the
features before train model by Random Forest method.*
• 3.Per the graphs we generate, they imply something:*
– Some error classifications of A are considered as B.*
– Some error classifications of B are considered as A or C.*
– Some error classifications of C are considered as A.*
– Some error classifications of D are considered as A.*
– Some error classifications of E are considered as B.*
• 4.According to the page http://groupware.les.inf.puc-rio.br/har "Weight Lifting
Exercises Dataset". It declares*
– Class A - the specification exercise.*
– Class B - throwing the elbows to the front*
– Class C - lifting the dumbbell only halfway*
– Class D - lowering the dumbbell only halfway*
– Class E - throwing the hips to the front*
• 5.when we do the specified exercise,if we make the mistake about throwing our
hips to the front, it might make our elbows to the front at the same time.*
• 6.The most important variable is accel_belt_z and then accel_dumbbell_y by
GINI importance.*

More Related Content

PDF
Human_Activity_Recognition_Predictive_Model
PDF
Kaggle talk series top 0.2% kaggler on amazon employee access challenge
PDF
Data mining with caret package
PPTX
PCA and LDA in machine learning
PPTX
PPTX
Data analysis on bank data
PDF
P set5 question_7
PPTX
Data Science Academy Student Demo day--Peggy sobolewski,analyzing transporati...
Human_Activity_Recognition_Predictive_Model
Kaggle talk series top 0.2% kaggler on amazon employee access challenge
Data mining with caret package
PCA and LDA in machine learning
Data analysis on bank data
P set5 question_7
Data Science Academy Student Demo day--Peggy sobolewski,analyzing transporati...

Viewers also liked (12)

DOCX
ag_resume13 (2) (1)
PPTX
Virtual Private Network
PPTX
Media Relations - An overview
PPT
к о м п`ю т е р
PDF
Вконтакте размещение рекламы
PDF
TrackGSM
PPTX
ULI Geoforum - Exempel på nyttjande av öppna data
PPTX
PPTX
Investments to sustainable development
PPT
Μήλα και καμήλα
PPTX
Viking fitness
PPTX
How to write an argumentative essay
ag_resume13 (2) (1)
Virtual Private Network
Media Relations - An overview
к о м п`ю т е р
Вконтакте размещение рекламы
TrackGSM
ULI Geoforum - Exempel på nyttjande av öppna data
Investments to sustainable development
Μήλα και καμήλα
Viking fitness
How to write an argumentative essay
Ad

Similar to Classification examp (20)

PDF
Peterson_-_Machine_Learning_Project
PDF
Course Project for Coursera Practical Machine Learning
PDF
Simple rules for building robust machine learning models
PDF
Bridging the Gap: Machine Learning for Ubiquitous Computing -- Evaluation
PDF
VSSML18. Ensembles and Logistic Regressions
PDF
VSSML18. Evaluations
PPTX
Predicting Hospital Readmission Using TreeNet
PDF
Machine Learning with R
PPTX
Evaluating machine learning claims
PPTX
Statistical Learning and Model Selection module 2.pptx
PPTX
Diabetes prediction using Machine Leanring and Data Preprocessing techniques
PDF
Data Science Cheatsheet.pdf
PPTX
Supervised Machine Learning in R
PDF
Making Machine Learning Work in Practice - StampedeCon 2014
PDF
4_2_Ensemble models and grad boost part 1.pdf
PDF
4 2 ensemble models and grad boost part 1 2019-10-07
PDF
MLSD18 Evaluations
PPTX
Data Analysis project "TITANIC SURVIVAL"
Peterson_-_Machine_Learning_Project
Course Project for Coursera Practical Machine Learning
Simple rules for building robust machine learning models
Bridging the Gap: Machine Learning for Ubiquitous Computing -- Evaluation
VSSML18. Ensembles and Logistic Regressions
VSSML18. Evaluations
Predicting Hospital Readmission Using TreeNet
Machine Learning with R
Evaluating machine learning claims
Statistical Learning and Model Selection module 2.pptx
Diabetes prediction using Machine Leanring and Data Preprocessing techniques
Data Science Cheatsheet.pdf
Supervised Machine Learning in R
Making Machine Learning Work in Practice - StampedeCon 2014
4_2_Ensemble models and grad boost part 1.pdf
4 2 ensemble models and grad boost part 1 2019-10-07
MLSD18 Evaluations
Data Analysis project "TITANIC SURVIVAL"
Ad

Classification examp

  • 1. WLE-classify HUNG HUO-SU 09/23/2015 Practical Machine Learning Project 09/24/2015 WLE Data analysis library(caret) ## Loading required package: lattice ## Loading required package: ggplot2 training_csv = read.csv("pml-training.csv") #Partition Original training data with classe into 2 part #70% for training model and 30% for verification inTrain <- createDataPartition(y=training_csv$classe, p=0.7, list=FALSE) training <- training_csv[inTrain,] testing <- training_csv[-inTrain,] #We just only focus on accelerometers, and ignore others sensor #training_accel contains only accelerometers data without classe #training_accel_classe contains only accelerometers with classe. training_accel <- training[grep("^accel", colnames(training))] training_accel_classe<-cbind(training_accel, training$classe) colnames(training_accel_classe)[ncol(training_accel_classe)] <- "classe" colnames(training_accel_classe)[ncol(training_accel_classe)] ## [1] "classe" #Use Random Forests method to train the model called modelFit_rf_70 modelFit_rf_70 <- train(training_accel_classe$classe ~ ., data=training _accel_classe , method="rf", prof=TRUE) ## Loading required package: randomForest ## randomForest 4.6-10 ## Type rfNews() to see new features/changes/bug fixes. #The accuracy is over 90% and mtry =2 is best. modelFit_rf_70 ## Random Forest ## ## 13737 samples
  • 2. ## 12 predictor ## 5 classes: 'A', 'B', 'C', 'D', 'E' ## ## No pre-processing ## Resampling: Bootstrapped (25 reps) ## Summary of sample sizes: 13737, 13737, 13737, 13737, 13737, 13737, ... ## Resampling results across tuning parameters: ## ## mtry Accuracy Kappa Accuracy SD Kappa SD ## 2 0.9305551 0.9121133 0.003224900 0.004081289 ## 7 0.9210913 0.9001473 0.003691518 0.004695061 ## 12 0.9033667 0.8777115 0.005626041 0.007144838 ## ## Accuracy was used to select the optimal model using the largest val ue. ## The final value used for the model was mtry = 2. #Use confusionMatrix to verify model accuracy #The accuracy of model created by Random Forest is high and over 0.9 confusionMatrix(testing$classe, predict(modelFit_rf_70, testing)) ## Confusion Matrix and Statistics ## ## Reference ## Prediction A B C D E ## A 1613 8 26 23 4 ## B 43 1039 35 14 8 ## C 13 37 969 6 1 ## D 19 4 47 889 5 ## E 5 15 6 10 1046 ## ## Overall Statistics ## ## Accuracy : 0.9441 ## 95% CI : (0.9379, 0.9498) ## No Information Rate : 0.2877 ## P-Value [Acc > NIR] : < 2.2e-16 ## ## Kappa : 0.9293 ## Mcnemar's Test P-Value : 9.14e-12 ## ## Statistics by Class: ## ## Class: A Class: B Class: C Class: D Class: E ## Sensitivity 0.9527 0.9420 0.8947 0.9437 0.9831 ## Specificity 0.9854 0.9791 0.9881 0.9848 0.9925 ## Pos Pred Value 0.9636 0.9122 0.9444 0.9222 0.9667 ## Neg Pred Value 0.9810 0.9865 0.9765 0.9892 0.9963 ## Prevalence 0.2877 0.1874 0.1840 0.1601 0.1808
  • 3. ## Detection Rate 0.2741 0.1766 0.1647 0.1511 0.1777 ## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839 ## Balanced Accuracy 0.9691 0.9605 0.9414 0.9643 0.9878 #Fill the predict result to predRight column pred <- predict(modelFit_rf_70, testing) testing$predRight <- pred==testing$classe #Predict the answers of pml-testing.csv, and get result testing_csv = read.csv("pml-testing.csv") answers <- predict(modelFit_rf_70, testing_csv) answers ## [1] B A C A A E D D A A B C B A E E A B B B ## Levels: A B C D E #Use pml_write_files() function to create answer of file for 20 problem s pml_write_files = function(x){ n = length(x) for(i in 1:n){ filename = paste0("problem_id_",i,".txt") write.table(x[i],file=filename,quote=FALSE,row.names=FALSE,col.name s=FALSE) } } pml_write_files(answers) #Although we get over 90% accuracy by testing data from 30% of original data, we still want to know which conditions cause error prediction. #We use correlation matrix between factors and find out some less relat ion factors in order to show the test result in graphic. #For example, we get the most TRUE column is accel_forearm_y, and #find the other factor less relation with it. min(abs(cor(training_accel[which(training_accel_classe$classe == "A "),]))) ## [1] 0.01247164 #Use min_cor_rcname() funciton can retrive row/column name of minimal c orrelation value for each classe min_cor_rcname <- function(Class) { mdat <- abs(cor(training_accel[which(training_accel_classe$classe == Class),])) index <- which.min(mdat) k <- arrayInd(index, dim(mdat)) rr <- rownames(mdat)[k[,1]] cc <- colnames(mdat)[k[,2]]
  • 4. print(rr) print(cc) } min_cor_rcname("A") ## [1] "accel_belt_y" ## [1] "accel_belt_x" min_cor_rcname("B") ## [1] "accel_forearm_x" ## [1] "accel_dumbbell_y" min_cor_rcname("C") ## [1] "accel_forearm_y" ## [1] "accel_arm_y" min_cor_rcname("D") ## [1] "accel_forearm_y" ## [1] "accel_belt_z" min_cor_rcname("E") ## [1] "accel_forearm_z" ## [1] "accel_dumbbell_y" # Divide testing data by classe, because we want to observe error by ea ch data testing_A <- testing[which(testing$classe == "A"),] testing_B <- testing[which(testing$classe == "B"),] testing_C <- testing[which(testing$classe == "C"),] testing_D <- testing[which(testing$classe == "D"),] testing_E <- testing[which(testing$classe == "E"),] #Plot graphs for each Classe A,B,C,D,E qplot(accel_belt_x, accel_belt_y, colour=predict(modelFit_rf_70, testin g_A), data=testing_A, main="Class A")
  • 5. qplot(accel_dumbbell_x, accel_belt_z, colour=predict(modelFit_rf_70, te sting_B), data=testing_B, main="Class B")
  • 6. qplot(accel_belt_y, accel_belt_x, colour=predict(modelFit_rf_70, testin g_C), data=testing_C, main="Class C") qplot(accel_belt_x, accel_forearm_x, colour=predict(modelFit_rf_70, tes ting_D), data=testing_D, main = "Class D")
  • 8. Summary • 1.Random Forest algorithm have high accuracy but performance is bad.It need much time to train model.* • 2.Most errors happen near the center of each group of each Class, but it is still predicted error. It may be caused by overfitting.It is better to reduce the features before train model by Random Forest method.* • 3.Per the graphs we generate, they imply something:* – Some error classifications of A are considered as B.* – Some error classifications of B are considered as A or C.* – Some error classifications of C are considered as A.* – Some error classifications of D are considered as A.* – Some error classifications of E are considered as B.* • 4.According to the page http://groupware.les.inf.puc-rio.br/har "Weight Lifting Exercises Dataset". It declares* – Class A - the specification exercise.* – Class B - throwing the elbows to the front* – Class C - lifting the dumbbell only halfway* – Class D - lowering the dumbbell only halfway* – Class E - throwing the hips to the front* • 5.when we do the specified exercise,if we make the mistake about throwing our hips to the front, it might make our elbows to the front at the same time.* • 6.The most important variable is accel_belt_z and then accel_dumbbell_y by GINI importance.*