SlideShare a Scribd company logo
R:
k
    apply
    sesejun@is.ocha.ac.jp
         2009/11/19
USPS
ImageName     Class   0,0   0,1   0,2   0,3   0,4
img_2_00_02   1       0     0     0     0     0
img_2_00_03   1       0     38    22    0     0
img_2_00_05   1       13    0     64    13    42
...
img_0_00_09   -1      34    53    0     38    0
img_0_00_28   -1      0     64    0     98    93
img_0_01_08   -1      13    0     0     59    13
img_0_03_05   -1      34    34    0     0     0
Datamining r 4th
img_3_29_25   img_5_03_31   img_3_06_30   img_3_17_08
k-NN
Apply Family
•                                  ,      ,
    •   for
    •
    apply(X, 1,        )

    apply(X, 2,        )

apply(X, c(1,2),           )

    lapply(X,      )
                                                 dataframe


    sapply(X,      )
                                                   table

     sweep(X, M,V)             X       (M=1)   (M=2)         (M=c(1,2))   V
                                                                              7
1
> m <- matrix((1:9)**2, nrow=3)   > l <- list(a=1:3, b=4:6)
> m                               > l
     [,1] [,2] [,3]               $a
[1,]    1   16   49               [1] 1 2 3
[2,]    4   25   64
[3,]    9   36   81               $b
> apply(m, 1, sum)                [1] 4 5 6
[1] 66 93 126
> apply(m, 2, sum)                > lapply(l, sum)
[1] 14 77 194                     $a
> apply(m, c(1,2), sqrt)          [1] 6
     [,1] [,2] [,3]
[1,]    1    4     7              $b
[2,]    2    5     8              [1] 15
[3,]    3    6     9
                                  > sapply(l, sum)
                                   a b
                                   6 15

                                                              8
2
> m <- matrix((1:9)**2, nrow=3)   > l <- list(a=1:3, b=4:6)
> m                               > l
     [,1] [,2] [,3]               $a
[1,]    1   16   49               [1] 1 2 3
[2,]    4   25   64
[3,]    9   36   81               $b
> apply(m, 1, sum)                [1] 4 5 6
[1] 66 93 126
> apply(m, 2, sum)                > lapply(l, sum)
[1] 14 77 194                     $a
> apply(m, c(1,2), sqrt)          [1] 6
     [,1] [,2] [,3]
[1,]    1    4     7              $b
[2,]    2    5     8              [1] 15
[3,]    3    6     9
                                  > sapply(l, sum)
                                   a b
                                   6 15

                                                              9
K-NN
 •
> iris.train <- read.table("iris_train.csv", sep=",", header=T)
> iris.test <- read.table("iris_test.csv", sep=",", header=T)

> q <- iris.test[1,1:4]

> diff <- sweep(iris.train[1:4], 2, t(q))

> diff * diff

> distquery <- apply(diff * diff, 1, sum)

> sort(distquery)

> order(distquery)




                                                                  10
1

> iris.train[order(distquery)[1:5],]

> iris.train[order(distquery)[1:5],]$Class

> knnclasses <- table(iris.train[order(distquery)[1:5],]$Class)

> as.factor(table(knnclasses)

> sortedtable <- sort(as.factor(table(knnclasses)), decreasing=T)

> labels(sortedtable)[1]

> predclass <- labels(sortedtable)[1]

> predclass == iris.test$Class[1]



                                                                    11
>   knnpredict <- function(train,class,query,k) {
+   diff <- sweep(train,2,query)
+   distquery <- apply(diff * diff, 1, sum)
+   knnclasses <- class[order(distquery)[1:k]]
+   sortedtable <- sort(as.factor(table(knnclasses)), decreasing=T)
+   labels(sortedtable)[1]
+   }

> knnpredict(iris.train[1:4], iris.train$Class, t(iris.test[1,1:4]),
5)

> knnpredict(iris.train[1:4], iris.train$Class, t(iris.test[10,1:4]),
1)


> for (i in 1:length(rownames(iris.test))) {
+ pred <- knnpredict(iris.train[1:4], iris.train$Class, t(iris.test
[i,1:4]),10)
+ result <- pred == iris.test[i,]$Class
+ cat(paste(pred, iris.test[i,]$Class, result, sep="t"))
+ cat("n")
+ }                                                                 12
> resvec <- c()
> for (i in 1:30) {
+ pred <- knnpredict(iris.train[1:4], iris.train$Class, t(iris.test
[i,1:4]),10)
+ resvec <- append(resvec, pred == iris.test[i,]$Class)
+ }
> sum(resvec)/length(resvec)




                                                                      13
3
1.    IRIS
     1.   IRIS                              4    ("Sepal.length","Sepal.width",
          "Petal.length","Petal.width")


     2.   IRIS
                                                   K-NN


2.    USPS
     1.   USPS                            5-NN                      (0-9)


     2.   K-NN      K
     3.   USPS                                                                    K-NN
                                                                            K-NN    K


     4.   USPS

                                                                                         14

More Related Content

PDF
Sql to Mongodb
PPTX
Jacobson Theorem
DOC
20120523123639 relationsfunctionsclass 11
PDF
Econometric Analysis 8th Edition Greene Solutions Manual
PDF
Math1000 section1.9
DOCX
Algebra and function
PDF
Ese563
PPT
Sequence analysis
Sql to Mongodb
Jacobson Theorem
20120523123639 relationsfunctionsclass 11
Econometric Analysis 8th Edition Greene Solutions Manual
Math1000 section1.9
Algebra and function
Ese563
Sequence analysis

What's hot (19)

PPT
Derivatives vinnie
PPT
Pt 3&4 turunan fungsi implisit dan cyclometri
PDF
DISTANCE TWO LABELING FOR MULTI-STOREY GRAPHS
PPSX
General equation of a circle
DOC
F4 c1 functions__new__1_
PDF
Composicion de funciones
PDF
Nov. 3 Coordinate Equation Of A Circle
KEY
0207 ch 2 day 7
PDF
PPT
1509 circle- coordinate geometry
PDF
Bc4103338340
PDF
8th alg -l10.6
PPT
Pt 2 turunan fungsi eksponen, logaritma, implisit dan cyclometri-d4
PDF
maths basics
PDF
Lesson18 Double Integrals Over Rectangles Slides
PDF
Emat 213 midterm 2 fall 2005
PDF
Lesson 19: Double Integrals over General Regions
PPT
14 6 equations of-circles
PPTX
Functions
Derivatives vinnie
Pt 3&4 turunan fungsi implisit dan cyclometri
DISTANCE TWO LABELING FOR MULTI-STOREY GRAPHS
General equation of a circle
F4 c1 functions__new__1_
Composicion de funciones
Nov. 3 Coordinate Equation Of A Circle
0207 ch 2 day 7
1509 circle- coordinate geometry
Bc4103338340
8th alg -l10.6
Pt 2 turunan fungsi eksponen, logaritma, implisit dan cyclometri-d4
maths basics
Lesson18 Double Integrals Over Rectangles Slides
Emat 213 midterm 2 fall 2005
Lesson 19: Double Integrals over General Regions
14 6 equations of-circles
Functions
Ad

Similar to Datamining r 4th (20)

PDF
Datamining R 4th
PDF
Data Munging in R - Chicago R User Group
PDF
PRE: Datamining 2nd R
PDF
Datamining R 1st
PDF
Datamining r 1st
PPTX
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
PDF
purrr.pdf
PDF
第2回 基本演算,データ型の基礎,ベクトルの操作方法
PDF
[1062BPY12001] Data analysis with R / week 2
PDF
第4回 データフレームの基本操作 その2(解答付き)
PDF
第3回 データフレームの基本操作 その1(解答付き)
PDF
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
PDF
Matlab/R Dictionary
PDF
Basic operations by novi reandy sasmita
PDF
R Programming Homework Help
PDF
第3回 データフレームの基本操作 その1
PDF
Tsukubar8
PPT
R Programming Intro
PDF
R short-refcard
PDF
Writing Readable Code with Pipes
Datamining R 4th
Data Munging in R - Chicago R User Group
PRE: Datamining 2nd R
Datamining R 1st
Datamining r 1st
Seminar PSU 09.04.2013 - 10.04.2013 MiFIT, Arbuzov Vyacheslav
purrr.pdf
第2回 基本演算,データ型の基礎,ベクトルの操作方法
[1062BPY12001] Data analysis with R / week 2
第4回 データフレームの基本操作 その2(解答付き)
第3回 データフレームの基本操作 その1(解答付き)
第2回 基本演算,データ型の基礎,ベクトルの操作方法(解答付き)
Matlab/R Dictionary
Basic operations by novi reandy sasmita
R Programming Homework Help
第3回 データフレームの基本操作 その1
Tsukubar8
R Programming Intro
R short-refcard
Writing Readable Code with Pipes
Ad

More from sesejun (20)

PDF
RNAseqによる変動遺伝子抽出の統計: A Review
PDF
バイオインフォマティクスによる遺伝子発現解析
PDF
次世代シーケンサが求める機械学習
PDF
20110602labseminar pub
PDF
20110524zurichngs 2nd pub
PDF
20110524zurichngs 1st pub
PDF
20110214nips2010 read
PDF
Datamining 9th association_rule.key
PDF
Datamining 8th hclustering
PDF
Datamining r 3rd
PDF
Datamining r 2nd
PDF
Datamining 6th svm
PDF
Datamining 5th knn
PDF
Datamining 4th adaboost
PDF
Datamining 3rd naivebayes
PDF
Datamining 2nd decisiontree
PDF
Datamining 7th kmeans
PDF
100401 Bioinfoinfra
PDF
Datamining 8th Hclustering
PDF
Datamining 9th Association Rule
RNAseqによる変動遺伝子抽出の統計: A Review
バイオインフォマティクスによる遺伝子発現解析
次世代シーケンサが求める機械学習
20110602labseminar pub
20110524zurichngs 2nd pub
20110524zurichngs 1st pub
20110214nips2010 read
Datamining 9th association_rule.key
Datamining 8th hclustering
Datamining r 3rd
Datamining r 2nd
Datamining 6th svm
Datamining 5th knn
Datamining 4th adaboost
Datamining 3rd naivebayes
Datamining 2nd decisiontree
Datamining 7th kmeans
100401 Bioinfoinfra
Datamining 8th Hclustering
Datamining 9th Association Rule

Datamining r 4th

  • 1. R: k apply sesejun@is.ocha.ac.jp 2009/11/19
  • 3. ImageName Class 0,0 0,1 0,2 0,3 0,4 img_2_00_02 1 0 0 0 0 0 img_2_00_03 1 0 38 22 0 0 img_2_00_05 1 13 0 64 13 42 ... img_0_00_09 -1 34 53 0 38 0 img_0_00_28 -1 0 64 0 98 93 img_0_01_08 -1 13 0 0 59 13 img_0_03_05 -1 34 34 0 0 0
  • 5. img_3_29_25 img_5_03_31 img_3_06_30 img_3_17_08
  • 7. Apply Family • , , • for • apply(X, 1, ) apply(X, 2, ) apply(X, c(1,2), ) lapply(X, ) dataframe sapply(X, ) table sweep(X, M,V) X (M=1) (M=2) (M=c(1,2)) V 7
  • 8. 1 > m <- matrix((1:9)**2, nrow=3) > l <- list(a=1:3, b=4:6) > m > l [,1] [,2] [,3] $a [1,] 1 16 49 [1] 1 2 3 [2,] 4 25 64 [3,] 9 36 81 $b > apply(m, 1, sum) [1] 4 5 6 [1] 66 93 126 > apply(m, 2, sum) > lapply(l, sum) [1] 14 77 194 $a > apply(m, c(1,2), sqrt) [1] 6 [,1] [,2] [,3] [1,] 1 4 7 $b [2,] 2 5 8 [1] 15 [3,] 3 6 9 > sapply(l, sum) a b 6 15 8
  • 9. 2 > m <- matrix((1:9)**2, nrow=3) > l <- list(a=1:3, b=4:6) > m > l [,1] [,2] [,3] $a [1,] 1 16 49 [1] 1 2 3 [2,] 4 25 64 [3,] 9 36 81 $b > apply(m, 1, sum) [1] 4 5 6 [1] 66 93 126 > apply(m, 2, sum) > lapply(l, sum) [1] 14 77 194 $a > apply(m, c(1,2), sqrt) [1] 6 [,1] [,2] [,3] [1,] 1 4 7 $b [2,] 2 5 8 [1] 15 [3,] 3 6 9 > sapply(l, sum) a b 6 15 9
  • 10. K-NN • > iris.train <- read.table("iris_train.csv", sep=",", header=T) > iris.test <- read.table("iris_test.csv", sep=",", header=T) > q <- iris.test[1,1:4] > diff <- sweep(iris.train[1:4], 2, t(q)) > diff * diff > distquery <- apply(diff * diff, 1, sum) > sort(distquery) > order(distquery) 10
  • 11. 1 > iris.train[order(distquery)[1:5],] > iris.train[order(distquery)[1:5],]$Class > knnclasses <- table(iris.train[order(distquery)[1:5],]$Class) > as.factor(table(knnclasses) > sortedtable <- sort(as.factor(table(knnclasses)), decreasing=T) > labels(sortedtable)[1] > predclass <- labels(sortedtable)[1] > predclass == iris.test$Class[1] 11
  • 12. > knnpredict <- function(train,class,query,k) { + diff <- sweep(train,2,query) + distquery <- apply(diff * diff, 1, sum) + knnclasses <- class[order(distquery)[1:k]] + sortedtable <- sort(as.factor(table(knnclasses)), decreasing=T) + labels(sortedtable)[1] + } > knnpredict(iris.train[1:4], iris.train$Class, t(iris.test[1,1:4]), 5) > knnpredict(iris.train[1:4], iris.train$Class, t(iris.test[10,1:4]), 1) > for (i in 1:length(rownames(iris.test))) { + pred <- knnpredict(iris.train[1:4], iris.train$Class, t(iris.test [i,1:4]),10) + result <- pred == iris.test[i,]$Class + cat(paste(pred, iris.test[i,]$Class, result, sep="t")) + cat("n") + } 12
  • 13. > resvec <- c() > for (i in 1:30) { + pred <- knnpredict(iris.train[1:4], iris.train$Class, t(iris.test [i,1:4]),10) + resvec <- append(resvec, pred == iris.test[i,]$Class) + } > sum(resvec)/length(resvec) 13
  • 14. 3 1. IRIS 1. IRIS 4 ("Sepal.length","Sepal.width", "Petal.length","Petal.width") 2. IRIS K-NN 2. USPS 1. USPS 5-NN (0-9) 2. K-NN K 3. USPS K-NN K-NN K 4. USPS 14