R intro 20140716-advance

R 統計軟體簡介(2)
常用統計分析
徐峻賢
中央研究院語言學研究所
大腦與語言實驗室

• What is central limit theorem？
x <- rnorm(30, mean = 1, sd = 2)
hist(x)
xmean <- numeric(100)
for (i in 1:100)
{
x <- rnorm(30, mean = 1, sd = 2)
xmean[i] <-mean(x)
}
hist(xmean)

• What is central limit theorem？
y <- rexp(100, rate = 1)
hist(y)
ymean <- numeric(100)
for (i in 1:100)
{
y <- rexp(100, rate = 1)
ymean[i] <-mean(y)
}
hist(ymean)

rnorm()  產生常態分布的隨機變數
dnorm()  probability density
pnorm()  cumulative probability function
qnorm()  the value of quantile
rnorm(n=30,mean=0,sd=1)
dnorm(1)== 1/sqrt(2*pi)*exp(-1/2)
pnorm(1.645, mean=0,sd=1)
qnorm(0.95,mean=0,sd=1)

建立 R documents 的好習慣
• R 軟體有很多細節，使用者偶而會出現失讀
症的徵狀…

建立 R documents 的好習慣
• 多做注解 (##)
• 留意套件和主程式的版本 (R-news)
• 在documents 的開頭交代基本環境
– e.g.:
### This is for …. By xxx at 2014/7/06
library(ez)
setwd(“c:/data/”)
load(“myexample.Rdata”)
rm(list=ls())

quasif data set in languageR package
Source: Raaijmakers et al., 1999, Table2

data(lexicalMeasures)
Lexical distributional measures for 2233 English
monomorphemic words. This dataset provides a
subset of the data available in the dataset
english.
Baayen, R.H., Feldman, L. and Schreuder, R.
(2006) Morphological influences on the
recognition of monosyllabic monomorphemic
words, Journal of Memory and Language, 53,
496-512.

data(lexicalMeasures)
head(lexicalMeasures)
lexicalMeasures.cor = cor(lexicalMeasures[,-1], method =
"spearman")^2
lexicalMeasures.dist = dist(lexicalMeasures.cor)
### Hierarchical Clustering
lexicalMeasures.clust = hclust(lexicalMeasures.dist)
plclust(lexicalMeasures.clust)
### or
### DIvisive ANAlysis Clustering
pltree(diana(lexicalMeasures.dist))

ffV
ffN
ffNonzero
spelV
friendsV
spelN
friendsN
fbV
fbN
phonV
phonN
Vf
Dent
NsyS
CelS
NsyC
Ncou
Len
Bigr
Ient
NVratio
Fdif
InBi
0.0 0.5 1.0 1.5 2.0hclust(*,"complete")
lexicalMeasures.dist
Height

CelS
NsyC
NsyS
Vf
Dent
Ient
NVratio
Fdif
InBi
Len
Bigr
Ncou
spelV
friendsV
spelN
friendsN
phonV
phonN
fbV
fbN
ffV
ffN
ffNonzero
0.0 0.5 1.0 1.5 2.0
Dendrogramofdiana(x=lexicalMeasures.dist)
diana(*,"NA")
lexicalMeasures.dist
Height

quasif data set in languageR package
> ldt=quasif
> detach(package:languageR)
> B=read.csv(file="Baayen2008C.csv")
> head(ldt, n=10)
> tail(ldt, n=10)

dataframe[r,c]
> B[1, 4]
[1] 466
> B[1:2, ]
Subj Item SOA RT
1 s1 w1 Long 466
2 s1 w2 Long 520
> B[,4]
[1] 466 520 502 475 …

dataframe$variable
> B$RT
[1] 466 520 502 475 …
> B[B$Subj=="s1", 4]
[1] 466 520 502 475 494 490
> B[B$RT<500, 4]
[1] 466 475 494 490 491 484 470

> B=B[order(B$Item, B$SOA), ];B
Subj Item SOA RT
1 s1 w1 Long 466
2 s1 w2 Long 520
3 s1 w3 Long 502
4 s1 w1 Short 475
5 s1 w2 Short 494
6 s1 w3 Short 490
7 s2 w1 Long 516
8 s2 w2 Long 566
9 s2 w3 Long 577
10 s2 w1 Short 491
11 s2 w2 Short 544
12 s2 w3 Short 526
13 s3 w1 Long 484
14 s3 w2 Long 529
15 s3 w3 Long 539
16 s3 w1 Short 470
17 s3 w2 Short 511
18 s3 w3 Short 528

> B$RT=B$RT/1000;B
Subj Item SOA RT
1 s1 w1 Long 0.466
2 s1 w2 Long 0.520
3 s1 w3 Long 0.502
4 s1 w1 Short 0.475
5 s1 w2 Short 0.494
6 s1 w3 Short 0.490
7 s2 w1 Long 0.516
8 s2 w2 Long 0.566
9 s2 w3 Long 0.577
10 s2 w1 Short 0.491
11 s2 w2 Short 0.544
12 s2 w3 Short 0.526
13 s3 w1 Long 0.484
14 s3 w2 Long 0.529
15 s3 w3 Long 0.539
16 s3 w1 Short 0.470
17 s3 w2 Short 0.511
18 s3 w3 Short 0.528

> B.xtab=xtabs(~ SOA+Item, data=B);B.xtab
Item
SOA w1 w2 w3
Long 3 3 3
Short 3 3 3
> B.xtab.g500=xtabs(~ SOA+Item,
+ data=B,subset=B$RT>500);B.xtab.g500
Item
SOA w1 w2 w3
Long 1 3 3
Short 0 2 2

> bysub=aggregate(B$RT, list(B$SOA, B$Subj),
+ mean); bysub
Group.1 Group.2 x
1 Long s1 496.0000
2 Short s1 486.3333
3 Long s2 553.0000
4 Short s2 520.3333
5 Long s3 517.3333
6 Short s3 503.0000
> colnames(bysub) = c(“SOA”, “Subj”, “meanRT”)
> bysub
SOA Subj meanRT
1 Long s1 496.0000
2 Short s1 486.3333
3 Long s2 553.0000
4 Short s2 520.3333
5 Long s3 517.3333
6 Short s3 503.0000

> byitem=aggregate(B$RT, list(B$SOA, B$Item),
+ mean); byitem
Group.1 Group.2 x
1 Long w1 488.6667
2 Short w1 478.6667
3 Long w2 538.3333
4 Short w2 516.3333
5 Long w3 539.3333
6 Short w3 514.6667
> colnames(byitem) = c(“SOA”, “Subj”, “meanRT”)
> byitem
SOA Subj meanRT
1 Long s1 496.0000
2 Short s1 486.3333
3 Long s2 553.0000
4 Short s2 520.3333
5 Long s3 517.3333
6 Short s3 503.0000

• By subject analysis
bysub=aggregate(B$RT, list(B$SOA, B$Subj), mean);
bysub
names(bysub) <- c("SOA", "Subj", "RT”)
rt_anova = ezANOVA(
data = B #### 用aggregate 之前的 data frames
, dv = RT
, wid = Subj
, within = .(SOA)
)
print(rt_anova)
rt_anova3 = ezANOVA(
data = bysub #### 用by subject mean 的 data frames
, dv = RT
, wid = Subj
, within = .(SOA)
)
print(rt_anova3)

• By item analysis
byitem=aggregate(B$RT, list(B$SOA, B$Item), mean);
byitem
names(byitem) <- c("SOA", "items", "RT")
rt_anova2 = ezANOVA(
data = byitem
, dv = RT
,wid = items
, between = SOA
)
print(rt_anova2)

• data(ANT)
– ANT{ez}
– Simulated data from the Attention Network Test
– J Fan, BD McCandliss, T Sommer, A Raz, MI Posner
(2002). Testing the efficiency and independence of
attentional networks. Journal of Cognitive
Neuroscience, 14, 340-347.
• 2 within-Ss variables (“cue” and “flank”)
• 1 between-Ss variable (“group”)
• 2 dependent variables (“rt”, and “error”)

> data(ANT) ### A data frame with 5760
observations on the following 10 variables
> head(ANT, 20)

aov.rt = ezANOVA(
data = ANT[ANT$error==0,]
, dv = rt
, wid = subnum
, within = .(cue,flank)
, between = group
)
print(aov.rt)

aov.rt = ezANOVA(
, dv = rt
, wid = subnum
, between = group
, detailed = T
)
print(aov.rt)

bt_descriptives = ezStats(
, dv = rt
, wid = subnum
, between = group
)
print(bt_descriptives)

所有獨變項組合的平均反應時間
all_descriptives = ezStats(
, dv = rt
, wid = subnum
, between = group
)
print(all_descriptives)

group_plot = ezPlot(
, dv = .(rt)
, wid = .(subnum)
, between = .(group)
, x = .(group)
, do_lines = FALSE
, x_lab = 'Group'
, y_lab = 'RT (ms)'
)
print(group_plot)

cue_by_flank_plot = ezPlot(
, dv = .(rt)
, wid = .(subnum)
, x = .(flank)
, split = .(cue)
, x_lab = 'Flanker'
, y_lab = 'RT (ms)'
, split_lab = 'Cue'
)
print(cue_by_flank_plot)

• 自我挑戰：
– (1) 用 aggregare 計算正確反應時間的by subject
mean
– (2) 用 (1) 的輸出執行 ezANOVA
– (3) 用 aggregate 計算每個人、每個 condition 的
錯誤率
– (4) 用 ezStats 計算每個人、每個 condition 的錯
誤率
– (5) 使用錯誤率分析、畫圖

運算子(operators)
Arithmetic Comparison Logical
+ addition < lesser than !x logical NOT
- subtraction > greater than x&y logical AND
* multiplication <= lesser than or equal to x&&y id.
/ division >= greater than or equal to x|y logical OR
^ power == equal x||y id.
%% modulo != different xor(x,y) exclusive OR
%/% integer division
x<-matrix(1:6,2,3) #製造一個2*3的矩陣x，其數值為1到6
x[2,3]==6 # x矩陣第2row第3column的值是否等於6
x[x<=3] # 列出x矩陣內小於或等於3的數值
x[x!=6] # 列出x矩陣內不等於6的數值
x[x<=3 & x!=2] #列出x矩陣內小於或等於3且不等於2的值

函數(function)
• function.name(object, argument, option)
函數名稱物件指令選項
#args(function.name) 查詢該函數的指令
• 數學及簡單函數
sum(),mean(),max(),length()
• 產生隨機變數
rnorm(),runiform(),rbinom()
• 初統常用分析函數
t.test(),aova(),lm()

Graphing
> windows() #開啟一個繪圖視窗
> par(mfrow=c(m,n)) #將繪圖視窗切割成m*n區
> plot(x) #散佈圖
> hist(x) #直方圖
> boxplot(x) #箱型圖
> qqnorm(x);qqline(x) #QQ Plot
main=“titile”
xlab=“x lable name” ylab=“y lable name”
xlim=c(a,b) ylim=c(a,b)

Graphing
> windows()
> plot(B$RT, main="Scatter plot of B", ylab="B")

Graphing
> windows()
> hist(B$RT, main="Histogram of B", xlab="B")

Graphing
> windows()
> boxplot(B$RT, main="Boxplot of B")

Graphing
> windows()
> qqnorm(B$RT); qqline(B$RT)

Graphing
> windows()
> par(mfrow=c(2,2))
> plot(B$RT, main="Scatter plot of B", ylab="B")
> hist(B$RT, main="Histogram of B", xlab="B")
> boxplot(B$RT, main="Boxplot of B")
> qqnorm(B$RT); qqline(B$RT)

Exercise 3
• 請依據MASS中leuk資料集內的time變項資料製作
下面這張圖,並儲存成MASSleuk.jpeg

R intro 20140716-advance

More Related Content

What's hot (20)

Viewers also liked (18)

Similar to R intro 20140716-advance (20)

More from Kevin Chun-Hsien Hsu (7)

Recently uploaded (20)

R intro 20140716-advance