18 一些自定义函数
18.1 数据处理
18.1.1 分组排序并排名
单个分组变量进行排序
rank_in_group <- function(.data,.group_var,.rank_var){
.data %>% arrange({{.group_var}}, {{.rank_var}}) %>% ##先根据group和var进行排序
group_by({{.group_var}}) %>%
mutate(n = rank({{.rank_var}}, ties.method = "first"), ##生成n=每个分组中每个观测的排名
N = max(n)) ##N=每个分组的观测数
}
iris %>%
rank_in_group(.group_var = Species,.rank_var = Sepal.Length)## # A tibble: 150 × 7
## # Groups: Species [3]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species n N
## <dbl> <dbl> <dbl> <dbl> <fct> <int> <int>
## 1 4.3 3 1.1 0.1 setosa 1 50
## 2 4.4 2.9 1.4 0.2 setosa 2 50
## 3 4.4 3 1.3 0.2 setosa 3 50
## 4 4.4 3.2 1.3 0.2 setosa 4 50
## 5 4.5 2.3 1.3 0.3 setosa 5 50
## 6 4.6 3.1 1.5 0.2 setosa 6 50
## 7 4.6 3.4 1.4 0.3 setosa 7 50
## 8 4.6 3.6 1 0.2 setosa 8 50
## 9 4.6 3.2 1.4 0.2 setosa 9 50
## 10 4.7 3.2 1.3 0.2 setosa 10 50
## # ℹ 140 more rows
多个分组变量进行排序
rank_in_groups <- function(.data, .rank_var, ...) {
rank_var <- enquo(.rank_var)
group_vars <- enquos(...) # Get a list of quoted dots
.data %>% arrange(!!!group_vars,!!rank_var) %>%
group_by(!!!group_vars) %>% # Unquote-splice the list
mutate(n = rank(!!rank_var, ties.method = "first"), ##生成n=每个分组中每个观测的排名
N = max(n))
}
mtcars %>%
rank_in_groups(mpg,vs,gear) ##mpg是排序变量,vs和gear是分组变量## # A tibble: 32 × 13
## # Groups: vs, gear [6]
## mpg cyl disp hp drat wt qsec vs am gear carb n N
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 10.4 8 472 205 2.93 5.25 18.0 0 0 3 4 1 12
## 2 10.4 8 460 215 3 5.42 17.8 0 0 3 4 2 12
## 3 13.3 8 350 245 3.73 3.84 15.4 0 0 3 4 3 12
## 4 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 4 12
## 5 14.7 8 440 230 3.23 5.34 17.4 0 0 3 4 5 12
## 6 15.2 8 276. 180 3.07 3.78 18 0 0 3 3 6 12
## 7 15.2 8 304 150 3.15 3.44 17.3 0 0 3 2 7 12
## 8 15.5 8 318 150 2.76 3.52 16.9 0 0 3 2 8 12
## 9 16.4 8 276. 180 3.07 4.07 17.4 0 0 3 3 9 12
## 10 17.3 8 276. 180 3.07 3.73 17.6 0 0 3 3 10 12
## # ℹ 22 more rows
18.2 统计相关
18.2.1 汇总数据t检验
t.test2 <- function(m1,m2,s1,s2,n1,n2,m0=0,equal.variance=FALSE)
{
if( equal.variance==FALSE )
{
se <- sqrt( (s1^2/n1) + (s2^2/n2) )
# welch-satterthwaite df
df <- ( (s1^2/n1 + s2^2/n2)^2 )/( (s1^2/n1)^2/(n1-1) + (s2^2/n2)^2/(n2-1) )
} else
{
# pooled standard deviation, scaled by the sample sizes
se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*s1^2 + (n2-1)*s2^2)/(n1+n2-2) )
df <- n1+n2-2
}
t <- (m1-m2-m0)/se
dat <- c(m1-m2, se, t, 2*pt(-abs(t),df)) ##pt函数可以打印p值
names(dat) <- c("Difference of means", "Std Error", "t", "p-value")
return(dat)
}
# m1, m2: the sample means
# s1, s2: the sample standard deviations
# n1, n2: the same sizes
# m0: the null value for the difference in means to be tested for. Default is 0.
# equal.variance: whether or not to assume equal variance. Default is FALSE. usage:
## Difference of means Std Error t p-value
## 0.01183358 0.11348530 0.10427416 0.91704542
18.2.2 卡方检验
chisq_test <- function(data,x1,x2,correct=FALSE,fisher=FALSE){
library(dplyr)
X<- data %>% ##先将原始数据变换为列联表形式
select(x1,x2) %>%
table()
if(fisher==FALSE){
## 如果fisher参数是F,进行卡方检验
result <- chisq.test(x = X,correct = correct)
} else{
## 如果fisher参数是T,进行fisher精确检验
result <- fisher.test(x = X)
}
## 报告卡方值和p值
list(squared.value=result$statistic,
p.value=result$p.value
)
}