18 一些自定义函数

18.1 数据处理

18.1.1 分组排序并排名

单个分组变量进行排序

rank_in_group <- function(.data,.group_var,.rank_var){
  .data %>% arrange({{.group_var}}, {{.rank_var}}) %>%  ##先根据group和var进行排序
    group_by({{.group_var}}) %>%
    mutate(n = rank({{.rank_var}}, ties.method = "first"), ##生成n=每个分组中每个观测的排名
           N = max(n)) ##N=每个分组的观测数
}

iris %>%
  rank_in_group(.group_var = Species,.rank_var = Sepal.Length)
## # A tibble: 150 × 7
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species     n     N
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>   <int> <int>
##  1          4.3         3            1.1         0.1 setosa      1    50
##  2          4.4         2.9          1.4         0.2 setosa      2    50
##  3          4.4         3            1.3         0.2 setosa      3    50
##  4          4.4         3.2          1.3         0.2 setosa      4    50
##  5          4.5         2.3          1.3         0.3 setosa      5    50
##  6          4.6         3.1          1.5         0.2 setosa      6    50
##  7          4.6         3.4          1.4         0.3 setosa      7    50
##  8          4.6         3.6          1           0.2 setosa      8    50
##  9          4.6         3.2          1.4         0.2 setosa      9    50
## 10          4.7         3.2          1.3         0.2 setosa     10    50
## # ℹ 140 more rows

多个分组变量进行排序

rank_in_groups <- function(.data, .rank_var, ...) {
  rank_var <- enquo(.rank_var)
  group_vars <- enquos(...)  # Get a list of quoted dots

  .data %>% arrange(!!!group_vars,!!rank_var) %>%
    group_by(!!!group_vars) %>%  # Unquote-splice the list
    mutate(n = rank(!!rank_var, ties.method = "first"), ##生成n=每个分组中每个观测的排名
           N = max(n))
}

mtcars %>%
  rank_in_groups(mpg,vs,gear) ##mpg是排序变量,vs和gear是分组变量
## # A tibble: 32 × 13
## # Groups:   vs, gear [6]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb     n     N
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
##  1  10.4     8  472    205  2.93  5.25  18.0     0     0     3     4     1    12
##  2  10.4     8  460    215  3     5.42  17.8     0     0     3     4     2    12
##  3  13.3     8  350    245  3.73  3.84  15.4     0     0     3     4     3    12
##  4  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4     4    12
##  5  14.7     8  440    230  3.23  5.34  17.4     0     0     3     4     5    12
##  6  15.2     8  276.   180  3.07  3.78  18       0     0     3     3     6    12
##  7  15.2     8  304    150  3.15  3.44  17.3     0     0     3     2     7    12
##  8  15.5     8  318    150  2.76  3.52  16.9     0     0     3     2     8    12
##  9  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3     9    12
## 10  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3    10    12
## # ℹ 22 more rows

18.2 统计相关

18.2.1 汇总数据t检验

t.test2 <- function(m1,m2,s1,s2,n1,n2,m0=0,equal.variance=FALSE)
{
    if( equal.variance==FALSE ) 
    {
        se <- sqrt( (s1^2/n1) + (s2^2/n2) )
        # welch-satterthwaite df
        df <- ( (s1^2/n1 + s2^2/n2)^2 )/( (s1^2/n1)^2/(n1-1) + (s2^2/n2)^2/(n2-1) )
    } else
    {
        # pooled standard deviation, scaled by the sample sizes
        se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*s1^2 + (n2-1)*s2^2)/(n1+n2-2) ) 
        df <- n1+n2-2
    }      
    t <- (m1-m2-m0)/se 
    dat <- c(m1-m2, se, t, 2*pt(-abs(t),df))     ##pt函数可以打印p值
    names(dat) <- c("Difference of means", "Std Error", "t", "p-value")
    return(dat) 
}

# m1, m2: the sample means
# s1, s2: the sample standard deviations
# n1, n2: the same sizes
# m0: the null value for the difference in means to be tested for. Default is 0. 
# equal.variance: whether or not to assume equal variance. Default is FALSE. 

usage:

## Difference of means           Std Error                   t             p-value 
##          0.01183358          0.11348530          0.10427416          0.91704542

18.2.2 卡方检验

chisq_test <- function(data,x1,x2,correct=FALSE,fisher=FALSE){
  library(dplyr)
  X<- data %>%   ##先将原始数据变换为列联表形式
    select(x1,x2) %>% 
    table()
  
  if(fisher==FALSE){ 
    ## 如果fisher参数是F,进行卡方检验
    result <- chisq.test(x = X,correct = correct)
  } else{   
    ## 如果fisher参数是T,进行fisher精确检验
    result <- fisher.test(x = X)
  }
  ## 报告卡方值和p值
  list(squared.value=result$statistic,
       p.value=result$p.value
       )
}