library(tidyverse)

1 数据读取

主要学习如何批量读取excel文件，使用readxl包中的read_excel()函数。

更详细的内容可参见 Tidyverse风格的数据录入。

获取所有xlsx文件的路径。
批量读取并按照行合并数据，这里可将读取一个文件的代码封装成一个函数，使用purrr包中的map_dfr()函数来批量处理。
Hadley 建议在处理异构数据时可在中间插入处理步骤。

# 获取所有xlsx文件的路径
file_paths <- fs::dir_ls("D:/Myblog/datas/最简R编程入门/data", glob = "*.xls")
file_paths

D:/Myblog/datas/最简R编程入门/data/分省年度数据 (1).xls
D:/Myblog/datas/最简R编程入门/data/分省年度数据 (2).xls
D:/Myblog/datas/最简R编程入门/data/分省年度数据 (3).xls
D:/Myblog/datas/最简R编程入门/data/分省年度数据 (4).xls
D:/Myblog/datas/最简R编程入门/data/分省年度数据.xls

# 批量读取并按照行合并数据
read_onefile_func <- function(file_path){
  readxl::read_excel(file_path, skip = 3, n_max = 31)
}

# 使用purrr包中的map_dfr()函数来批量处理
df <- purrr::map_dfr(file_paths, read_onefile_func, .id = "来源")
df

# A tibble: 155 × 12
   来源     地区  `2024年` `2023年` `2022年` `2021年` `2020年` `2019年` `2018年`
   <chr>    <chr>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
 1 D:/Mybl… 北京市……    116.     106.      112.    111.      108.     114.     121.
 2 D:/Mybl… 天津市……    284.     268.      273.    266.      210.     185.     175.
 3 D:/Mybl… 河北省……   4522.    4466.     4409.   4030.     3880.    3518.    3339.
 4 D:/Mybl… 山西省……   1392.    1389.     1335.   1287.     1167.     825.     741.
 5 D:/Mybl… 内蒙古自…   2873.    2737.     2655.   2354.     2029.    1863.    1751.
 6 D:/Mybl… 辽宁省……   2566.    2651      2598    2462.     2285.    2178     2021.
 7 D:/Mybl… 吉林省……   1590.    1645.     1689.   1554.     1553     1287.    1161.
 8 D:/Mybl… 黑龙江省…   3203.    3518.     3635.   3463.     3445.    3183.    3001.
 9 D:/Mybl… 上海市……     99.7     96.1     100      96.1     108.     107.     105.
10 D:/Mybl… 江苏省……   5245.    5076.     4964.   4721      4538.    4297.    4142.
# ℹ 145 more rows
# ℹ 3 more variables: `2017年` <dbl>, `2016年` <dbl>, `2015年` <dbl>

# Hadley建议在处理异构数据时可在中间插入处理步骤
df_1 <- purrr::map(file_paths, read_onefile_func) |> 
  list_rbind(names_to = "来源")
df_1

# A tibble: 155 × 12
   来源     地区  `2024年` `2023年` `2022年` `2021年` `2020年` `2019年` `2018年`
   <chr>    <chr>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
 1 D:/Mybl… 北京市……    116.     106.      112.    111.      108.     114.     121.
 2 D:/Mybl… 天津市……    284.     268.      273.    266.      210.     185.     175.
 3 D:/Mybl… 河北省……   4522.    4466.     4409.   4030.     3880.    3518.    3339.
 4 D:/Mybl… 山西省……   1392.    1389.     1335.   1287.     1167.     825.     741.
 5 D:/Mybl… 内蒙古自…   2873.    2737.     2655.   2354.     2029.    1863.    1751.
 6 D:/Mybl… 辽宁省……   2566.    2651      2598    2462.     2285.    2178     2021.
 7 D:/Mybl… 吉林省……   1590.    1645.     1689.   1554.     1553     1287.    1161.
 8 D:/Mybl… 黑龙江省…   3203.    3518.     3635.   3463.     3445.    3183.    3001.
 9 D:/Mybl… 上海市……     99.7     96.1     100      96.1     108.     107.     105.
10 D:/Mybl… 江苏省……   5245.    5076.     4964.   4721      4538.    4297.    4142.
# ℹ 145 more rows
# ℹ 3 more variables: `2017年` <dbl>, `2016年` <dbl>, `2015年` <dbl>

还可以使用mathmodels包中的read_nbs()函数来批量读取数据。

library(mathmodels)
df_2 <- read_nbs(file_paths)
df_2

# A tibble: 155 × 12
   indicator        region `2024年` `2023年` `2022年` `2021年` `2020年` `2019年`
   <chr>            <chr>     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
 1 第一产业增加值(亿元)…… 北京      116.     106.      112.    111.      108.     114.
 2 第一产业增加值(亿元)…… 天津      284.     268.      273.    266.      210.     185.
 3 第一产业增加值(亿元)…… 河北     4522.    4466.     4409.   4030.     3880.    3518.
 4 第一产业增加值(亿元)…… 山西     1392.    1389.     1335.   1287.     1167.     825.
 5 第一产业增加值(亿元)…… 内蒙古   2873.    2737.     2655.   2354.     2029.    1863.
 6 第一产业增加值(亿元)…… 辽宁     2566.    2651      2598    2462.     2285.    2178 
 7 第一产业增加值(亿元)…… 吉林     1590.    1645.     1689.   1554.     1553     1287.
 8 第一产业增加值(亿元)…… 黑龙江   3203.    3518.     3635.   3463.     3445.    3183.
 9 第一产业增加值(亿元)…… 上海       99.7     96.1     100      96.1     108.     107.
10 第一产业增加值(亿元)…… 江苏     5245.    5076.     4964.   4721      4538.    4297.
# ℹ 145 more rows
# ℹ 4 more variables: `2018年` <dbl>, `2017年` <dbl>, `2016年` <dbl>,
#   `2015年` <dbl>

Note

使用haven包读取其他类型的数据：

read_spss()：读取SPSS数据文件（.sav）
read_dta()：读取Stata数据文件（.dta）
read_sas()：读取SAS数据文件（.sas7bdat）

2 数据重塑

更多的操作实例可参看dplyr进阶，tidyr进阶。

library(tidyverse)
library(nycflights13)
flights

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

df <- read_csv("D:/Myblog/datas/score.csv")

dplyr 定义了数据处理的规范语法，其中主要函数包含：

数据变换和选择: {更多函数可参看dplyr主页。}[.aside]
- mutate(), select(), rename() , filter()
- summarise(), group_by(), arrange()
- 与其他选择函数连用：all_of(), any_of()
- left_join(), right_join(), full_join()
数据整理：
- pivot_longer(), pivot_wider()
- expand(), complete()
- expand_grid(), crossing()
- separate(), unite(), distinct()/n_distinct()

。

2.1 dplyr的简单操作

reward <- c(2, 5, 9, 8, 5, 6)
df_new <- df %>%
  mutate(extra = reward) %>%
  mutate(total = score + extra)
df_new

# A tibble: 6 × 5
  name  type    score extra total
  <chr> <chr>   <dbl> <dbl> <dbl>
1 Alice english    80     2    82
2 Alice math       60     5    65
3 Bob   english    70     9    79
4 Bob   math       69     8    77
5 Carol english    80     5    85
6 Carol math       90     6    96

df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

2.2 **_join()系列函数

数据连接在数据分析中是非常重要的操作，dplyr 提供了多种数据连接函数。

df1 <- df_new %>%
  group_by(name) %>%
  summarise(mean_score = mean(total))

df2 <- tibble(
  name = c("Alice", "Bob", "Dave"),
  age = c(12, 13, 14)
)

left_join()以左边的数据框为准。
right_join()以右边的数据框为准。
full_join()以两边的数据框为准，并返回所有匹配的行。
inner_join()只返回匹配的行。
semi_join(x,y), 不改变x的变量数量，只筛选x中与y一致的所有匹配行。
anti_join(x,y), 不改变x的变量数量，丢弃与y一致的所有匹配行。

df1 %>%
  left_join(df2, by = "name")

# A tibble: 3 × 3
  name  mean_score   age
  <chr>      <dbl> <dbl>
1 Alice       73.5    12
2 Bob         78      13
3 Carol       90.5    NA

df1 %>%
  right_join(df2, by = "name")

# A tibble: 3 × 3
  name  mean_score   age
  <chr>      <dbl> <dbl>
1 Alice       73.5    12
2 Bob         78      13
3 Dave        NA      14

df1 %>%
  full_join(df2, by = "name")

# A tibble: 4 × 3
  name  mean_score   age
  <chr>      <dbl> <dbl>
1 Alice       73.5    12
2 Bob         78      13
3 Carol       90.5    NA
4 Dave        NA      14

df1 %>%
  inner_join(df2, by = "name")

# A tibble: 2 × 3
  name  mean_score   age
  <chr>      <dbl> <dbl>
1 Alice       73.5    12
2 Bob         78      13

df1 %>%
  semi_join(df2, by = "name")

# A tibble: 2 × 2
  name  mean_score
  <chr>      <dbl>
1 Alice       73.5
2 Bob         78

df1 %>%
  anti_join(df2, by = "name")

# A tibble: 1 × 2
  name  mean_score
  <chr>      <dbl>
1 Carol       90.5

2.3 数据规整

pivot_longer()将数据框变长。
pivot_wider()将数据框变宽。
fill()填充缺失值。
expand()扩展数据框，指定数据框的若干列，根据其向量元素值，产生所有可能的交叉组合
complete()补全数据框。
crossing()，expand_grid()的去重复版本，产生所有可能的交叉组合。
separate()分割变量。
unite()合并变量。
distinct()去重。
n_distinct()计算变量的唯一值个数。

plant_height <- data.frame(
  Day = 1:5,
  A = c(0.7, 1.0, 1.5, 1.8, 2.2),
  B = c(0.5, 0.7, 0.9, 1.3, 1.8),
  C = c(0.3, 0.6, 1.0, 1.2, 2.2),
  D = c(0.4, 0.7, 1.2, 1.5, 3.2)
)


plant_height

  Day   A   B   C   D
1   1 0.7 0.5 0.3 0.4
2   2 1.0 0.7 0.6 0.7
3   3 1.5 0.9 1.0 1.2
4   4 1.8 1.3 1.2 1.5
5   5 2.2 1.8 2.2 3.2

2.3.1 宽表变长表

long <- plant_height %>%
  pivot_longer(
    cols = -Day,
    names_to = "plant",
    values_to = "height"
  )
long

# A tibble: 20 × 3
     Day plant height
   <int> <chr>  <dbl>
 1     1 A        0.7
 2     1 B        0.5
 3     1 C        0.3
 4     1 D        0.4
 5     2 A        1  
 6     2 B        0.7
 7     2 C        0.6
 8     2 D        0.7
 9     3 A        1.5
10     3 B        0.9
11     3 C        1  
12     3 D        1.2
13     4 A        1.8
14     4 B        1.3
15     4 C        1.2
16     4 D        1.5
17     5 A        2.2
18     5 B        1.8
19     5 C        2.2
20     5 D        3.2

参数cols，表示哪些列需要转换.
参数names_to，表示cols选取的这些列的名字，构成了新的一列，这里需要取一个名字.
参数values_to，表示cols选取的这些列的值，构成了新的一列，这里也需要取一个名字.
数据框总的信息量不会丢失

2.3.2 长表变宽表

wide <- long %>%
  pivot_wider(
    names_from = "plant",
    values_from = "height"
  )
wide

# A tibble: 5 × 5
    Day     A     B     C     D
  <int> <dbl> <dbl> <dbl> <dbl>
1     1   0.7   0.5   0.3   0.4
2     2   1     0.7   0.6   0.7
3     3   1.5   0.9   1     1.2
4     4   1.8   1.3   1.2   1.5
5     5   2.2   1.8   2.2   3.2

参数names_from，表示哪些列的值作为新的列名.
参数values_from，表示哪些列的值作为新的列值.

2.3.3 列名转变为多个变量

plant_record <- data.frame(
  day = c(1L, 2L, 3L, 4L, 5L),
  A_height = c(1.1, 1.2, 1.3, 1.4, 1.5),
  A_width = c(2.1, 2.2, 2.3, 2.4, 2.5),
  A_depth = c(3.1, 3.2, 3.3, 3.4, 3.5),
  B_height = c(4.1, 4.2, 4.3, 4.4, 4.5),
  B_width = c(5.1, 5.2, 5.3, 5.4, 5.5),
  B_depth = c(6.1, 6.2, 6.3, 6.4, 6.5),
  C_height = c(7.1, 7.2, 7.3, 7.4, 7.5),
  C_width = c(8.1, 8.2, 8.3, 8.4, 8.5),
  C_depth = c(9.1, 9.2, 9.3, 9.4, 9.5)
)
plant_record

  day A_height A_width A_depth B_height B_width B_depth C_height C_width
1   1      1.1     2.1     3.1      4.1     5.1     6.1      7.1     8.1
2   2      1.2     2.2     3.2      4.2     5.2     6.2      7.2     8.2
3   3      1.3     2.3     3.3      4.3     5.3     6.3      7.3     8.3
4   4      1.4     2.4     3.4      4.4     5.4     6.4      7.4     8.4
5   5      1.5     2.5     3.5      4.5     5.5     6.5      7.5     8.5
  C_depth
1     9.1
2     9.2
3     9.3
4     9.4
5     9.5

plant_record数据框中，每种植物的高度、宽度、深度都有单独的一列，并用_符号连接。此时我们希望把原始数据框的列名(即A_height等)转换为多个变量，例如A,B,C为species变量，height，width，depth为parameter变量。

plant_record %>%
  pivot_longer(
    cols = -day,
    names_to = c("species", "parameter"),
    values_to = "value",
    names_pattern = "(.*)_(.*)"
  )

# A tibble: 45 × 4
     day species parameter value
   <int> <chr>   <chr>     <dbl>
 1     1 A       height      1.1
 2     1 A       width       2.1
 3     1 A       depth       3.1
 4     1 B       height      4.1
 5     1 B       width       5.1
 6     1 B       depth       6.1
 7     1 C       height      7.1
 8     1 C       width       8.1
 9     1 C       depth       9.1
10     2 A       height      1.2
# ℹ 35 more rows

2.3.4 更复杂的情况

例如希望原始数据框的列名中，一部分进入变量，一部分保持原来的列名，例如height，width，depth分别拆分为3个变量。

plant_record_longer <- plant_record %>%
  pivot_longer(
    cols = -day,
    names_to = c("species", ".value"),
    names_pattern = "(.*)_(.*)"
  )
plant_record_longer

# A tibble: 15 × 5
     day species height width depth
   <int> <chr>    <dbl> <dbl> <dbl>
 1     1 A          1.1   2.1   3.1
 2     1 B          4.1   5.1   6.1
 3     1 C          7.1   8.1   9.1
 4     2 A          1.2   2.2   3.2
 5     2 B          4.2   5.2   6.2
 6     2 C          7.2   8.2   9.2
 7     3 A          1.3   2.3   3.3
 8     3 B          4.3   5.3   6.3
 9     3 C          7.3   8.3   9.3
10     4 A          1.4   2.4   3.4
11     4 B          4.4   5.4   6.4
12     4 C          7.4   8.4   9.4
13     5 A          1.5   2.5   3.5
14     5 B          4.5   5.5   6.5
15     5 C          7.5   8.5   9.5

变量名以_分割为两部分。
names_to = c("species", ".value")，指将参数cols选择的列的变量的第1部分作为新变量species的值，将第2部分继续留作列名用来存放值。
注意 .value而不是value，说明这里不是单个列名，而是匹配得到的多个值做列名。
宽变长的过程类似。

plant_record_wider <- plant_record_longer %>%
  pivot_wider(
    names_from = "species",
    values_from = c("height", "width", "depth"),
    names_glue = "{species}_{.value}"
  )
plant_record_wider

# A tibble: 5 × 10
    day A_height B_height C_height A_width B_width C_width A_depth B_depth
  <int>    <dbl>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1     1      1.1      4.1      7.1     2.1     5.1     8.1     3.1     6.1
2     2      1.2      4.2      7.2     2.2     5.2     8.2     3.2     6.2
3     3      1.3      4.3      7.3     2.3     5.3     8.3     3.3     6.3
4     4      1.4      4.4      7.4     2.4     5.4     8.4     3.4     6.4
5     5      1.5      4.5      7.5     2.5     5.5     8.5     3.5     6.5
# ℹ 1 more variable: C_depth <dbl>

更多内容，可参见我的R语言读书笔记，第三部分”变量类型及变量操作”。