dplyr
1.1.3 数据处理
A Grammar of
Data Manipulation
A Grammar of
Data Manipulation
样本处理
变量处理
汇总
分组和行式处理
用 %>%
连接多个操作
合并多个数据集
操作数据库和 data.table
library(tidyverse)library(nycflights13)data(package = "nycflights13")# 包含airlines、airports、flights、planes、weather等5个数据集
flights # just print() it
#> # A tibble: 336,776 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 517 515 2 830 819 11#> 2 2013 1 1 533 529 4 850 830 20#> 3 2013 1 1 542 540 2 923 850 33#> # ℹ 336,773 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
library(tidyverse)library(nycflights13)data(package = "nycflights13")# 包含airlines、airports、flights、planes、weather等5个数据集
flights # just print() it
#> # A tibble: 336,776 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 517 515 2 830 819 11#> 2 2013 1 1 533 529 4 850 830 20#> 3 2013 1 1 542 540 2 923 850 33#> # ℹ 336,773 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
A tibble: 336,776 x 19
<int>
|<dbl>
|<chr>
|<dttm>
|<lgl>
|<fctr>
|<date>
分别表示变量为 integer | double | character | date-time | logical | factor | date 类型的向量
library(tidyverse)library(nycflights13)data(package = "nycflights13")# 包含airlines、airports、flights、planes、weather等5个数据集
flights # just print() it
#> # A tibble: 336,776 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 517 515 2 830 819 11#> 2 2013 1 1 533 529 4 850 830 20#> 3 2013 1 1 542 540 2 923 850 33#> # ℹ 336,773 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
A tibble: 336,776 x 19
<int>
|<dbl>
|<chr>
|<dttm>
|<lgl>
|<fctr>
|<date>
分别表示变量为 integer | double | character | date-time | logical | factor | date 类型的向量
?flights # 打开flights数据集的帮助文档以进一步了解数据集,如变量的定义
glimpse(flights) # 数据一瞥
#> Rows: 336,776#> Columns: 19#> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…#> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, …#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, …#> $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0…#> $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924,…#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917,…#> $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4,…#> $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6",…#> $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194…#> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516…#> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "L…#> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "O…#> $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 3…#> $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005…#> $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6,…#> $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0,…#> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2…
glimpse(flights) # 数据一瞥
#> Rows: 336,776#> Columns: 19#> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…#> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, …#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, …#> $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0…#> $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924,…#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917,…#> $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4,…#> $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6",…#> $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194…#> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516…#> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "L…#> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "O…#> $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 3…#> $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005…#> $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6,…#> $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0,…#> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2…
View(flights) # 在 RStudio 数据浏览器中打开数据集# tibble::view() 调用 utils::View() 并不可见地返回原数据集,便于 %>% 操作,但速度好像慢很多
(combine tables)
airlines
#> # A tibble: 16 × 2#> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc.#> 3 AS Alaska Airlines Inc. #> # ℹ 13 more rows
planes
#> # A tibble: 3,322 × 9#> tailnum year type manufacturer model#> <chr> <int> <chr> <chr> <chr>#> 1 N10156 2004 Fixed win… EMBRAER EMB-…#> 2 N102UW 1998 Fixed win… AIRBUS INDU… A320…#> 3 N103US 1999 Fixed win… AIRBUS INDU… A320…#> # ℹ 3,319 more rows#> # ℹ 4 more variables: engines <int>,#> # seats <int>, speed <int>, engine <chr>
airlines
#> # A tibble: 16 × 2#> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc.#> 3 AS Alaska Airlines Inc. #> # ℹ 13 more rows
planes
#> # A tibble: 3,322 × 9#> tailnum year type manufacturer model#> <chr> <int> <chr> <chr> <chr>#> 1 N10156 2004 Fixed win… EMBRAER EMB-…#> 2 N102UW 1998 Fixed win… AIRBUS INDU… A320…#> 3 N103US 1999 Fixed win… AIRBUS INDU… A320…#> # ℹ 3,319 more rows#> # ℹ 4 more variables: engines <int>,#> # seats <int>, speed <int>, engine <chr>
airports
#> # A tibble: 1,458 × 8#> faa name lat lon alt tz dst #> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>#> 1 04G Lansdo… 41.1 -80.6 1044 -5 A #> 2 06A Moton … 32.5 -85.7 264 -6 A #> 3 06C Schaum… 42.0 -88.1 801 -6 A #> # ℹ 1,455 more rows#> # ℹ 1 more variable: tzone <chr>
weather
#> # A tibble: 26,115 × 15#> origin year month day hour temp dewp#> <chr> <int> <int> <int> <int> <dbl> <dbl>#> 1 EWR 2013 1 1 1 39.0 26.1#> 2 EWR 2013 1 1 2 39.0 27.0#> 3 EWR 2013 1 1 3 39.0 28.0#> # ℹ 26,112 more rows#> # ℹ 8 more variables: humid <dbl>,#> # wind_dir <dbl>, wind_speed <dbl>,#> # wind_gust <dbl>, precip <dbl>,#> # pressure <dbl>, visib <dbl>,#> # time_hour <dttm>
主键(primary key):指的是数据表中一个变量(列)或多个变量的组合,其值能唯一地标识表中的每一行。主键主要用来与其他表的外键关联,以及本表的修改与删除。
planes %>% count(tailnum) %>% filter(n > 1)
#> # A tibble: 0 × 2#> # ℹ 2 variables: tailnum <chr>, n <int>
weather %>% count(year, month, day, hour, origin) %>% filter(n > 1)
#> # A tibble: 3 × 6#> year month day hour origin n#> <int> <int> <int> <int> <chr> <int>#> 1 2013 11 3 1 EWR 2#> 2 2013 11 3 1 JFK 2#> 3 2013 11 3 1 LGA 2
weather <- weather %>% distinct(year, month, day, hour, origin, .keep_all = TRUE) # 修正之
主键(primary key):指的是数据表中一个变量(列)或多个变量的组合,其值能唯一地标识表中的每一行。主键主要用来与其他表的外键关联,以及本表的修改与删除。
planes %>% count(tailnum) %>% filter(n > 1)
#> # A tibble: 0 × 2#> # ℹ 2 variables: tailnum <chr>, n <int>
weather %>% count(year, month, day, hour, origin) %>% filter(n > 1)
#> # A tibble: 3 × 6#> year month day hour origin n#> <int> <int> <int> <int> <chr> <int>#> 1 2013 11 3 1 EWR 2#> 2 2013 11 3 1 JFK 2#> 3 2013 11 3 1 LGA 2
weather <- weather %>% distinct(year, month, day, hour, origin, .keep_all = TRUE) # 修正之
外键(foreign key):对应于另一数据表主键的一个或多个变量。
*_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
flights %>% select(year:day, hour, tailnum, carrier) %>% left_join(airlines) # 以同名变量为合并键,增加变量 name(航空公司全名)
#> Joining with `by = join_by(carrier)`
#> # A tibble: 336,776 × 7#> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc.#> # ℹ 336,773 more rows
*_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
flights %>% select(year:day, hour, tailnum, carrier) %>% left_join(airlines) # 以同名变量为合并键,增加变量 name(航空公司全名)
#> Joining with `by = join_by(carrier)`
#> # A tibble: 336,776 × 7#> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc.#> # ℹ 336,773 more rows
flights %>% select(year:day, hour, tailnum, carrier) %>% mutate(name = airlines$name[match(carrier, airlines$carrier)]) # 殊途同归,但容易理解吗?
#> # A tibble: 336,776 × 7#> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc.#> # ℹ 336,773 more rows
用 by
参数设定连接所用的键变量
# 默认时(即`by = NULL`)使用两个数据集中同名的全部变量,即natural join# 此时会显示提示信息“Joining, by = join_by(以,分隔的全部同名变量)”flights %>% left_join(weather)# 显性设定`by = join_by(以,分隔的键变量)` 或者 `by = 键变量字符向量`flights %>% left_join(weather, by = join_by(year, month, day, origin, hour))# 两个数据集的变量不同名时,则显性设定如# `by = join_by(xv1 == yv1, xv2 == yv2)` 或者 `by = 命名字符变量向量`flights %>% left_join(airports, by = join_by(dest == faa)) # by = c("dest" = "faa")# dplyr v1.1.0 `by` 参数设定的新语法 `by = join_by(...)` 更加灵活,支持非等值# 连接(non-equi joins)(具体见后)
合并式连接函数*_join()
的参数 relationship
1,3
"one-to-one"
[见前]
"one-to-many"
/ "many-to-one"
"many-to-many"
2many-to-many
的等值合并(equi join)并不合理,*_join()
函数在 relationship
参数取默认值 NULL
时会给出相应的警告信息。*_join()
函数的其他参数:keep
、na_matches
、multiple
和 unmatched
。semi_join(x, y, by = NULL, copy = FALSE, ...) | anti_join()
保留 | 丢弃 x
数据集中在 y
数据集中找得到匹配的样本
top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10)flights %>% filter(dest %in% top_dest$dest)
#> # A tibble: 141,145 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 542 540 2 923 850 33#> 2 2013 1 1 554 600 -6 812 837 -25#> 3 2013 1 1 554 558 -4 740 728 12#> # ℹ 141,142 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
semi_join(x, y, by = NULL, copy = FALSE, ...) | anti_join()
保留 | 丢弃 x
数据集中在 y
数据集中找得到匹配的样本
top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10)flights %>% filter(dest %in% top_dest$dest)
#> # A tibble: 141,145 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 542 540 2 923 850 33#> 2 2013 1 1 554 600 -6 812 837 -25#> 3 2013 1 1 554 558 -4 740 728 12#> # ℹ 141,142 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
flights %>% semi_join(top_dest)# 相同的结果,但更容易处理 filter() + %in% 所难以处理的存在多个 key 变量的情况
# # input data by hand# transactions <- tibble(# company = c("A", "A", "B", "B"),# year = c(2019, 2020, 2021, 2023),# revenue = c(50, 4, 10, 12)# )transactions
#> # A tibble: 4 × 3#> company year revenue#> <chr> <dbl> <dbl>#> 1 A 2019 50#> 2 A 2020 4#> 3 B 2021 10#> 4 B 2023 12
companies
#> # A tibble: 3 × 3#> id since name #> <chr> <dbl> <chr> #> 1 A 1973 Patagonia#> 2 B 2009 RStudio #> 3 B 2022 Posit
# # input data by hand# transactions <- tibble(# company = c("A", "A", "B", "B"),# year = c(2019, 2020, 2021, 2023),# revenue = c(50, 4, 10, 12)# )transactions
#> # A tibble: 4 × 3#> company year revenue#> <chr> <dbl> <dbl>#> 1 A 2019 50#> 2 A 2020 4#> 3 B 2021 10#> 4 B 2023 12
companies
#> # A tibble: 3 × 3#> id since name #> <chr> <dbl> <chr> #> 1 A 1973 Patagonia#> 2 B 2009 RStudio #> 3 B 2022 Posit
# 1. Equality Joins:transactions |> left_join(companies, by = join_by(company == id))
#> Warning in left_join(transactions, companies, by = join_by(company == id)): Detected an unexpected many-to-many#> relationship between `x` and `y`.#> ℹ Row 3 of `x` matches multiple rows in `y`.#> ℹ Row 1 of `y` matches multiple rows in `x`.#> ℹ If a many-to-many relationship is#> expected, set `relationship =#> "many-to-many"` to silence this warning.
#> # A tibble: 6 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2021 10 2022 Posit #> 5 B 2023 12 2009 RStudio #> 6 B 2023 12 2022 Posit
🤔 如何解决?
# 2. Inequality Joins: # involves at least one join expression# containing one of inequality # conditions: >=, >, <=, or <.transactions |> left_join(companies, join_by(company == id, year >= since))
#> # A tibble: 5 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2009 RStudio #> 5 B 2023 12 2022 Posit
# 2. Inequality Joins: # involves at least one join expression# containing one of inequality # conditions: >=, >, <=, or <.transactions |> left_join(companies, join_by(company == id, year >= since))
#> # A tibble: 5 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2009 RStudio #> 5 B 2023 12 2022 Posit
# 3. Rolling Joins: transactions |> left_join(companies, join_by(company == id, closest(year >= since)))
#> # A tibble: 4 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2022 Posit
# 4. Overlap Joins:# three helpers for overlap joins: # between(), overlaps(), and within()
# 2. Inequality Joins: # involves at least one join expression# containing one of inequality # conditions: >=, >, <=, or <.transactions |> left_join(companies, join_by(company == id, year >= since))
#> # A tibble: 5 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2009 RStudio #> 5 B 2023 12 2022 Posit
# 3. Rolling Joins: transactions |> left_join(companies, join_by(company == id, closest(year >= since)))
#> # A tibble: 4 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2022 Posit
# 4. Overlap Joins:# three helpers for overlap joins: # between(), overlaps(), and within()
👉 ?join_by
cross_join()
/ nest_join()
cross_join(x, y, ...)
将数据集x
中的每一行与数据集y
中的每一行进行匹配,结果为nrow(x) * nrow(y)
行
nest_join(x, y, by = NULL, ...)
保留数据集x
,并将匹配的数据集y
的数据作为新增的列表列,元素为 tibble
(当在数据集y
中找不到匹配数据时,列表列相应位置填充一个0行的tibble
)
从某种意义上说,cross_join()
和 nest_join()
是最基础的数据集连接操作函数,例如 left_join()
可视为 nest_join() + unnest(.keep_empty = TRUE)
,semi_join()
可视为 nest_join() + filter()
(filter()
用来检查列表列中元素是否为0行的tibble
)
cross_join()
/ nest_join()
cross_join(x, y, ...)
将数据集x
中的每一行与数据集y
中的每一行进行匹配,结果为nrow(x) * nrow(y)
行
nest_join(x, y, by = NULL, ...)
保留数据集x
,并将匹配的数据集y
的数据作为新增的列表列,元素为 tibble
(当在数据集y
中找不到匹配数据时,列表列相应位置填充一个0行的tibble
)
从某种意义上说,cross_join()
和 nest_join()
是最基础的数据集连接操作函数,例如 left_join()
可视为 nest_join() + unnest(.keep_empty = TRUE)
,semi_join()
可视为 nest_join() + filter()
(filter()
用来检查列表列中元素是否为0行的tibble
)
用 y
表中的数据来修改 x
表中的行
rows_insert(x, y, by = NULL, ...)
| rows_update()
| rows_upsert()
rows_delete()
rows_append()
| rows_patch()
合并多个数据集的行或列
bind_rows(..., .id = NULL)
# do.call(rbind, dfs_list)
bind_cols(..., .name_repair)
# do.call(cbind, dfs_list)
集合式合并
intersect(x, y, ...)
setdiff(x, y, ...)
| symdiff()
union(x, y, ...)
| union_all()
A Grammar of
Data Manipulation
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
o | Tile View: Overview of Slides |
Esc | Back to slideshow |
dplyr
1.1.3 数据处理
A Grammar of
Data Manipulation
A Grammar of
Data Manipulation
样本处理
变量处理
汇总
分组和行式处理
用 %>%
连接多个操作
合并多个数据集
操作数据库和 data.table
library(tidyverse)library(nycflights13)data(package = "nycflights13")# 包含airlines、airports、flights、planes、weather等5个数据集
flights # just print() it
#> # A tibble: 336,776 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 517 515 2 830 819 11#> 2 2013 1 1 533 529 4 850 830 20#> 3 2013 1 1 542 540 2 923 850 33#> # ℹ 336,773 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
library(tidyverse)library(nycflights13)data(package = "nycflights13")# 包含airlines、airports、flights、planes、weather等5个数据集
flights # just print() it
#> # A tibble: 336,776 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 517 515 2 830 819 11#> 2 2013 1 1 533 529 4 850 830 20#> 3 2013 1 1 542 540 2 923 850 33#> # ℹ 336,773 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
A tibble: 336,776 x 19
<int>
|<dbl>
|<chr>
|<dttm>
|<lgl>
|<fctr>
|<date>
分别表示变量为 integer | double | character | date-time | logical | factor | date 类型的向量
library(tidyverse)library(nycflights13)data(package = "nycflights13")# 包含airlines、airports、flights、planes、weather等5个数据集
flights # just print() it
#> # A tibble: 336,776 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 517 515 2 830 819 11#> 2 2013 1 1 533 529 4 850 830 20#> 3 2013 1 1 542 540 2 923 850 33#> # ℹ 336,773 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
A tibble: 336,776 x 19
<int>
|<dbl>
|<chr>
|<dttm>
|<lgl>
|<fctr>
|<date>
分别表示变量为 integer | double | character | date-time | logical | factor | date 类型的向量
?flights # 打开flights数据集的帮助文档以进一步了解数据集,如变量的定义
glimpse(flights) # 数据一瞥
#> Rows: 336,776#> Columns: 19#> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…#> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, …#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, …#> $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0…#> $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924,…#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917,…#> $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4,…#> $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6",…#> $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194…#> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516…#> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "L…#> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "O…#> $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 3…#> $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005…#> $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6,…#> $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0,…#> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2…
glimpse(flights) # 数据一瞥
#> Rows: 336,776#> Columns: 19#> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…#> $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…#> $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, …#> $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, …#> $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0…#> $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924,…#> $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917,…#> $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4,…#> $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6",…#> $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194…#> $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516…#> $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "L…#> $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "O…#> $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 3…#> $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005…#> $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6,…#> $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0,…#> $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2…
View(flights) # 在 RStudio 数据浏览器中打开数据集# tibble::view() 调用 utils::View() 并不可见地返回原数据集,便于 %>% 操作,但速度好像慢很多
(combine tables)
airlines
#> # A tibble: 16 × 2#> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc.#> 3 AS Alaska Airlines Inc. #> # ℹ 13 more rows
planes
#> # A tibble: 3,322 × 9#> tailnum year type manufacturer model#> <chr> <int> <chr> <chr> <chr>#> 1 N10156 2004 Fixed win… EMBRAER EMB-…#> 2 N102UW 1998 Fixed win… AIRBUS INDU… A320…#> 3 N103US 1999 Fixed win… AIRBUS INDU… A320…#> # ℹ 3,319 more rows#> # ℹ 4 more variables: engines <int>,#> # seats <int>, speed <int>, engine <chr>
airlines
#> # A tibble: 16 × 2#> carrier name #> <chr> <chr> #> 1 9E Endeavor Air Inc. #> 2 AA American Airlines Inc.#> 3 AS Alaska Airlines Inc. #> # ℹ 13 more rows
planes
#> # A tibble: 3,322 × 9#> tailnum year type manufacturer model#> <chr> <int> <chr> <chr> <chr>#> 1 N10156 2004 Fixed win… EMBRAER EMB-…#> 2 N102UW 1998 Fixed win… AIRBUS INDU… A320…#> 3 N103US 1999 Fixed win… AIRBUS INDU… A320…#> # ℹ 3,319 more rows#> # ℹ 4 more variables: engines <int>,#> # seats <int>, speed <int>, engine <chr>
airports
#> # A tibble: 1,458 × 8#> faa name lat lon alt tz dst #> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>#> 1 04G Lansdo… 41.1 -80.6 1044 -5 A #> 2 06A Moton … 32.5 -85.7 264 -6 A #> 3 06C Schaum… 42.0 -88.1 801 -6 A #> # ℹ 1,455 more rows#> # ℹ 1 more variable: tzone <chr>
weather
#> # A tibble: 26,115 × 15#> origin year month day hour temp dewp#> <chr> <int> <int> <int> <int> <dbl> <dbl>#> 1 EWR 2013 1 1 1 39.0 26.1#> 2 EWR 2013 1 1 2 39.0 27.0#> 3 EWR 2013 1 1 3 39.0 28.0#> # ℹ 26,112 more rows#> # ℹ 8 more variables: humid <dbl>,#> # wind_dir <dbl>, wind_speed <dbl>,#> # wind_gust <dbl>, precip <dbl>,#> # pressure <dbl>, visib <dbl>,#> # time_hour <dttm>
主键(primary key):指的是数据表中一个变量(列)或多个变量的组合,其值能唯一地标识表中的每一行。主键主要用来与其他表的外键关联,以及本表的修改与删除。
planes %>% count(tailnum) %>% filter(n > 1)
#> # A tibble: 0 × 2#> # ℹ 2 variables: tailnum <chr>, n <int>
weather %>% count(year, month, day, hour, origin) %>% filter(n > 1)
#> # A tibble: 3 × 6#> year month day hour origin n#> <int> <int> <int> <int> <chr> <int>#> 1 2013 11 3 1 EWR 2#> 2 2013 11 3 1 JFK 2#> 3 2013 11 3 1 LGA 2
weather <- weather %>% distinct(year, month, day, hour, origin, .keep_all = TRUE) # 修正之
主键(primary key):指的是数据表中一个变量(列)或多个变量的组合,其值能唯一地标识表中的每一行。主键主要用来与其他表的外键关联,以及本表的修改与删除。
planes %>% count(tailnum) %>% filter(n > 1)
#> # A tibble: 0 × 2#> # ℹ 2 variables: tailnum <chr>, n <int>
weather %>% count(year, month, day, hour, origin) %>% filter(n > 1)
#> # A tibble: 3 × 6#> year month day hour origin n#> <int> <int> <int> <int> <chr> <int>#> 1 2013 11 3 1 EWR 2#> 2 2013 11 3 1 JFK 2#> 3 2013 11 3 1 LGA 2
weather <- weather %>% distinct(year, month, day, hour, origin, .keep_all = TRUE) # 修正之
外键(foreign key):对应于另一数据表主键的一个或多个变量。
*_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
flights %>% select(year:day, hour, tailnum, carrier) %>% left_join(airlines) # 以同名变量为合并键,增加变量 name(航空公司全名)
#> Joining with `by = join_by(carrier)`
#> # A tibble: 336,776 × 7#> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc.#> # ℹ 336,773 more rows
*_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
flights %>% select(year:day, hour, tailnum, carrier) %>% left_join(airlines) # 以同名变量为合并键,增加变量 name(航空公司全名)
#> Joining with `by = join_by(carrier)`
#> # A tibble: 336,776 × 7#> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc.#> # ℹ 336,773 more rows
flights %>% select(year:day, hour, tailnum, carrier) %>% mutate(name = airlines$name[match(carrier, airlines$carrier)]) # 殊途同归,但容易理解吗?
#> # A tibble: 336,776 × 7#> year month day hour tailnum carrier name #> <int> <int> <int> <dbl> <chr> <chr> <chr> #> 1 2013 1 1 5 N14228 UA United Air Lines Inc. #> 2 2013 1 1 5 N24211 UA United Air Lines Inc. #> 3 2013 1 1 5 N619AA AA American Airlines Inc.#> # ℹ 336,773 more rows
用 by
参数设定连接所用的键变量
# 默认时(即`by = NULL`)使用两个数据集中同名的全部变量,即natural join# 此时会显示提示信息“Joining, by = join_by(以,分隔的全部同名变量)”flights %>% left_join(weather)# 显性设定`by = join_by(以,分隔的键变量)` 或者 `by = 键变量字符向量`flights %>% left_join(weather, by = join_by(year, month, day, origin, hour))# 两个数据集的变量不同名时,则显性设定如# `by = join_by(xv1 == yv1, xv2 == yv2)` 或者 `by = 命名字符变量向量`flights %>% left_join(airports, by = join_by(dest == faa)) # by = c("dest" = "faa")# dplyr v1.1.0 `by` 参数设定的新语法 `by = join_by(...)` 更加灵活,支持非等值# 连接(non-equi joins)(具体见后)
合并式连接函数*_join()
的参数 relationship
1,3
"one-to-one"
[见前]
"one-to-many"
/ "many-to-one"
"many-to-many"
2many-to-many
的等值合并(equi join)并不合理,*_join()
函数在 relationship
参数取默认值 NULL
时会给出相应的警告信息。*_join()
函数的其他参数:keep
、na_matches
、multiple
和 unmatched
。semi_join(x, y, by = NULL, copy = FALSE, ...) | anti_join()
保留 | 丢弃 x
数据集中在 y
数据集中找得到匹配的样本
top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10)flights %>% filter(dest %in% top_dest$dest)
#> # A tibble: 141,145 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 542 540 2 923 850 33#> 2 2013 1 1 554 600 -6 812 837 -25#> 3 2013 1 1 554 558 -4 740 728 12#> # ℹ 141,142 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
semi_join(x, y, by = NULL, copy = FALSE, ...) | anti_join()
保留 | 丢弃 x
数据集中在 y
数据集中找得到匹配的样本
top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10)flights %>% filter(dest %in% top_dest$dest)
#> # A tibble: 141,145 × 19#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay#> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>#> 1 2013 1 1 542 540 2 923 850 33#> 2 2013 1 1 554 600 -6 812 837 -25#> 3 2013 1 1 554 558 -4 740 728 12#> # ℹ 141,142 more rows#> # ℹ 10 more variables: carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,#> # time_hour <dttm>
flights %>% semi_join(top_dest)# 相同的结果,但更容易处理 filter() + %in% 所难以处理的存在多个 key 变量的情况
# # input data by hand# transactions <- tibble(# company = c("A", "A", "B", "B"),# year = c(2019, 2020, 2021, 2023),# revenue = c(50, 4, 10, 12)# )transactions
#> # A tibble: 4 × 3#> company year revenue#> <chr> <dbl> <dbl>#> 1 A 2019 50#> 2 A 2020 4#> 3 B 2021 10#> 4 B 2023 12
companies
#> # A tibble: 3 × 3#> id since name #> <chr> <dbl> <chr> #> 1 A 1973 Patagonia#> 2 B 2009 RStudio #> 3 B 2022 Posit
# # input data by hand# transactions <- tibble(# company = c("A", "A", "B", "B"),# year = c(2019, 2020, 2021, 2023),# revenue = c(50, 4, 10, 12)# )transactions
#> # A tibble: 4 × 3#> company year revenue#> <chr> <dbl> <dbl>#> 1 A 2019 50#> 2 A 2020 4#> 3 B 2021 10#> 4 B 2023 12
companies
#> # A tibble: 3 × 3#> id since name #> <chr> <dbl> <chr> #> 1 A 1973 Patagonia#> 2 B 2009 RStudio #> 3 B 2022 Posit
# 1. Equality Joins:transactions |> left_join(companies, by = join_by(company == id))
#> Warning in left_join(transactions, companies, by = join_by(company == id)): Detected an unexpected many-to-many#> relationship between `x` and `y`.#> ℹ Row 3 of `x` matches multiple rows in `y`.#> ℹ Row 1 of `y` matches multiple rows in `x`.#> ℹ If a many-to-many relationship is#> expected, set `relationship =#> "many-to-many"` to silence this warning.
#> # A tibble: 6 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2021 10 2022 Posit #> 5 B 2023 12 2009 RStudio #> 6 B 2023 12 2022 Posit
🤔 如何解决?
# 2. Inequality Joins: # involves at least one join expression# containing one of inequality # conditions: >=, >, <=, or <.transactions |> left_join(companies, join_by(company == id, year >= since))
#> # A tibble: 5 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2009 RStudio #> 5 B 2023 12 2022 Posit
# 2. Inequality Joins: # involves at least one join expression# containing one of inequality # conditions: >=, >, <=, or <.transactions |> left_join(companies, join_by(company == id, year >= since))
#> # A tibble: 5 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2009 RStudio #> 5 B 2023 12 2022 Posit
# 3. Rolling Joins: transactions |> left_join(companies, join_by(company == id, closest(year >= since)))
#> # A tibble: 4 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2022 Posit
# 4. Overlap Joins:# three helpers for overlap joins: # between(), overlaps(), and within()
# 2. Inequality Joins: # involves at least one join expression# containing one of inequality # conditions: >=, >, <=, or <.transactions |> left_join(companies, join_by(company == id, year >= since))
#> # A tibble: 5 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2009 RStudio #> 5 B 2023 12 2022 Posit
# 3. Rolling Joins: transactions |> left_join(companies, join_by(company == id, closest(year >= since)))
#> # A tibble: 4 × 5#> company year revenue since name #> <chr> <dbl> <dbl> <dbl> <chr> #> 1 A 2019 50 1973 Patagonia#> 2 A 2020 4 1973 Patagonia#> 3 B 2021 10 2009 RStudio #> 4 B 2023 12 2022 Posit
# 4. Overlap Joins:# three helpers for overlap joins: # between(), overlaps(), and within()
👉 ?join_by
cross_join()
/ nest_join()
cross_join(x, y, ...)
将数据集x
中的每一行与数据集y
中的每一行进行匹配,结果为nrow(x) * nrow(y)
行
nest_join(x, y, by = NULL, ...)
保留数据集x
,并将匹配的数据集y
的数据作为新增的列表列,元素为 tibble
(当在数据集y
中找不到匹配数据时,列表列相应位置填充一个0行的tibble
)
从某种意义上说,cross_join()
和 nest_join()
是最基础的数据集连接操作函数,例如 left_join()
可视为 nest_join() + unnest(.keep_empty = TRUE)
,semi_join()
可视为 nest_join() + filter()
(filter()
用来检查列表列中元素是否为0行的tibble
)
cross_join()
/ nest_join()
cross_join(x, y, ...)
将数据集x
中的每一行与数据集y
中的每一行进行匹配,结果为nrow(x) * nrow(y)
行
nest_join(x, y, by = NULL, ...)
保留数据集x
,并将匹配的数据集y
的数据作为新增的列表列,元素为 tibble
(当在数据集y
中找不到匹配数据时,列表列相应位置填充一个0行的tibble
)
从某种意义上说,cross_join()
和 nest_join()
是最基础的数据集连接操作函数,例如 left_join()
可视为 nest_join() + unnest(.keep_empty = TRUE)
,semi_join()
可视为 nest_join() + filter()
(filter()
用来检查列表列中元素是否为0行的tibble
)
用 y
表中的数据来修改 x
表中的行
rows_insert(x, y, by = NULL, ...)
| rows_update()
| rows_upsert()
rows_delete()
rows_append()
| rows_patch()
合并多个数据集的行或列
bind_rows(..., .id = NULL)
# do.call(rbind, dfs_list)
bind_cols(..., .name_repair)
# do.call(cbind, dfs_list)
集合式合并
intersect(x, y, ...)
setdiff(x, y, ...)
| symdiff()
union(x, y, ...)
| union_all()