library(tidyverse) # 加载tidyverse包,提供数据处理和可视化的工具集
library(camcorder) # 加载camcorder包,用于记录和回放R会话
library(usmap) # 加载usmap包,用于绘制美国地图
file1 <- "D:/Myblog/posts/ggplot2-tidytuesday-2024week32-tornados/tech/tornados1.csv"
file2 <- "D:/Myblog/posts/ggplot2-tidytuesday-2024week32-tornados/tech/tornados2.csv"
tornados1 <- readr::read_csv(file1)
tornados2 <- readr::read_csv(file2)
1 准备数据
数据集来自 NOAA 风暴预测中心制作的数据集。创建该数据集所做的主要改动是删除了一些列、更改了一些数据类型以及按日期排序。数据周期为 1950 年至 2021 年。
在本分析中,我们将探讨
- 历年龙卷风发生情况。
- 龙卷风在哪里发生?
- 按月划分的龙卷风。
- 因龙卷风而死亡的人数。
2 EDA
2.1 历年龙卷风发生情况
# A tibble: 1 × 1
n
<int>
1 67558
# 按年份查看龙卷风总数并可视化
tornados2 %>%
group_by(yr) %>%
count() %>%
ggplot(aes(x = yr, y = n)) +
geom_line(color = "#98103e") +
labs(x = "", y = "Number of Tornados") +
theme_minimal() +
theme(text = element_text(color = "black"),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank())
2.2 龙卷风在哪里发生?
(
tornados2_st <- tornados2 %>%
group_by(state) %>%
summarise(value = n()) %>%
arrange(desc(value))
)
# A tibble: 53 × 2
state value
<chr> <int>
1 TX 9149
2 KS 4375
3 OK 4092
4 FL 3497
5 NE 2967
6 IA 2773
7 IL 2682
8 MS 2476
9 MO 2427
10 AL 2358
# ℹ 43 more rows
# 可视化化龙卷风地点,先对数据进行分段
tornados2_st <- tornados2_st %>%
mutate(
color = case_when(
value < 500 ~ "Less than 500",
value <= 2000 ~ "1001 to 2000",
value <= 4000 ~ "2001 to 4000",
value > 4000 ~ "More than 4000",
TRUE ~ "No Tornados"
)
)
tornados2_st$color <- fct_relevel(
tornados2_st$color,
c("More than 4000", "2001 to 4000", "1001 to 2000", "Less than 500")
)
plot_usmap(
data = tornados2_st, values = "color", labels = FALSE
) +
scale_fill_manual(
values = c(
"Less than 500" = "#ffdfa4",
"1001 to 2000" = "#FFC457",
"2001 to 2000" = "#E4683F",
"2001 to 4000" = "#C03434",
"More than 4000" = "#98103E",
"No Tornadoes" = "grey70"
)
) +
theme(
legend.position.inside = c(0.1, -0.15),
legend.title = element_blank(),
legend.text = element_text(size = 8),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank()
) +
# 使用 guides 方法为图例设置自定义属性
guides(fill = guide_legend(
label.position = "top", # 设定标签位置为顶部
color = "#808080", # 设置图例标签的颜色
nrow = 1, # 图例在一行中显示
keywidth = 2, # 控制图例键的宽度
keyheight = 0.5 # 控制图例键的高度
))
- 德州的龙卷风数量最多。
- 堪萨斯州、俄克拉荷马州、佛罗里达州和内布拉斯加州位列前五。
- 中西部和南部出现大量龙卷风。
- 东北部和西部地区则很少发生龙卷风。
2.3 按月统计龙卷风
mymonths <- c("Jan","Feb","Mar",
"Apr","May","Jun",
"Jul","Aug","Sep",
"Oct","Nov","Dec")
month <- tornados2 %>%
summarise(value = n(), .by = "mo") %>%
mutate(mo = mymonths)
ggplot(month, aes(x = factor(mo, levels = mymonths), y = value)) +
geom_col(fill = "#98103e") +
labs(x = "", y = "Number of Tornados") +
theme_minimal() +
theme(
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
text = element_text(color = "black")
)
2.4 造成死亡人数
1950-2021 年间,6112 人死于龙卷风。
# 每年死亡人数
yearly_tornados_fat <- tornados2 %>%
summarise(fatalities = sum(fat), .by = "yr")
ggplot(yearly_tornados_fat, aes(x = yr, y = fatalities)) +
geom_line(color = "#98103e") +
geom_smooth(method = "lm", se = FALSE, alpha = 0.3, color = "grey") +
annotate(geom = "text", x = 2011, y = 555, label = "2011, 553", color = "black", size = 3, vjust = -0.5) +
annotate(geom = "text", x = 1953, y = 523, label = "1953, 523", color = "black", size = 3, vjust = -0.5) +
scale_x_continuous(breaks = seq(1950, 2021, 10)) +
labs(x = "", y = "Number of Fatalities") +
theme_minimal() +
theme(
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
text = element_text(color = "black")
)
- 在过去 70 年里,龙卷风造成的死亡人数趋势是不断下降的。
- 1953年和2011 年死亡人数均超过500人。
2.5 哪些地区死亡人数最多
# 按州统计死亡人数
state_tornadoes_fat <- tornados2 %>%
summarise(value = sum(fat), .by = "state") %>%
arrange(desc(value))
# 绘制地图
state_tornadoes_fat <- state_tornadoes_fat %>%
mutate(color = case_when(
value >= 1 & value <= 10 ~ "Less than 10",
value > 10 & value <= 50 ~ "11 - 50",
value > 51 & value <= 200 ~ "51 - 200",
value > 201 & value <= 400 ~ "201 - 400",
value > 400 ~ "More than 400",
TRUE ~ "No Fatalities"
)) %>%
mutate(color = fct_relevel(color, c("More than 400", "201 - 400", "11 - 50", "Less than 10", "No Fatalities")))
plot_usmap(
data = state_tornadoes_fat,
values = "color", labels = FALSE) +
scale_fill_manual(values = c(
"Less than 10" = "#ffdfa4",
"11 - 50" = "#FFC457",
"51 - 200" = "#E4683F",
"201 - 400" = "#C03434",
"More than 400" = "#98103E",
"No Fatalities" = "grey70"
)) +
labs(x = "", y = "") +
theme(
legend.position.inside = c(0.1, -0.15),
legend.title = element_blank(),
legend.text = element_text(size = 8),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank()
) +
guides(fill = guide_legend(
label.position = "top",
color = "#808080", nrow = 1,
keywidth = 2, keyheight = 0.5,
))
- 东南部地区死亡人数最多。
- 八个州的龙卷风造成的死亡人数为零。
- 马萨诸塞州的龙卷风次数较少,但死亡人数相对较多。
3 高级绘图
file1 <- "D:/Myblog/posts/ggplot2-tidytuesday-2024week32-tornados/tech/tornados1.csv"
tornados1 <- readr::read_csv(file1)
td <- tornados1 %>%
add_count(date) %>% # 计算每个日期的龙卷风数量,并添加为新列
group_by(date) %>% # 按日期分组
reframe(
date, mo, n,
m = max(mag, na.rm = TRUE), # 计算每个日期组中最大强度(忽略NA值)
m = ifelse(m == -Inf, NA, as.character(m)) # 如果最大强度为-Inf,则替换为NA,并转换为字符类型
) %>%
distinct() # 去除重复行
dates <- data.frame(date = seq(min(td$date, na.rm = TRUE), max(td$date, na.rm = TRUE), 1)) %>%
left_join(td) %>%
mutate(i = row_number())
sp <- data.frame(
t = seq(0, 2 * pi * 73, length.out = 365 * 73) + 10.5 * pi
) %>%
mutate(
x = -t * cos(t),
y = t * sin(t),
i = row_number()
) %>%
left_join(dates)
mo_l <- data.frame(
i = c(month.abb, ""),
t = seq(0, 2 * pi, length.out = 13) + 2 * pi/3
) %>%
mutate(
x = - 550 * cos(t),
y = 550 * sin(t)
)
tn_max <- sp %>%
slice_max(order_by = n, n = 3)
pal <- cetcolor::cet_pal(6, "r2")
ggplot(sp, aes(x = x, y = y)) +
ggrepel::geom_text_repel(data = tn_max, aes(x, y, label = paste0(n, " tornados\n(", format(date, "%b %d, %Y"), ")")), nudge_x = c(200, 420, 200), nudge_y = c(-100, -100, 40), alpha = 0.75, lineheight = 0.9, size = 3, segment.size = 0.4, color = "purple4") +
geom_path(alpha = 0.05) +
geom_point(aes(size = n, fill = m), alpha = 0.7, shape = 21, stroke = 0) +
geom_text(data = mo_l, aes(x, y, label = i), , size = 4, color = "cornflowerblue") +
scale_size_continuous(range = c(0.5, 5), guide = "none") +
scale_fill_manual(values = pal, guide = guide_legend(title = "Highest magnitude\n(*F scale used until 2007, afterwards EF)", title.position = "top", label.position = "bottom", nrow = 1, override.aes = list(size = 8))) +
scale_x_continuous(limits = c(-620, 620)) +
coord_fixed() +
labs(
title = "US Tornadoes, 1950-2022",
subtitle = str_wrap("The size of each point represents the number of tornadoes recorded per day, and its color the highest magnitude* for that day"),
caption = "Source: Storm Prediction Center · Graphic: Georgios Karamanis"
) +
theme_void() +
theme(
legend.position = "bottom",
legend.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "grey99", color = NA),
plot.title = element_text(hjust = 0.5, face = "bold", size = 20, margin = margin(15, 0, 0, 0)),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0.5, margin = margin(15, 0, 5, 0))
)