美国龙卷风数据探索

Tidytuesday 2023 Week 20

ggplot2
Tidytuesday
Author

Lee

Published

September 3, 2024

1 准备数据

library(tidyverse) # 加载tidyverse包,提供数据处理和可视化的工具集
library(camcorder) # 加载camcorder包,用于记录和回放R会话
library(usmap) # 加载usmap包,用于绘制美国地图

file1 <- "D:/Myblog/posts/ggplot2-tidytuesday-2024week32-tornados/tech/tornados1.csv"
file2 <- "D:/Myblog/posts/ggplot2-tidytuesday-2024week32-tornados/tech/tornados2.csv"

tornados1 <- readr::read_csv(file1)
tornados2 <- readr::read_csv(file2)

数据集来自 NOAA 风暴预测中心制作的数据集。创建该数据集所做的主要改动是删除了一些列、更改了一些数据类型以及按日期排序。数据周期为 1950 年至 2021 年。

在本分析中,我们将探讨

  • 历年龙卷风发生情况。
  • 龙卷风在哪里发生?
  • 按月划分的龙卷风。
  • 因龙卷风而死亡的人数。

2 EDA

2.1 历年龙卷风发生情况

names(tornados2)[5] <- "state"

# 查看龙卷风总数
tornados2 %>%
  count()
# A tibble: 1 × 1
      n
  <int>
1 67558
# 按年份查看龙卷风总数并可视化
tornados2 %>%
  group_by(yr) %>%
  count() %>%
  ggplot(aes(x = yr, y = n)) +
  geom_line(color = "#98103e") +
  labs(x = "", y = "Number of Tornados") +
  theme_minimal() +
  theme(text = element_text(color = "black"), 
  panel.grid.major.x = element_blank(),
  panel.grid.major.y = element_blank())

2.2 龙卷风在哪里发生?

(
  tornados2_st <- tornados2 %>%
    group_by(state) %>%
    summarise(value = n()) %>%
    arrange(desc(value))
)
# A tibble: 53 × 2
   state value
   <chr> <int>
 1 TX     9149
 2 KS     4375
 3 OK     4092
 4 FL     3497
 5 NE     2967
 6 IA     2773
 7 IL     2682
 8 MS     2476
 9 MO     2427
10 AL     2358
# ℹ 43 more rows
# 可视化化龙卷风地点,先对数据进行分段
tornados2_st <- tornados2_st %>%
  mutate(
    color = case_when(
      value < 500 ~ "Less than 500",
      value <= 2000 ~ "1001 to 2000",
      value <= 4000 ~ "2001 to 4000",
      value > 4000 ~ "More than 4000",
      TRUE ~ "No Tornados"
    )
  )

tornados2_st$color <- fct_relevel(
  tornados2_st$color,
  c("More than 4000", "2001 to 4000", "1001 to 2000", "Less than 500")
)

plot_usmap(
  data = tornados2_st, values = "color", labels = FALSE
) +
  scale_fill_manual(
    values = c(
      "Less than 500" = "#ffdfa4",
      "1001 to 2000" = "#FFC457",
      "2001 to 2000" = "#E4683F",
      "2001 to 4000" = "#C03434",
      "More than 4000" = "#98103E",
      "No Tornadoes" = "grey70"
    )
  ) +
  theme(
    legend.position.inside = c(0.1, -0.15),
    legend.title = element_blank(),
    legend.text = element_text(size = 8),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_blank()
  ) +
  # 使用 guides 方法为图例设置自定义属性
  guides(fill = guide_legend(
    label.position = "top", # 设定标签位置为顶部
    color = "#808080", # 设置图例标签的颜色
    nrow = 1, # 图例在一行中显示
    keywidth = 2, # 控制图例键的宽度
    keyheight = 0.5 # 控制图例键的高度
  ))

  • 德州的龙卷风数量最多。
  • 堪萨斯州、俄克拉荷马州、佛罗里达州和内布拉斯加州位列前五。
  • 中西部和南部出现大量龙卷风。
  • 东北部和西部地区则很少发生龙卷风。

2.3 按月统计龙卷风

mymonths <- c("Jan","Feb","Mar",
              "Apr","May","Jun",
              "Jul","Aug","Sep",
              "Oct","Nov","Dec")

month <- tornados2 %>%
  summarise(value = n(), .by = "mo") %>% 
  mutate(mo = mymonths)

ggplot(month, aes(x = factor(mo, levels = mymonths), y = value)) +
  geom_col(fill = "#98103e") +
  labs(x = "", y = "Number of Tornados") +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank(),
    text = element_text(color = "black")
  )

2.4 造成死亡人数

# 总死亡人数
tornados2 %>%
  summarise(fatalities = sum(fat))
# A tibble: 1 × 1
  fatalities
       <dbl>
1       6112

1950-2021 年间,6112 人死于龙卷风。

# 每年死亡人数
yearly_tornados_fat <- tornados2 %>%
  summarise(fatalities = sum(fat), .by = "yr")

ggplot(yearly_tornados_fat, aes(x = yr, y = fatalities)) +
  geom_line(color = "#98103e") +
  geom_smooth(method = "lm", se = FALSE, alpha = 0.3, color = "grey") +
  annotate(geom = "text", x = 2011, y = 555, label = "2011, 553", color = "black", size = 3, vjust = -0.5) +
  annotate(geom = "text", x = 1953, y = 523, label = "1953, 523", color = "black", size = 3, vjust = -0.5) +
scale_x_continuous(breaks = seq(1950, 2021, 10)) +
  labs(x = "", y = "Number of Fatalities") +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank(),
    text = element_text(color = "black")
  )

  • 在过去 70 年里,龙卷风造成的死亡人数趋势是不断下降的。
  • 1953年和2011 年死亡人数均超过500人。

2.5 哪些地区死亡人数最多

# 按州统计死亡人数
state_tornadoes_fat <- tornados2 %>%
  summarise(value = sum(fat), .by = "state") %>%
  arrange(desc(value))

# 绘制地图
state_tornadoes_fat <- state_tornadoes_fat %>%
  mutate(color = case_when(
    value >= 1 & value <= 10 ~ "Less than 10",
    value > 10 & value <= 50 ~ "11 - 50",
    value > 51 & value <= 200 ~ "51 - 200",
    value > 201 & value <= 400 ~ "201 - 400",
    value > 400 ~ "More than 400",
    TRUE ~ "No Fatalities"
  )) %>%
  mutate(color = fct_relevel(color, c("More than 400", "201 - 400", "11 - 50", "Less than 10", "No Fatalities")))

plot_usmap(
  data = state_tornadoes_fat, 
  values = "color", labels = FALSE) +
  scale_fill_manual(values = c(
    "Less than 10" = "#ffdfa4",
    "11 - 50" = "#FFC457",
    "51 - 200" = "#E4683F",
    "201 - 400" = "#C03434",
    "More than 400" = "#98103E",
    "No Fatalities" = "grey70"
  )) +
  labs(x = "", y = "") +
  theme(
    legend.position.inside = c(0.1, -0.15),
    legend.title = element_blank(),
    legend.text = element_text(size = 8),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_blank()
  ) +
  guides(fill = guide_legend(
    label.position = "top",
    color = "#808080", nrow = 1,
    keywidth = 2, keyheight = 0.5,
  ))

  • 东南部地区死亡人数最多。
  • 八个州的龙卷风造成的死亡人数为零。
  • 马萨诸塞州的龙卷风次数较少,但死亡人数相对较多。

3 高级绘图

file1 <- "D:/Myblog/posts/ggplot2-tidytuesday-2024week32-tornados/tech/tornados1.csv"

tornados1 <- readr::read_csv(file1)

td <- tornados1 %>% 
  add_count(date) %>%  # 计算每个日期的龙卷风数量,并添加为新列
  group_by(date) %>%  # 按日期分组
  reframe(
    date, mo, n,
    m = max(mag, na.rm = TRUE),  # 计算每个日期组中最大强度(忽略NA值)
    m = ifelse(m == -Inf, NA, as.character(m))  # 如果最大强度为-Inf,则替换为NA,并转换为字符类型
    ) %>% 
  distinct()  # 去除重复行

dates <- data.frame(date = seq(min(td$date, na.rm = TRUE), max(td$date, na.rm = TRUE), 1)) %>% 
  left_join(td) %>% 
  mutate(i = row_number())

sp <- data.frame(
  t = seq(0, 2 * pi * 73, length.out = 365 *  73) + 10.5 * pi
  ) %>% 
  mutate(
    x = -t * cos(t),
    y = t * sin(t),
    i = row_number()
  ) %>% 
  left_join(dates)

mo_l <- data.frame(
  i = c(month.abb, ""),
  t = seq(0, 2 * pi, length.out = 13) + 2 * pi/3
  ) %>% 
  mutate(
    x = - 550 * cos(t),
    y = 550 * sin(t)
  )

tn_max <- sp %>% 
  slice_max(order_by = n, n = 3)

pal <- cetcolor::cet_pal(6, "r2")  
ggplot(sp, aes(x = x, y = y)) +
  ggrepel::geom_text_repel(data = tn_max, aes(x, y, label = paste0(n, " tornados\n(", format(date, "%b %d, %Y"), ")")), nudge_x = c(200, 420, 200), nudge_y = c(-100, -100, 40), alpha = 0.75, lineheight = 0.9, size = 3, segment.size = 0.4, color = "purple4") +
  geom_path(alpha = 0.05) +
  geom_point(aes(size = n, fill = m), alpha = 0.7, shape = 21, stroke = 0) +
  geom_text(data = mo_l, aes(x, y, label = i), , size = 4, color = "cornflowerblue") +
  scale_size_continuous(range = c(0.5, 5), guide = "none") +
  scale_fill_manual(values = pal, guide = guide_legend(title = "Highest magnitude\n(*F scale used until 2007, afterwards EF)", title.position = "top", label.position = "bottom", nrow = 1, override.aes = list(size = 8))) +
  scale_x_continuous(limits = c(-620, 620)) +
  coord_fixed() +
  labs(
    title = "US Tornadoes, 1950-2022",
    subtitle = str_wrap("The size of each point represents the number of tornadoes recorded per day, and its color the highest magnitude* for that day"),
    caption = "Source: Storm Prediction Center · Graphic: Georgios Karamanis"
  ) +
  theme_void() +
  theme(
    legend.position = "bottom",
    legend.title = element_text(hjust = 0.5),
    plot.background = element_rect(fill = "grey99", color = NA),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 20, margin = margin(15, 0, 0, 0)),
    plot.subtitle = element_text(hjust = 0.5),
    plot.caption = element_text(hjust = 0.5, margin = margin(15, 0, 5, 0))
  )