file<-"d:/Myblog/datas/athlete_events.csv"data<-read_csv(file, col_types =cols( ID =col_character(), Name =col_character(), Sex =col_factor(levels =c("M","F")), Age =col_integer(), Height =col_double(), Weight =col_double(), Team =col_character(), NOC =col_character(), Games =col_character(), Year =col_integer(), Season =col_factor(levels =c("Summer","Winter")), City =col_character(), Sport =col_character(), Event =col_character(), Medal =col_factor(levels =c("Gold","Silver","Bronze"))))head(data)
# A tibble: 6 × 15
ID Name Sex Age Height Weight Team NOC Games Year Season City
<chr> <chr> <fct> <int> <dbl> <dbl> <chr> <chr> <chr> <int> <fct> <chr>
1 1 A Dijiang M 24 180 80 China CHN 1992… 1992 Summer Barc…
2 2 A Lamusi M 23 170 60 China CHN 2012… 2012 Summer Lond…
3 3 Gunnar N… M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…
4 4 Edgar Li… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris
5 5 Christin… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
6 5 Christin… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
# ℹ 3 more variables: Sport <chr>, Event <chr>, Medal <fct>
art<-data%>%filter(Sport=="Art Competitions")%>%select(Name, Sex, Age, Team, NOC, Year, City, Event, Medal)head(art)
# A tibble: 6 × 9
Name Sex Age Team NOC Year City Event Medal
<chr> <fct> <int> <chr> <chr> <int> <chr> <chr> <fct>
1 Win Valdemar Aaltonen M 54 Finland FIN 1948 London Art C… <NA>
2 Adolf Gaston Abel M 45 Germany GER 1928 Amsterdam Art C… <NA>
3 Adolf Gaston Abel M 45 Germany GER 1928 Amsterdam Art C… <NA>
4 Georges Achille-Fould F 55 France FRA 1924 Paris Art C… <NA>
5 Dsir Antoine Acket M 27 Belgium BEL 1932 Los Angeles Art C… <NA>
6 Dsir Antoine Acket M 27 Belgium BEL 1932 Los Angeles Art C… <NA>
ggplot(counts_sex, aes(x =Year, y =Athletes, group =Sex, color =Sex))+geom_point(size =2)+geom_line()+scale_color_manual(values =c("darkblue", "red"))+scale_x_continuous(breaks =seq(1896, 2016, 4))+scale_y_continuous(breaks =seq(0, 9000, 500))+labs(title ="Number of male and female Olympians over time")+theme_bw()+theme(plot.title=element_text(hjust =0.5), axis.text.x =element_text(angle =90, face ="bold", vjust =0.5))
# Count M/F Total per country per Olympicscounts_NOC<-data%>%filter(Year%in%c(1936, 1956, 1976, 1996, 2016))%>%group_by(Year, NOC, Sex)%>%summarize(Count =length(unique(ID)))%>%spread(Sex, Count)%>%# 按照Sex为key,Count为value的规则进行pivot_wider()操作。mutate(Total =sum(M, F, na.rm =T))# 计算运动员总数names(counts_NOC)[3:4]<-c("Male", "Female")# 将缺失值变为0counts_NOC$Male[is.na(counts_NOC$Male)]<-0counts_NOC$Female[is.na(counts_NOC$Female)]<-0counts_NOC$Year<-as.factor(counts_NOC$Year)# 将Year列转化为因子counts_NOC
# A tibble: 622 × 5
# Groups: Year, NOC [622]
Year NOC Male Female Total
<fct> <chr> <dbl> <dbl> <int>
1 1936 AFG 15 0 15
2 1936 ARG 50 1 51
3 1936 AUS 29 4 33
4 1936 AUT 265 27 292
5 1936 BEL 168 8 176
6 1936 BER 5 0 5
7 1936 BOL 2 0 2
8 1936 BRA 67 6 73
9 1936 BUL 33 0 33
10 1936 CAN 101 25 126
# ℹ 612 more rows
ggplot(counts_NOC, aes(x =Male, y =Female, group =Year, color =Year))+geom_point(alpha =0.5)+geom_abline(intercept =0, slope =1, linetype ="dashed")+geom_smooth(method ="lm", se =FALSE)
ggplot(medalCount, aes(x =Team, y =Count, fill =Medal))+geom_col()+coord_flip()+scale_fill_manual(values =c("gold1", "gray70", "gold4"))+ggtitle("Olympics Medal Tally")+theme(plot.title =element_text(hjust =0.5))
# 读取NOC数据noc<-read_csv("noc_regions.csv", col_types =cols( NOC =col_character(), region =col_character()))# 增加regions数据,去除缺失值dataRegions<-data%>%left_join(noc, by ="NOC")%>%filter(!is.na(region))# 将选择的三届奥运会的数据筛选出来amsterdam<-dataRegions%>%filter(Games=="1928 Summer")%>%group_by(region)%>%summarize(Amsterdam =length(unique(ID)))munich<-dataRegions%>%filter(Games=="1972 Summer")%>%group_by(region)%>%summarize(Munich =length(unique(ID)))rio<-dataRegions%>%filter(Games=="2016 Summer")%>%group_by(region)%>%summarize(Rio =length(unique(ID)))
# 建立地图world<-map_data("world")mapdat<-tibble(region =unique(world$region))# 提取国家列mapdat<-mapdat%>%left_join(amsterdam, by ="region")%>%left_join(munich, by ="region")%>%left_join(rio, by ="region")mapdat$Amsterdam[is.na(mapdat$Amsterdam)]<-0mapdat$Munich[is.na(mapdat$Munich)]<-0mapdat$Rio[is.na(mapdat$Rio)]<-0world<-left_join(world, mapdat, by ="region")# 1928ggplot(world, aes(x =long, y =lat, group =group))+geom_polygon(aes(fill =Amsterdam))+labs(title ="Amsterdam 1928", x =NULL, y =NULL)+theme(axis.ticks =element_blank(), axis.text =element_blank(), panel.background =element_rect(fill ="navy"), plot.title =element_text(hjust =0.5))+guides(fill =guide_colorbar(title ="Athletes"))+scale_fill_gradient(low ="white", high ="red")
1.6.2 1972年奥运会情况
ggplot(world, aes(x =long, y =lat, group =group))+geom_polygon(aes(fill =Munich))+labs(title ="Munich 1972", x =NULL, y =NULL)+theme(axis.ticks =element_blank(), axis.text =element_blank(), panel.background =element_rect(fill ="navy"), plot.title =element_text(hjust =0.5))+guides(fill =guide_colorbar(title ="Athletes"))+scale_fill_gradient2(low ="white", high ="red")
1.6.3 2016年奥运会情况
ggplot(world, aes(x =long, y =lat, group =group))+geom_polygon(aes(fill =Rio))+labs(title ="Rio 2016", x =NULL, y =NULL)+theme(axis.ticks =element_blank(), axis.text =element_blank(), panel.background =element_rect(fill ="navy"), plot.title =element_text(hjust =0.5))+guides(fill =guide_colorbar(title ="Athletes"))+scale_fill_gradient2(low ="white", high ="red")
data%>%group_by(Year, Sex)%>%summarize(Present =length(unique(ID[which(!is.na(Height)&!is.na(Weight))])), Total =length(unique(ID)))%>%mutate(Proportion =Present/Total)%>%ggplot(aes(x =Year, y =Proportion, group =Sex, color =Sex))+geom_point()+geom_line()+scale_color_manual(values =c("darkblue", "red"))+theme(plot.title =element_text(hjust =0.5), axis.text.x =element_text(face ="bold", angle =90))+labs(title ="Height/Weight data completeness from each Olympics")+scale_x_continuous(breaks =seq(1896, 2016, 4))
# 筛选1960年奥运会的项目events<-data[data$Year==1960, "Event"]%>%unique%>%.$Eventyears<-data$Year%>%unique%>%sort%>%tail(-1)for(iin1:length(years)){nxt<-data[data$Year==years[i], "Event"]%>%unique%>%.$Eventevents<-intersect(events, nxt)}# 按照1960年项目对之后的项目进行筛选data<-data%>%filter(Event%in%events)# get list of sports matching eventssportsEvents<-data%>%select(Sport, Event)%>%unique