Portfolios | Hantong Wang

Chinese GDP - Unbalanced Growth

Thu, 07 Apr 2022 00:00:00 +0000

这是我第二次尝试记录我的数据分析结果，这次我们把目光移回到国内，重点关注：

中国各省市和地区的GDP发展情况
对各省市的发展快慢进行排名
探索每个省市和地区的相对发展速度（与平均GDP增长率相比）
绘制GDP相对增长地图

所以本文除了涉及之前的柱状图以外，还会展示如何在R中绘制地图。

本文的所有数据大部分来源于Kaggle还有中华人民共和国民政部的开放数据，感谢它们！🙏🙏

一、各区域GDP增长情况

在本节，我们会将各个省份划分为不同的地区，看看不同地区的GDP发展情况如何。

首先清洗数据

#加载包
library(rio)
library(tidyverse)
library(MetBrewer)
library(showtext); showtext_auto()
library(ggmap)
library(mapproj)
library(sf)
library(ggrepel)
library(extrafont)
library(RColorBrewer)
library(cowplot)
library(ragg)
#导入字体
font_add_google("Calligraffitti", "call")
font_add_google("Kalam", "kalam")
font_add_google("ZCOOL KuaiLe", "cool")

#读入数据
china_map = st_read(dsn = "http://xzqh.mca.gov.cn/data/quanguo.json", stringsAsFactors = FALSE)
st_crs(china_map) <- 4326 #设置地图坐标系统为 WGS84
data1 <- import('Chinas GDP in Province En.csv') #GDP数据
line <- st_read('geojson/九段线GS（2019）1719号.geojson') #用于绘制南海9段线
#数据预处理
GDP_long <- data1 %>%
rename(Guangxi = `Guangxi,`, #广西省名称有误，修改
year = V1) %>%
pivot_longer(cols = 2:ncol(data1),names_to = 'Province', values_to = 'gdp') %>%
mutate(quhao = case_when(
Province == 'Beijing' ~ '110000',
Province == 'Tianjin' ~ '120000',
Province == 'Hebei' ~ '130000',
Province == 'Shanxi' ~ '140000',
Province == 'Inner Mongolia'~ '150000',
Province == 'Liaoning'~ '210000',
Province == 'Jilin'~ '220000',
Province == 'Heilongjiang'~ '230000',
Province == 'Shanghai'~ '310000',
Province == 'Jiangsu'~ '320000',
Province == 'Zhejiang'~ '330000',
Province == 'Anhui'~ '340000',
Province == 'Fujian'~ '350000',
Province == 'Jiangxi'~ '360000',
Province == 'Shandong'~ '370000',
Province == 'Henan'~ '410000',
Province == 'Hubei'~ '420000',
Province == 'Hunan'~ '430000',
Province == 'Guangdong'~ '440000',
Province == 'Guangxi'~ '450000',
Province == 'Hainan'~ '460000',
Province == 'Chongqing'~ '500000',
Province == 'Sichuan'~ '510000',
Province == 'Guizhou'~ '520000',
Province == 'Yunnan'~ '530000',
Province == 'Tibet'~ '540000',
Province == 'Shaanxi'~ '610000',
Province == 'Gansu'~ '620000',
Province == 'Qinghai'~ '630000',
Province == 'Ningxia'~ '640000',
Province == 'Xinjiang'~ '650000'), #为省份赋予区号，便于后续后并数据
area = case_when(substring(quhao,1,1) == '1' ~ '华北',
substring(quhao,1,1) == '2' ~ '东北',
substring(quhao,1,1) == '3' ~ '华东',
quhao %in% c('410000','420000','430000') ~ '华中',
quhao %in% c('440000','450000','460000') ~ '华南',
substring(quhao,1,1) == '5' ~ '西南',
substring(quhao,1,1) == '6' ~ '西北'))

进行可视化展示

GDP_long %>%
ggplot(aes(x=year, y = gdp))+
facet_wrap(~area,ncol = 2,strip.position = "top")+
stat_summary(aes(x = year, y = gdp, group = area), fun.y = 'mean', geom = 'line',
size = 0.6, lty = 'dotted',color = 'black')+
geom_line(aes(group = Province,color = quhao),size = 0.6, alpha = 0.7,show.legend = F)+
geom_text_repel(data = GDP_long[GDP_long$year == '2020',],aes(label = Province),
nudge_y = 1500,max.overlaps = 13, nudge_x = 1, size = 6, alpha = 1, family = 'kalam')+
theme_classic()+
scale_color_manual(values = met.brewer('Signac',length(unique(GDP_long$quhao))),guide='none')+
labs(x = 'Year', y = 'GDP')+
theme(panel.grid.major = element_line(),
strip.background = element_blank(),
strip.text = element_text(size = 18, family = 'wqy-microhei'),
strip.placement = 'outside',
axis.title = element_text(size = 30, family = 'kalam'),
axis.text.y = element_text(size = 18,family = 'kalam'),
axis.text.x = element_text(size = 14, angle = 10, family = 'kalam'),
) -> GDP_line
#ggsave('GDP/GDP_line.png',GDP_line, height = 18, width = 32, unit = 'cm', scaling = 1, bg = 'white')

各地区GDP发展时序图

我们最终得到了可视化的结果，图中黑色虚线代表了各地区的平均发展速度。从上图大致可以看出 华东、华南、华中、西南 地区的GDP上升速度是比较快的，而 华北、西北、东北 则相对缓慢。具体来看：

华东地区发展较快，以江苏、山东、浙江为代表；
华中地区发展最为均衡，三省之间差异不大；
华南地区主要由广东带动其发展，其余两省似乎难以望其项背；
北方的省市均稍显颓势，东北三省的发展均有所落后

总体来说，图上所呈现的模式大致与我们平时所了解的东南部强势、西北部弱势相吻合。

二、各省市GDP增速排行

本节将会对具体各省市的发展情况进行排名并予以展示。

数据清洗

#数据清洗
GDP_long %>%
pivot_wider(names_from = year,values_from = gdp, names_prefix = 'year',names_sort = T) %>% #将年份长转宽，使得一行观察值代表每个省市
mutate(ratio = (year2020 - year1992)/(year1992)) %>% #计算增长率
arrange(ratio) %>%
mutate(Province = fct_reorder(Province, ratio),
angle = 90 - 360*((as.numeric(Province) - 0.5)/(max(as.numeric(Province))+4)), #+4 流出空白区域
hjust = if_else(angle < -90,1,0),
angle = if_else(angle < -90,angle + 180,angle),
relative = ratio - mean(ratio)) -> GDP_wide

结果可视化 - 环形柱状图

#结果可视化
GDP_wide %>%
ggplot(aes(x= as.numeric(Province), y = ratio)) +
geom_col(aes(fill = ratio),color = '#8F999F',show.legend = F, size = 0.3)+
coord_polar() +
geom_text(aes(y = ratio+0.2,label = Province, angle = angle, hjust = hjust), vjust = 0.5, size = 7, family = 'kalam') +
scale_y_continuous(expand = c(0,0), limits = c(-10, max(GDP_wide$ratio)+10), labels = scales::percent_format()) +
xlim(c(0,nrow(GDP_wide)+4))+
geom_segment(data = data.frame(x = rep(nrow(GDP_wide)+1,3),
xend = rep(nrow(GDP_wide)+4,3),
y = c(15,30,60),
yend = c(15,30,60)),
aes(x = x, xend = xend , y =y, yend =yend), color = '#8F999F',size = 0.6) +
geom_segment(aes(x = 0, xend = nrow(GDP_wide)+4, y = 0 , yend = 0),color = '#8F999F',size = 0.3)+ #内部圆环
geom_text(data = data.frame(label = c('1,500%',"3,000%","6,000%"),
y = c(15,30,60)),
aes(x = nrow(GDP_wide)+2.5, y = y + 2, label =label),vjust = 0,
angle = 360 - 360*(nrow(GDP_wide)+2.5)/(max(as.numeric(GDP_wide$Province))+4),
size = 8, family = 'kalam',color = '#8F999F')+
theme_void()+
scale_fill_gradientn(colors=met.brewer("Tam", direction = -1)) -> GDP_cycle
#ggsave("GDP/GDP_cycle.png", GDP_cycle, width = 6, height = 6, scaling = 1, bg = 'white')

通过环形图可以看到与第一部分的结论一致，东北三省的增长速度确实是最为缓慢的，这可能与中国工业逐步转型有关。但让人惊讶的是上海的排名也位于下半区，这好像与我的理解有出入，这可能跟1992年GDP基数大小有关，西藏位于排头可能也有这方面的原因。为此，下文将进一步将增量与增速同时进行考虑，来更好地理解环形图的结果。

结果可视化 - 双轴柱状图

#绘制双轴柱状图
GDP_wide %>%
mutate(incs = year2020 - year1992) %>%
ggplot() +
geom_col(aes(x = reorder(Province, -ratio), y = -incs, fill = incs),show.legend = F) +
scale_fill_gradient(low = '#33655b', high = '#bbded6')+
new_scale_fill()+
geom_col(aes(x = reorder(Province, -ratio), y = ratio*1000,fill = ratio),show.legend = F)+
scale_fill_gradientn(colors = met.brewer('Hokusai2', direction = -1))+
scale_y_continuous(breaks = c(seq(-90000, 0, 30000), c(20000, 40000, 60000)),
labels = c("90000","60000","30000","0","","",""),
sec.axis = sec_axis(~ ./10, name = "GDP增速(%)",
breaks = seq(0, 6000,2000))) +
geom_text(aes(x = reorder(Province, -ratio), y = ratio*1000,
label = paste0(scales::comma(round(ratio*100),1.0),"%")),
hjust = 0.4, vjust = -0.5, size =6.5, family = 'call') +
geom_text(aes(x = reorder(Province, -ratio), y = -incs,
label = scales::comma(incs,1.0)),
hjust = 0.5, vjust = 1.5, size =6.5, family = 'call') +
geom_hline(aes(yintercept = 0), size = 0.6 , color = "white") +
theme_minimal()+
labs(y =" GDP增量（亿元）")+
theme(panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(size = 0.4, color = 'darkgray'),
panel.grid.minor.y = element_line(size = 0.3, color = 'lightgray'),
axis.text.y = element_text(size = 18, family = 'kalam'),
axis.title.y.left = element_text(hjust = 0.28, size = 32, vjust = 2),
axis.title.y.right = element_text(hjust = 0.12, size = 32, vjust = 2),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 18, family = 'kalam', angle = 15, hjust = 0.5)) -> compare_col
ggsave("GDP/GDP_two_axes.png", compare_col, width = 12, height = 6,bg = 'white')

各省市GDP增速与增量

将这部分的结果结合环形图来看的话，可以发现一些地区虽然增速较快的实际的增量却不是很大，尤其是西部的地区比如西藏、宁夏、青海等。江苏和广东则是增量大的同时增速也很快。而浙江属于增速快而增量较大、上海增量较大而增速相对较缓。而东北三省在考虑到增量以后仍不容乐观。因此如何进一步促进西部与北部地区的发展仍然是我们需要去解决的问题。

三、中国GDP地图

本节将继续考虑各省市的GDP相对增长速度,本文对其的定义是：相对于平均增长速度的差异。并将结果用地图的形式绘制出来。

数据清洗

#数据清洗
GDP_map <- china_map %>%
rename(quhao = QUHUADAIMA) %>%
mutate(quhao = case_when(quhao== 'daodian' ~ NAME,
TRUE ~ quhao)) %>%
select(quhao,geometry) %>%
left_join(GDP_wide, by = 'quhao') %>%
select(Province,ratio, geometry, relative) %>%
mutate(hjust = if_else(relative >= 0, 0,1),
type = case_when(relative >= 15 ~ 1,
10 <= relative & relative < 15 ~ 2,
5 <= relative & relative < 10 ~ 3,
0 <= relative & relative < 5 ~ 4,
-5 <= relative & relative < 0 ~ 5,
-10 <= relative & relative < -5 ~ 6,
-15 <= relative & relative < -10 ~ 7,
relative < -15 ~ 8),
type = factor(as.character(type)),
fill = case_when(type == "1" ~ "#FC4E2A",
type == "2" ~ "#FD8D3C",
type == "3" ~ "#FEB24C",
type == "4" ~ "#FED976",
type == "5" ~ "#C6DBEF",
type == "6" ~ "#9ECAE1",
type == "7" ~ "#6BAED6",
type == "8" ~ "#2470a0",
TRUE ~ "lightgrey"))%>%
arrange(desc(relative))

结果可视化

主要地图

#绘制主要地图
GDP_map %>%
ggplot()+
geom_sf(aes(fill = fill, geometry = geometry),show.legend = F,color = 'lightgrey', size = 0.5) +
scale_fill_identity()+
theme_void()+
theme(aspect.ratio = 0.93) -> MGDP

南海九段线

#绘制南海九段线
GDP_map %>%
ggplot()+
geom_sf(aes(fill = fill, geometry = geometry),show.legend = F,color = 'lightgrey', size = 0.5) +
scale_fill_identity()+
geom_sf(data = line) +
coord_sf(xlim = c(108,124), ylim = c(2,24))+
theme(aspect.ratio = 1.5, #调节长宽比
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(), #panel - 分面或者单图的画布信息
panel.background = element_blank(),
panel.border = element_rect(fill=NA, color="grey10",linetype=1,size=0.5),
plot.margin=unit(c(0,0,0,0),"mm")) -> nine_line

相对发展柱状图

GDP_map %>%
mutate(Province = fct_reorder(Province, relative),
label = paste0(Province,"(",scales::comma(round(relative*100),1),"%",")")) %>%
distinct(Province, .keep_all = T) %>%
filter(!is.na(Province)) %>%
ggplot()+
geom_col(aes(x=Province,y = relative, fill = fill, color = fill)) +
scale_fill_identity()+
scale_color_identity()+
coord_flip()+
geom_text(aes(x = Province, y = relative+0.2 - hjust * 0.4 ,label = label, hjust = hjust), vjust = 0.5, size = 7, family = 'kalam') +
scale_y_continuous(labels = scales::percent_format(), limits = c(-27,20)) +
theme_classic()+
theme(axis.ticks= element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
plot.title = element_text(hjust = 0.5,size = 18),
plot.margin= margin(0,8,0,8,unit = 'mm')) -> relative_plot

合并图片

text = '自1992年以来各省的GDP都呈现快速的增长模式，但从此部分的可\n视化结果可以发现，各省虽都处于快速发展阶段，但省之间的相对\n发展速度存在明显差异。西藏、福建、重庆、贵州与北京是相对增\n速最快的五个省市，而东北三省以及河北与甘肃的增速相对缓慢。\n从地理分布图上也可以看出，除去GDP基数较小的西藏与内蒙古以\n外,南北间存在着明显差距。'
ggdraw(xlim = c(0.15,1),ylim = c(0,1))+
draw_plot(relative_plot,x = 0.57, y = 0.5,hjust = 0.5, vjust = 0.5, width = 0.85,height = 0.95)+
draw_plot(MGDP, x = 0.8, y = 0.20,hjust = 0.5, vjust = 0.5,width = 0.65,height = 0.65)+
draw_plot(nine_line, x = 0.95, y = 0.12,hjust = 0.5, vjust = 0.5, width = 0.15,height = 0.15)+
draw_text('1992~2020年中国各省份GDP相对增长速率',x = 0.39, y = 0.94,hjust = 0.5, vjust = 0.5, size = 54, family = 'cool')+
draw_text(text,x = 0.26, y = 0.75,hjust = 0, vjust = 0.5, size = 28, lineheight = 1, family = 'cool') -> final_plot
final_plot
ggsave('GDP/GDP_MAP.png',final_plot, height = 18, width = 32, unit = 'cm', scaling = 0.6, bg = 'white')

1992-2020GDP相对增长

正如图上所描述的：自1992年以来,近30年间各省的GDP都呈现向上增长的发展模式，但省之间的发展速度存在明显差异。西藏、福建、重庆、贵州与北京相对增速最快，而东北三省以及河北与甘肃的增速相对缓慢。从地图的分布上看，再次通过另一种形式表现出了南、北与东、西之间的发展差异。

本次的内容结束了，内容并不是很多。如果有什么建议或者问题欢迎联系我📱💌 或者给我留言~

Billboard - TOP100

Mon, 21 Mar 2022 00:00:00 +0000

**本文利用开放R包 billboard 中的数据：**

展示如何利用R以及ggplot2包进行数据分析并形成出版级别的可视化效果
展现2016年年度十大歌手及其代表作品
分析半个多世纪以来，年度Top100歌曲在某些特征上的时间变化
对比1960年与2015年的歌曲特征，揭示不同时代的受欢迎的音乐特点

文章包括分析代码、可视化结果以及相关的分析结论。

话不多说，直接进入正题

一、前期准备

#首先加载会使用到的包
library(tidyverse)
library(MetBrewer)
library(cowplot)
library(billboard)
library(extrafont)
library(cowplot)
library(showtext); showtext_auto()
library(ggcorrplot)
library(ggsci)
library(gghalves)
library(ggprism)
library(ggsignif)
library(car)
library(ggbump)
#从GoogleFonts导入字体
font_add_google("Satisfy", "satisfy")
font_add_google("Sacramento", "mento")
font_add_google("Lobster", "lobster")
font_add_google("Luckiest Guy", "luck")
font_add_google("Calligraffitti", "call")
font_add_google("Kalam", "kalam")
font_add_google("ZCOOL KuaiLe", "cool")

#读入数据
head(spotify_track_data)

## # A tibble: 6 x 23
## year artist_name artist_id explicit track_name track_id danceability energy
## <chr> <chr> <chr> <lgl> <chr> <chr> <dbl> <dbl>
## 1 1960 Percy Faith ~ 24DQLSng~ FALSE "The Them~ 2kKL5kk~ 0.466 0.389
## 2 1960 Jim Reeves 2Ev0e8GU~ FALSE "He'll Ha~ 7DH5dXV~ 0.554 0.186
## 3 1960 Johnny Prest~ 1B8n8vtE~ FALSE "Running ~ 60DbxCJ~ 0.758 0.462
## 4 1960 Mark Dinning 55Rf9Kfq~ FALSE "Teen Ang~ 7GvhZ0F~ 0.583 0.168
## 5 1960 Brenda Lee 4cPHsZM9~ FALSE "I'm Sorr~ 5VyBNCS~ 0.567 0.141
## 6 1960 Elvis Presley 43ZHCT0c~ FALSE "It's Now~ 2u8Lrq2~ 0.635 0.391
## # ... with 15 more variables: key <int>, loudness <dbl>, mode <int>,
## # speechiness <dbl>, acousticness <dbl>, instrumentalness <dbl>,
## # liveness <dbl>, valence <dbl>, tempo <dbl>, type <chr>, uri <chr>,
## # track_href <chr>, analysis_url <chr>, duration_ms <int>,
## # time_signature <int>

head(wiki_hot_100s)

## no title artist year
## 1 1 Theme from A Summer Place Percy Faith 1960
## 2 2 He'll Have to Go Jim Reeves 1960
## 3 3 Cathy's Clown The Everly Brothers 1960
## 4 4 Running Bear Johnny Preston 1960
## 5 5 Teen Angel Mark Dinning 1960
## 6 6 I'm Sorry Brenda Lee 1960

从表格中可以看到wiki_hot_100s数据反映了从1960年到2016年的年度TOP100单曲，包含四个变量：

歌曲排行：no
歌曲名称：title
歌手名称：artist
歌曲年份：year

spotify_track_data数据则反映了其中大部分歌曲在多个特征维度上的评价情况

🙌前期工作完成🙌

下面尝试对这些数据进行分析，看看能不能得到什么有趣的结果

二、正式分析

1.歌手排名

经过大致的数据浏览，我决定选取2016年的数据，以歌曲上榜数量以及歌曲平均排行为指标，决出当年的十大歌手。

#数据清洗
singer <- wiki_hot_100s %>%
mutate(year = as.numeric(year),
no = as.numeric(no)) %>%
filter(year %in% 2016 & no <= 100) %>% #选出目标数据并过滤缺失数据
group_by(artist) %>%
mutate(best = min(no),
num = n(),
no_all = sum(no)) %>% #找到每个歌手的最佳作品和上榜歌曲总数
ungroup %>%
mutate(mean_no = no_all/num) %>%
arrange(desc(num), mean_no) %>%
group_by(artist) %>%
mutate(rank = 1:n()) %>%
filter(rank == 1) %>%
select(-c(year,rank,no)) %>%
ungroup() %>%
mutate(rank = row_number())
#选择前10歌手
top_10 <- singer[1:10,] %>%
mutate( angle = 360 - rank*(360 / (max(rank)+1)),
angle = ifelse(angle < -90, angle +180, angle),
mean_no = round(mean_no)) #计算平均歌曲排名

#结果可视化
image = "https://raw.githubusercontent.com/wht9975/website/master/images/loveuself.png"
#子图1
top_10 %>%
ggplot()+
ggimage::geom_image(aes(x = 0, y = -1.5, image = image),size=0.19) +
geom_col(aes(x=rank, y=num, fill = artist),show.legend = F, width = 0.95)+
coord_polar(theta = 'x',direction = 1,start = -0.3)+
ylim(c(-1.5,5))+
xlim(c(0,11))+
theme_void()+
geom_text(aes(x = rank, y=as.numeric(num)+1,label = artist),
size=18, family = 'satisfy', fontface = 'bold', vjust = -0.1) +
geom_text(aes(x = rank, y=as.numeric(num)+1,label = paste(title,'-',best)),
size=11, family = 'call', fontface = 'bold', vjust = 1.7, color = '#716e77', hjust = 0.55) +
geom_text(aes(x = 0.25, y=2.8),label = 'The year of',
size=28, family = 'lobster', fontface = 'bold', hjust = 1)+
geom_text(data = data.frame(label=factor(c('2','0','1','6')),
y = c(12/4, 9/4, 6/4, 3/4)),
aes(x = c(0.5,0.5,0.45,0.5) , y = y , label = label),vjust = 1,hjust = 1,
size = 25, family = 'kalam', fontface = 'bold') +
geom_text(aes(x = rank, y=as.numeric(num)/2, label = rank, size =rank),
color = 'white',family = 'kalam',show.legend = F) +
scale_size_continuous(range = c(32,16)) +
scale_fill_manual(values = met.brewer('Hokusai1',n=10)) ->song_2016
#子图2
top_10 %>%
ggplot(aes(x = rank, y = 100 - mean_no, fill = artist))+
geom_col(aes(y=100),color = 'lightgrey',fill = 'white', lty = 'dashed', size = 0.8)+
geom_col(aes(color = artist),show.legend = F,size = 0.8)+
scale_fill_manual(values = met.brewer('Hokusai1',n=10))+
scale_color_manual(values = met.brewer('Hokusai1',n=10))+
geom_hline(yintercept = c(50,75),color = '#716e77',lty = 'dashed',size = 0.8)+
geom_text(aes(y = -4, label = artist),
size=14, family = 'satisfy', fontface = 'bold')+
theme_minimal()+
theme(panel.grid = element_blank(),
# axis.line.x = element_line(size = 2),
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(family = 'kalam',size = 64, hjust = 0.5),
plot.margin = margin(8,2,1,2,unit = 'mm')) +
scale_x_continuous(expand = c(0,0))+
scale_y_continuous(breaks=c(0,50,75,100, 120), labels = c('',50,25,1, '')) -> song_col
#图片合并
ggdraw()+
draw_plot(song_2016, x = 0.025,y = 0.175,width = 0.95,height = 0.95)+
draw_plot(song_col, x = 0.125, y = 0.025,width = 0.75,height = 0.35)+
draw_plot_label(c("M","e","a","n"), x = rep((0.07),4), y = seq(0.35,0.15,length.out=4),
size = 96, family = 'kalam',hjust = 0.5)+
draw_plot_label(c("R","a","n","k"), x = rep((0.095),4), y = seq(0.32,0.15,length.out=4),
size = 84, family = 'kalam',hjust = 0.5) -> song_all
#ggsave("2016.png", song_all, width = 16, height = 9, units = 'in',bg = 'white')

通过环形图可以看到Justin Bieber凭借其Love Yourself（总排名1）以及总共三首上榜歌曲荣登2016年度榜首。排名其后的是Twenty One Pilots、Adele以及The Weekend均有三首佳作上榜。之后的几名歌手均有两首歌曲进入Top100,根据他们的平均歌曲排名（下方柱状图）确定最终排名。

2.特征关联

本部分尝试分析Spotify在评定歌曲时依据的各项指标间的关系，并通过相关图展示最后结果

#数据清洗
feather <- spotify_track_data %>%
rename(artist=artist_name,
title = track_name) %>%
group_by(year) %>%
mutate(rank = 1:n()) %>%
ungroup
feather_all <- feather %>%
select(-c(year,artist,artist_id,track_href,title,track_id,key,type,uri,analysis_url,duration_ms,time_signature,explicit)) #去除无关数据
cor = round(cor(feather_all, method = 'pearson'),2) #对特征变量间进行两两相关分析
p.mat <- cor_pmat(feather_all) #计算相关显著性

#数据可视化
ggcorrplot(cor, method = 'circle',hc.order = TRUE,
outline.color = "white",type = "lower",
lab = F, digits = 2,lab_size = 4, lab_col = 'black',
colors = c('#128b8e','#FFF5C3',"#fb3c3c"),
ggtheme = theme_minimal())+
guides(fill = guide_colorbar(direction = 'horizontal',
barwidth = unit(12,'cm'),
ticks = F)) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 28, angle = 30,family = 'call', hjust = 0.6, vjust = 0.9),
axis.text.y = element_text(size = 28, family = 'call'),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
plot.margin = margin(20,3,3,3,unit = 'mm'),
legend.title = element_blank(),
legend.text = element_text(size = 20,family = 'mento'),
legend.position = ('top')) -> corplot
#ggsave("feather_cor.png", corplot,width = 20, height = 20, units = 'cm',bg = 'white',dpi = 300)

相关图中的变量各自代表的含义如下：

danceability: 反应曲目适合跳舞的程度。0代表最不适合跳舞，1代表最适合跳舞。
valence: 取值由0至1，用于描述曲目所传达的情绪效价（激动、高兴）。得分高者听起来更积极反之更消极（悲伤、痛苦）。
speechiness: 用于检测曲目中是否存在类似说话的声音。歌曲越像语音（如脱口秀、有声读物、诗歌），值越接近1。高于0.66代表基本由话语组成的曲目。介于0.33和0.66之间可能同时包含音乐和语音，（说唱音乐）。低于0.33最有可能表示音乐和其他非语音曲目。
energy: 从0到1代表感知到的强度和能量。
loudness: 整体响度（以分贝为单位）。
mode: 0,1取值，0代表小调，1代表大调。
acousticness: 0到1，值越大越有可能是原声音乐。
instrumentalness: 0到1，判断乐曲是否不包含人声，越接近1与可能是纯乐器演奏。
rank: 该歌曲在当年的Top100排名。
liveness: 通过歌曲中是否存在观众声音来判断歌曲是否是live版本。
tempo: 曲目的总体节奏（每分钟节拍数，BPM）。

通过相关热图可以发现：

歌曲效价（valence）与舞动性（danceability）、能量值（energy）与响度（loudness）以及效价（valence）与能量值（energy）之间存在较高的正相关。说明这些变量共同反映了歌曲动感、活泼的一个方面
原声性（acousticness）与能量值（energy）、响度（loudness）以及舞动性（danceability）间均有较高的负相关，这可能是由于原声音乐都是比较柔和、安静的所以与一些动感的音乐元素匹配不佳。
而歌曲的排名与其他变量间并没有特别的相关。反映了听众的音乐品味是多元化的，并没有那种特征元素特别受到大家的欢迎

3.时代变化

第三部分选取四个特征（danceability、valence、energy以及speechiness）展示从1960年以来这些特征的时代变化

#数据清洗
feather_year <- feather %>%
select(year, title, artist, danceability, valence, energy, instrumentalness,rank, speechiness) %>%
group_by(year) %>%
summarise(mean_dance = mean(danceability),
mean_valence = mean(valence),
mean_energy = mean(energy),
mean_speechiness = mean(speechiness)) %>%
pivot_longer(c(mean_dance,mean_valence,mean_energy,mean_speechiness), names_to = 'index', values_to = 'value')
#数据可视化
feather_year %>%
ggplot(aes(x = year, y = value, group = index, color = index))+
geom_point(aes(fill = index, shape = index),size =2, alpha = 0.8)+
geom_smooth(aes(fill= index), #对数据进行线性拟合
span = 0.6, size = 0.8,
method = "loess",
level=0.95,
formula = 'y~x', alpha = 0.5) +
scale_fill_manual(values = c('#2E94B9','#F0B775','#3b9a9c','#D25565'),
labels = c('Danceability','Energy','Speechiness','Valence'))+
scale_color_manual(values = c('#2E94B9','#F0B775','#3b9a9c','#D25565'),
labels = c('Danceability','Energy','Speechiness','Valence'))+
scale_shape_manual(values = c(21,22,23,24),
labels = c('Danceability','Energy','Speechiness','Valence'))+
theme_minimal()+
labs(x = '', y = "Score", title = 'Feathers of each Year')+
theme(legend.title = element_blank(),
legend.text = element_text(size = 18, family = 'call'),
legend.position = 'top',
axis.text.x = element_text(size = 18, angle = 15,family = 'call',
hjust = 0.5, vjust = 0.9),
axis.text.y = element_text(size = 24, family = 'call'),
axis.title = element_text(size = 40, family = 'kalam'),
plot.title = element_text(size = 60, family = 'kalam', hjust = 0.5)) ->feather_plot
ggsave("feather.png",feather_plot, width = 12,height = 6, units = 'in', bg = 'white')

时序图上得出一些有意思的结论：

虽然danceability,energy和valence之间存在高相关，但top100歌曲的这三个特征的时代变化却略有差异，人们对于存在高能量值的歌曲越发喜爱，但对歌曲本身效价的选择却由大多为积极的音乐转变为中性的音乐，考虑到数据的平均属性，这可能反映了近年听众开始更多聆听带有悲伤情感的音乐。
speechiness维度上的变化幅度虽然不大，但也能从一定程度上反映出从90年代开始，人们对带有说话性质的歌曲（可能以说唱歌曲为代表）的认可度有所提升。这与8、90年代说唱逐渐流行的情况也是吻合的

4.1960 V.S. 2015

由于第三部分的分析发现了歌曲特征在时代上的变化，本部分重点选取最早的1960年代的数据以及数据集中最新的2015年代的数在效价、舞动性以及能量值上进行直接的对比来验证之前的结论。

#数据清理
compare <- feather %>%
filter(year %in% c('1960','2015')) %>%
select(year, danceability, valence, energy) %>%
mutate(year = factor(year)) %>%
group_by(year) %>%
mutate(no = 1:n()) %>%
ungroup %>%
pivot_longer(c(danceability, valence, energy), names_to = 'index', values_to = 'value') %>%
mutate(dotx = case_when(year == '1960' ~ 1.15,
year == '2015' ~ 1.85),
vlinx = case_when(year == '1960' ~ 0.85,
year == '2015' ~ 2.15),
fill = case_when(year == '1960' & index == 'danceability' ~ '#35999D',
year == '1960' & index == 'energy' ~ '#D0938B',
year == '1960' & index == 'valence' ~ '#D2B577',
year == '2015' & index == 'danceability' ~ '#356CA2',
year == '2015' & index == 'energy' ~ '#BA5555',
year == '2015' & index == 'valence' ~ '#BB683F'),
value = as.numeric(value))

#分别对三部分数据进行方差齐性检验并根据结果进行随后的差异t检验
#dance
data_dance <- compare[compare$fill == '#35999D'| compare$fill == '#356CA2',]
leveneTest(data_dance$value,factor(data_dance$fill),center=mean)

## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 1 0.3261 0.5687
## 186

#energy
data_energy <- compare[compare$fill == '#D0938B'| compare$fill == '#BA5555',]
leveneTest(data_energy$value,factor(data_energy$fill),center=mean)

## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 1 6.6239 0.01084 *
## 186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#valence
data_valence <- compare[compare$fill == '#D2B577'| compare$fill == '#BB683F',]
leveneTest(data_valence$value,factor(data_valence$fill),center=mean)

## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 1 3.1566 0.07725 .
## 186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

t.test(value ~ fill,data_dance, paired = F, var.equal = F)

##
## Welch Two Sample t-test
##
## data: value by fill
## t = 5.1498, df = 185.72, p-value = 6.615e-07
## alternative hypothesis: true difference in means between group #356CA2 and group #35999D is not equal to 0
## 95 percent confidence interval:
## 0.06701662 0.15024701
## sample estimates:
## mean in group #356CA2 mean in group #35999D
## 0.6584318 0.5498000

t.test(value ~ fill,data_energy, paired = F, var.equal = T)

##
## Two Sample t-test
##
## data: value by fill
## t = 10.39, df = 186, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group #BA5555 and group #D0938B is not equal to 0
## 95 percent confidence interval:
## 0.1997677 0.2934115
## sample estimates:
## mean in group #BA5555 mean in group #D0938B
## 0.7039886 0.4573990

t.test(value ~ fill,data_valence, paired = F, var.equal = F)

##
## Welch Two Sample t-test
##
## data: value by fill
## t = -3.8158, df = 185.98, p-value = 0.0001847
## alternative hypothesis: true difference in means between group #BB683F and group #D2B577 is not equal to 0
## 95 percent confidence interval:
## -0.20338511 -0.06475407
## sample estimates:
## mean in group #BB683F mean in group #D2B577
## 0.5388534 0.6729230

# 根据统计结果进行可视化展示
ggplot(compare, aes(y=value))+
geom_line(aes(x = dotx, group = no,fill = fill),
position = position_dodge(0.2),alpha = 0.35, color = 'grey')+
geom_point(aes(x = dotx, group = no,fill = fill),color = 'white',
position = position_dodge(0.2), shape = 21, size =2,alpha = 0.6)+
geom_half_violin(data = compare[compare$year == '1960',],
aes(x = vlinx,fill = fill),
side = 'l',size = 0.3, trim = T,alpha = 0.8,color = 'white')+
geom_half_violin(data = compare[compare$year == '2015',],
aes(x = vlinx,fill = fill),side = 'r',
size = 0.3, trim = T,alpha = 0.8,color = 'white')+
geom_boxplot(aes(x = vlinx, group = year), color = 'white', fill = '#B9C8E3',
width = 0.2, size = 0.2, outlier.shape = NA,alpha = 0.5)+
stat_summary(aes(x = vlinx, color = fill), fun.data = "mean_cl_normal",
fun.args = list(conf.int = .95), geom = 'errorbar',
width = 0.08,size = 0.5)+
geom_signif(data = data.frame(index = c("danceability","energy",'valence')),
aes(annotations=rep("***", 3),
y_position=rep(1.05, 3), xmin=rep(0.8, 3), xmax=rep(2.2, 3)),
tip_length=0.02, manual = T,textsize = 8,vjust = 0.5) +
facet_grid(~index)+
scale_fill_identity()+
scale_color_identity()+
theme_prism(base_line_size =0.5)+
scale_x_continuous(expand = c(0,0),breaks = c(1,2),
labels = c("1960", "2015"),limits = c(0,3))+
labs(x = 'Year', y= 'Score', title = '1960 v.s. 2015')+
theme(plot.margin = margin(2,2,2,2,unit = 'mm'),
panel.grid.minor = element_line(size = 0.4,color = "#e5e5e5"),
panel.grid.major = element_line(size = 0.4,color = "#e5e5e5"),
panel.spacing = unit(0,"lines"),
axis.ticks = element_blank(),
plot.title = element_text(family = 'kalam', size =48),
axis.title = element_text(family = 'kalam', size =24),
axis.text = element_text(family = 'mento', size =24),
strip.text = element_text(family = 'kalam', size =32,vjust = 1)) -> compare_plot
ggsave('wind&rain.png',compare_plot,width=16,height = 8)

结果反应1960与2015间Top100歌曲在效价、能量值与舞动性均存在统计学意义上的显著性差异，总体变化趋势与第三部分的结果一致。

👋👋 以上就是本文的全部内容了，如果有什么建议欢迎联系我📱💌

购物App原型

Sun, 20 Mar 2022 00:00:00 +0000

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis posuere tellus ac convallis placerat. Proin tincidunt magna sed ex sollicitudin condimentum. Sed ac faucibus dolor, scelerisque sollicitudin nisi. Cras purus urna, suscipit quis sapien eu, pulvinar tempor diam. Quisque risus orci, mollis id ante sit amet, gravida egestas nisl. Sed ac tempus magna. Proin in dui enim. Donec condimentum, sem id dapibus fringilla, tellus enim condimentum arcu, nec volutpat est felis vel metus. Vestibulum sit amet erat at nulla eleifend gravida.

Nullam vel molestie justo. Curabitur vitae efficitur leo. In hac habitasse platea dictumst. Sed pulvinar mauris dui, eget varius purus congue ac. Nulla euismod, lorem vel elementum dapibus, nunc justo porta mi, sed tempus est est vel tellus. Nam et enim eleifend, laoreet sem sit amet, elementum sem. Morbi ut leo congue, maximus velit ut, finibus arcu. In et libero cursus, rutrum risus non, molestie leo. Nullam congue quam et volutpat malesuada. Sed risus tortor, pulvinar et dictum nec, sodales non mi. Phasellus lacinia commodo laoreet. Nam mollis, erat in feugiat consectetur, purus eros egestas tellus, in auctor urna odio at nibh. Mauris imperdiet nisi ac magna convallis, at rhoncus ligula cursus.

Cras aliquam rhoncus ipsum, in hendrerit nunc mattis vitae. Duis vitae efficitur metus, ac tempus leo. Cras nec fringilla lacus. Quisque sit amet risus at ipsum pharetra commodo. Sed aliquam mauris at consequat eleifend. Praesent porta, augue sed viverra bibendum, neque ante euismod ante, in vehicula justo lorem ac eros. Suspendisse augue libero, venenatis eget tincidunt ut, malesuada at lorem. Donec vitae bibendum arcu. Aenean maximus nulla non pretium iaculis. Quisque imperdiet, nulla in pulvinar aliquet, velit quam ultrices quam, sit amet fringilla leo sem vel nunc. Mauris in lacinia lacus.

Suspendisse a tincidunt lacus. Curabitur at urna sagittis, dictum ante sit amet, euismod magna. Sed rutrum massa id tortor commodo, vitae elementum turpis tempus. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean purus turpis, venenatis a ullamcorper nec, tincidunt et massa. Integer posuere quam rutrum arcu vehicula imperdiet. Mauris ullamcorper quam vitae purus congue, quis euismod magna eleifend. Vestibulum semper vel augue eget tincidunt. Fusce eget justo sodales, dapibus odio eu, ultrices lorem. Duis condimentum lorem id eros commodo, in facilisis mauris scelerisque. Morbi sed auctor leo. Nullam volutpat a lacus quis pharetra. Nulla congue rutrum magna a ornare.

Aliquam in turpis accumsan, malesuada nibh ut, hendrerit justo. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Quisque sed erat nec justo posuere suscipit. Donec ut efficitur arcu, in malesuada neque. Nunc dignissim nisl massa, id vulputate nunc pretium nec. Quisque eget urna in risus suscipit ultricies. Pellentesque odio odio, tincidunt in eleifend sed, posuere a diam. Nam gravida nisl convallis semper elementum. Morbi vitae felis faucibus, vulputate orci placerat, aliquet nisi. Aliquam erat volutpat. Maecenas sagittis pulvinar purus, sed porta quam laoreet at.