# Load libraries and settings here
library(tidyverse)
library(here)
library(ggplot2)
library(fcuk)
library(readxl)
library(janitor)
library(here)
library(ggplot2)
library(dplyr)
library(hrbrthemes)
library(mapproj)
library(rgeos)
library(knitr)
library(viridis)
library(janitor)
library(maps)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)
library(lubridate)
library(patchwork)
knitr::opts_chunk$set(
    warning = FALSE,
    message = FALSE,
    comment = "#>",
    fig.path = "figs/", # Folder where rendered plots are saved
    fig.width = 7.252, # Default plot width
    fig.height = 4, # Default plot height
    fig.retina = 3 # For better plot resolution
)

# Put any other "global" settings here, e.g. a ggplot theme:
theme_set(theme_bw(base_size = 20))

# Write code below here to load any data used in project
income1 <- read_csv("data_raw/2019tbl.csv", skip=3)
income_three_years <- income1 %>% 
  
    filter(GeoName != "United States") %>% 
    select(-c(GeoFips)) %>% 
    rename("State" = "GeoName") %>% 
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    mutate(State = fct_other(State, keep=c("District of Columbia",'Mississippi'))) %>% 
    group_by(State, Time) %>% 
    summarise(mean = mean(Dollars))
income_three_years %>% 
    ggplot(aes(x=Time, y=mean, group = State)) +
    geom_point(aes(color=State))+
    geom_line(aes(color=State))+
    theme(legend.position = "none")+
    theme_minimal()+
    scale_color_manual(values=c("#8b0000", "#00008b", "dark grey"))+
    labs(
        y="Dollars (thousand)",
        title = "Income had Increased Steadily even in Pandemic Situation",
        subtitle = "Whether highest income state,District of Columbia,\nor lowest income state, Mississippi, they all increased",
        legend = "State"
    )
write.csv(income_three_years, "data_processed\\income_three_years.csv")
income <- read_csv("data_raw/us_income/Table.csv", skip=3)
#view(income)
income_two_states <- income %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != c("United States", "Rhode Island", "New England", "Mideast", "Great Lakes", "Plains", "Southeast", "Rocky Mountain", "Far West")) %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    mutate(State = fct_other(State, keep=c("District of Columbia",'Mississippi'))) %>% 
    group_by(State, Time) %>% 
    summarise(mean = mean(Dollars))
write.csv(income_two_states, "data_processed\\income_two_states.csv")
income_two_states %>% 
    ggplot(aes(x=Time, y=mean, group = State)) +
    geom_point(aes(color=State))+
    geom_line(aes(color=State))+
    theme(legend.position = "none")+
    theme_minimal()+
    scale_color_manual(values=c("#8b0000", "#00008b", "dark grey"))+
    labs(
        y="Dollars (thousand)",
        title = "Steady increase in income during the pandemic",
        legend = "State"
    )
    
    
covid_timeline <- read_csv("data_raw/owid-covid-data.csv")
covid_timeline_filtered <- covid_timeline %>% 
    filter(iso_code == "USA") %>% 
    mutate(date = ymd(date)) %>% 
    mutate(new_cases = new_cases/100)
#View(covid_timeline_filtered)

#glimpse(covid_timeline_filtered)

write.csv(covid_timeline_filtered, "data_processed\\covid_timeline_filtered.csv")
d <-  data.frame(date = as.Date("2020-08-15","2020-04-03", "2021-03-13","2021-05-20","2021-08-23","2021-11-15"))

plot1 <- ggplot(covid_timeline_filtered) +
    geom_line(aes(x=date, y=new_cases), color="#E51837", size=0.5)+
    scale_x_date(
        date_breaks = "3 months",
        date_labels = '%Y/%b'
    )+
    labs(title = "Covid Timeline",
         x= "",
         y= "Cases(100)")+
    geom_segment(x=as.Date("2020-04-03"), xend=as.Date("2020-04-03"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-06-04"), xend=as.Date("2021-06-04"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2020-12-14"), xend=as.Date("2020-12-14"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-12-01"), xend=as.Date("2021-12-01"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    annotate ('text', x=as.Date("2020-03-31"), y=5000, color='grey50', hjust=1, label="Mask")+
    annotate ('text', x=as.Date("2021-06-01"), y=5000, color='grey50', hjust=1, label="Delta")+
    annotate ('text', x=as.Date("2020-12-11"), y=5000, color='grey50', hjust=1, label="First Vaccine")+
    annotate ('text', x=as.Date("2021-11-28"), y=5000, color='grey50', hjust=1, label="Omicron")
    

plot2 <- ggplot(covid_timeline_filtered) +
    geom_line(aes(x=date, y=new_deaths), color="blue", size=0.5)+
    scale_x_date(
        date_breaks = "3 months",
        date_labels = '%Y/%b'
    )+
    labs(x= "Time",
         y= "Deaths")+
    geom_segment(x=as.Date("2020-04-03"), xend=as.Date("2020-04-03"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-06-04"), xend=as.Date("2021-06-04"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2020-12-14"), xend=as.Date("2020-12-14"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-12-01"), xend=as.Date("2021-12-01"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    annotate ('text', x=as.Date("2020-03-31"), y=3500, color='grey50', hjust=1, label="Mask")+
    annotate ('text', x=as.Date("2021-06-01"), y=3500, color='grey50', hjust=1, label="Delta")+
    annotate ('text', x=as.Date("2020-12-11"), y=3500, color='grey50', hjust=1, label="First Vaccine")+
    annotate ('text', x=as.Date("2021-11-28"), y=3500, color='grey50', hjust=1, label="Omicron")

    
    
plot1/plot2
income_per_capita <- income %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != c("United States", "Rhode Island", "New England", "Mideast", "Great Lakes", "Plains", "Southeast", "Southwest", "Rocky Mountain", "Far West")) %>% 
    mutate(GeoName = fct_recode(GeoName, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(State = fct_reorder(State, Dollars)) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    group_by(State) %>% 
    summarise(mean = mean(Dollars)) %>% 
    arrange(desc(mean))

    
    ggplot(income_per_capita)+
    geom_segment(aes(x=0, xend = mean, y=State, yend=State),color = 'gray')+
    geom_point(aes(x=mean, y=State), color='Steelblue')+
    geom_vline(xintercept = mean(income_per_capita$mean), color='red', linetype='dashed')+
        annotate('text', x= 62, y='Michigan', color='red', hjust=0, label='mean')+
    scale_x_continuous(
expand = expansion(mult = c(0, 0.05)))+
    theme_minimal()+
        labs(x='Income(1000 dollars)',
             y='State',
             title='The ranking of State income'
        )
write.csv(income_per_capita, "data_processed\\income_per_capita.csv")
files <- list.files(here::here('data_raw', 'us_covid_neverend'))
paths <- here::here('data_raw', 'us_covid_neverend', files)
data <- list()

index <- 1
suppressMessages(
for (i in 1:length(files)) {
    temp <- read_csv(paths[i])
    if (ncol(temp) == 21) {
        data[[index]] <- temp
        index <- index + 1
    }
}
)

# lapply(data, function(x) length(names(x)))

daily_covid_state <- do.call(rbind, data)

daily_covid_state_filtered <- daily_covid_state %>% 
    select(Province_State, Date, Confirmed, Deaths, Case_Fatality_Ratio, ) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>%
    filter(!is.na(Deaths)) %>% 
    filter(!is.na(Case_Fatality_Ratio)) %>% 
    mutate(Deaths = as.numeric(Deaths)) %>% 
    mutate(Case_Fatality_Ratio = as.numeric(Case_Fatality_Ratio)) %>%
    mutate(Date = as.Date(Date)) 
write.csv(daily_covid_state_filtered, "data_processed\\daily_covid_state_filtered.csv")
state_fatality_ratio <- daily_covid_state %>% 
    select(Province_State, Date, Confirmed, Deaths, Case_Fatality_Ratio) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>%
    mutate(Case_Fatality_Ratio = as.numeric(Case_Fatality_Ratio)) %>%
    filter(!is.na(Case_Fatality_Ratio)) %>%
    group_by(Province_State) %>% 
    summarise(Case_Fatality_Ratio = mean(Case_Fatality_Ratio)) %>% 
    arrange(desc(Case_Fatality_Ratio)) %>%  
    mutate(Province_State = fct_reorder(Province_State,Case_Fatality_Ratio))

ggplot(state_fatality_ratio)+
    geom_segment(aes(x=0, xend = Case_Fatality_Ratio, y=Province_State, yend=Province_State),color = 'gray')+
    geom_point(aes(x=Case_Fatality_Ratio, y=Province_State), color='Steelblue')+
    geom_vline(xintercept = mean(state_fatality_ratio$Case_Fatality_Ratio), color='red', linetype='dashed')+
        annotate('text', x= 1.45, y='Washington', color='red', hjust=0, label='mean')+
    scale_x_continuous(expand = expansion(mult = c(0, 0.05)))+
    theme_minimal()+
    labs(
        x='Death Ratio',
        y='State',
        title='The ranking of Covid Death Ratio in States'
        
    )

write.csv(state_fatality_ratio, "data_processed\\state_fatality_ratio.csv")
A <- income %>% 
    mutate(GeoName = fct_recode(GeoName, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != "United States") %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    filter(State != "Rhode Island") %>%
    filter(State != "Rocky Mountain") %>%
    filter(State != "Plains") %>%
    filter(State != "Far West") %>%
    filter(State != "Mideast") %>%
    filter(State != "Great Lakes") %>%
    filter(State != "Southwest") %>%
    filter(State != "Southeast") %>%
    filter(State != "New England") %>%
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(State = fct_reorder(State, Dollars)) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    group_by(State) %>% 
    summarise(mean = mean(Dollars)) %>% 
    arrange((mean)) %>% 
    mutate(Type ='Income') %>% 
    select(State, Type)
A$Sequence <- c(1:50)

write.csv(A, "data_processed\\income_sequence.csv")

B <- daily_covid_state %>% 
    select(Province_State, Date, Confirmed, Deaths, Case_Fatality_Ratio) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>%
    mutate(Case_Fatality_Ratio = as.numeric(Case_Fatality_Ratio)) %>%
    filter(!is.na(Case_Fatality_Ratio)) %>%
    group_by(Province_State) %>% 
    summarise(Case_Fatality_Ratio = mean(Case_Fatality_Ratio)) %>% 
    arrange((Case_Fatality_Ratio)) %>%  
    mutate(Province_State = fct_reorder(Province_State,Case_Fatality_Ratio)) %>% 
    mutate(Case_Fatality_Ratio = Case_Fatality_Ratio *50) %>% 
    mutate( Type ='Death Ratio') %>% 
    rename("State"="Province_State") %>% 
    select(State, Type)
B$Sequence <- c(1:50)

write.csv(B, "data_processed\\ratio_sequence.csv")


C <- A %>% 
    full_join(B) %>% 
    mutate(State = fct_recode(State, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    mutate(
        lineColor = case_when(State =='District of Columbia'~ '50', 
                              State =='Connecticut' ~ '48',
                              State =='Massachusetts' ~ '49',
                              State =='New Jersey' ~ '47',
                              State =='New York' ~ '46',
                              State =='California' ~ '45',
                              State =='Washington' ~ '44',
                              State =='New Hampsire'~ '43',
                              State =='Colorado' ~ '42',
                              State =='Wyoming' ~'41',
                              State =='Maryland' ~'40',
                              State =='Illinois' ~'39',
                              State =='Virginia' ~ '38',
                              State =='Alaska' ~ '37',
                              State =='Minnesota' ~'36',
                              State =='North Dakota' ~ '35',
                              State =='South Dakota' ~'34',
                              State =='Pennsylvania'~'33',
                              State =='Vermont' ~'32',
                              State =='Florida'~'31',
                              State =='Nebraska' ~ '30',
                              State =='Oregon'~'29',
                              State =='Hawaii'~'28',
                              State =='Delaware' ~ '27',
                              State =='Texas'~'26',
                              State =='Wisconsin'~'25',
                              State =='Nevada'~'24',
                              State =='Kansas'~'23',
                              State =='Maine'~'22',
                              State =='Iowa'~'21',
                              State =='Montana' ~'20',
                              State =='Ohio'~'19',
                              State =='Michigan' ~'18',
                              State =='Tennessee'~'17',
                              State =='Indiana'~'16',
                              State =='North Carolina' ~'15',
                              State =='Utah' ~ '14',
                              State =='Georgia'~'13',
                              State =='Arizona' ~'12',
                              State =='Missouri' ~'11',
                              State =='Louisiana'~ '10',
                              State =='Oklahoma'~'9',
                              State =='Idaho' ~'8',
                              State =='South Carolina' ~'7',
                              State =='Kentucky' ~'6',
                              State =='Mississippi' ~ '1',
                              State =='West Virginia' ~'2',
                              State =='Alabama' ~ '3',
                              State =='New Mexico' ~ '4',
                              State =='Arkansas' ~ '5'),
        label = paste(State, '(', Sequence, ')'),
        label_left = ifelse(Type == 'Death Ratio', label, NA),
        label_right = ifelse(Type == 'Income', label, NA)
    )

write.csv(C, "data_processed\\join_sequences.csv")
    

library(ggrepel)
red <- ggplot(C, aes(x=Type, y=Sequence, group=State))+
    geom_line(aes(color = lineColor), linewidth = 0.6)+
    geom_text_repel(aes(label = label_left), hjust=1, nudge_x = -0.05)+
    geom_text_repel(aes(label = label_right), hjust= 0 , nudge_x = 0.05)+
    scale_color_manual(values = c('50'='red', '49'='red', '48'='red', '47'='red', '46'='red',
    'gray80','gray80','gray80','gray80','gray80'))+                             
    theme(legend.position= "none")+
    theme(legend.position= "none")+
    scale_x_discrete(position='top')+
    labs(
        x=' ',
        y='Ranking',
        title='Comparing Ranking of Death Ratio and Income by States',
        subtitle='5 Highest Income States had relatively high covid death ratio'
    )
red
library(ggrepel)
green <- ggplot(C, aes(x=Type, y=Sequence, group=State))+
    geom_line(aes(color = lineColor), size = 0.7)+
    geom_text_repel(aes(label = label_left), hjust=1, nudge_x = -0.05)+
    geom_text_repel(aes(label = label_right), hjust= 0 , nudge_x = 0.05)+
    scale_color_manual(values = c('45'='green', '44'='green', '43'='green', '42'='green','41'='green','37'='green','36'='green','35'='green','32'='green','30'='green','29'='green','28'='green','25'='green','grey'))+
    theme(legend.position= "none")+
    theme(legend.position= "none")+
    scale_x_discrete(position='top')+
    labs(
        x=' ',
        y='',
        subtitle='Middle Income States had relatively lower covid death ratio'
    )
green
library(ggrepel)
blue <- ggplot(C, aes(x=Type, y=Sequence, group=State))+
    geom_line(aes(color = lineColor), size = 0.7)+
    geom_text_repel(aes(label = label_left), hjust=1, nudge_x = -0.05)+
    geom_text_repel(aes(label = label_right), hjust= 0 , nudge_x = 0.05)+
    scale_color_manual(values = c('gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
                                  'gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
                                  'gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
                                  'gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
 'gray','gray','gray','gray','gray','1'='blue','2'='blue','3'='blue','4'='blue','5'='blue', '7'='blue','10'='blue','12'='blue','13'='blue','16'='blue','18'='blue','19'='blue'))+
    theme(legend.position= "none")+
    theme(legend.position= "none")+
    scale_x_discrete(position='top')+
    labs(
        x='',
        y='',
        subtitle='5 Lowest Income States had relatively high covid death ratio'
    )
blue
red + green + blue
df1 <- daily_covid_state %>% 
    filter(!is.na(Case_Fatality_Ratio)) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>% 
    rename( "State" = "Province_State") %>% 
    select(Case_Fatality_Ratio, State) %>% 
    group_by(State) %>% 
    summarize(mean_ratio = mean(Case_Fatality_Ratio))

write.csv(df1, "data_processed\\mean_ratio_state.csv")

income2 <- income %>% 
    mutate(GeoName = fct_recode(GeoName, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != "United States") %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    filter(State != "Rhode Island") %>%
    filter(State != "Rocky Mountain") %>%
    filter(State != "Plains") %>%
    filter(State != "Far West") %>%
    filter(State != "Mideast") %>%
    filter(State != "Great Lakes") %>%
    filter(State != "Southwest") %>%
    filter(State != "Southeast") %>%
    filter(State != "New England") %>%
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(State = fct_reorder(State, Dollars)) %>% 
    group_by(State) %>% 
    summarise(mean = mean(Dollars)) %>% 
    mutate(mean = mean/100) %>% 
    arrange((mean))

write.csv(income2, "data_processed\\mean_income_state.csv")
df2 <- df1 %>% 
    full_join(income2)

df2$abbs <- c('AL', 'AK', 'AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY')

write.csv(df2, "data_processed\\income_ratio_join.csv")

corr <- cor(
    df2$mean,
    df2$mean_ratio,
    method = 'spearman',
    use='complete.obs'
)

corrLabel <- paste("r=", round(corr,2))

model <- lm(formula =mean~mean_ratio, data=df2)
coefs <- round(coef(model),2)

modelLabel <- paste('y=', coefs[1], '+', coefs[2])

df2 %>% 
    ggplot(aes(x=mean, y=mean_ratio))+
    geom_point(size=5,color='gray80', alpha=0.8)+
    geom_smooth(method='lm',se=FALSE)+
    scale_x_continuous(
        breaks=c(500,600,700,800,900)
    )+
    scale_y_continuous(
        breaks = c(0, 0.5, 1.0, 1.5, 2.0, 2.5),
        limits = c(0, 2.5)
    )+
    geom_text(
        label=df2$abbs,
        nudge_x =0,
        nudge_y =0,
        check_overlap =T,
        size=2
    )+
    annotate(
        geom = 'text', x=450, y=2.3, label=corrLabel, hjust=0, size=5)+
    annotate(
        geom='text', x=450, y=2.1,
        label=modelLabel, color='blue',hjust=0, size=5
    )+
    labs(
        title="The higher income states had higher death ratios",
        caption = "*Ratio : mean of number recorded deaths * 100 divided by number confirmed cases\n Income : mean Income of states",
        x="Income(100 dollars)",
        y="Ratio"
    )


library(readxl)
gini2021 <- read_excel(here('data_raw', 'gini_2021.xlsx'), sheet=2, skip=3)
gini2022 <- read_csv(here("data_raw",'csvData_2022_gini.csv'))

Gini <- gini2021 %>% 
    filter(!is.na(...2)) %>% 
    filter(!is.na(`Gini coefficient as a measure for household income distribution inequality in the United States 2021, by state`)) %>% 
    rename("state"="Gini coefficient as a measure for household income distribution inequality in the United States 2021, by state") %>% 
    rename("giniCoefficient" = "...2") %>% 
    mutate(giniCoefficient=giniCoefficient*100) %>% 
    full_join(gini2022) %>% 
    group_by(state) %>% 
    summarize(mean_gini=mean(giniCoefficient)) %>% 
    filter(state != "Puerto Rico") %>% 
    filter(state != "Rhode Island") %>% 
    filter(state != "United States") %>% 
    rename("State"="state") %>% 
    left_join(df1, by='State')

Gini$abbs <- c('AL', 'AK', 'AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY')

write.csv(Gini, "data_processed\\gini.csv")
corr <- cor(
    Gini$mean_gini,
    Gini$mean_ratio,
    method = 'spearman',
    use='complete.obs'
)

corrLabel <- paste("r=", round(corr,2))

model <- lm(formula =mean_gini~mean_ratio, data=Gini)
coefs <- round(coef(model),2)

modelLabel <- paste('y=', coefs[1], '+', coefs[2])

Gini %>% 
    ggplot(aes(x=mean_gini, y=mean_ratio))+
    geom_point(size=5,color='gray80', alpha=0.8)+
    geom_smooth(method='lm',se=FALSE)+
    scale_x_continuous(
        breaks=c(42,45,48,51,54),
        limits=c(42, 54)
    )+
    scale_y_continuous(
        breaks = c(0, 0.5, 1.0, 1.5, 2.0, 2.5),
        limits = c(0, 2.5)
    )+
    geom_text(
        label=Gini$abbs,
        nudge_x =0,
        nudge_y =0,
        check_overlap =T,
        size=2
    )+
    annotate(
        geom = 'text', x=42.5, y=2.3, label=corrLabel, hjust=0, size=5)+
    annotate(
        geom='text', x=42.5, y=2.1,
        label=modelLabel, color='blue',hjust=0, size=5
    )+
    labs(
        title="The higher income inequality states had higher death ratios",
        caption = "*Ratio : mean of number recorded deaths * 100 divided by number confirmed cases\n Mean Gini coefficient : mean of gini coefficient of 2020 and 2021 ",
        x="Gini Coefficient",
        y="Ratio"
    )


source(here::here('data_raw', 'state_abbs.R'))

vaccinePath <- here("data_raw", "us_state_vaccinations.csv")

us_vaccines <- read_csv(vaccinePath) %>%
    clean_names() %>% 
    rename(name = location) %>% 
    filter((date == "2022-12-07")) %>% 
    filter((!name == 'Marshall Islands') & 
    (!name == 'Federated States of Micronesia') & (!name == 'Dept of Defense') &
    (!name == 'Bureau of Prisons') & (!name == 'American Samoa') & (!name == 'Indian Health Svc') &
    (!name == 'Guam') & (!name == 'Northern Mariana Islands') & (!name == 'Republic of Palau') &
    (!name == 'Puerto Rico') & (!name == 'Veterans Health') & (!name == 'Virgin Islands')
    & (!name == 'United States'))

write_csv(us_vaccines, path = file.path('data_processed', 'us_vaccines.csv'))

us_states <- ne_states(
    country = 'united states of america',
    returnclass = 'sf') %>%
    filter(! name %in% c('Alaska', 'Hawaii')) %>%
    left_join(us_vaccines, by = 'name') %>% 
    st_transform(crs = "ESRI:102003")

us_states %>%
    select(name, people_fully_vaccinated_per_hundred) %>%
    head()

centroids <- as.data.frame(
    st_coordinates(st_centroid(us_states)))
names(centroids) <- c('label_x', 'label_y')
us_states <- bind_cols(us_states, centroids)

us_states <- us_states %>%
    left_join(state_abbs, by = c('name' = 'state_name'))



ggplot(us_states) +
  geom_sf(aes(fill = people_fully_vaccinated_per_hundred)) +
    geom_label(aes(x = label_x, y = label_y, label = state_abb, label.size = 0.25)) +
  scale_fill_viridis(
    option = "plasma",
    limits = c(50,110)) +
  theme_void(base_size = 10) +
  theme(legend.position = 'bottom') +
  labs(fill = 'Percentage of People Fully Vaccinated',
       title = 'Full COVID Vaccination by State in 2022') +
    coord_sf(crs = 'ESRI:102003')

Research Question :

“Relationship between Covid 19 pandemic and the income equality among states”

Data Sources & Description

USA daily state reports (csse_covid_19_daily_reports_us)

Description:

Large repository of data including both US and other nations daily COVID-19 reports.

How it was collected:

The data is collected through dozens of sources such as US State health department websites, national health department websites such as ministries of health or departments of health.

By Whom it was collected:

The data has been collected, is being continuously updated, and managed by the Johns Hopkins University Center for Systems Science and Engineering.

When it was collected:

The data receives daily updates once per day between 04:45 and 05:15 UTC.

Discussion:

The amount of data was huge because it was a daily report data, so we had to think of a way to select some of the data. The COVID in Us had started from Jan.31st.2020 so we selected monthly data for the data frame. We are not sure if this selective process is going to be biased, so in the future we will try using all of the data to see if the results would be different.

State quarterly personal income summary: personal income, population, per capita personal income

Description:

Data collected by the United States Bureau of Economic Analysis in conjunction with the US Department of Commerce regarding the income people living in each state and the District of Columbia get from wages, proprietors’ income, dividends, interest, rents, and government benefits. The intended use of these statistics is to help assess and compare the economic well-being of state residents.

Mid quarter population estimates by state are derived by BEA. BEA produced intercensal quarterly statistics for the second quarter of 2010 through the first quarter of 2020 that are tied to the Census Bureau decennial counts for 2010 and 2020. BEA developed intercensal population statistics because this data was not published when Census released state population data for 2020-2021, which are based on the 2020 decennial counts. BEA used the Census Bureau Das Gupta method (see https://www2.census.gov/programs-surveys/popest/technical-documentation/methodology/intercensal/2000-2010-intercensal-estimates-methodology.pdf), modified to account for an extra leap year day, to produce the intercensal population figures that will be used until Census releases its official intercensal population data. Mid quarter population estimates for the second quarter of 2020 through the second quarter of 2022 are tied to the Census Bureau decennial counts for 2020. Per capita personal income is total personal income divided by total quarterly population estimates. BEA produced intercensal population figures for the second quarter of 2010 to the first quarter of 2020 that are tied to the Census Bureau decennial counts for 2010 and 2020 to create consistent time series that are used to prepare per capita personal income statistics. BEA used the Census Bureau Das Gupta method (see https://www2.census.gov/programs-surveys/popest/technical-documentation/methodology/intercensal/2000-2010-intercensal-estimates-methodology.pdf), modified to account for an extra leap year day, to produce the intercensal population figures that will be used until Census releases its official intercensal population data.* Estimates prior to 1950 are not available for Alaska and Hawaii.Note. Millions of dollars, seasonally adjusted at annual rates. All dollar estimates are in current dollars (not adjusted for inflation). Calculations are performed on unrounded data.

Last updated: September 30, 2022–new statistics for 2022:Q2; revised statistics for 2017:Q1-2022:Q1.

How it was collected:

Most of the data comes from over 360 surveys and other data collections sponsored by other Federal agencies, such as statistical agencies, aggregate tax data sources, administrative and regulatory sources, and private trade sources.

By Whom it was collected:

The data is collected, maintained, and updated by the BEA and their partners.

When it was collected:

The most recent update to the data was on September 30th, 2022. The next update is scheduled for December 23rd, 2022.

Discussion:

There are places which is not considered as one of the states.

Gini Coefficient

Description

The errors of gini coeffieient are edited out from the original US Census Bureau data. The Gini coefficient is calculated by looking at average income rates. A score of zero would reflect perfect income equality and a score of 1 indicates a society where one person would have all the money and all other people have nothing.

How it was collected

It was collected by American Community Survey on Households

By whom it was collected

US Census Bureau

when was it collected

2019 American Community Survey 1-Year Estimates

World Covid Data

Description

Collection of the COVID-19 data maintained by Our World In Data.

How and by whom it was collected

  • Confirmed cases and deaths: our data comes from the COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU). We discuss how and when JHU collects and publishes this data here. The cases & deaths dataset is updated daily. Note: confirmed cases and deaths are collected by Johns Hopkins University by date of report, rathen than date of test/death. Therefore the number they report on a given day does not necessarily represent the actual number on that date, because of the long reporting chain that exists between a new case/death and its inclusion in statistics. This also means that time series can show sudden changes (negative or positive) when a country corrects historical data, because it had previously under- or overestimated the number of cases/deaths.

-Hospitalizations and intensive care unit (ICU) admissions: our data is collected from official sources and collated by Our World in Data.

-Testing for COVID-19: this data is collected by the Our World in Data team from official reports. On 23 June 2022, we stopped adding new datapoints to our COVID-19 testing dataset.

-Vaccinations against COVID-19: this data is collected by the Our World in Data team from official reports.

-Other variables: this data is collected from a variety of sources (United Nations, World Bank, Global Burden of Disease, Blavatnik School of Government, etc.).

When it was collected

It is updated daily throughout the duration of the COVID-19 pandemic.

Us State Vaccination Rates (us_state_vaccination.csv)

Description:

State-by-state data on United States COVID-19 vaccinations. #### Data Link: https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations#united-states-vaccination-data #### How it was collected: Data updated daily, and collected by the United States Centers for Disease Control and Prevention. #### By Whom it was collected: The data has been collected, is being continuously updated, and managed by the Our World in Data team. #### When it was collected: The data receives daily updates once per day between 04:45 and 05:15 UTC. #### Discussion: The amount of data collected was on a day by day basis for each state and US Territory. At first the cleaning of the data seemed daunting due to the nature of its vastness, but we simply needed the most up to date measure of state vaccination rates. Therefore, it was filtered down to the most recent date available, and cleaned and renamed for use to plot on a map.

Results and Charts

Before jumping into analysis of covid and the income, we visualized how each variables acted during covid pandemic.

Income

income1 <- read_csv("data_raw/2019tbl.csv", skip=3)
income_three_years <- income1 %>% 
  
    filter(GeoName != "United States") %>% 
    select(-c(GeoFips)) %>% 
    rename("State" = "GeoName") %>% 
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    mutate(State = fct_other(State, keep=c("District of Columbia",'Mississippi'))) %>% 
    group_by(State, Time) %>% 
    summarise(mean = mean(Dollars))
income_three_years %>% 
    ggplot(aes(x=Time, y=mean, group = State)) +
    geom_point(aes(color=State))+
    geom_line(aes(color=State))+
    theme(legend.position = "none")+
    theme_minimal()+
    scale_color_manual(values=c("#8b0000", "#00008b", "dark grey"))+
    labs(
        y="Dollars (thousand)",
        title = "Income had Increased Steadily even in Pandemic Situation",
        subtitle = "Whether highest income state,District of Columbia,\nor lowest income state, Mississippi, they all increased",
        legend = "State"
    )

write.csv(income_three_years, "data_processed\\income_three_years.csv")

In this plot, 2019(before the pandemic) was considered as somewhat of a constant. It was expected the states that had higher incomes in 2019 would remain having the higher incomes in 2020 during and after the pandemic because this indicates that these states have more wealthy people with earlier and easier access to vaccines and covid information as well as careers that would be more possible shift to remotely which would cause less of an impact on their household income. Based on the result of the graph we can see that shockingly, there was in fact an increase of income for all states after covid 19, and an upwards trend at that.

income <- read_csv("data_raw/us_income/Table.csv", skip=3)
#view(income)
income_two_states <- income %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != c("United States", "Rhode Island", "New England", "Mideast", "Great Lakes", "Plains", "Southeast", "Rocky Mountain", "Far West")) %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    mutate(State = fct_other(State, keep=c("District of Columbia",'Mississippi'))) %>% 
    group_by(State, Time) %>% 
    summarise(mean = mean(Dollars))
write.csv(income_two_states, "data_processed\\income_two_states.csv")
income_two_states %>% 
    ggplot(aes(x=Time, y=mean, group = State)) +
    geom_point(aes(color=State))+
    geom_line(aes(color=State))+
    theme(legend.position = "none")+
    theme_minimal()+
    scale_color_manual(values=c("#8b0000", "#00008b", "dark grey"))+
    labs(
        y="Dollars (thousand)",
        title = "Steady increase in income during the pandemic",
        legend = "State"
    )

This graph shows the change of income from the start of the covid in US. The distribution of the income didn’t show a trend of decrease but rather an increase as time passed. Also, a sharp increase in the first quarter in 2021 was identified.

How Covid Pandemic has changed over time

covid_timeline <- read_csv("data_raw/owid-covid-data.csv")
covid_timeline_filtered <- covid_timeline %>% 
    filter(iso_code == "USA") %>% 
    mutate(date = ymd(date)) %>% 
    mutate(new_cases = new_cases/100)
#View(covid_timeline_filtered)

#glimpse(covid_timeline_filtered)
write.csv(covid_timeline_filtered, "data_processed\\covid_timeline_filtered.csv")
d <-  data.frame(date = as.Date("2020-08-15","2020-04-03", "2021-03-13","2021-05-20","2021-08-23","2021-11-15"))

plot1 <- ggplot(covid_timeline_filtered) +
    geom_line(aes(x=date, y=new_cases), color="#E51837", size=0.5)+
    scale_x_date(
        date_breaks = "3 months",
        date_labels = '%Y/%b'
    )+
    labs(title = "Covid Timeline",
         x= "",
         y= "Cases(100)")+
    geom_segment(x=as.Date("2020-04-03"), xend=as.Date("2020-04-03"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-06-04"), xend=as.Date("2021-06-04"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2020-12-14"), xend=as.Date("2020-12-14"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-12-01"), xend=as.Date("2021-12-01"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    annotate ('text', x=as.Date("2020-03-31"), y=5000, color='grey50', hjust=1, label="Mask")+
    annotate ('text', x=as.Date("2021-06-01"), y=5000, color='grey50', hjust=1, label="Delta")+
    annotate ('text', x=as.Date("2020-12-11"), y=5000, color='grey50', hjust=1, label="First Vaccine")+
    annotate ('text', x=as.Date("2021-11-28"), y=5000, color='grey50', hjust=1, label="Omicron")
    

plot2 <- ggplot(covid_timeline_filtered) +
    geom_line(aes(x=date, y=new_deaths), color="blue", size=0.5)+
    scale_x_date(
        date_breaks = "3 months",
        date_labels = '%Y/%b'
    )+
    labs(x= "Time",
         y= "Deaths")+
    geom_segment(x=as.Date("2020-04-03"), xend=as.Date("2020-04-03"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-06-04"), xend=as.Date("2021-06-04"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2020-12-14"), xend=as.Date("2020-12-14"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    geom_segment(x=as.Date("2021-12-01"), xend=as.Date("2021-12-01"), y=0, yend = 14000, color = 'black', linetype = 'dashed') +
    annotate ('text', x=as.Date("2020-03-31"), y=3500, color='grey50', hjust=1, label="Mask")+
    annotate ('text', x=as.Date("2021-06-01"), y=3500, color='grey50', hjust=1, label="Delta")+
    annotate ('text', x=as.Date("2020-12-11"), y=3500, color='grey50', hjust=1, label="First Vaccine")+
    annotate ('text', x=as.Date("2021-11-28"), y=3500, color='grey50', hjust=1, label="Omicron")

    
    
plot1/plot2

The graph above explains the overview of the timeline through covid. The red line above indicates the confirmed cases of covid and the blue line below indicates the deaths of covid. Keep in mind that the y scale differ. By comparing two graphs with important incidents marked as a dashed line we can interpret how the covid situation has changed.

Income of all 50 States

income_per_capita <- income %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != c("United States", "Rhode Island", "New England", "Mideast", "Great Lakes", "Plains", "Southeast", "Southwest", "Rocky Mountain", "Far West")) %>% 
    mutate(GeoName = fct_recode(GeoName, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(State = fct_reorder(State, Dollars)) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    group_by(State) %>% 
    summarise(mean = mean(Dollars)) %>% 
    arrange(desc(mean))

    
    ggplot(income_per_capita)+
    geom_segment(aes(x=0, xend = mean, y=State, yend=State),color = 'gray')+
    geom_point(aes(x=mean, y=State), color='Steelblue')+
    geom_vline(xintercept = mean(income_per_capita$mean), color='red', linetype='dashed')+
        annotate('text', x= 62, y='Michigan', color='red', hjust=0, label='mean')+
    scale_x_continuous(
expand = expansion(mult = c(0, 0.05)))+
    theme_minimal()+
        labs(x='Income(1000 dollars)',
             y='State',
             title='The ranking of State income'
        )

write.csv(income_per_capita, "data_processed\\income_per_capita.csv")

The intention behind this chart is to visualize the ranking of the income to identify the higher income states and lower income states. The income was estimated though mean of all the quarters throughout 2020, 2021 and 2022.

files <- list.files(here::here('data_raw', 'us_covid_neverend'))
paths <- here::here('data_raw', 'us_covid_neverend', files)
data <- list()

index <- 1
suppressMessages(
for (i in 1:length(files)) {
    temp <- read_csv(paths[i])
    if (ncol(temp) == 21) {
        data[[index]] <- temp
        index <- index + 1
    }
}
)

# lapply(data, function(x) length(names(x)))

daily_covid_state <- do.call(rbind, data)
daily_covid_state_filtered <- daily_covid_state %>% 
    select(Province_State, Date, Confirmed, Deaths, Case_Fatality_Ratio, ) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>%
    filter(!is.na(Deaths)) %>% 
    filter(!is.na(Case_Fatality_Ratio)) %>% 
    mutate(Deaths = as.numeric(Deaths)) %>% 
    mutate(Case_Fatality_Ratio = as.numeric(Case_Fatality_Ratio)) %>%
    mutate(Date = as.Date(Date)) 
write.csv(daily_covid_state_filtered, "data_processed\\daily_covid_state_filtered.csv")

COVID Death Ratio per State

state_fatality_ratio <- daily_covid_state %>% 
    select(Province_State, Date, Confirmed, Deaths, Case_Fatality_Ratio) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>%
    mutate(Case_Fatality_Ratio = as.numeric(Case_Fatality_Ratio)) %>%
    filter(!is.na(Case_Fatality_Ratio)) %>%
    group_by(Province_State) %>% 
    summarise(Case_Fatality_Ratio = mean(Case_Fatality_Ratio)) %>% 
    arrange(desc(Case_Fatality_Ratio)) %>%  
    mutate(Province_State = fct_reorder(Province_State,Case_Fatality_Ratio))

ggplot(state_fatality_ratio)+
    geom_segment(aes(x=0, xend = Case_Fatality_Ratio, y=Province_State, yend=Province_State),color = 'gray')+
    geom_point(aes(x=Case_Fatality_Ratio, y=Province_State), color='Steelblue')+
    geom_vline(xintercept = mean(state_fatality_ratio$Case_Fatality_Ratio), color='red', linetype='dashed')+
        annotate('text', x= 1.45, y='Washington', color='red', hjust=0, label='mean')+
    scale_x_continuous(expand = expansion(mult = c(0, 0.05)))+
    theme_minimal()+
    labs(
        x='Death Ratio',
        y='State',
        title='The ranking of Covid Death Ratio in States'
        
    )

write.csv(state_fatality_ratio, "data_processed\\state_fatality_ratio.csv")

The purpose of this chart is to visualize the ranking of the death ratio among states to identify the higher death ratio states and lower death ratio states. The death ratio indicates the number recorded deaths * 100 divided by number of confirmed cases.

A <- income %>% 
    mutate(GeoName = fct_recode(GeoName, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != "United States") %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    filter(State != "Rhode Island") %>%
    filter(State != "Rocky Mountain") %>%
    filter(State != "Plains") %>%
    filter(State != "Far West") %>%
    filter(State != "Mideast") %>%
    filter(State != "Great Lakes") %>%
    filter(State != "Southwest") %>%
    filter(State != "Southeast") %>%
    filter(State != "New England") %>%
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(State = fct_reorder(State, Dollars)) %>% 
    mutate(Dollars = Dollars/1000) %>% 
    group_by(State) %>% 
    summarise(mean = mean(Dollars)) %>% 
    arrange((mean)) %>% 
    mutate(Type ='Income') %>% 
    select(State, Type)
A$Sequence <- c(1:50)

write.csv(A, "data_processed\\income_sequence.csv")

B <- daily_covid_state %>% 
    select(Province_State, Date, Confirmed, Deaths, Case_Fatality_Ratio) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>%
    mutate(Case_Fatality_Ratio = as.numeric(Case_Fatality_Ratio)) %>%
    filter(!is.na(Case_Fatality_Ratio)) %>%
    group_by(Province_State) %>% 
    summarise(Case_Fatality_Ratio = mean(Case_Fatality_Ratio)) %>% 
    arrange((Case_Fatality_Ratio)) %>%  
    mutate(Province_State = fct_reorder(Province_State,Case_Fatality_Ratio)) %>% 
    mutate(Case_Fatality_Ratio = Case_Fatality_Ratio *50) %>% 
    mutate( Type ='Death Ratio') %>% 
    rename("State"="Province_State") %>% 
    select(State, Type)
B$Sequence <- c(1:50)

write.csv(B, "data_processed\\ratio_sequence.csv")


C <- A %>% 
    full_join(B) %>% 
    mutate(State = fct_recode(State, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    mutate(
        lineColor = case_when(State =='District of Columbia'~ '50', 
                              State =='Connecticut' ~ '48',
                              State =='Massachusetts' ~ '49',
                              State =='New Jersey' ~ '47',
                              State =='New York' ~ '46',
                              State =='California' ~ '45',
                              State =='Washington' ~ '44',
                              State =='New Hampsire'~ '43',
                              State =='Colorado' ~ '42',
                              State =='Wyoming' ~'41',
                              State =='Maryland' ~'40',
                              State =='Illinois' ~'39',
                              State =='Virginia' ~ '38',
                              State =='Alaska' ~ '37',
                              State =='Minnesota' ~'36',
                              State =='North Dakota' ~ '35',
                              State =='South Dakota' ~'34',
                              State =='Pennsylvania'~'33',
                              State =='Vermont' ~'32',
                              State =='Florida'~'31',
                              State =='Nebraska' ~ '30',
                              State =='Oregon'~'29',
                              State =='Hawaii'~'28',
                              State =='Delaware' ~ '27',
                              State =='Texas'~'26',
                              State =='Wisconsin'~'25',
                              State =='Nevada'~'24',
                              State =='Kansas'~'23',
                              State =='Maine'~'22',
                              State =='Iowa'~'21',
                              State =='Montana' ~'20',
                              State =='Ohio'~'19',
                              State =='Michigan' ~'18',
                              State =='Tennessee'~'17',
                              State =='Indiana'~'16',
                              State =='North Carolina' ~'15',
                              State =='Utah' ~ '14',
                              State =='Georgia'~'13',
                              State =='Arizona' ~'12',
                              State =='Missouri' ~'11',
                              State =='Louisiana'~ '10',
                              State =='Oklahoma'~'9',
                              State =='Idaho' ~'8',
                              State =='South Carolina' ~'7',
                              State =='Kentucky' ~'6',
                              State =='Mississippi' ~ '1',
                              State =='West Virginia' ~'2',
                              State =='Alabama' ~ '3',
                              State =='New Mexico' ~ '4',
                              State =='Arkansas' ~ '5'),
        label = paste(State, '(', Sequence, ')'),
        label_left = ifelse(Type == 'Death Ratio', label, NA),
        label_right = ifelse(Type == 'Income', label, NA)
    )

write.csv(C, "data_processed\\join_sequences.csv")

State Income vs COVID Death Ratio

library(ggrepel)
red <- ggplot(C, aes(x=Type, y=Sequence, group=State))+
    geom_line(aes(color = lineColor), linewidth = 0.6)+
    geom_text_repel(aes(label = label_left), hjust=1, nudge_x = -0.05)+
    geom_text_repel(aes(label = label_right), hjust= 0 , nudge_x = 0.05)+
    scale_color_manual(values = c('50'='red', '49'='red', '48'='red', '47'='red', '46'='red',
    'gray80','gray80','gray80','gray80','gray80'))+                             
    theme(legend.position= "none")+
    theme(legend.position= "none")+
    scale_x_discrete(position='top')+
    labs(
        x=' ',
        y='Ranking',
        title='Comparing Ranking of Death Ratio and Income by States',
        subtitle='5 Highest Income States had relatively high covid death ratio'
    )
red

The relationship between those two variables is visualized by sequencing the order of 50 states in both income and death ratio. The higher number in ranking means higher values in both sections. For example, District of Columbia has highest income in all states and New Jersey has the highest death ratio among all states.

library(ggrepel)
green <- ggplot(C, aes(x=Type, y=Sequence, group=State))+
    geom_line(aes(color = lineColor), size = 0.7)+
    geom_text_repel(aes(label = label_left), hjust=1, nudge_x = -0.05)+
    geom_text_repel(aes(label = label_right), hjust= 0 , nudge_x = 0.05)+
    scale_color_manual(values = c('45'='green', '44'='green', '43'='green', '42'='green','41'='green','37'='green','36'='green','35'='green','32'='green','30'='green','29'='green','28'='green','25'='green','grey'))+
    theme(legend.position= "none")+
    theme(legend.position= "none")+
    scale_x_discrete(position='top')+
    labs(
        x=' ',
        y='',
        subtitle='Middle Income States had relatively lower covid death ratio'
    )
green

library(ggrepel)
blue <- ggplot(C, aes(x=Type, y=Sequence, group=State))+
    geom_line(aes(color = lineColor), size = 0.7)+
    geom_text_repel(aes(label = label_left), hjust=1, nudge_x = -0.05)+
    geom_text_repel(aes(label = label_right), hjust= 0 , nudge_x = 0.05)+
    scale_color_manual(values = c('gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
                                  'gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
                                  'gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
                                  'gray','gray','gray','gray','gray','gray','gray','gray','gray','gray',
 'gray','gray','gray','gray','gray','1'='blue','2'='blue','3'='blue','4'='blue','5'='blue', '7'='blue','10'='blue','12'='blue','13'='blue','16'='blue','18'='blue','19'='blue'))+
    theme(legend.position= "none")+
    theme(legend.position= "none")+
    scale_x_discrete(position='top')+
    labs(
        x='',
        y='',
        subtitle='5 Lowest Income States had relatively high covid death ratio'
    )
blue

red + green + blue

The results were not exactly what we had anticipated. From the graphs we drew, the top 5 income states are District of Columbia, Massachusetts, Connecticut, New Jersey, New York and the lowest 5 states were Mississippi, West Virginia, Alabama, New Mexico, Arkansas. The top 5 highest COVID death ratio states were New Jersey, New York, Massachusetts, Connecticut, Pennsylvania and the lowest 5 was Alaska, Utah, Vermont, Nebraska, Hawaii. We originally thought that states with higher income will have a lower death ratio. However, from the graph above, the higher income state had higher death ratio. Also the lower income states also had relatively high death ratio. However, the middle-high income states had low death ratio.

By comparing three graphs in one look, the general trend was notable. Shockingly enough, the higher income states had high death ratio as the red lines indicate. The blue lines which is the lower income states, had relatively high death ratio as well. The middle income states in reverse had rather lower death ratio indicated in green lines. The sequence was a way to get a hold of the general trend. Another way to analyze this is to match the exact numbers by visualizing them on scatterplot.

df1 <- daily_covid_state %>% 
    filter(!is.na(Case_Fatality_Ratio)) %>% 
    filter(Province_State != "American Samoa") %>%
    filter(Province_State != "Diamond Princess") %>%
    filter(Province_State != "Rhode Island") %>%
    filter(Province_State != "Grand Princess") %>%
    filter(Province_State != "Guam") %>%
    filter(Province_State != "Puerto Rico") %>%
    filter(Province_State != "Northern Mariana Islands") %>%
    filter(Province_State != "Virgin Islands") %>% 
    rename( "State" = "Province_State") %>% 
    select(Case_Fatality_Ratio, State) %>% 
    group_by(State) %>% 
    summarize(mean_ratio = mean(Case_Fatality_Ratio))

write.csv(df1, "data_processed\\mean_ratio_state.csv")
income2 <- income %>% 
    mutate(GeoName = fct_recode(GeoName, "Hawaii"="Hawaii *", "Alaska"="Alaska *")) %>% 
    filter(LineCode == 3) %>% 
    filter(GeoName != "United States") %>% 
    select(-c(GeoFips,Description, LineCode)) %>% 
    rename("State" = "GeoName") %>% 
    filter(State != "Rhode Island") %>%
    filter(State != "Rocky Mountain") %>%
    filter(State != "Plains") %>%
    filter(State != "Far West") %>%
    filter(State != "Mideast") %>%
    filter(State != "Great Lakes") %>%
    filter(State != "Southwest") %>%
    filter(State != "Southeast") %>%
    filter(State != "New England") %>%
    pivot_longer(
        cols = -State,
        names_to = "Time",
        values_to = "Dollars"
    ) %>% 
    mutate(State = fct_reorder(State, Dollars)) %>% 
    group_by(State) %>% 
    summarise(mean = mean(Dollars)) %>% 
    mutate(mean = mean/100) %>% 
    arrange((mean))

write.csv(income2, "data_processed\\mean_income_state.csv")
df2 <- df1 %>% 
    full_join(income2)

df2$abbs <- c('AL', 'AK', 'AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY')

write.csv(df2, "data_processed\\income_ratio_join.csv")
corr <- cor(
    df2$mean,
    df2$mean_ratio,
    method = 'spearman',
    use='complete.obs'
)

corrLabel <- paste("r=", round(corr,2))

model <- lm(formula =mean~mean_ratio, data=df2)
coefs <- round(coef(model),2)

modelLabel <- paste('y=', coefs[1], '+', coefs[2])

df2 %>% 
    ggplot(aes(x=mean, y=mean_ratio))+
    geom_point(size=5,color='gray80', alpha=0.8)+
    geom_smooth(method='lm',se=FALSE)+
    scale_x_continuous(
        breaks=c(500,600,700,800,900)
    )+
    scale_y_continuous(
        breaks = c(0, 0.5, 1.0, 1.5, 2.0, 2.5),
        limits = c(0, 2.5)
    )+
    geom_text(
        label=df2$abbs,
        nudge_x =0,
        nudge_y =0,
        check_overlap =T,
        size=2
    )+
    annotate(
        geom = 'text', x=450, y=2.3, label=corrLabel, hjust=0, size=5)+
    annotate(
        geom='text', x=450, y=2.1,
        label=modelLabel, color='blue',hjust=0, size=5
    )+
    labs(
        title="The higher income states had higher death ratios",
        caption = "*Ratio : mean of number recorded deaths * 100 divided by number confirmed cases\n Income : mean Income of states",
        x="Income(100 dollars)",
        y="Ratio"
    )

While the correlation was weak it did have a twist that higher income states had higher death ratio and the lower income states had lower death ratio. This mean value was estimated through gdp and through further research we found out that income can be estimated through a lot of different indexes. Therefore, we decided to do another analysis on death ratio based on gini coefficient.The Gini coefficient measures the extent to which the distribution of income within a country deviates from a perfectly equal distribution. A coefficient of 0 expresses perfect equality where everyone has the same income, while a coefficient of 100 expresses full inequality where only one person has all the income.

library(readxl)
gini2021 <- read_excel(here('data_raw', 'gini_2021.xlsx'), sheet=2, skip=3)
gini2022 <- read_csv(here("data_raw",'csvData_2022_gini.csv'))
Gini <- gini2021 %>% 
    filter(!is.na(...2)) %>% 
    filter(!is.na(`Gini coefficient as a measure for household income distribution inequality in the United States 2021, by state`)) %>% 
    rename("state"="Gini coefficient as a measure for household income distribution inequality in the United States 2021, by state") %>% 
    rename("giniCoefficient" = "...2") %>% 
    mutate(giniCoefficient=giniCoefficient*100) %>% 
    full_join(gini2022) %>% 
    group_by(state) %>% 
    summarize(mean_gini=mean(giniCoefficient)) %>% 
    filter(state != "Puerto Rico") %>% 
    filter(state != "Rhode Island") %>% 
    filter(state != "United States") %>% 
    rename("State"="state") %>% 
    left_join(df1, by='State')
Gini$abbs <- c('AL', 'AK', 'AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY')

write.csv(Gini, "data_processed\\gini.csv")
corr <- cor(
    Gini$mean_gini,
    Gini$mean_ratio,
    method = 'spearman',
    use='complete.obs'
)

corrLabel <- paste("r=", round(corr,2))

model <- lm(formula =mean_gini~mean_ratio, data=Gini)
coefs <- round(coef(model),2)

modelLabel <- paste('y=', coefs[1], '+', coefs[2])

Gini %>% 
    ggplot(aes(x=mean_gini, y=mean_ratio))+
    geom_point(size=5,color='gray80', alpha=0.8)+
    geom_smooth(method='lm',se=FALSE)+
    scale_x_continuous(
        breaks=c(42,45,48,51,54),
        limits=c(42, 54)
    )+
    scale_y_continuous(
        breaks = c(0, 0.5, 1.0, 1.5, 2.0, 2.5),
        limits = c(0, 2.5)
    )+
    geom_text(
        label=Gini$abbs,
        nudge_x =0,
        nudge_y =0,
        check_overlap =T,
        size=2
    )+
    annotate(
        geom = 'text', x=42.5, y=2.3, label=corrLabel, hjust=0, size=5)+
    annotate(
        geom='text', x=42.5, y=2.1,
        label=modelLabel, color='blue',hjust=0, size=5
    )+
    labs(
        title="The higher income inequality states had higher death ratios",
        caption = "*Ratio : mean of number recorded deaths * 100 divided by number confirmed cases\n Mean Gini coefficient : mean of gini coefficient of 2020 and 2021 ",
        x="Gini Coefficient",
        y="Ratio"
    )

This coefficient gave us a more evident correlation between income and covid mortality. The higher income states had higher death ratio and the lower income states had lower death ratio which is a remarkable result since it is the exact opposite of what we had thought of. Therefore, we came across the thought that income is not a simple index that could be calculated in one aspect.

reading on inequality and covid https://www.imf.org/external/pubs/ft/fandd/2021/06/inequality-and-covid-19-ferreira.htm

This page is about covid and income inequality. This page explains about how challenging the concept of inequality is to make it a definitive statement.

Income is a complex variable that is influenced by a lot of different elements such as education, politics and in our case of covid analysis, healthcare, hospitality and vaccine rates. Therefore, we decided to do further study on some of those variables as well.

One such study was to see if education and vaccination rate were related:

source(here::here('data_raw', 'state_abbs.R'))

vaccinePath <- here("data_raw", "us_state_vaccinations.csv")

us_vaccines <- read_csv(vaccinePath) %>%
    clean_names() %>% 
    rename(name = location) %>% 
    filter((date == "2022-12-07")) %>% 
    filter((!name == 'Marshall Islands') & 
    (!name == 'Federated States of Micronesia') & (!name == 'Dept of Defense') &
    (!name == 'Bureau of Prisons') & (!name == 'American Samoa') & (!name == 'Indian Health Svc') &
    (!name == 'Guam') & (!name == 'Northern Mariana Islands') & (!name == 'Republic of Palau') &
    (!name == 'Puerto Rico') & (!name == 'Veterans Health') & (!name == 'Virgin Islands')
    & (!name == 'United States'))

write_csv(us_vaccines, path = file.path('data_processed', 'us_vaccines.csv'))

us_states <- ne_states(
    country = 'united states of america',
    returnclass = 'sf') %>%
    filter(! name %in% c('Alaska', 'Hawaii')) %>%
    left_join(us_vaccines, by = 'name') %>% 
    st_transform(crs = "ESRI:102003")

us_states %>%
    select(name, people_fully_vaccinated_per_hundred) %>%
    head()
#> Simple feature collection with 6 features and 2 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: -2137947 ymin: 513615.1 xmax: 1098157 ymax: 1564784
#> Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic
#>           name people_fully_vaccinated_per_hundred
#> 1    Minnesota                               71.72
#> 2   Washington                               75.55
#> 3        Idaho                               56.19
#> 4      Montana                               58.88
#> 5 North Dakota                               58.26
#> 6     Michigan                               62.06
#>                         geometry
#> 1 MULTIPOLYGON (((61831.83 13...
#> 2 MULTIPOLYGON (((-1949099 15...
#> 3 MULTIPOLYGON (((-1546132 14...
#> 4 MULTIPOLYGON (((-1474486 14...
#> 5 MULTIPOLYGON (((-595576.1 1...
#> 6 MULTIPOLYGON (((884461.6 10...
centroids <- as.data.frame(
    st_coordinates(st_centroid(us_states)))
names(centroids) <- c('label_x', 'label_y')
us_states <- bind_cols(us_states, centroids)

us_states <- us_states %>%
    left_join(state_abbs, by = c('name' = 'state_name'))



ggplot(us_states) +
  geom_sf(aes(fill = people_fully_vaccinated_per_hundred)) +
    geom_label(aes(x = label_x, y = label_y, label = state_abb, label.size = 0.25)) +
  scale_fill_viridis(
    option = "plasma",
    limits = c(50,110)) +
  theme_void(base_size = 10) +
  theme(legend.position = 'bottom') +
  labs(fill = 'Percentage of People Fully Vaccinated',
       title = 'Full COVID Vaccination by State in 2022') +
    coord_sf(crs = 'ESRI:102003')

According to US News, using data collected from the census, Department of Education, and College Board, Massachusetts is ranked number two in the country in education. Additionally, they are ranked number two in health care. Using the charts created above, full vaccination percentage and deaths due to COVID, we can conclude that the quality of public health and education have a significant impact on the vaccination rate and deaths occurring in a state. Massachusetts, leading in both of those sectors, fared much better with COVID than most states. They had a very high vaccination rate, and furthermore, one of the lower death and case rates in the country by capita. Our assumption that income alone would have a significant impact on the effect of COVID on states or peoples in those states was wrong, as the spread and consequences of the virus are due to multiple variables. The quality of health care provided, the education provided to the people in the state, and the legislation and actions of the local government are just a few of the several variables that impact the spread and severity of the COVID-19 virus.

Appendix (data dictionary)

USA daily state reports (csse_covid_19_daily_reports_us)

Province_State - The name of the State within the USA.
Country_Region - The name of the Country (US).
Last_Update - The most recent date the file was pushed.
Lat - Latitude.
Long - Longitude.
Confirmed - Aggregated case count for the state.
Deaths - Aggregated death toll for the state.
Recovered - Aggregated Recovered case count for the state.
Active - Aggregated confirmed cases that have not been resolved (Active cases = total cases - total recovered - total deaths).
FIPS - Federal Information Processing Standards code that uniquely identifies counties within the USA.
Incident_Rate - cases per 100,000 persons.
Total_Test_Results - Total number of people who have been tested.
People_Hospitalized - Total number of people hospitalized. (Nullified on Aug 31, see Issue #3083)
Case_Fatality_Ratio - Number recorded deaths * 100/ Number confirmed cases.
UID - Unique Identifier for each row entry.
ISO3 - Officially assigned country code identifiers.
Testing_Rate - Total test results per 100,000 persons. The “total test results” are equal to “Total test results (Positive + Negative)” from COVID Tracking Project.
Hospitalization_Rate - US Hospitalization Rate (%): = Total number hospitalized / Number cases. The “Total number hospitalized” is the “Hospitalized – Cumulative” count from COVID Tracking Project. The “hospitalization rate” and “Total number hospitalized” is only presented for those states which provide cumulative hospital data.

State quarterly personal income summary: personal income, population, per capita personal income

GeoFips - area codes in US
GeoName - State and province name
LineCode - Line numbers indicating what category the description is
Description - category of whether values indicate personal income or population or per capita personal income
202X:QX - income in dollars in specific year and quarters

World Covid Data

Confirmed cases
Variable Description
total_cases Total confirmed cases of COVID-19. Counts can include probable cases, where reported.
new_cases New confirmed cases of COVID-19. Counts can include probable cases, where reported. In rare cases where our source reports a negative daily change due to a data correction, we set this metric to NA.
new_cases_smoothed New confirmed cases of COVID-19 (7-day smoothed). Counts can include probable cases, where reported.
total_cases_per_million Total confirmed cases of COVID-19 per 1,000,000 people. Counts can include probable cases, where reported.
new_cases_per_million New confirmed cases of COVID-19 per 1,000,000 people. Counts can include probable cases, where reported.
new_cases_smoothed_per_million New confirmed cases of COVID-19 (7-day smoothed) per 1,000,000 people. Counts can include probable cases, where reported.
Confirmed deaths
Variable Description
total_deaths Total deaths attributed to COVID-19. Counts can include probable deaths, where reported.
new_deaths New deaths attributed to COVID-19. Counts can include probable deaths, where reported. In rare cases where our source reports a negative daily change due to a data correction, we set this metric to NA.
new_deaths_smoothed New deaths attributed to COVID-19 (7-day smoothed). Counts can include probable deaths, where reported.
total_deaths_per_million Total deaths attributed to COVID-19 per 1,000,000 people. Counts can include probable deaths, where reported.
new_deaths_per_million New deaths attributed to COVID-19 per 1,000,000 people. Counts can include probable deaths, where reported.
new_deaths_smoothed_per_million New deaths attributed to COVID-19 (7-day smoothed) per 1,000,000 people. Counts can include probable deaths, where reported.
Excess mortality
Variable Description
excess_mortality Percentage difference between the reported number of weekly or monthly deaths in 2020–2021 and the projected number of deaths for the same period based on previous years. For more information, see https://github.com/owid/covid-19-data/tree/master/public/data/excess_mortality
excess_mortality_cumulative Percentage difference between the cumulative number of deaths since 1 January 2020 and the cumulative projected deaths for the same period based on previous years. For more information, see https://github.com/owid/covid-19-data/tree/master/public/data/excess_mortality
excess_mortality_cumulative_absolute Cumulative difference between the reported number of deaths since 1 January 2020 and the projected number of deaths for the same period based on previous years. For more information, see https://github.com/owid/covid-19-data/tree/master/public/data/excess_mortality
excess_mortality_cumulative_per_million Cumulative difference between the reported number of deaths since 1 January 2020 and the projected number of deaths for the same period based on previous years, per million people. For more information, see https://github.com/owid/covid-19-data/tree/master/public/data/excess_mortality
Hospital & ICU
Variable Description
icu_patients Number of COVID-19 patients in intensive care units (ICUs) on a given day
icu_patients_per_million Number of COVID-19 patients in intensive care units (ICUs) on a given day per 1,000,000 people
hosp_patients Number of COVID-19 patients in hospital on a given day
hosp_patients_per_million Number of COVID-19 patients in hospital on a given day per 1,000,000 people
weekly_icu_admissions Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week (reporting date and the preceeding 6 days)
weekly_icu_admissions_per_million Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week per 1,000,000 people (reporting date and the preceeding 6 days)
weekly_hosp_admissions Number of COVID-19 patients newly admitted to hospitals in a given week (reporting date and the preceeding 6 days)
weekly_hosp_admissions_per_million Number of COVID-19 patients newly admitted to hospitals in a given week per 1,000,000 people (reporting date and the preceeding 6 days)
Policy responses
Variable Description
stringency_index Government Response Stringency Index: composite measure based on 9 response indicators including school closures, workplace closures, and travel bans, rescaled to a value from 0 to 100 (100 = strictest response)
Reproduction rate
Variable Description
reproduction_rate Real-time estimate of the effective reproduction rate (R) of COVID-19. See https://github.com/crondonm/TrackingR/tree/main/Estimates-Database
Tests & positivity

On 23 June 2022, we stopped adding new datapoints to our COVID-19 testing dataset. You can read more at https://github.com/owid/covid-19-data/discussions/2667. | Variable | Description | |:———————————-|:———————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————-| | total_tests | Total tests for COVID-19 | | new_tests | New tests for COVID-19 (only calculated for consecutive days) | | total_tests_per_thousand | Total tests for COVID-19 per 1,000 people | | new_tests_per_thousand | New tests for COVID-19 per 1,000 people | | new_tests_smoothed | New tests for COVID-19 (7-day smoothed). For countries that don’t report testing data on a daily basis, we assume that testing changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window | | new_tests_smoothed_per_thousand | New tests for COVID-19 (7-day smoothed) per 1,000 people | | positive_rate | The share of COVID-19 tests that are positive, given as a rolling 7-day average (this is the inverse of tests_per_case) | | tests_per_case | Tests conducted per new confirmed case of COVID-19, given as a rolling 7-day average (this is the inverse of positive_rate) | | tests_units | Units used by the location to report its testing data. A country file can’t contain mixed units. All metrics concerning testing data use the specified test unit. Valid units are ‘people tested’ (number of people tested), ‘tests performed’ (number of tests performed. a single person can be tested more than once in a given day) and ‘samples tested’ (number of samples tested. In some cases, more than one sample may be required to perform a given test.) |

Vaccinations
Variable Description
total_vaccinations Total number of COVID-19 vaccination doses administered
people_vaccinated Total number of people who received at least one vaccine dose
people_fully_vaccinated Total number of people who received all doses prescribed by the initial vaccination protocol
total_boosters Total number of COVID-19 vaccination booster doses administered (doses administered beyond the number prescribed by the vaccination protocol)
new_vaccinations New COVID-19 vaccination doses administered (only calculated for consecutive days)
new_vaccinations_smoothed New COVID-19 vaccination doses administered (7-day smoothed). For countries that don’t report vaccination data on a daily basis, we assume that vaccination changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window
total_vaccinations_per_hundred Total number of COVID-19 vaccination doses administered per 100 people in the total population
people_vaccinated_per_hundred Total number of people who received at least one vaccine dose per 100 people in the total population
people_fully_vaccinated_per_hundred Total number of people who received all doses prescribed by the initial vaccination protocol per 100 people in the total population
total_boosters_per_hundred Total number of COVID-19 vaccination booster doses administered per 100 people in the total population
new_vaccinations_smoothed_per_million New COVID-19 vaccination doses administered (7-day smoothed) per 1,000,000 people in the total population
new_people_vaccinated_smoothed Daily number of people receiving their first vaccine dose (7-day smoothed)
new_people_vaccinated_smoothed_per_hundred Daily number of people receiving their first vaccine dose (7-day smoothed) per 100 people in the total population
Others
Variable Description
iso_code ISO 3166-1 alpha-3 – three-letter country codes. Note that OWID-defined regions (e.g. continents like ‘Europe’) contain prefix ‘OWID_’.
continent Continent of the geographical location
location Geographical location. Location ‘International’ considers special regions (“Diamond Princess” and “MS Zaandam” cruises).
date Date of observation
population Population (latest available values). See https://github.com/owid/covid-19-data/blob/master/scripts/input/un/population_latest.csv for full list of sources
population_density Number of people divided by land area, measured in square kilometers, most recent year available
median_age Median age of the population, UN projection for 2020
aged_65_older Share of the population that is 65 years and older, most recent year available
aged_70_older Share of the population that is 70 years and older in 2015
gdp_per_capita Gross domestic product at purchasing power parity (constant 2011 international dollars), most recent year available
extreme_poverty Share of the population living in extreme poverty, most recent year available since 2010
cardiovasc_death_rate Death rate from cardiovascular disease in 2017 (annual number of deaths per 100,000 people)
diabetes_prevalence Diabetes prevalence (% of population aged 20 to 79) in 2017
female_smokers Share of women who smoke, most recent year available
male_smokers Share of men who smoke, most recent year available
handwashing_facilities Share of the population with basic handwashing facilities on premises, most recent year available
hospital_beds_per_thousand Hospital beds per 1,000 people, most recent year available since 2010
life_expectancy Life expectancy at birth in 2019
human_development_index A composite index measuring average achievement in three basic dimensions of human development—a long and healthy life, knowledge and a decent standard of living. Values for 2019, imported from http://hdr.undp.org/en/indicators/137506

Gini Coefficient

The dataset has two rows which are the name of the states and the corresponding gini indexes.

Us State Vaccination Rates

  • location: name of the state or federal entity.
  • date: date of the observation.
  • total_vaccinations: total number of doses administered. This is counted as a single dose, and may not equal the total number of people vaccinated, depending on the specific dose regime (e.g. people receive multiple doses). If a person receives one dose of the vaccine, this metric goes up by 1. If they receive a second dose, it goes up by 1 again.
  • total_vaccinations_per_hundred: total_vaccinations per 100 people in the total population of the state.
  • daily_vaccinations_raw: daily change in the total number of doses administered. It is only calculated for consecutive days. This is a raw measure provided for data checks and transparency, but we strongly recommend that any analysis on daily vaccination rates be conducted using daily_vaccinations instead.
  • daily_vaccinations: new doses administered per day (7-day smoothed). For countries that don’t report data on a daily basis, we assume that doses changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window. An example of how we perform this calculation can be found here.
  • daily_vaccinations_per_million: daily_vaccinations per 1,000,000 people in the total population of the state.
  • people_vaccinated: total number of people who received at least one vaccine dose. If a person receives the first dose of a 2-dose vaccine, this metric goes up by 1. If they receive the second dose, the metric stays the same.
  • people_vaccinated_per_hundred: people_vaccinated per 100 people in the total population of the state.
  • people_fully_vaccinated: total number of people who received all doses prescribed by the initial vaccination protocol. If a person receives the first dose of a 2-dose vaccine, this metric stays the same. If they receive the second dose, the metric goes up by 1.
  • people_fully_vaccinated_per_hundred: people_fully_vaccinated per 100 people in the total population of the state.
  • total_distributed: cumulative counts of COVID-19 vaccine doses recorded as shipped in CDC’s Vaccine Tracking System.
  • total_distributed_per_hundred: cumulative counts of COVID-19 vaccine doses recorded as shipped in CDC’s Vaccine Tracking System per 100 people in the total population of the state.
  • share_doses_used: share of vaccination doses administered among those recorded as shipped in CDC’s Vaccine Tracking System.
  • total_boosters: total number of COVID-19 vaccination booster doses administered (doses administered beyond the number prescribed by the initial vaccination protocol)
  • total_boosters_per_hundred: total_boosters per 100 people in the total population.