title: “new_happiness”
author: “Enrico”
date: “2024-10-29”
output:
word_document: default
html_document: default
pdf_document: default
“`{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
{r}
library(tidyverse)
library(dplyr)
library(readr)
{r}
WHR2024 <- read.csv(“/Users/enricomathis/Desktop/Documents – Enrico’s MacBook Pro/HAPPYNESS/WHR2024.csv”)
World_Suicide_Rate_2023_exported <- read.csv(“/Users/enricomathis/Desktop/Documents – Enrico’s MacBook Pro/HAPPYNESS/suicideworldhealth.csv”)
depression_rates_by_country_2024 <- read_csv(“/Users/enricomathis/Desktop/Documents – Enrico’s MacBook Pro/HAPPYNESS/depression-rates-by-country-2024.csv”)
UVRadiationcsv <- read_csv(“/Users/enricomathis/Desktop/Documents – Enrico’s MacBook Pro/HAPPYNESS/UVRadiationcsv.csv”)
{r}
suicide <- World_Suicide_Rate_2023_exported
uv <- UVRadiationcsv
happiness <- WHR2024
depression <- depression_rates_by_country_2024
{r}
suicide <- suicide %>% rename(‘Country name’ = Country) # Adjust based on actual name
{r}
view(suicide)
{r}
Rename the column …1 to “Country name”
uv <- uv %>% rename(Country name
= ...1
)
{r}
Remove the first row, which contains non-data information
uv <- uv %>% slice(-1)
{r}
depression <- depression %>% rename(Country name
= country)
{r}
happiness <- happiness %>% rename(Country name
= Country.name)
{r}
library(dplyr)
Merging the datasets sequentially
merged_data <- happiness %>%
left_join(suicide, by = “Country name”) %>%
left_join(depression, by = “Country name”) %>%
left_join(uv, by = “Country name”)
{r}
check missing value and clean merged_data
summary(merged_data)
{r}
RENAME column
merged_data <- merged_data %>% rename(‘suicide’ = All)
{r}
Eliminate column not need it
merged_data <- merged_data %>%
select(-Female
, –Male
, –M/F
, –2000
, –Change%
, –depressionRatesByCountry_cases
)
{r}
Manually add some value
Assign a specific value to the missing ‘suicide’ rate for Czechia
merged_data <- merged_data %>%
mutate(suicide = ifelse(Country name
== “Czechia” & is.na(suicide), 9.5, suicide))
{r}
merged_data <- merged_data %>%
mutate(suicide = ifelse(Country name
== “Congo (Brazzaville)” & is.na(suicide), 11.6, suicide))
```{r}
merged_data <- merged_data %>%
mutate(suicide = ifelse(`Country name` == "Turkiye" & is.na(suicide), 2.3, suicide))
str(uv)
# Filter and display countries with missing UV radiation data
missing_uv_countries <- merged_data %>%
filter(is.na(`UV radiation`)) %>%
select(`Country name`)
# View the list of countries
print(missing_uv_countries)
# Calculate min and max values of the UV radiation column
min_uv <- min(merged_data$`UV radiation`, na.rm = TRUE)
max_uv <- max(merged_data$`UV radiation`, na.rm = TRUE)
# Apply min-max scaling to fit 0-10 scale
merged_data <- merged_data %>%
mutate(Scaled_UV = 10 * (`UV radiation` - min_uv) / (max_uv - min_uv))
# Round the Scaled_UV column to 2 decimal places
merged_data <- merged_data %>%
mutate(Scaled_UV = round(Scaled_UV, 2))
# View the country names and Scaled_UV columns in merged_data
View(merged_data %>% select(`Country name`, Scaled_UV))
# Load dplyr package
library(dplyr)
# Update missing UV radiation values for each specified country
merged_data <- merged_data %>%
mutate(`UV radiation` = case_when(
`Country name` == "United Kingdom" ~ 1.46,
`Country name` == "United States" ~ 4.44,
`Country name` == "Kosovo" ~ 1.63,
`Country name` == "Taiwan Province of China" ~ 5.89,
`Country name` == "Serbia" ~ 1.63,
`Country name` == "South Korea" ~ 3.05,
`Country name` == "Vietnam" ~ 8.32,
`Country name` == "Moldova" ~ 1.63,
`Country name` == "Russia" ~ 1.64,
`Country name` == "Bolivia" ~ 7.83,
`Country name` == "Montenegro" ~ 2.43,
`Country name` == "Venezuela" ~ 7.74,
`Country name` == "Hong Kong S.A.R. of China" ~ 5.89,
`Country name` == "Congo (Brazzaville)" ~ 8.90,
`Country name` == "Laos" ~ 8.32,
`Country name` == "Ivory Coast" ~ 8.20,
`Country name` == "Iran" ~ 5.59,
`Country name` == "State of Palestine" ~ 5.59,
`Country name` == "Tanzania" ~ 9.44,
`Country name` == "Congo (Kinshasa)" ~ 8.90,
TRUE ~ `UV radiation` # Keep existing values for other countries
))
# View unique values in Country name to check for any issues
unique(merged_data$`Country name`)
# Load dplyr if not loaded
library(dplyr)
# Update missing UV radiation values for each specified country with modified approach
merged_data <- merged_data %>%
mutate(`UV radiation` = case_when(
trimws(`Country name`) == "United Kingdom" ~ 1.46,
trimws(`Country name`) == "United States" ~ 4.44,
trimws(`Country name`) == "Kosovo" ~ 1.63,
trimws(`Country name`) == "Taiwan Province of China" ~ 5.89,
trimws(`Country name`) == "Serbia" ~ 1.63,
trimws(`Country name`) == "South Korea" ~ 3.05,
trimws(`Country name`) == "Vietnam" ~ 8.32,
trimws(`Country name`) == "Moldova" ~ 1.63,
trimws(`Country name`) == "Russia" ~ 1.64,
trimws(`Country name`) == "Bolivia" ~ 7.83,
trimws(`Country name`) == "Montenegro" ~ 2.43,
trimws(`Country name`) == "Venezuela" ~ 7.74,
trimws(`Country name`) == "Hong Kong S.A.R. of China" ~ 5.89,
trimws(`Country name`) == "Congo (Brazzaville)" ~ 8.90,
trimws(`Country name`) == "Laos" ~ 8.32,
trimws(`Country name`) == "Ivory Coast" ~ 8.20,
trimws(`Country name`) == "Iran" ~ 5.59,
trimws(`Country name`) == "State of Palestine" ~ 5.59,
trimws(`Country name`) == "Tanzania" ~ 9.44,
trimws(`Country name`) == "Congo (Kinshasa)" ~ 8.90,
TRUE ~ `UV radiation` # Keep existing values for other countries
))
# Display rows for specified countries to verify updates
merged_data %>% filter(`Country name` %in% c(
"United Kingdom", "United States", "Kosovo", "Taiwan Province of China", "Serbia",
"South Korea", "Vietnam", "Moldova", "Russia", "Bolivia", "Montenegro",
"Venezuela", "Hong Kong S.A.R. of China", "Congo (Brazzaville)", "Laos",
"Ivory Coast", "Iran", "State of Palestine", "Tanzania", "Congo (Kinshasa)"
))
# Count missing values in the 'UV radiation' column
missing_count <- sum(is.na(merged_data$`UV radiation`))
print(missing_count)
# Update missing Scaled_UV values with approximations
merged_data <- merged_data %>%
mutate(Scaled_UV = case_when(
`Country name` == "United Kingdom" ~ 1.46,
`Country name` == "United States" ~ 4.44,
`Country name` == "Kosovo" ~ 1.63,
`Country name` == "Taiwan Province of China" ~ 5.89,
`Country name` == "Serbia" ~ 1.63,
`Country name` == "South Korea" ~ 3.05,
`Country name` == "Vietnam" ~ 8.32,
`Country name` == "Moldova" ~ 1.63,
`Country name` == "Russia" ~ 1.64,
`Country name` == "Bolivia" ~ 7.83,
`Country name` == "Montenegro" ~ 2.43,
`Country name` == "Venezuela" ~ 7.74,
`Country name` == "Hong Kong S.A.R. of China" ~ 5.89,
`Country name` == "Congo (Brazzaville)" ~ 8.90,
`Country name` == "Laos" ~ 8.32,
`Country name` == "Ivory Coast" ~ 8.20,
`Country name` == "Iran" ~ 5.59,
`Country name` == "State of Palestine" ~ 5.59,
`Country name` == "Tanzania" ~ 9.44,
`Country name` == "Congo (Kinshasa)" ~ 8.90,
TRUE ~ Scaled_UV # Keep existing values for other countries
))
# Add scaled UV value for the Netherlands
merged_data <- merged_data %>%
mutate(Scaled_UV = ifelse(`Country name` == "Netherlands" & is.na(Scaled_UV), 1.34, Scaled_UV))
summary(merged_data)
# Save the cleaned dataset as a CSV file
write.csv(merged_data, "cleaned_dataset.csv", row.names = FALSE)
Review Missing Values: Check for any remaining missing values across all columns, especially in key metrics (e.g., Ladder.score, suicide, depressionRatesByCountry_prevalence, UV radiation).
# View structure and summary
str(merged_data)
summary(merged_data)
Investigate Missing Data Patterns
Pattern Analysis: Examine if certain countries or types of data tend to have more missing values, which may indicate an underlying reason (e.g., regional factors or lack of data reporting).
Code in R:
library(naniar)
install.packages("gt")
install.packages("gtExtras")
vis_miss(merged_data)
# Get summary of each numeric column in merged_data
summary(merged_data)
# Check the range of each numeric column to identify potential scaling issues
sapply(merged_data, function(x) if(is.numeric(x)) range(x, na.rm = TRUE))
# Load the mice package for multiple imputation
library(mice)
# Perform multiple imputation with Predictive Mean Matching (PMM)
imputed_data <- mice(merged_data, method = "pmm", m = 5, maxit = 10, seed = 500)
# Extract the completed data (first imputed dataset by default)
merged_data <- complete(imputed_data)
# Replace spaces in column names with underscores
colnames(merged_data) <- gsub(" ", "_", colnames(merged_data))
# Load the mice package and run imputation
library(mice)
imputed_data <- mice(merged_data, method = "pmm", m = 5, maxit = 10, seed = 500)
merged_data <- complete(imputed_data)
# you can visualize the imputed values using mice's diagnostic plots
stripplot(imputed_data, pch = 20, cex = 1.2)
summary(merged_data)
stripplot(imputed_data, pch = 20, cex = 1.2)
# List of columns to normalize
columns_to_normalize <- c("Ladder.score", "suicide", "Scaled_UV", "depressionRatesByCountry_prevalence")
# Apply min-max normalization for each selected column
merged_data[columns_to_normalize] <- lapply(merged_data[columns_to_normalize], function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
})
# Check the summary to confirm normalization
summary(merged_data[columns_to_normalize])
# Calculate the new Ladder.score
merged_data$new_ladder_score <- merged_data$Ladder.score +
merged_data$Scaled_UV -
merged_data$suicide -
merged_data$depressionRatesByCountry_prevalence
# Review the summary of the new score to ensure it looks reasonable
summary(merged_data$new_ladder_score)
Explanation of the Code
Normalized_GDP: This column represents the normalized GDP influence, scaled from 0 to 1.
adjusted_ladder_score: This new score is recalculated to reflect a “happiness without GDP influence” by subtracting the normalized GDP contribution from the previously calculated new_ladder_score.
Final Note
After recalculating adjusted_ladder_score, you have a happiness score that excludes the positive influence of GDP. This score can now be compared to the original Ladder.score and new_ladder_score to analyze the hypothetical impact of economic factors on happiness.
# Install dplyr if it's not already installed
if (!require(dplyr)) install.packages("dplyr")
# Load dplyr
library(dplyr)
# Now, you can run the code
merged_data <- merged_data %>%
mutate(Normalized_GDP = (Explained.by..Log.GDP.per.capita - min(Explained.by..Log.GDP.per.capita, na.rm = TRUE)) /
(max(Explained.by..Log.GDP.per.capita, na.rm = TRUE) - min(Explained.by..Log.GDP.per.capita, na.rm = TRUE)))
# Recalculate the new ladder score by excluding the GDP factor
merged_data <- merged_data %>%
mutate(
adjusted_ladder_score = new_ladder_score - Normalized_GDP
)
summary(merged_data)
# Load necessary libraries
if (!requireNamespace("tidyr", quietly = TRUE)) install.packages("tidyr")
library(dplyr)
library(tidyr)
library(ggplot2)
# `merged_data` is my dataframe with normalized columns
# List of columns to calculate standard deviation
key_metrics <- c("Ladder.score", "suicide", "Scaled_UV", "depressionRatesByCountry_prevalence")
# Calculate standard deviation for each metric
std_dev_data <- merged_data %>%
select(all_of(key_metrics)) %>%
summarise(across(everything(), sd, na.rm = TRUE)) %>%
pivot_longer(cols = everything(), names_to = "Metric", values_to = "Standard_Deviation")
# Create a bar plot using ggplot2 to visualize standard deviations
ggplot(std_dev_data, aes(x = Metric, y = Standard_Deviation, fill = Metric)) +
geom_bar(stat = "identity") +
labs(title = "Standard Deviation of Key Metrics",
x = "Metric",
y = "Standard Deviation") +
theme_minimal() +
theme(legend.position = "none")
In your analysis, Scaled_UV effectively assumes the role that Explained.by..Log.GDP.per.capita might have played in a typical happiness analysis. Since I excluded GDP per capita, Scaled_UV (which has high variability and therefore high weight in variance-sensitive calculations) can dominate the analysis, impacting the composite scores significantly.
Here’s a summary of how Scaled_UV’s influence might resemble that of GDP per capita:
High Variability as a Dominant Factor: Just as GDP per capita often has a large range across countries, resulting in high variability, Scaled_UV shows similar high variability. Metrics with high variance can heavily influence composite scores or weighted analyses because they contribute more to the “spread” in the data. This means that, just as GDP per capita might create notable differences in happiness scores due to economic disparities, Scaled_UV might create similar differences based on sunlight exposure levels across countries.
Weighted Influence: In the absence of GDP, Scaled_UV could act as a powerful predictor for happiness, especially in analyses looking for environmental or lifestyle correlates. Its weight in any unadjusted calculation is significant because of its spread. If other metrics have lower variability, their influence on the final score or correlation might appear muted compared to Scaled_UV.
Potential Bias: Relying heavily on Scaled_UV as a dominant factor might introduce a bias towards environmental conditions over socio-economic ones. This could skew interpretations, especially if the analysis aims to capture a holistic view of happiness influenced by both economic and environmental factors.
# Save the current state of the dataset to a CSV file
write.csv(merged_data, "final_processed_dataset.csv", row.names = FALSE)
data <- read.csv("/Users/enricomathis/Desktop/Documents - Enrico’s MacBook Pro/HAPPYNESS/final_processed_dataset.csv")
# Rename columns to shorter, more descriptive names
data <- data %>%
rename(
Country = `Country_name`,
Happiness = `Ladder.score`,
Upper_Happiness = `upperwhisker`,
Lower_Happiness = `lowerwhisker`,
GDP = `Explained.by..Log.GDP.per.capita`,
Social_Support = `Explained.by..Social.support`,
Life_Expectancy = `Explained.by..Healthy.life.expectancy`,
Freedom = `Explained.by..Freedom.to.make.life.choices`,
Generosity = `Explained.by..Generosity`,
Corruption = `Explained.by..Perceptions.of.corruption`,
Dystopia_Residual = `Dystopia...residual`,
Suicide = `suicide`,
Depression = `depressionRatesByCountry_prevalence`,
UV_Radiation = `UV_radiation`,
Scaled_UV = `Scaled_UV`,
New_Happiness = `new_ladder_score`,
Normalized_GDP = `Normalized_GDP`,
Adjusted_Happiness = `adjusted_ladder_score`
)
# Check the result to confirm changes
colnames(data)
view(data)
# Remove the specified columns from the dataset
data <- data %>%
select(-c(UV_Radiation, Normalized_GDP, GDP))
# Confirm the columns have been removed
colnames(data)
Happiness: The original or baseline happiness score.
New_Happiness: Happiness score adjusted for mental health and environmental factors.
Adjusted_Happiness: Further refined score excluding GDP’s impact, emphasizing non-economic aspects of happiness.
data <- data %>% select(-Happiness, -New_Happiness)
# To reorder the columns and place Adjusted_Happiness as the second column after Country.
data <- data %>% select(Country, Adjusted_Happiness, everything())
# Explanation:
# This code normalizes the `Social_Support` column in the cleaned `data` dataset.
# It applies min-max normalization specifically to `Social_Support`, placing it on a 0–1 scale for comparability.
# The `Upper_Happiness` and `Lower_Happiness` columns are retained as they are confidence interval bounds.
# Load necessary library
library(dplyr)
# Normalize `Social_Support` column in `data`
data <- data %>%
mutate(Social_Support = (Social_Support - min(Social_Support, na.rm = TRUE)) /
(max(Social_Support, na.rm = TRUE) - min(Social_Support, na.rm = TRUE)))
# Display summary to confirm normalization
summary(data$Social_Support)
# Why Check for Outliers?
#Outliers are data points that differ significantly from other observations. They’re important to examine because they can:
# Skew results in analysis, particularly in averages and regression models.
#Distort visualizations by stretching scales, hiding patterns in the main dataset.
# Indicate data quality issues or provide insights into unique cases.
library(ggplot2)
# Boxplot for Adjusted_Happiness to visualize outliers
ggplot(data, aes(x = "", y = Adjusted_Happiness)) +
geom_boxplot(fill = "steelblue", outlier.color = "red") +
labs(title = "Outliers in Adjusted Happiness", y = "Adjusted Happiness Score") +
theme_minimal()
# Boxplot for Social_Support
ggplot(data, aes(x = "", y = Social_Support)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Social Support", y = "Social Support") +
theme_minimal()
Q1 <- quantile(data$Social_Support, 0.25)
Q3 <- quantile(data$Social_Support, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
outliers_social_support <- data %>%
filter(Social_Support < lower_bound) %>%
select(Country, Social_Support)
print(outliers_social_support)
The contextual investigation reveals distinct socio-economic challenges that contribute to the low “Social Support” scores in both Afghanistan and Benin. Here’s a summary of the findings:
- Benin:
- Social Support Systems: Although efforts to expand social support and safety nets exist, such as a recent World Bank initiative, coverage remains limited. The country’s high poverty rate (around 36.2%) and significant underemployment (72% underemployed) point to systemic economic vulnerabilities that limit available social resources for many citizens【155†source】【156†source】.
- Economic Factors: Benin’s economy relies heavily on informal labor and low-value exports, leading to economic fragility. Although economic growth is stable, social and economic resilience remains weak, which constrains the development of robust social support mechanisms.
- Afghanistan:
- Impact of Conflict and Political Instability: The long-standing conflict and recent Taliban takeover have caused economic collapse and severe restrictions on social freedoms. While some improvements in security have been observed, economic stability is precarious and dependent on international aid【157†source】.
- Education and Social Restrictions: Social restrictions, especially on education and labor participation, further reduce community-based support systems, especially affecting women and youth. The high levels of poverty and food insecurity limit the capacity for a supportive social network.
Analytical Implications
For a data analyst, these insights suggest that the low “Social Support” scores for these countries are not statistical outliers in the traditional sense but reflect significant, context-driven socio-economic issues. A professional approach would consider retaining these scores as valid representations of unique, contextual challenges rather than anomalies, as removing or altering these data points could lead to bias by obscuring meaningful socio-political realities.
In terms of analytical practice:
- Interpret with Caution: These outliers should be flagged for stakeholders to consider the socio-political context in decision-making.
- Documentation: Note the socio-economic context in any analysis report to provide clarity on why these scores are unusually low.
colnames (data)
# Calculate Q1, Q3, and IQR for Life_Expectancy
Q1 <- quantile(data$Life_Expectancy, 0.25, na.rm = TRUE)
Q3 <- quantile(data$Life_Expectancy, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Define outlier thresholds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Identify outliers
outliers <- data %>%
filter(Life_Expectancy < lower_bound | Life_Expectancy > upper_bound) %>%
select(Country, Life_Expectancy)
# Print outliers
print(outliers)
# Box plot for Life_Expectancy with outliers highlighted
ggplot(data, aes(x = "", y = Life_Expectancy)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Life Expectancy", y = "Life Expectancy") +
theme_minimal()
# Calculate the first and third quartiles and the IQR for Life Expectancy
Q1 <- quantile(data$Life_Expectancy, 0.25, na.rm = TRUE)
Q3 <- quantile(data$Life_Expectancy, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Define the lower bound for detecting outliers
lower_bound <- Q1 - 1.5 * IQR
# Filter for outliers in Life Expectancy
life_expectancy_outlier <- data %>% filter(Life_Expectancy < lower_bound)
# View the country with the Life Expectancy outlier
print(life_expectancy_outlier %>% select(Country, Life_Expectancy))
using the normalized life expectancy value of Nigeria as a proxy for Lesotho is a practical and transparent solution, especially since our dataset is already normalized. Here’s a step-by-step explanation for transparency in my research:
Rationale for Using Nigeria’s Normalized Value for Lesotho
- Contextual Research and Similarities:
- According to Macrotrends, Nigeria and Lesotho share similar life expectancy levels, both reflecting challenging health and socioeconomic conditions. This similarity allows us to assume that, for analytical purposes, Nigeria’s normalized life expectancy is a reasonable substitute for Lesotho.
- Dataset Consistency:
- Since our dataset has been normalized, directly inputting a raw, unnormalized value like 53 years for Lesotho would disrupt the scale of analysis. Re-normalizing the dataset would be time-consuming and could introduce further inconsistencies.
- By using Nigeria’s normalized value, we maintain the dataset’s consistency without additional transformations.
- Transparency and Justification:
- We document this substitution, explaining that Nigeria’s normalized value was used due to its similar life expectancy profile, based on external verification from Macrotrends.
- This approach is transparent and defensible, as we are using a comparable country’s value to maintain analytical rigor.
Explanation for Transparency in Reporting
In the final report, you could include the following explanation:
“To address missing or anomalous life expectancy data for Lesotho (recorded as 0), we referred to Macrotrends, which indicated that Nigeria and Lesotho share similar life expectancy levels. To maintain consistency in our normalized dataset, we replaced Lesotho’s life expectancy with the normalized value from Nigeria. This adjustment ensures the dataset’s integrity and avoids introducing outliers or rescaling issues.”
R Code to Implement the Substitution
Here’s the R code to make this adjustment:
# Replace Lesotho's normalized Life_Expectancy with Nigeria's normalized value
data$Life_Expectancy[data$Country == "Lesotho"] <-
data$Life_Expectancy[data$Country == "Nigeria"]
view(data)
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Calculate Q1, Q3, and IQR
Q1 <- quantile(data$Freedom, 0.25, na.rm = TRUE)
Q3 <- quantile(data$Freedom, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Calculate the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Identify outliers
outliers_freedom <- data %>% filter(Freedom < lower_bound | Freedom > upper_bound)
# Display the outlier countries
print(outliers_freedom %>% select(Country, Freedom))
# Plot boxplot for the Freedom variable with outliers
ggplot(data, aes(x = "", y = Freedom)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Freedom", y = "Freedom") +
theme_minimal()
# Display the identified outlier countries in the 'Freedom' variable
outliers_freedom <- data %>% filter(Freedom < lower_bound)
print(outliers_freedom %>% select(Country, Freedom))
Certainly. Here’s a concise explanation that you could include in your research:
Rationale for Retaining Outliers in Freedom Score
In analyzing the “Freedom” variable within the dataset, I identified several low outlier values, particularly for countries such as Afghanistan, Turkiye, Comoros, and Lebanon. After careful consideration, I decided to retain these outliers for the following reasons:
Firstly, these freedom scores are derived from survey data provided by the World Happiness Report, a reputable source that bases its findings on respondents’ perceptions. These low scores likely reflect genuine societal conditions and the citizens’ lived experiences regarding their perceived freedom in these regions. Removing or altering these values would risk misrepresenting the realities captured in the survey responses.
Moreover, the presence of these outliers adds valuable insights to the analysis, highlighting how perceptions of freedom vary globally and may influence other factors related to happiness. By keeping these values, I maintain the dataset’s integrity and allow for a more comprehensive exploration of how freedom—an essential component of happiness—correlates with other metrics across different contexts.
In summary, retaining these outliers supports the authenticity of the data and aligns with a transparent and respectful approach to the original responses collected in the World Happiness Report. This decision ensures that the analysis accurately reflects real-world conditions, enhancing the relevance and depth of the findings.
This explanation provides a clear, research-focused justification for keeping the outliers and supports the transparency of your analysis process.
# Boxplot for Generosity to identify outliers
ggplot(data, aes(x = "", y = Generosity)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Generosity", y = "Generosity") +
theme_minimal()
# Identify the country with the outlier in Generosity
outlier_country <- data %>%
filter(Generosity > 0.3) %>%
select(Country, Generosity)
# Display the result
print(outlier_country)
In examining the ‘Generosity’ variable, we identified three countries—Indonesia, Gambia, and Myanmar—with notably high scores. These values are consistent with known cultural practices and societal values within these regions. After careful consideration, we decided to retain these outliers to maintain the integrity of cultural differences reflected in the data. This decision aligns with the World Happiness Report’s intent to capture and respect cultural variances in generosity measures.
# Boxplot for Corruption to identify outliers
ggplot(data, aes(x = "", y = Corruption)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Corruption", y = "Corruption") +
theme_minimal()
# Calculate the Interquartile Range (IQR) for "Corruption"
Q1 <- quantile(data$Corruption, 0.25, na.rm = TRUE)
Q3 <- quantile(data$Corruption, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Define the upper threshold for outliers (1.5 * IQR above Q3)
upper_threshold <- Q3 + 1.5 * IQR
# Filter out the countries where Corruption is above the upper threshold
outliers_corruption <- data %>% filter(Corruption > upper_threshold)
# Display the results
outliers_corruption %>% select(Country, Corruption)
The ‘Corruption’ variable shows notable outliers that represent countries with exceptionally low levels of perceived corruption, including Finland, Denmark, and Singapore. These values are retained in the dataset, as they align with internationally recognized indices and reflect genuine societal characteristics rather than data anomalies. Including these high-performing countries allows for valuable benchmarking and insights into the effects of low corruption on societal well-being.
# Boxplot for Suicide to identify outliers
ggplot(data, aes(x = "", y = Suicide)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Suicide", y = "Suicide") +
theme_minimal()
# Calculate Q1, Q3, and IQR for the "Suicide" variable
Q1 <- quantile(data$Suicide, 0.25, na.rm = TRUE)
Q3 <- quantile(data$Suicide, 0.75, na.rm = TRUE)
IQR_suicide <- Q3 - Q1
# Define the upper bound for the "Suicide" variable
upper_bound_suicide <- Q3 + 1.5 * IQR_suicide
# Filter the data for outliers in the "Suicide" column
suicide_outliers <- data %>%
filter(Suicide > upper_bound_suicide) %>%
select(Country, Suicide)
# Print the result
print(suicide_outliers)
Based on this dataset from the World Bank, we can assess the suicide mortality rates in various countries for 2019 and compare them to our current dataset. Here’s how a professional analyst would approach verifying the outliers identified in the “Suicide” variable of our dataset:
- Matching Rates for Outliers: Identify the suicide rates for the countries that showed up as outliers in our dataset, such as Lithuania, South Korea, and Lesotho.
- Benchmarking: Compare these countries’ reported rates with the World Bank data to confirm if our dataset aligns with authoritative sources. For example:
- Lithuania has a reported suicide rate of 26.1 per 100,000 in the World Bank data, which corroborates its high position.
- South Korea is listed with a rate of 28.6, further validating its high suicide rate.
- Lesotho has an exceptionally high rate of 72.4, which aligns with it being an extreme outlier.
- Conclusion and Note for Presentation:
Based on the World Bank’s data, it appears that our outliers are accurate reflections of real-world statistics rather than data errors. Including this external validation from the World Bank in the analysis enhances the credibility of our findings by confirming the reliability of the extreme values in the “Suicide” variable. This supports the decision to keep these outliers, as they represent genuine variations in suicide rates across countries. Source: Suicide mortality rate data for 2019 from the World Bank’s official dataset (accessed via genderdata.worldbank.org).
# Boxplot for Depression to identify outliers
ggplot(data, aes(x = "", y = Depression)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Depression" , y = "Depression") +
theme_minimal()
Measurement Differences: “Depression” and “Suicide” might be measured differently. Depression rates might be based on reported cases or surveys, which depend on healthcare access, mental health awareness, and social stigma. In contrast, suicide rates are objective counts of incidents, which may be less affected by underreporting but do not necessarily capture all underlying mental health conditions.
Cultural and Societal Influences: In some countries, high depression rates might not translate to high suicide rates due to strong social or religious taboos against suicide, supportive community structures, or coping mechanisms. Conversely, in countries with low depression awareness or treatment access, people may be less likely to report or be diagnosed with depression, even if suicide rates are high.
Economic and Environmental Stressors: High suicide rates may sometimes correlate with economic hardships, political instability, or other stressors unrelated to clinical depression. These situational factors can influence suicide rates independently of diagnosed mental health conditions.
Data Quality and Reporting Biases: In countries where mental health resources and awareness are limited, depression may be underdiagnosed, meaning the official “Depression” metric could be underestimating the true prevalence. Suicide data might be more accurately recorded due to its tangible nature.
Observation: There is a noticeable lack of correlation between high suicide rates and depression outliers in some countries. This could be due to measurement differences, cultural factors, economic stressors, or data reporting biases. This suggests the need for cautious interpretation and consideration of context when analyzing these variables.
# Boxplot for Scaled_UV to identify outliers
ggplot(data, aes(x = "", y = Scaled_UV)) +
geom_boxplot(outlier.color = "red") +
labs(title = "Outliers in Scaled UV Radiation", y = "Scaled UV") +
theme_minimal()
Adjusted Happiness – No significant outliers.
Social Support – Outliers examined, kept due to contextual significance.
Life Expectancy – Outlier for Lesotho adjusted with external data.
Freedom – Outliers in low-scoring countries examined and retained.
Generosity – High outlier in Myanmar; contextual understanding noted.
Corruption – High outliers in countries with lower perceived corruption levels; retained for analysis.
Suicide – Outliers verified and retained.
Depression – No significant outliers observed.
Scaled UV – No significant outliers observed.
Upper Happiness and Lower Happiness – Confidence interval data, not analyzed as outliers.
NEXT Correlation Analysis
Perform correlation analysis to examine relationships between variables.
This can help identify which factors have the strongest associations with Adjusted Happiness.
- Missing Data Handling (if any remains)
Review if any variables have missing values and decide on imputation or removal. - Descriptive Statistics and Distributions
Plot histograms or density plots to understand the distribution of each key variable.
This step helps ensure normality assumptions or guides us on transformations if needed.
# Save the data set for security reason
write.csv(data, "final_processed_dataset.csv", row.names = FALSE)
str(data)
install.packages("openxlsx")
library(openxlsx)
# Round numeric columns to 3 decimal places
data <- data %>% mutate(across(where(is.numeric), ~ round(., 3)))
# Write the rounded data to an Excel file
write.xlsx(data, "final_processed_dataset.xlsx")
# check for missing value
sum(is.na(data))
library(dplyr)
library(tidyverse)
# Summary of zeros in each column
zero_counts <- sapply(data, function(x) sum(x == 0, na.rm = TRUE))
print(zero_counts)
# Filter the data set to find rows where Social_Support is zero
zero_social_support <- data %>% filter(Social_Support == 0)
# Display the result
print(zero_social_support)
Data Integrity and Retention of Original Values
In reviewing the dataset, a zero value was identified in the Social_Support variable. After verifying against the original dataset provided by the World Happiness Report, it was confirmed that this zero value was indeed part of the source data. As this value reflects the organization’s original measurement, I have retained it without alteration to preserve the integrity of the dataset. This approach ensures consistency with the source material and allows for an analysis aligned with the original data structure.
freedom_zero<- data %>% filter(Freedom == 0)
print(freedom_zero)
Retention of Original Values
The Freedom variable includes a zero for Afghanistan, confirmed as accurate from the World Happiness Report dataset. This value is retained to ensure alignment with the original data
Generosity_zero<- data %>% filter(Generosity ==0)
print(Generosity_zero)
Retention of Original Values
The Generosity variable includes a zero for one country, verified as accurate in the World Happiness Report dataset. This value is retained to maintain consistency with the original data.
# Load dplyr for the filter function if not already loaded
library(dplyr)
# Filter rows where Corruption is 0
Corruption_zero <- data %>% filter(Corruption == 0)
print(Corruption_zero)
Note: The Corruption value for Bosnia and Herzegovina is recorded as 0 in the official World Happiness Report dataset. For consistency with the source data and to maintain accuracy, this value has been retained without alteration
Suicide_zero<- data %>% filter(Suicide==0)
print(Suicide_zero)
Brief Explanation for Documentation
For consistency and accuracy in our analysis, we updated Jordan’s suicide rate with a normalized value based on the World Bank data, which reports a suicide rate of 1.6 per 100,000 for Jordan in 2019. The normalized value was calculated using the existing min and max range in our dataset, ensuring comparability across countries. This transformation allows us to retain the integrity of the World Bank’s official data within our dataset.
# Step 1: Calculate Min and Max of Suicide column
min_suicide <- min(data$Suicide, na.rm = TRUE)
max_suicide <- max(data$Suicide, na.rm = TRUE)
# Step 2: Normalize the given suicide rate for Jordan
# Given suicide rate for Jordan in 2019 is 1.6 per 100,000 population
jordan_suicide_rate <- 1.6
normalized_jordan_suicide <- (jordan_suicide_rate - min_suicide) / (max_suicide - min_suicide)
# Step 3: Update the Suicide value for Jordan in the dataset with the normalized value
data <- data %>% mutate(Suicide = ifelse(Country == "Jordan", normalized_jordan_suicide, Suicide))
# Print the updated value for Jordan to confirm
print(data %>% filter(Country == "Jordan"))
# Filter the dataset for rows where the Depression column has zero values
depression_zeros <- data %>% filter(Depression == 0)
# Print the rows with zero values in the Depression column to identify them
print(depression_zeros)
Handling of Depression Data for Nepal and Laos:
During our data review, it was observed that the original depression data for Nepal and Laos may have been calculated using a different method or format than the rest of the dataset, possibly impacting consistency and comparability. As our primary objective is to ensure data integrity and avoid introducing bias, we have chosen to set the depression values for these countries to NA. This approach prevents these data points from influencing the overall analysis while allowing for potential imputation or further investigation if necessary.
Sources and methods used in the data collection for other countries remain consistent and are therefore retained.
# Brief Explanation:
# The following steps will set depression values for Nepal and Laos to NA to avoid bias.
# These countries had inconsistent or incomplete data for the depression variable.
# Check current depression values for Nepal and Laos
data %>% filter(Country %in% c("Nepal", "Laos")) %>% select(Country, Depression)
# Update the depression column by setting values for Nepal and Laos to NA
data <- data %>%
mutate(Depression = ifelse(Country %in% c("Nepal", "Laos"), NA, Depression))
# Verify changes to confirm Nepal and Laos have NA in Depression column
data %>% filter(Country %in% c("Nepal", "Laos")) %>% select(Country, Depression)
# Note: This step was taken to ensure unbiased analysis, as incomplete or inconsistent
# depression data may skew results in the final analysis.
Scaled_UV_zero<- data %>% filter(Scaled_UV==0)
print(Scaled_UV_zero)
# Check Norway's Scaled_UV to use as a reference for Iceland
norway_uv <- data %>% filter(Country == "Norway") %>% pull(Scaled_UV)
# Replace Iceland's Scaled_UV with Norway's Scaled_UV
data <- data %>% mutate(Scaled_UV = ifelse(Country == "Iceland" & Scaled_UV == 0, norway_uv, Scaled_UV))
# Document the choice
# This code replaces Iceland's missing Scaled_UV value with Norway's,
# a nearby country with a similar UV range, to maintain data consistency.
Outliers Checked and Documented: We identified and reviewed outliers across key variables, made notes on why they were retained or adjusted.
Missing Values Addressed: We’ve handled missing or zero values by either imputing reasonable values, referencing reliable sources, or setting them as NA to prevent bias.
Normalization Completed: All variables that required normalization have been normalized consistently.
Variable Names Standardized: We renamed columns for clarity, making them ready for interpretation and analysis in tools like Tableau.
Dataset Saved: We saved the cleaned, processed dataset to avoid redoing any steps.
write.csv(data, "final_processed_dataset.csv", row.names = FALSE)
view(data)
Descriptive Statistics
Purpose: Gain a general understanding of the data distribution for each variable.
Actions: Calculate the mean, median, standard deviation, and range for each variable.
Visualization: Histograms or box plots for individual variables to observe their distribution.
# Summary statistics for each normalized numeric variable, excluding unnormalized columns
descriptive_stats <- data %>%
select_if(is.numeric) %>%
select(-Upper_Happiness, -Lower_Happiness) %>%
summarise_all(list(
Mean = ~mean(., na.rm = TRUE),
Median = ~median(., na.rm = TRUE),
SD = ~sd(., na.rm = TRUE),
Min = ~min(., na.rm = TRUE),
Max = ~max(., na.rm = TRUE)
))
print(descriptive_stats)
# Loop to create histograms for each relevant normalized numeric variable
library(ggplot2)
# Select only numeric variables excluding `Upper_Happiness` and `Lower_Happiness`
numeric_vars <- names(select(data, where(is.numeric), -Upper_Happiness, -Lower_Happiness))
# Loop through each variable to create histograms
for (var in numeric_vars) {
p <- ggplot(data, aes_string(x = var)) +
geom_histogram(binwidth = 0.1, fill = "steelblue", color = "black") +
theme_minimal() +
labs(title = paste("Histogram of", var), x = var, y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
}
Justification for Retaining Original Distributions:
Reflective of Reality: The skewed or non-uniform distributions align with expected global patterns in socioeconomic and psychological variables. For example, extreme values in suicide rates or corruption levels occur in a few countries but are not representative globally.
Interpretive Value: By keeping the original distributions, we preserve the natural variation and allow for insights into how these metrics are distributed worldwide. This helps maintain the interpretative validity of each variable.
Impact on Analysis: While normality assumptions are important for specific statistical tests, not all analyses require normalized data. For analyses sensitive to non-normality (e.g., parametric tests), we can apply data transformations on a case-by-case basis if needed.
Conclusion: Therefore, we retain all variables in their original form, as their distributions capture meaningful, real-world patterns. This approach maintains the authenticity of the data and provides a more accurate representation of each variable’s distribution on a global scale.
install.packages("reshape2", repos = "http://cran.rstudio.com/")
# Load necessary libraries for visualization
library(ggplot2)
library(reshape2)
# Select only numeric variables, excluding `Upper_Happiness` and `Lower_Happiness`
numeric_data <- data %>% select_if(is.numeric) %>% select(-Upper_Happiness, -Lower_Happiness)
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_data, use = "complete.obs")
# Print correlation matrix to view raw values
print(correlation_matrix)
# Visualize the correlation matrix using a heatmap
# Melt the correlation matrix for ggplot
melted_corr <- melt(correlation_matrix)
# Plot the heatmap
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "red", high = "blue", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
labs(title = "Correlation Heatmap of Variables", x = "", y = "")
# Load necessary library
library(ggplot2)
# List of variables to compare with Adjusted_Happiness
variables <- c("Social_Support", "Life_Expectancy", "Freedom", "Generosity",
"Corruption", "Dystopia_Residual", "Suicide", "Depression", "Scaled_UV")
# Loop to create scatter plots with trend lines for each variable against Adjusted_Happiness
for (var in variables) {
# Create the scatter plot with trend line
plot <- ggplot(data, aes_string(x = var, y = "Adjusted_Happiness")) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
theme_minimal() +
labs(title = paste("Adjusted Happiness vs", var),
x = var,
y = "Adjusted Happiness") +
theme(plot.title = element_text(hjust = 0.5))
# Print each plot
print(plot)
}
summary(data_standardized$Corruption)
# Load necessary libraries
library(dplyr)
# Step 2: Exclude Upper_Happiness and Lower_Happiness, and ensure predictors are numeric
# This step helps in using only relevant, standardized predictors for the model
data <- data %>% select(-Upper_Happiness, -Lower_Happiness)
# Step 3: Run the multiple linear regression model
# Explanation: The model will predict 'Adjusted_Happiness' using all other standardized factors as predictors.
model <- lm(Adjusted_Happiness ~ ., data = data)
# Step 4: Summary of the model
# Explanation: Displaying the summary of the regression model gives coefficients, p-values, and R-squared.
# Dystopia_Residual is included as it represents a baseline component for happiness.
summary(model)
# Step 5: Interpretation of coefficients
# Explanation: Extracting coefficients to assess the impact of each factor.
coefficients <- summary(model)$coefficients
print(coefficients)
# Step 6: Checking model assumptions
# Plot residuals to check for normality and homoscedasticity, which are standard regression assumptions.
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))
# Additional Note: This approach keeps Dystopia_Residual as a baseline variable in the model.
# Display summary of the regression model
summary(model)
# Multiple regression for Adjusted Happiness, excluding Dystopia_Residual
model_happiness <- lm(Adjusted_Happiness ~ Social_Support + Life_Expectancy + Freedom + Generosity + Corruption + Scaled_UV, data = data)
summary(model_happiness)
# Load necessary libraries
library(ggplot2)
library(broom)
# Assuming `model` is your regression model object
# Extract coefficients with confidence intervals for the Coefficient Plot
coef_data <- tidy(model, conf.int = TRUE)
# Plot 1: Coefficient Plot
ggplot(coef_data, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_pointrange(color = "blue") +
coord_flip() +
labs(title = "Coefficient Plot for Happiness Model",
x = "Predictor Variables",
y = "Coefficient Estimate") +
theme_minimal()
# Plot 2: Scatter Plots with Trend Lines for Key Predictors
# Loop through each predictor to create scatter plots with trend lines
predictors <- c("Freedom", "Generosity", "Scaled_UV", "Social_Support", "Life_Expectancy", "Corruption")
for (predictor in predictors) {
ggplot(data, aes_string(x = predictor, y = "Adjusted_Happiness")) +
geom_point(color = "darkgray") +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(title = paste("Adjusted Happiness vs.", predictor),
x = predictor,
y = "Adjusted Happiness") +
theme_minimal() +
print()
}
# Filter coefficients to exclude country-specific dummies and intercept
coef_data <- tidy(model, conf.int = TRUE) %>%
filter(!term %in% c("(Intercept)", "Dystopia_Residual")) %>%
filter(!grepl("Country", term)) # Exclude dummy variables for countries
# Plot 1: Coefficient Plot (Refined)
ggplot(coef_data, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_pointrange(color = "blue") +
coord_flip() +
labs(title = "Coefficient Plot for Happiness Model (Key Predictors Only)",
x = "Predictor Variables",
y = "Coefficient Estimate") +
theme_minimal()
# Print out all coefficients with confidence intervals to see what we have
coef_data <- tidy(model, conf.int = TRUE)
print(coef_data)
# Filter to exclude intercept, country dummies, and keep only predictors with non-zero estimates
coef_data_filtered <- coef_data %>%
filter(!term %in% c("(Intercept)", "Dystopia_Residual")) %>%
filter(!grepl("Country", term)) %>%
filter(estimate != 0) # Ensure we're plotting only non-zero estimates
# Check if there are any remaining coefficients to plot
if (nrow(coef_data_filtered) > 0) {
ggplot(coef_data_filtered, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_pointrange(color = "blue") +
coord_flip() +
labs(title = "Coefficient Plot for Happiness Model (Key Predictors Only)",
x = "Predictor Variables",
y = "Coefficient Estimate") +
theme_minimal()
} else {
message("No non-zero coefficients to plot.")
}
# Load necessary library
library(broom) # This helps in organizing regression outputs in a tidy format
# List of predictor variables to test individually (excluding Dystopia_Residual)
predictors <- c("Social_Support", "Life_Expectancy", "Freedom", "Generosity",
"Corruption", "Suicide", "Depression", "Scaled_UV")
# Initialize an empty data frame to store the results
bivariate_results <- data.frame(Predictor = character(),
Estimate = numeric(),
Std_Error = numeric(),
t_value = numeric(),
p_value = numeric(),
stringsAsFactors = FALSE)
# Loop over each predictor to run a bivariate regression and collect results
for (predictor in predictors) {
# Create a formula dynamically
formula <- as.formula(paste("Adjusted_Happiness ~", predictor))
# Run the bivariate regression
model <- lm(formula, data = data)
# Extract summary using tidy for easy organization
summary <- tidy(model)
# Filter to include only the predictor (omit intercept for clarity)
predictor_row <- summary[summary$term == predictor, ]
# Append the result to the results data frame
bivariate_results <- rbind(bivariate_results,
data.frame(Predictor = predictor,
Estimate = predictor_row$estimate,
Std_Error = predictor_row$std.error,
t_value = predictor_row$statistic,
p_value = predictor_row$p.value))
}
# View the summarized bivariate results
print(bivariate_results)
# Load necessary package
if(!require(car)) install.packages("car", dependencies=TRUE)
library(car)
# Fit a model with all predictors (excluding Dystopia_Residual)
vif_model <- lm(Adjusted_Happiness ~ Social_Support + Life_Expectancy + Freedom + Generosity + Corruption + Suicide + Depression + Scaled_UV, data = data)
# Calculate VIF
vif_values <- vif(vif_model)
print(vif_values)
# Load ggplot2 for visualization
library(ggplot2)
# Define predictors to plot
predictors <- c("Social_Support", "Life_Expectancy", "Freedom", "Generosity", "Corruption", "Suicide", "Depression", "Scaled_UV")
# Loop through predictors and create scatter plots with trend lines
for (predictor in predictors) {
plot <- ggplot(data, aes_string(x = predictor, y = "Adjusted_Happiness")) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(title = paste("Adjusted Happiness vs", predictor), x = predictor, y = "Adjusted Happiness") +
theme_minimal()
print(plot) # Each plot will be displayed in turn
}
# Load necessary library
library(ggplot2)
# Brief Explanation:
# This code performs a linear regression analysis to examine the relationship between
# sunlight (Scaled_UV) and suicide rates (Suicide). It aims to test the hypothesis that
# higher levels of sunlight may correlate with lower suicide rates.
# Step 1: Run the linear regression
# We set `Suicide` as the response variable and `Scaled_UV` as the predictor.
model <- lm(Suicide ~ Scaled_UV, data = data)
# Step 2: Summary of the model
# Display the summary of the regression model to understand the relationship.
summary_model <- summary(model)
print(summary_model)
# Step 3: Interpret Key Results
# The summary includes important information such as the coefficient for `Scaled_UV`,
# its significance level, and R-squared value, which together indicate the strength
# and direction of the relationship between sunlight and suicide rates.
# Step 4: Visualization (Optional)
# Plot a scatter plot of `Suicide` vs. `Scaled_UV` with a regression line to visually
# represent the relationship.
ggplot(data, aes(x = Scaled_UV, y = Suicide)) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(title = "Relationship Between Sunlight (Scaled_UV) and Suicide Rates",
x = "Scaled UV (Sunlight Intensity)", y = "Suicide Rate (Normalized)") +
theme_minimal()
Explanation for Research Paper
The graph supports the hypothesis that increased sunlight might correlate with lower suicide rates. This relationship, while visible, does not show a strong trend. The downward slope of the regression line suggests that higher sunlight exposure is associated with a decrease in suicide rates, albeit marginally.
Further analysis or a more complex model could investigate additional factors that might interact with sunlight to affect suicide rates more significantly.
# Fit the simple linear regression model
model_sunlight_suicide <- lm(Suicide ~ Scaled_UV, data = data)
# Display the summary of the model, which includes the p-value for Scaled_UV
summary(model_sunlight_suicide)
Scaled UV Exposure: The coefficient for scaled UV exposure is -0.07149, suggesting a slight negative association between sunlight exposure and suicide rates. This means that, on average, as scaled UV exposure increases, suicide rates show a small tendency to decrease. However, this coefficient is not statistically significant (p = 0.104), indicating that this relationship does not reach the conventional threshold for significance.
# Load libraries for spatial visualization
library(ggplot2)
install.packages("rworldmap")
library(rworldmap) # For world map data
# Merge world map data with our data set
world_map <- joinCountryData2Map(data, joinCode = "NAME", nameJoinColumn = "Country", verbose = TRUE)
# Plot Happiness Score on World Map
happiness_map <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(fill = Happiness_score, map_id = Country)) +
expand_limits(x = world_map$long, y = world_map$lat) +
theme_minimal() +
scale_fill_viridis_c(option = "viridis", name = "Happiness Score") +
labs(title = "World Happiness Score by Country") +
theme(axis.title = element_blank(), axis.text = element_blank(), axis.ticks = element_blank())
print(happiness_map)
# Standardize country names in your dataset
data$Country <- gsub("Czechia", "Czech Republic", data$Country)
data$Country <- gsub("Taiwan Province of China", "Taiwan", data$Country)
data$Country <- gsub("North Macedonia", "Macedonia", data$Country)
data$Country <- gsub("Hong Kong S.A.R. of China", "Hong Kong", data$Country)
data$Country <- gsub("Turkiye", "Turkey", data$Country)
data$Country <- gsub("State of Palestine", "Palestine", data$Country)
data$Country <- gsub("Eswatini", "Swaziland", data$Country)
# Load required libraries
library(rworldmap)
library(ggplot2)
library(dplyr)
# Join your dataset with country data in rworldmap
world_map <- getMap(resolution = "low")
world_map_df <- fortify(world_map, region = "ISO3") # Convert to data frame
# Merge your happiness data with the map data
data <- rename(data, ISO3 = Country) # Rename Country column to ISO3 to match map
merged_data <- left_join(world_map_df, data, by = "ISO3")
# Plot with ggplot
ggplot(merged_data, aes(long, lat, group = group, fill = Happiness_score)) +
geom_polygon(color = "white") +
theme_minimal() +
scale_fill_viridis_c(option = "plasma", name = "Happiness Score") +
labs(title = "World Happiness Score by Country",
subtitle = "Based on environmental and mental health metrics")
# Load required libraries
library(rworldmap)
library(ggplot2)
library(dplyr)
library(sf)
# Load world map and convert to sf format
world_map <- getMap(resolution = "low")
world_map_sf <- st_as_sf(world_map)
# Preview the country names in your data to ensure they are correct
print(unique(data$Country))
# Use mutate to replace country names that need adjustment
data <- data %>%
mutate(Country = case_when(
Country == "Czechia" ~ "Czech Republic",
Country == "Taiwan Province of China" ~ "Taiwan",
Country == "North Macedonia" ~ "Macedonia",
Country == "Hong Kong S.A.R. of China" ~ "Hong Kong",
Country == "Turkiye" ~ "Turkey",
Country == "State of Palestine" ~ "Palestine",
Country == "Eswatini" ~ "Swaziland",
TRUE ~ Country # Keeps other country names unchanged
))
# Check if changes were applied correctly
print(unique(data$Country))
# Rename Country column in your data for joining
data <- rename(data, NAME = Country)
# Perform a left join between the world map and your data
world_map_sf <- left_join(world_map_sf, data, by = "NAME")
# Plot the map with ggplot2
ggplot(world_map_sf) +
geom_sf(aes(fill = Happiness_score), color = "white") +
theme_minimal() +
scale_fill_viridis_c(option = "plasma", name = "Happiness Score") +
labs(title = "World Happiness Score by Country",
subtitle = "Based on environmental and mental health metrics")
# Check the column names in `data` to confirm `Country` exists
print(names(data)) # This will help us confirm the correct column name
# Rename 'NAME' column to 'Country'
data <- data %>%
rename(Country = NAME)
# Confirm the change
print(names(data)) # This should now show 'Country' as one of the column names
# Standardize country names for mapping
data <- data %>%
mutate(Country = case_when(
Country == "Czechia" ~ "Czech Republic",
Country == "Taiwan Province of China" ~ "Taiwan",
Country == "North Macedonia" ~ "Macedonia",
Country == "Hong Kong S.A.R. of China" ~ "Hong Kong",
Country == "Turkiye" ~ "Turkey",
Country == "State of Palestine" ~ "Palestine",
Country == "Eswatini" ~ "Swaziland",
TRUE ~ Country # Keep original name if no change is needed
))
# Verify changes
print(unique(data$Country)) # This will list the unique country names to confirm the updates
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(sf) # Using sf for spatial data handling
# Load a world shapefile with `sf`
world_map <- st_as_sf(maps::map("world", plot = FALSE, fill = TRUE))
colnames(world_map)[1] <- "Country"
# Join the data with the world map
map_data <- world_map %>%
left_join(data, by = "Country")
# Plot the world map with happiness scores
ggplot(map_data) +
geom_sf(aes(fill = Adjusted_Happiness), color = "grey", lwd = 0.1) +
scale_fill_viridis_c(name = "Happiness Score", option = "plasma") +
theme_minimal() +
labs(title = "World Map of Adjusted Happiness Scores",
subtitle = "Based on environmental and mental health factors",
caption = "Data Source: Customized Happiness Report") +
theme(legend.position = "bottom")
# Summarize suicide data for a pie chart
suicide_data <- data %>%
filter(!is.na(Suicide)) %>%
mutate(Percentage = Suicide / sum(Suicide, na.rm = TRUE) * 100)
# Create the pie chart
ggplot(suicide_data, aes(x = "", y = Percentage, fill = Country)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Distribution of Suicide Rates by Country",
fill = "Country") +
theme_void() +
theme(legend.position = "right")
# Load required libraries
library(ggplot2)
library(dplyr)
library(rnaturalearth)
library(sf)
# Load the world map from `rnaturalearth` package
world <- ne_countries(scale = "medium", returnclass = "sf")
# Assuming `suicide_data` is your dataset with columns "Country" and "Suicide"
# Join the suicide data to the world map data
map_data <- world %>%
left_join(suicide_data, by = c("name" = "Country"))
# Create the map with better color and full world projection
ggplot(map_data) +
geom_sf(aes(fill = Suicide)) +
scale_fill_viridis_c(option = "plasma", na.value = "lightgray", name = "Suicide Rate") +
theme_minimal() +
labs(title = "Global Distribution of Suicide Rates by Country",
caption = "Data Source: World Happiness Report, Suicide Rates") +
theme(legend.position = "right",
plot.title = element_text(size = 16, face = "bold"),
plot.caption = element_text(size = 10))
# Assuming `suicide_data` contains "Country" and "Suicide" columns
# Filter the top 15 countries with the highest suicide rates
top_15_suicide <- suicide_data %>%
arrange(desc(Suicide)) %>%
slice(1:15)
# Plot the top 15 countries with highest suicide rates
ggplot(top_15_suicide, aes(x = reorder(Country, Suicide), y = Suicide, fill = Suicide)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_viridis_c(option = "plasma", name = "Suicide Rate") +
theme_minimal() +
labs(title = "Top 15 Countries by Suicide Rate",
x = "Country",
y = "Suicide Rate",
caption = "Data Source: World Happiness Report, Suicide Rates") +
theme(plot.title = element_text(size = 16, face = "bold"),
plot.caption = element_text(size = 10),
axis.text.y = element_text(size = 12))
# Scatter plot of Sunlight Exposure vs. Suicide Rate
ggplot(suicide_data, aes(x = Scaled_UV, y = Suicide)) +
geom_point(color = "blue", alpha = 0.6, size = 3) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
theme_minimal() +
labs(title = "Scatter Plot of Sunlight Exposure vs. Suicide Rate",
x = "Scaled Sunlight Exposure (UV Index)",
y = "Suicide Rate",
caption = "Data Source: World Happiness Report, Suicide Rates") +
theme(plot.title = element_text(size = 16, face = "bold"),
plot.caption = element_text(size = 10))
Step 8: Correlation Analysis
Objective: Assess the strength and direction of relationships between key variables, such as Sunlight Exposure (Scaled_UV), Suicide Rate, Depression, Social Support, and Adjusted Happiness.
Method: Calculate the correlation matrix and visualize it with a heatmap to identify strong associations among variables.
install.packages("ggcorrplot")
# Selecting relevant columns for correlation analysis
correlation_data <- suicide_data %>%
select(Adjusted_Happiness, Social_Support, Life_Expectancy, Freedom, Generosity, Corruption, Suicide, Depression, Scaled_UV)
# Calculating the correlation matrix
correlation_matrix <- cor(correlation_data, use = "complete.obs")
# Visualizing the correlation matrix as a heatmap
library(ggcorrplot)
ggcorrplot(correlation_matrix, method = "circle", type = "lower",
lab = TRUE, title = "Correlation Matrix of Key Variables")
Key Observations from the Correlation Heatmap
Adjusted_Happiness and Scaled_UV (Sunlight Exposure)
Correlation Coefficient: 0.87
Interpretation: There is a strong positive correlation between Adjusted_Happiness and Scaled_UV. This suggests that higher sunlight exposure is associated with higher happiness scores. This aligns with theories in psychology suggesting that sunlight can positively affect mood and well-being.
Depression and Adjusted_Happiness
Correlation Coefficient: -0.8
Interpretation: There’s a strong negative correlation between Depression and Adjusted_Happiness. This implies that higher levels of depression are associated with lower happiness scores, which is expected as depression typically reduces an individual’s sense of well-being.
Social_Support and Adjusted_Happiness
Correlation Coefficient: 0.52
Interpretation: There is a moderate positive correlation between Social_Support and Adjusted_Happiness, indicating that greater social support tends to be associated with higher happiness levels. This finding is consistent with research showing the importance of social connections for personal well-being.
Depression and Social_Support
Correlation Coefficient: -0.6
Interpretation: This moderate negative correlation suggests that higher social support is associated with lower levels of depression. Social support can act as a protective factor against depression by providing emotional assistance and a sense of community.
Suicide and Adjusted_Happiness
Correlation Coefficient: -0.33
Interpretation: There is a weak negative correlation between Suicide rates and Adjusted_Happiness, indicating that as happiness scores increase, suicide rates tend to decrease slightly. This weak correlation suggests that other factors may be influencing the suicide rate beyond overall happiness levels.
Scaled_UV (Sunlight) and Depression
Correlation Coefficient: -0.6
Interpretation: The moderate negative correlation between sunlight exposure and depression implies that greater sunlight exposure is associated with lower levels of depression. Sunlight exposure is often linked to improved mood, which may help reduce depressive symptoms.
Scaled_UV and Suicide
Correlation Coefficient: -0.13
Interpretation: The weak negative correlation between sunlight exposure and suicide rates suggests that there may be a slight tendency for suicide rates to decrease with more sunlight exposure, but the relationship is not strong. This reflects the complexity of factors influencing suicide rates.
Summary of Insights
Happiness and Sunlight: Sunlight exposure has a strong positive relationship with happiness and a moderate negative relationship with depression, supporting the idea that sunlight contributes to overall well-being.
Mental Health and Social Support: Depression is inversely related to social support and sunlight, highlighting the protective role these factors play in mental health.
Suicide Correlation: The weak relationships between suicide rates and other variables suggest that suicide may be influenced by a more complex mix of factors beyond just happiness or sunlight exposure.
This analysis provides a multi-dimensional look at how these factors relate to each other and to well-being. Let me know if you’d like any further breakdown or additional analysis.
Step 9: Multivariable Linear Regression (Exploring Predictors of Happiness)
Objective: To evaluate which factors (e.g., sunlight, social support, suicide rate, depression) contribute most significantly to Adjusted Happiness when considered together.
Method: Build a multiple linear regression model with Adjusted Happiness as the dependent variable and Scaled_UV, Suicide, Depression, and Social_Support as independent variables.
Sunlight Exposure: This has a strong positive association with happiness, implying that regions with more sunlight may have happier populations, potentially due to the mood-enhancing effects of sunlight.
Depression and Suicide: Both have a substantial and highly significant negative effect on happiness, which underscores the importance of mental health as a core component of overall well-being.
Social Support: While not as influential as sunlight or depression, social support still contributes positively to happiness, reflecting the importance of community and interpersonal relationships.
This model demonstrates that while environmental factors like sunlight are important, mental health indicators (depression and suicide rates) have a crucial impact on happiness. The high explanatory power of the model (Adjusted R-squared of 0.9255) supports the robustness of these predictors in understanding happiness beyond economic factors.
# Multivariable linear regression model with additional predictors
happiness_lm <- lm(Adjusted_Happiness ~ Scaled_UV + Suicide + Depression + Social_Support +
Life_Expectancy + Freedom + Generosity + Corruption, data = suicide_data)
# Display the summary of the regression model
summary(happiness_lm)
Here’s the interpretation of the multivariable linear regression model results for Adjusted Happiness using predictors such as Scaled_UV, Suicide, Depression, Social_Support, Life_Expectancy, Freedom, Generosity, and Corruption:
Key Findings from the Regression Output
- Model Fit
- The Multiple R-squared value is 0.9337, meaning that approximately 93.4% of the variability in Adjusted Happiness is explained by the predictors in this model.
- The Adjusted R-squared is 0.9297, indicating a high level of explanatory power after adjusting for the number of predictors.
- The F-statistic is significant (p-value < 2.2e-16), indicating that the overall model is statistically significant.
- Significant Predictors
- Scaled_UV (Sunlight Exposure): Estimate = 1.146, p-value < 2e-16. Sunlight exposure has a strong positive association with Adjusted Happiness, indicating that higher levels of sunlight exposure correlate with higher happiness scores.
- Suicide Rate: Estimate = -0.713, p-value < 2e-16. Suicide rates have a strong negative relationship with happiness, suggesting that countries with higher suicide rates tend to have lower happiness levels.
- Depression: Estimate = -1.206, p-value < 2e-16. Depression is also strongly negatively associated with happiness, reflecting the expected inverse relationship between mental health issues and happiness.
- Freedom: Estimate = 0.235, p-value = 0.008. Freedom shows a positive association with happiness, indicating that countries with greater perceived freedom report higher happiness scores.
- Non-Significant Predictors
- Social Support: Estimate = 0.153, p-value = 0.093. Social support has a positive but marginally non-significant effect on happiness (at the 5% significance level). This suggests a trend where more social support may correlate with higher happiness, though it doesn’t reach traditional significance.
- Life Expectancy: Estimate = 0.044, p-value = 0.734. Life expectancy does not show a significant association with happiness in this model, which may be due to overlapping effects with other variables like health and social factors.
- Generosity: Estimate = 0.231, p-value = 0.164. Generosity has a positive but non-significant effect on happiness in this model, meaning that it may not be as influential as other factors.
- Corruption: Estimate = -0.001, p-value = 0.990. Corruption is essentially neutral here, showing no significant impact on happiness. This may be surprising, but it could be due to correlations with other predictors or variability in how corruption is perceived across regions.
Interpretation and Implications
- Sunlight Exposure is a significant and strong predictor of happiness, supporting the idea that environmental factors, like sunlight, have a positive impact on well-being.
- Mental Health indicators (Suicide and Depression) are powerful predictors of happiness. Both higher suicide rates and higher depression rates are associated with lower happiness, reinforcing the importance of mental health support as a factor in national well-being.
- Freedom shows a meaningful and significant positive impact, suggesting that autonomy and freedom of choice contribute positively to happiness.
- While Social Support and Generosity show positive trends, they do not achieve statistical significance in this model. This could imply that, while potentially beneficial, they may not be as impactful or may interact with other social factors not captured here.
- Life Expectancy and Corruption do not have significant impacts in this model. Their lack of significance could be due to correlations with other predictors or differences in how they influence subjective well-being.
Next Steps
- Visualization: Visualize these relationships to further illustrate the importance of each factor.
- Interaction Effects: Explore potential interaction effects, especially between social support, freedom, and mental health.
- Further Analysis: Investigate why certain variables like corruption and life expectancy are not significant, as they may have indirect effects or interact with other predictors.
Step 10: Comparison of Top 15 and Bottom 15 Countries in Happiness
# Sorting and selecting top 15 and bottom 15 countries
top_15 <- suicide_data %>% arrange(desc(Adjusted_Happiness)) %>% head(15)
bottom_15 <- suicide_data %>% arrange(Adjusted_Happiness) %>% head(15)
# Combining for easy plotting
top_bottom <- bind_rows(top_15, bottom_15)
# Plotting comparison
ggplot(top_bottom, aes(x = reorder(Country, Adjusted_Happiness), y = Adjusted_Happiness, fill = factor(ifelse(Adjusted_Happiness > median(suicide_data$Adjusted_Happiness), "Top 15", "Bottom 15")))) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Comparison of Top 15 and Bottom 15 Countries by Adjusted Happiness",
x = "Country",
y = "Adjusted Happiness",
fill = "Happiness Ranking") +
scale_fill_manual(values = c("Top 15" = "green", "Bottom 15" = "red"))
This bar chart compares the top 15 and bottom 15 countries based on their Adjusted Happiness scores. Here’s a breakdown of the findings and insights:
Insights from the Visualization
- Top 15 Countries (Green Bars):
- These countries, including Venezuela, Niger, Philippines, and Chad, show the highest Adjusted Happiness scores in this dataset.
- The positive values suggest that, according to non-economic factors (such as social support, mental health indicators, and sunlight exposure), these countries score well on adjusted happiness metrics.
- This may indicate that factors other than GDP, such as social support or environmental conditions, play a significant role in these countries’ perceived happiness.
- Bottom 15 Countries (Red Bars):
- Countries such as Lithuania, Ukraine, Russia, and Estonia exhibit negative Adjusted Happiness scores.
- These negative scores suggest lower levels of happiness when adjusted for factors other than GDP.
- High suicide rates, depression, or limited social support may contribute to these low adjusted happiness levels, as seen in several of these countries with harsher climates or social challenges.
- Contrasting Trends:
- The chart highlights the paradox where some economically developed countries, despite their wealth, have lower adjusted happiness scores, while some less economically developed nations rank higher.
- This contrast supports your hypothesis that happiness and well-being are multi-dimensional and that non-economic factors (like mental health and social support) can greatly influence happiness levels.
This visualization effectively demonstrates the value of considering non-economic factors to assess happiness at a country level, supporting the core premise of your study.
# Check top countries by Corruption (highest values)
corruption_check <- data %>% arrange(desc(Corruption)) %>% select(Country, Corruption) %>% head(15)
print(corruption_check)