#Load all libraries needed
#Load all data sets needed
d_popvote <- read_csv("popvote_1948_2020.csv")
d_state_popvote <- read_csv("state_popvote_1948_2020.csv")
d_state_popvote[d_state_popvote$state == "District of Columbia",]$state <- "District Of Columbia"
d_polls <- read_csv("national_polls_1968-2024.csv")
d_state_polls <- read_csv("state_polls_1968-2024.csv")
d_turnout <- read_csv("state_turnout_1980_2022.csv")
d_state_demog <- read_csv("demographics.csv")
d_state_gdp <- read_csv("cleaned_state_gdp.csv")
d_state_unemployment <- read_csv("State_unemployment.csv")
#Process state unemployment for my model's needs
#Making all columns except State numeric variables
d_state_unemployment[ , -1] <- lapply(d_state_unemployment[ , -1], as.numeric)
d_state_unemployment <- d_state_unemployment |>
cols = -State,
names_to = "date",
values_to = "unemployment" #as a rate
) |>
separate(date, into = c("month", "year"), sep = " ") |>
rename(state = State) |>
filter(month == "Sep")
d_state_unemployment$year <- as.numeric(d_state_unemployment$year)
## # A tibble: 572 × 4
## state month year unemployment
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Sep 2014 6.5
## 2 Alabama Sep 2015 6.1
## 3 Alabama Sep 2016 5.9
## 4 Alabama Sep 2017 4.1
## 5 Alabama Sep 2018 3.9
## 6 Alabama Sep 2019 3
## 7 Alabama Sep 2020 5.9
## 8 Alabama Sep 2021 3
## 9 Alabama Sep 2022 2.5
## 10 Alabama Sep 2023 2.7
## # ℹ 562 more rows
#Process gdp growth for model's need
d_state_gdp <- d_state_gdp |>
select(year, state, q2_gdp_growth)
## # A tibble: 969 × 3
## year state q2_gdp_growth
## <dbl> <chr> <dbl>
## 1 2005 Alabama NA
## 2 2005 Alaska NA
## 3 2005 Arizona NA
## 4 2005 Arkansas NA
## 5 2005 California NA
## 6 2005 Colorado NA
## 7 2005 Connecticut NA
## 8 2005 Delaware NA
## 9 2005 District of Columbia NA
## 10 2005 Florida NA
## # ℹ 959 more rows
d_state_demog <- d_state_demog |>
# Create 'hispanic' column by summing all Hispanic race categories
hispanic = rowSums(d_state_demog[, c("hispanic_white", "hispanic_black", "hispanic_american_indian",
"hispanic_asian_pacific_islander", "hispanic_other_race",
"hispanic_two_or_more_races")], na.rm = TRUE),
# Create 'age_18_to_29' column by summing the relevant age groups
age_18_to_29 = rowSums(d_state_demog[, c("age_18_to_19", "age_20", "age_21",
"age_22_to_24", "age_25_to_29")], na.rm = TRUE),
# Create 'age_30_to_44' by summing the relevant age groups
age_30_to_44 = age_30_to_34 + age_35_to_44,
# Create 'age_55_to_64' by summing the relevant age groups
age_55_to_64 = age_55_to_59 + age_60_to_61 + age_62_to_64,
# Create 'age_75plus' by summing ages 75 and older
age_75plus = age_75_to_84 + age_85_and_over
## # A tibble: 663 × 49
## ...1 year state total_pop white black american_indian
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 1972 Alabama 3534110. 73.6 26.1 0.105
## 2 2 1972 Alaska 320676. 78.6 3.06 7.54
## 3 3 1972 Arizona 1960363 89.1 2.96 5.46
## 4 4 1972 Arkansas 1995923 81.7 17.9 0.195
## 5 5 1972 California 20696088. 86.6 7.15 0.561
## 6 6 1972 Colorado 2343800 94.5 3.11 0.466
## 7 7 1972 Connecticut 3046882. 92.9 6.18 0.0901
## 8 8 1972 Delaware 557351. 84.5 14.7 0.143
## 9 9 1972 District of Columbia 732875. 27.6 70.9 0.133
## 10 10 1972 Florida 7380819. 84.2 15.0 0.130
## # ℹ 653 more rows
## # ℹ 42 more variables: asian_pacific_islander <dbl>, other_race <dbl>,
## # two_or_more_races <dbl>, hispanic_white <dbl>, hispanic_black <dbl>,
## # hispanic_american_indian <dbl>, hispanic_asian_pacific_islander <dbl>,
## # hispanic_other_race <dbl>, hispanic_two_or_more_races <dbl>,
## # not_hispanic_white <dbl>, not_hispanic_black <dbl>,
## # not_hispanic_american_indian <dbl>, …
# Process state-level polling data.
d_pollav_state <- d_state_polls |>
group_by(year, state, party) |>
mutate(mean_pollav = mean(poll_support, na.rm = TRUE)) |>
top_n(1, poll_date) |>
rename(latest_pollav = poll_support) |>
select(-c(weeks_left, days_left, poll_date, candidate, before_convention)) |>
pivot_wider(names_from = party, values_from = c(latest_pollav, mean_pollav))
# Merge data.
d <- d_pollav_state |>
left_join(d_state_popvote, by = c("year", "state")) |>
left_join(d_popvote |> filter(party == "democrat"), by = "year") |>
left_join(d_turnout, by = c("year", "state")) |>
left_join(d_state_unemployment, by = c("year", "state")) |>
left_join(d_state_gdp, by = c("year", "state")) |>
left_join(d_state_demog, by = c("state", "year")) |>
filter(year >= 1980) |>
ungroup() |>
distinct(state, year, .keep_all = TRUE)
## # A tibble: 1,144 × 15
## year state vep_turnout vep_highest_office vap_highest_office total_ballots
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2022 United… 46.20% <NA> <NA> 112030874
## 2 2022 Alabama 37.30% <NA> <NA> 1424087
## 3 2022 Alaska 50.20% <NA> <NA> 267047
## 4 2022 Arizona 49.30% <NA> <NA> 2592313
## 5 2022 Arkans… 41.50% <NA> <NA> 914227
## 6 2022 Califo… 42.90% <NA> <NA> 11146610
## 7 2022 Colora… 58.30% <NA> <NA> 2540666
## 8 2022 Connec… 48.60% <NA> <NA> 1297811
## 9 2022 Delawa… 42.80% <NA> <NA> 325632
## 10 2022 Distri… 40.60% <NA> <NA> 205774
## # ℹ 1,134 more rows
## # ℹ 9 more variables: highest_office_ballots <dbl>, vep <dbl>, vap <dbl>,
## # noncitizen <chr>, prison <dbl>, probation <dbl>, parole <dbl>,
## # total_ineligible <dbl>, overseas_eligible <dbl>
# Sequester states for which we have polling data for 2024.
states.2024 <- unique(d$state[d$year == 2024])
states.2024 <- states.2024[-which(states.2024 == "Nebraska Cd 2")]
d <- d |>
filter(state %in% states.2024)
## # A tibble: 303 × 95
## year state latest_pollav_REP latest_pollav_DEM mean_pollav_REP
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2016 Arizona 45.6 42.9 41.4
## 2 2016 California 32.8 54.9 31.7
## 3 2016 Colorado 40.8 43.6 41.4
## 4 2016 Florida 45.5 46.4 42.6
## 5 2016 Georgia 48.3 44.4 45.5
## 6 2016 Indiana 50.3 37.3 47.4
## 7 2016 Maryland 27.4 59.9 29.0
## 8 2016 Massachusetts 28.0 55.7 30.1
## 9 2016 Michigan 42.5 45.7 36.9
## 10 2016 Minnesota 38.1 47.0 39.0
## # ℹ 293 more rows
## # ℹ 90 more variables: mean_pollav_DEM <dbl>, D_pv <dbl>, R_pv <dbl>,
## # D_pv2p <dbl>, R_pv2p <dbl>, votes_D <dbl>, votes_R <dbl>,
## # total_votes <dbl>, two_party_votes <dbl>, D_pv_lag1 <dbl>, R_pv_lag1 <dbl>,
## # D_pv2p_lag1 <dbl>, R_pv2p_lag1 <dbl>, D_pv_lag2 <dbl>, R_pv_lag2 <dbl>,
## # D_pv2p_lag2 <dbl>, R_pv2p_lag2 <dbl>, party <chr>, winner <lgl>,
## # candidate <chr>, pv <dbl>, pv2p <dbl>, incumbent <lgl>, …
# Separate into training and testing for simple poll prediction model.
d.train <- d |> filter(year < 2024) |> select(year, state, D_pv2p, latest_pollav_DEM, D_pv2p_lag1, D_pv2p_lag2, q2_gdp_growth, white, black, american_indian, asian_pacific_islander, unemployment, less_than_college, bachelors, graduate, hispanic, age_18_to_29, age_30_to_44, age_45_to_54, age_55_to_64, age_65_to_74, age_75plus) |> drop_na() |> distinct(state, year, .keep_all = TRUE)
## # A tibble: 52 × 22
## year state D_pv2p latest_pollav_DEM D_pv2p_lag1 D_pv2p_lag2 q2_gdp_growth
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016 Arizona 48.1 42.9 45.4 45.7 2.67
## 2 2016 Califor… 66.1 54.9 61.9 62.3 1.88
## 3 2016 Colorado 52.7 43.6 52.7 54.6 1.57
## 4 2016 Florida 49.4 46.4 50.4 51.4 3.19
## 5 2016 Georgia 47.3 44.4 46.0 47.4 3.21
## 6 2016 Indiana 39.9 37.3 44.8 50.5 1.20
## 7 2016 Maryland 64.0 59.9 63.3 62.9 3.56
## 8 2016 Massach… 64.7 55.7 61.8 63.2 1.15
## 9 2016 Michigan 49.9 45.7 54.8 58.4 2.18
## 10 2016 Minneso… 50.8 47.0 53.9 55.2 1.35
## # ℹ 42 more rows
## # ℹ 15 more variables: white <dbl>, black <dbl>, american_indian <dbl>,
## # asian_pacific_islander <dbl>, unemployment <dbl>, less_than_college <dbl>,
## # bachelors <dbl>, graduate <dbl>, hispanic <dbl>, age_18_to_29 <dbl>,
## # age_30_to_44 <dbl>, age_45_to_54 <dbl>, age_55_to_64 <dbl>,
## # age_65_to_74 <dbl>, age_75plus <dbl>
d.test <- d |> filter(year == 2024) |> select(year, state, D_pv2p, latest_pollav_DEM, D_pv2p_lag1, D_pv2p_lag2, q2_gdp_growth, white, black, american_indian, asian_pacific_islander, unemployment, less_than_college, bachelors, graduate, hispanic, age_18_to_29, age_30_to_44, age_45_to_54, age_55_to_64, age_75plus) |> distinct(state, year, .keep_all = TRUE)
## # A tibble: 27 × 21
## year state D_pv2p latest_pollav_DEM D_pv2p_lag1 D_pv2p_lag2 q2_gdp_growth
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2024 Arizona NA 46.4 NA NA 5.9
## 2 2024 Califor… NA 58.7 NA NA 5.4
## 3 2024 Colorado NA 52.8 NA NA 5.8
## 4 2024 Florida NA 44.5 NA NA 6.1
## 5 2024 Georgia NA 47.0 NA NA 6
## 6 2024 Indiana NA 39.2 NA NA 3.9
## 7 2024 Maine C… NA 43.7 NA NA NA
## 8 2024 Maryland NA 61.3 NA NA 5.1
## 9 2024 Massach… NA 59.7 NA NA 4.7
## 10 2024 Michigan NA 47.8 NA NA 6.5
## # ℹ 17 more rows
## # ℹ 14 more variables: white <dbl>, black <dbl>, american_indian <dbl>,
## # asian_pacific_islander <dbl>, unemployment <dbl>, less_than_college <dbl>,
## # bachelors <dbl>, graduate <dbl>, hispanic <dbl>, age_18_to_29 <dbl>,
## # age_30_to_44 <dbl>, age_45_to_54 <dbl>, age_55_to_64 <dbl>,
## # age_75plus <dbl>
# Step 1: Modify `t` to include demographic variables
t <- d |>
filter(year >= 2016) |>
arrange(year) |>
group_by(state) |>
D_pv2p_lag1 = lag(D_pv2p, 1),
R_pv2p_lag1 = lag(R_pv2p, 1),
D_pv2p_lag2 = lag(D_pv2p, 2),
R_pv2p_lag2 = lag(R_pv2p, 2),
white = lag(white, 1),
black = lag(black, 1),
american_indian = lag(american_indian, 1),
asian_pacific_islander = lag(asian_pacific_islander, 1),
less_than_college = lag(less_than_college, 1),
bachelors = lag(bachelors, 1),
graduate = lag(graduate, 1),
hispanic = lag(hispanic, 1),
age_18_to_29 = lag(age_18_to_29, 1),
age_30_to_44 = lag(age_30_to_44, 1),
age_45_to_54 = lag(age_45_to_54, 1),
age_55_to_64 = lag(age_55_to_64, 1),
age_65_to_74 = lag(age_65_to_74, 1),
age_75plus = lag(age_75plus, 1)
) |>
filter(year == 2024) |>
select(state, year, D_pv2p, D_pv2p_lag1, R_pv2p_lag1, D_pv2p_lag2, R_pv2p_lag2, white, black, american_indian, asian_pacific_islander, less_than_college, bachelors, graduate, hispanic, age_18_to_29, age_30_to_44, age_45_to_54, age_55_to_64, age_65_to_74, age_75plus) |>
distinct(state, year, .keep_all = TRUE)
## # A tibble: 27 × 21
## # Groups: state [27]
## state year D_pv2p D_pv2p_lag1 R_pv2p_lag1 D_pv2p_lag2 R_pv2p_lag2 white
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arizona 2024 NA 50.2 49.8 48.1 51.9 60.4
## 2 California 2024 NA 64.9 35.1 66.1 33.9 41.2
## 3 Colorado 2024 NA 56.9 43.1 52.7 47.3 70.7
## 4 Florida 2024 NA 48.3 51.7 49.4 50.6 57.7
## 5 Georgia 2024 NA 50.1 49.9 47.3 52.7 51.9
## 6 Indiana 2024 NA 41.8 58.2 39.9 60.1 77.2
## 7 Maine Cd 2 2024 NA NA NA NA NA NA
## 8 Maryland 2024 NA 67.0 33.0 64.0 36.0 48.7
## 9 Massachus… 2024 NA 67.1 32.9 64.7 35.3 69.6
## 10 Michigan 2024 NA 51.4 48.6 49.9 50.1 73.9
## # ℹ 17 more rows
## # ℹ 13 more variables: black <dbl>, american_indian <dbl>,
## # asian_pacific_islander <dbl>, less_than_college <dbl>, bachelors <dbl>,
## # graduate <dbl>, hispanic <dbl>, age_18_to_29 <dbl>, age_30_to_44 <dbl>,
## # age_45_to_54 <dbl>, age_55_to_64 <dbl>, age_65_to_74 <dbl>,
## # age_75plus <dbl>
# Step 2: Join `t` with `d.test` to ensure demographic data is included for 2024 predictions
d.test <- d.test |>
left_join(t, by = c("state", "year"), suffix = c("", ".t")) |>
D_pv2p_lag1 = ifelse(is.na(D_pv2p_lag1), D_pv2p_lag1.t, D_pv2p_lag1),
D_pv2p_lag2 = ifelse(is.na(D_pv2p_lag2), D_pv2p_lag2.t, D_pv2p_lag2),
D_pv2p = ifelse(is.na(D_pv2p), D_pv2p.t, D_pv2p),
# Include demographic variables with fallback values
white = ifelse(is.na(white), white.t, white),
black = ifelse(is.na(black), black.t, black),
american_indian = ifelse(is.na(american_indian), american_indian.t, american_indian),
asian_pacific_islander = ifelse(is.na(asian_pacific_islander), asian_pacific_islander.t, asian_pacific_islander),
less_than_college = ifelse(is.na(less_than_college), less_than_college.t, less_than_college),
bachelors = ifelse(is.na(bachelors), bachelors.t, bachelors),
graduate = ifelse(is.na(graduate), graduate.t, graduate),
hispanic = ifelse(is.na(hispanic), hispanic.t, hispanic),
age_18_to_29 = ifelse(is.na(age_18_to_29), age_18_to_29.t, age_18_to_29),
age_30_to_44 = ifelse(is.na(age_30_to_44), age_30_to_44.t, age_30_to_44),
age_45_to_54 = ifelse(is.na(age_45_to_54), age_45_to_54.t, age_45_to_54),
age_55_to_64 = ifelse(is.na(age_55_to_64), age_55_to_64.t, age_55_to_64),
age_75plus = ifelse(is.na(age_75plus), age_75plus.t, age_75plus),
) |>
## # A tibble: 27 × 24
## year state D_pv2p latest_pollav_DEM D_pv2p_lag1 D_pv2p_lag2 q2_gdp_growth
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2024 Arizona NA 46.4 50.2 48.1 5.9
## 2 2024 Califor… NA 58.7 64.9 66.1 5.4
## 3 2024 Colorado NA 52.8 56.9 52.7 5.8
## 4 2024 Florida NA 44.5 48.3 49.4 6.1
## 5 2024 Georgia NA 47.0 50.1 47.3 6
## 6 2024 Indiana NA 39.2 41.8 39.9 3.9
## 7 2024 Maine C… NA 43.7 NA NA NA
## 8 2024 Maryland NA 61.3 67.0 64.0 5.1
## 9 2024 Massach… NA 59.7 67.1 64.7 4.7
## 10 2024 Michigan NA 47.8 51.4 49.9 6.5
## # ℹ 17 more rows
## # ℹ 17 more variables: white <dbl>, black <dbl>, american_indian <dbl>,
## # asian_pacific_islander <dbl>, unemployment <dbl>, less_than_college <dbl>,
## # bachelors <dbl>, graduate <dbl>, hispanic <dbl>, age_18_to_29 <dbl>,
## # age_30_to_44 <dbl>, age_45_to_54 <dbl>, age_55_to_64 <dbl>,
## # age_75plus <dbl>, R_pv2p_lag1 <dbl>, R_pv2p_lag2 <dbl>, age_65_to_74 <dbl>
# Update the regression model to include demographic variables
reg.ols <- lm(D_pv2p ~ latest_pollav_DEM + D_pv2p_lag1 + D_pv2p_lag2 + q2_gdp_growth + white + black + american_indian + asian_pacific_islander + less_than_college + bachelors + graduate + hispanic + age_18_to_29 + age_30_to_44 + age_45_to_54 + age_55_to_64 + age_65_to_74 + age_75plus, data = d.train)
## Call:
## lm(formula = D_pv2p ~ latest_pollav_DEM + D_pv2p_lag1 + D_pv2p_lag2 +
## q2_gdp_growth + white + black + american_indian + asian_pacific_islander +
## less_than_college + bachelors + graduate + hispanic + age_18_to_29 +
## age_30_to_44 + age_45_to_54 + age_55_to_64 + age_65_to_74 +
## age_75plus, data = d.train)
## Residuals:
## Min 1Q Median 3Q Max
## -4.4888 -0.5383 0.0214 0.5888 2.1488
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.00816 32.84308 -0.305 0.762488
## latest_pollav_DEM 0.54305 0.10863 4.999 1.85e-05 ***
## D_pv2p_lag1 0.03558 0.13000 0.274 0.786037
## D_pv2p_lag2 0.38624 0.10030 3.851 0.000513 ***
## q2_gdp_growth 0.02220 0.09086 0.244 0.808507
## white 0.31494 0.23272 1.353 0.185166
## black 0.26354 0.21702 1.214 0.233225
## american_indian 0.40327 0.33023 1.221 0.230664
## asian_pacific_islander 0.62921 0.36628 1.718 0.095200 .
## less_than_college -1.20374 1.34072 -0.898 0.375780
## bachelors -0.83998 1.37430 -0.611 0.545250
## graduate -0.55304 1.39483 -0.396 0.694291
## hispanic 0.24277 0.12554 1.934 0.061753 .
## age_18_to_29 1.45120 1.42208 1.020 0.314928
## age_30_to_44 0.22489 1.41381 0.159 0.874585
## age_45_to_54 2.00334 1.37929 1.452 0.155822
## age_55_to_64 -1.11776 1.84174 -0.607 0.548068
## age_65_to_74 2.71133 1.33194 2.036 0.049883 *
## age_75plus -0.22373 1.55950 -0.143 0.886800
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 1.378 on 33 degrees of freedom
## Multiple R-squared: 0.9813, Adjusted R-squared: 0.9712
## F-statistic: 96.41 on 18 and 33 DF, p-value: < 2.2e-16
pred.ols.dem <- predict(reg.ols, newdata = d.test)
## 1 2 3 4 5 6 7 8
## 49.36694 66.02354 56.50472 45.34069 48.91996 38.15609 NA 64.60888
## 9 10 11 12 13 14 15 16
## 66.70648 47.78016 50.26365 38.17351 37.72057 37.82169 49.43660 50.82756
## 17 18 19 20 21 22 23 24
## 53.59671 61.88340 48.35522 42.23250 47.39012 41.76571 47.25647 40.13865
## 25 26 27
## 54.53748 59.49722 47.00644
# Assuming d.train and d.test are your training and testing datasets respectively
# Prepare your training data
x_train <- as.matrix(d.train[, c("latest_pollav_DEM", "D_pv2p_lag1", "D_pv2p_lag2", "white", "black", "american_indian", "asian_pacific_islander", "less_than_college", "bachelors", "graduate", "hispanic", "age_18_to_29", "age_30_to_44", "age_45_to_54", "age_55_to_64", "age_65_to_74", "age_75plus", "q2_gdp_growth")])
y_train <- d.train$D_pv2p
# Standardize predictors
x_train <- scale(x_train)
# Prepare your testing data
x_test <- as.matrix(d.test[, c("latest_pollav_DEM", "D_pv2p_lag1", "D_pv2p_lag2", "white", "black", "american_indian", "asian_pacific_islander", "less_than_college", "bachelors", "graduate", "hispanic", "age_18_to_29", "age_30_to_44", "age_45_to_54", "age_55_to_64", "age_65_to_74", "age_75plus", "q2_gdp_growth")])
x_test <- scale(x_test)
# Define a sequence of alpha values to try
alphas <- seq(0, 1, by = 0.1)
# Find the best alpha and lambda using cross-validation
results <- lapply(alphas, function(a) {
cv_fit <- cv.glmnet(x_train, y_train, alpha = a)
return(data.frame(alpha = a, lambda = cv_fit$lambda.min, cvm = min(cv_fit$cvm)))
# Convert results to a data frame
results_df <- do.call(rbind, results)
best_params <- results_df[which.min(results_df$cvm), ]
best_alpha <- best_params$alpha
best_lambda <- best_params$lambda
final_model <- glmnet(x_train, y_train, alpha = best_alpha, lambda = best_lambda)
# Make predictions using the Elastic Net model
predictions <- predict(final_model, newx = x_test, s = best_lambda)
pred_elastic_net <- as.vector(predictions)
results <- data.frame(
state = d.test$state,
predicted_D_pv2p = pred_elastic_net,
predicted_winner = ifelse(pred_elastic_net > 50, "Democrat", "Republican")
# Filter results for specific states
final_results <- results %>%
filter(state %in% c("Arizona", "Georgia", "Michigan", "Nevada", "North Carolina", "Pennsylvania", "Wisconsin"))
## state predicted_D_pv2p predicted_winner
## 1 Arizona 50.55875 Democrat
## 2 Georgia 50.32704 Democrat
## 3 Michigan 49.31514 Republican
## 4 Nevada 50.70580 Democrat
## 5 North Carolina 49.75490 Republican
## 6 Pennsylvania 48.58076 Republican
## 7 Wisconsin 48.31193 Republican
# correlation matrix for model variables
cor_matrix <- cor(d.train[, c("latest_pollav_DEM", "D_pv2p_lag1", "D_pv2p_lag2", "white", "black", "american_indian", "asian_pacific_islander", "less_than_college", "bachelors", "graduate", "hispanic", "age_18_to_29", "age_30_to_44", "age_45_to_54", "age_55_to_64", "age_65_to_74", "age_75plus", "q2_gdp_growth")])
## latest_pollav_DEM D_pv2p_lag1 D_pv2p_lag2 white
## latest_pollav_DEM 1.0000000 0.83392805 0.70218567 -0.48180317
## D_pv2p_lag1 0.8339280 1.00000000 0.92244118 -0.44992859
## D_pv2p_lag2 0.7021857 0.92244118 1.00000000 -0.29780877
## white -0.4818032 -0.44992859 -0.29780877 1.00000000
## black 0.1933139 0.14117914 0.10962505 -0.48727823
## american_indian -0.1049692 -0.04590781 -0.04605956 -0.11518521
## asian_pacific_islander 0.6391178 0.67470872 0.57017516 -0.60471235
## less_than_college -0.4005134 -0.26889920 -0.13748743 0.02410127
## bachelors 0.5102479 0.36843374 0.29998466 0.19229910
## graduate 0.7545521 0.67563547 0.59834530 -0.19100817
## hispanic 0.1663259 0.21975390 0.09455487 -0.68099441
## age_18_to_29 -0.1962627 -0.19739595 -0.31557399 -0.12548014
## age_30_to_44 0.1590385 0.19637900 0.04249750 -0.51831317
## age_45_to_54 0.2311597 0.54264180 0.62415381 -0.03657881
## age_55_to_64 0.2764114 0.27901154 0.43490082 0.34085812
## age_65_to_74 0.2146674 0.06350367 0.14573155 0.11489596
## age_75plus 0.1555921 0.15512982 0.26179444 0.21295083
## q2_gdp_growth -0.3932084 -0.01743752 -0.01706968 0.11954415
## black american_indian asian_pacific_islander
## latest_pollav_DEM 0.19331390 -0.1049692038 0.63911778
## D_pv2p_lag1 0.14117914 -0.0459078141 0.67470872
## D_pv2p_lag2 0.10962505 -0.0460595637 0.57017516
## white -0.48727823 -0.1151852123 -0.60471235
## black 1.00000000 -0.4197600789 0.05622805
## american_indian -0.41976008 1.0000000000 -0.20998016
## asian_pacific_islander 0.05622805 -0.2099801618 1.00000000
## less_than_college 0.06720393 0.1959565836 -0.35263269
## bachelors -0.15391544 -0.2609137668 0.27150794
## graduate 0.17903018 -0.1650380170 0.38614431
## hispanic -0.21197723 0.4972419477 0.37501039
## age_18_to_29 -0.17133511 0.0258686401 0.18817228
## age_30_to_44 0.05947731 -0.1232637144 0.58541806
## age_45_to_54 0.38936719 -0.3899164222 0.16889804
## age_55_to_64 0.02785193 0.0009966614 -0.22602836
## age_65_to_74 0.01146534 0.2341464921 -0.29778231
## age_75plus -0.04135942 0.0785925383 -0.24307132
## q2_gdp_growth -0.02835996 0.0054563007 -0.07637543
## less_than_college bachelors graduate hispanic
## latest_pollav_DEM -0.40051344 0.51024790 0.75455206 0.16632587
## D_pv2p_lag1 -0.26889920 0.36843374 0.67563547 0.21975390
## D_pv2p_lag2 -0.13748743 0.29998466 0.59834530 0.09455487
## white 0.02410127 0.19229910 -0.19100817 -0.68099441
## black 0.06720393 -0.15391544 0.17903018 -0.21197723
## american_indian 0.19595658 -0.26091377 -0.16503802 0.49724195
## asian_pacific_islander -0.35263269 0.27150794 0.38614431 0.37501039
## less_than_college 1.00000000 -0.77506811 -0.70719759 0.06057206
## bachelors -0.77506811 1.00000000 0.72786388 -0.26957882
## graduate -0.70719759 0.72786388 1.00000000 -0.08327055
## hispanic 0.06057206 -0.26957882 -0.08327055 1.00000000
## age_18_to_29 -0.40190802 -0.03134756 -0.03546023 0.28574164
## age_30_to_44 -0.45973842 0.15669127 0.09768318 0.45927148
## age_45_to_54 0.11172642 0.14807754 0.31779866 -0.25132395
## age_55_to_64 0.27799876 0.24849211 0.27268675 -0.44865171
## age_65_to_74 0.47058418 0.02422522 0.06270197 -0.16164107
## age_75plus 0.43024997 0.02368186 0.09820635 -0.18274175
## q2_gdp_growth -0.01436552 -0.15902956 -0.16325610 0.02800824
## age_18_to_29 age_30_to_44 age_45_to_54 age_55_to_64
## latest_pollav_DEM -0.19626266 0.15903847 0.23115975 0.2764113834
## D_pv2p_lag1 -0.19739595 0.19637900 0.54264180 0.2790115414
## D_pv2p_lag2 -0.31557399 0.04249750 0.62415381 0.4349008219
## white -0.12548014 -0.51831317 -0.03657881 0.3408581233
## black -0.17133511 0.05947731 0.38936719 0.0278519275
## american_indian 0.02586864 -0.12326371 -0.38991642 0.0009966614
## asian_pacific_islander 0.18817228 0.58541806 0.16889804 -0.2260283587
## less_than_college -0.40190802 -0.45973842 0.11172642 0.2779987605
## bachelors -0.03134756 0.15669127 0.14807754 0.2484921124
## graduate -0.03546023 0.09768318 0.31779866 0.2726867513
## hispanic 0.28574164 0.45927148 -0.25132395 -0.4486517054
## age_18_to_29 1.00000000 0.53539057 -0.39620648 -0.7799927132
## age_30_to_44 0.53539057 1.00000000 -0.09254776 -0.7091768651
## age_45_to_54 -0.39620648 -0.09254776 1.00000000 0.4932061691
## age_55_to_64 -0.77999271 -0.70917687 0.49320617 1.0000000000
## age_65_to_74 -0.73072234 -0.68095319 0.09080440 0.7888722857
## age_75plus -0.68796830 -0.75537744 0.22130697 0.7228140433
## q2_gdp_growth 0.28873057 0.24663837 0.29868296 -0.3266608698
## age_65_to_74 age_75plus q2_gdp_growth
## latest_pollav_DEM 0.21466738 0.15559207 -0.393208389
## D_pv2p_lag1 0.06350367 0.15512982 -0.017437516
## D_pv2p_lag2 0.14573155 0.26179444 -0.017069682
## white 0.11489596 0.21295083 0.119544149
## black 0.01146534 -0.04135942 -0.028359963
## american_indian 0.23414649 0.07859254 0.005456301
## asian_pacific_islander -0.29778231 -0.24307132 -0.076375427
## less_than_college 0.47058418 0.43024997 -0.014365516
## bachelors 0.02422522 0.02368186 -0.159029557
## graduate 0.06270197 0.09820635 -0.163256104
## hispanic -0.16164107 -0.18274175 0.028008238
## age_18_to_29 -0.73072234 -0.68796830 0.288730566
## age_30_to_44 -0.68095319 -0.75537744 0.246638368
## age_45_to_54 0.09080440 0.22130697 0.298682957
## age_55_to_64 0.78887229 0.72281404 -0.326660870
## age_65_to_74 1.00000000 0.81187914 -0.486376631
## age_75plus 0.81187914 1.00000000 -0.229458740
## q2_gdp_growth -0.48637663 -0.22945874 1.000000000
# Calculate VIF to check for multicollinearity
vif_model <- vif(lm(D_pv2p ~ latest_pollav_DEM + D_pv2p_lag1 + D_pv2p_lag2 + q2_gdp_growth + white + black + american_indian + asian_pacific_islander + less_than_college + bachelors + graduate + hispanic + age_18_to_29 + age_30_to_44 + age_45_to_54 + age_55_to_64 + age_65_to_74 + age_75plus, data = d.train))
## latest_pollav_DEM D_pv2p_lag1 D_pv2p_lag2
## 18.63658 31.88611 16.81634
## q2_gdp_growth white black
## 6.15531 219.05611 88.92473
## american_indian asian_pacific_islander less_than_college
## 13.60474 35.86533 581.29787
## bachelors graduate hispanic
## 193.21365 197.15273 62.88381
## age_18_to_29 age_30_to_44 age_45_to_54
## 33.14366 50.42852 43.89433
## age_55_to_64 age_65_to_74 age_75plus
## 113.40627 59.06835 60.93673
win_pred <- data.frame(state = d.test$state,
year = rep(2024, length(d.test$state)),
simp_pred_dem = pred.ols.dem,
simp_pred_rep = 100 - pred.ols.dem) |>
mutate(winner = ifelse(simp_pred_dem > simp_pred_rep, "Democrat", "Republican")) |>
left_join(d_ec, by = c("state", "year"))
## state year simp_pred_dem simp_pred_rep winner stateab electors
## 1 Arizona 2024 49.36694 50.63306 Republican AZ 11
## 2 California 2024 66.02354 33.97646 Democrat CA 54
## 3 Colorado 2024 56.50472 43.49528 Democrat CO 10
## 4 Florida 2024 45.34069 54.65931 Republican FL 30
## 5 Georgia 2024 48.91996 51.08004 Republican GA 16
## 6 Indiana 2024 38.15609 61.84391 Republican IN 11
## 7 Maine Cd 2 2024 NA NA <NA> <NA> NA
## 8 Maryland 2024 64.60888 35.39112 Democrat MD 10
## 9 Massachusetts 2024 66.70648 33.29352 Democrat MA 11
## 10 Michigan 2024 47.78016 52.21984 Republican MI 15
## 11 Minnesota 2024 50.26365 49.73635 Democrat MN 10
## 12 Missouri 2024 38.17351 61.82649 Republican MO 10
## 13 Montana 2024 37.72057 62.27943 Republican MT 4
## 14 Nebraska 2024 37.82169 62.17831 Republican NE 5
## 15 Nevada 2024 49.43660 50.56340 Republican NV 6
## 16 New Hampshire 2024 50.82756 49.17244 Democrat NH 4
## 17 New Mexico 2024 53.59671 46.40329 Democrat NM 5
## 18 New York 2024 61.88340 38.11660 Democrat NY 28
## 19 North Carolina 2024 48.35522 51.64478 Republican NC 16
## 20 Ohio 2024 42.23250 57.76750 Republican OH 17
## 21 Pennsylvania 2024 47.39012 52.60988 Republican PA 19
## 22 South Carolina 2024 41.76571 58.23429 Republican SC 9
## 23 Texas 2024 47.25647 52.74353 Republican TX 40
## 24 Utah 2024 40.13865 59.86135 Republican UT 6
## 25 Virginia 2024 54.53748 45.46252 Democrat VA 13
## 26 Washington 2024 59.49722 40.50278 Democrat WA 12
## 27 Wisconsin 2024 47.00644 52.99356 Republican WI 10
win_pred |>
filter(winner == "Democrat") |>
## state
## 1 California
## 2 Colorado
## 3 Maryland
## 4 Massachusetts
## 5 Minnesota
## 6 New Hampshire
## 7 New Mexico
## 8 New York
## 9 Virginia
## 10 Washington
win_pred |>
filter(winner == "Republican") |>
## state
## 1 Arizona
## 2 Florida
## 3 Georgia
## 4 Indiana
## 5 Michigan
## 6 Missouri
## 7 Montana
## 8 Nebraska
## 9 Nevada
## 10 North Carolina
## 11 Ohio
## 12 Pennsylvania
## 13 South Carolina
## 14 Texas
## 15 Utah
## 16 Wisconsin
win_pred |>
group_by(winner) |>
summarize(n = n(), ec = sum(electors))
## # A tibble: 3 × 3
## winner n ec
## <chr> <int> <dbl>
## 1 Democrat 10 157
## 2 Republican 16 225
## 3 <NA> 1 NA
# Create data set to summarize winners and EC
win_pred <- data.frame(state = d.test$state,
year = rep(2024, length(d.test$state)),
simp_pred_dem = pred.ols.dem,
simp_pred_rep = 100 - pred.ols.dem) |>
mutate(winner = ifelse(simp_pred_dem > simp_pred_rep, "Democrat", "Republican")) |> left_join(d_ec, by = c("state", "year"))
#### monte carlo simulations so i can create confidence intervals
# Set the number of simulations
m <- 1e4 # 10,000 simulations
residual_se <- summary(reg.ols)$sigma
pred_dem <- predict(reg.ols, newdata = d.test)
n_states <- nrow(d.test)
pred.mat <- data.frame(
state = rep(d.test$state, times = m),
year = rep(2024, times = m * n_states),
simp_pred_dem = numeric(m * n_states),
simp_pred_rep = numeric(m * n_states)
j <- 1
for (i in 1:m) {
if (i %% 1000 == 0) {
print(paste("Simulation", i))
simulated_errors <- rnorm(n_states, mean = 0, sd = residual_se)
pred_dem_sim <- pred_dem + simulated_errors
pred_dem_sim <- pmin(pmax(pred_dem_sim, 0), 100)
pred_rep_sim <- 100 - pred_dem_sim
idx <- j:(i * n_states)
pred.mat$simp_pred_dem[idx] <- pred_dem_sim
pred.mat$simp_pred_rep[idx] <- pred_rep_sim
j <- j + n_states
pred.mat <- pred.mat %>%
winner = ifelse(simp_pred_dem > simp_pred_rep, "Democrat", "Republican")
electoral_outcomes <- pred.mat |>
group_by(state) |>
mean_dem = mean(simp_pred_dem),
mean_rep = mean(simp_pred_rep),
sd_dem = sd(simp_pred_dem),
sd_rep = sd(simp_pred_rep),
lower_dem = mean_dem - 1.96 * sd_dem,
upper_dem = mean_dem + 1.96 * sd_dem,
lower_rep = mean_rep - 1.96 * sd_rep,
upper_rep = mean_rep + 1.96 * sd_rep
) |>
winner = ifelse(mean_dem > mean_rep, "Democrat", "Republican")
#Since this only gives us the results for 19 states, it includes all swing states so a simple calculation with lagged vote will allow me to fill in the winner for the rest of the US
#list of all states
all_states <- state.name
missing_states <- setdiff(all_states, unique(electoral_outcomes$state))
## [1] "Alabama" "Alaska" "Arkansas" "Connecticut"
## [5] "Delaware" "Hawaii" "Idaho" "Illinois"
## [9] "Iowa" "Kansas" "Kentucky" "Louisiana"
## [13] "Maine" "Mississippi" "New Jersey" "North Dakota"
## [17] "Oklahoma" "Oregon" "Rhode Island" "South Dakota"
## [21] "Tennessee" "Vermont" "West Virginia" "Wyoming"
#created a new data frame for the missing states
missing_data <- d_state_popvote |>
filter(state %in% missing_states) |>
filter(year == 2020) |>
select(state, D_pv2p_lag1, R_pv2p_lag1)
#Create the winner column based on lagged vote shares. Since these are not swing states, using the outcome from the last election reflects whethere it is a blue/red state
missing_data <- missing_data|>
winner = ifelse(D_pv2p_lag1 > R_pv2p_lag1, "Democrat", "Republican")
ec_2024 <- d_ec |>
filter(year == 2024) |>
select(state, electors)
#Combine datasets
all_states_pred <- bind_rows(electoral_outcomes, missing_data) |>
left_join(ec_2024, by = "state")
winner <- all_states_pred |>
group_by(winner) |>
summarize(total_electors = sum(electors))
## # A tibble: 3 × 2
## winner total_electors
## <chr> <dbl>
## 1 Democrat 223
## 2 Republican 312
## 3 <NA> NA
#DC not included so add 3 for Dem electoral college count
election_results <- tibble(
party = c("Democrat", "Republican"),
total_electors = c(winner$total_electors[1] + 3, winner$total_electors[2])
party | total_electors |
Democrat | 226 |
Republican | 312 |
plot_usmap(data = all_states_pred, regions = "states", values = "winner") + scale_fill_manual(
values = c("Democrat" = "blue", "Republican" = "red"),
name = "Predicted Winner"
) +
labs(title = "Electoral College Predictions")

#how close is this race?
electoral_outcomes_close <- electoral_outcomes |>
filter(state %in% c("Arizona", "Georgia", "Michigan", "Nevada", "North Carolina", "Pennsylvania", "Wisconsin")) |>
select(state, mean_dem, mean_rep, lower_dem, upper_dem,lower_rep, upper_rep, winner)
state | mean_dem | mean_rep | lower_dem | upper_dem | lower_rep | upper_rep | winner |
Arizona | 49.37393 | 50.62607 | 46.69963 | 52.04824 | 47.95176 | 53.30037 | Republican |
Georgia | 48.92008 | 51.07992 | 46.18223 | 51.65792 | 48.34208 | 53.81777 | Republican |
Michigan | 47.76604 | 52.23396 | 45.05198 | 50.48009 | 49.51991 | 54.94802 | Republican |
Nevada | 49.43468 | 50.56532 | 46.74295 | 52.12642 | 47.87358 | 53.25705 | Republican |
North Carolina | 48.37044 | 51.62956 | 45.66500 | 51.07588 | 48.92412 | 54.33500 | Republican |
Pennsylvania | 47.40134 | 52.59866 | 44.72298 | 50.07971 | 49.92029 | 55.27702 | Republican |
Wisconsin | 47.00512 | 52.99488 | 44.29103 | 49.71922 | 50.28078 | 55.70897 | Republican |