#if (!require("easypackages")) install.packages("easypackages")
library(easypackages)
libraries('dplyr', 'tidyverse', 'ggplot2', 'glm2', 'VGAM', 'multcomp', 'stringr', 'readxl', 'Lahman', 'kableExtra', 'GGally', 'modelr', 'broom', 'DiagrammeR', "PerformanceAnalytics", 'finalfit', "patchwork", "flextable", "dvmisc", "SurrogateRsq")
Note: %>% this symbol is known as ‘pipe’. Basically, it allows to chain operations, which helps to reduce the code needed to perform some actions. Also, inside the dplyr package, in tidyverse, are most of the ‘verbs’ you will see in data manipulation: select, filter, group_by, summarise, mutate, and arrange.
If you are interested in these functions (very useful if you are not good with base r), you can read this material from Professor Brandon LeBeau (EMS program), with whom I learned most of what I know about r via RStudio:
https://github.com/lebebr01/psqf_6250/tree/master/Syntax/R/v2
This course has a website as well: https://psqf6250.brandonlebeau.org/rcode/, however, Data Manipulation unit is missing.
registration <- readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/A_Inscritos_puntajes/A_INSCRITOS_PUNTAJES_PDT_2021_PUB_MRUN.csv")
socioeconomic <- readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/B_Socioeconomico/B_SOCIOECONOMICO_DOMICILIO_PDT_2021_PUB_MRUN.csv")
application <-readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/C_Postulacion_Seleccion/C_POSTULANTES_SELECCION_PDT_2021_PUB_MRUN.csv")
admission <- readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/D_Matricula_MRUN/D_MATRICULA_PDT__2021_PUB_MRUN.csv")
# Select important variables from first stage
a <- registration%>%
dplyr::select(MRUN, COD_SEXO, RBD, CODIGO_ENS, RAMA_EDUCACIONAL,
DEPENDENCIA, PROMEDIO_NOTAS, PTJE_NEM, PTJE_RANKING,
CLEC_ACTUAL, MATE_ACTUAL, PUNTAJES_PROCESO)%>%
dplyr::filter(PUNTAJES_PROCESO == 1 & CODIGO_ENS %in% c('310', '410', '510', '610'))%>% #filters
dplyr::select(MRUN, COD_SEXO, RBD, RAMA_EDUCACIONAL, DEPENDENCIA,
PROMEDIO_NOTAS, PTJE_NEM, PTJE_RANKING, CLEC_ACTUAL,MATE_ACTUAL)%>%
mutate(
gender = ifelse(COD_SEXO == 2, 1,0),
type_hs = ifelse(RAMA_EDUCACIONAL == 'H1', 0,1),
sector_sub = ifelse(DEPENDENCIA == 3, 1, 0),
sector_priv = ifelse(DEPENDENCIA == 4, 1, 0))%>%
dplyr::select(MRUN,RBD,PROMEDIO_NOTAS, PTJE_NEM, PTJE_RANKING, CLEC_ACTUAL,MATE_ACTUAL,gender, type_hs, sector_sub, sector_priv)
colnames(a) <- c('MRUN', 'RBD', 'gpa', 'NEM', 'RKG', 'LEN', 'MATH', 'gender', 'type_hs','sector_sub', 'sector_priv')
# Second stage
b <- socioeconomic%>%
dplyr::select(MRUN, INGRESO_PERCAPITA_GRUPO_FA, EDUCACION_MADRE, EDUCACION_PADRE)
ab <- left_join(a, b, by = 'MRUN')
ab <- ab%>%
mutate(
inc = INGRESO_PERCAPITA_GRUPO_FA,
ed_m = EDUCACION_MADRE,
ed_d = EDUCACION_PADRE,
ed_dad = ifelse(ed_d >= 0 & ed_d <=5, 0,
ifelse(ed_d >= 6 & ed_d <= 9, 1,
ifelse(ed_d == 13, NA, 2))),
ed_mom = ifelse(ed_m >= 1 & ed_m <= 5, 0,
ifelse(ed_m >= 6 & ed_m <= 9, 1,
ifelse(ed_m == 13, NA, 2))),
income = ifelse(inc == 1|inc == 2, 0,
ifelse(inc == 3|inc == 4, 1,
ifelse(inc == 5|inc == 6, 2,
ifelse(inc == 7|inc == 8, 3, 4)))))%>%
dplyr::select('MRUN', 'gpa', 'NEM', 'RKG', 'LEN', 'MATH', 'gender', 'type_hs', 'sector_sub', 'sector_priv', 'ed_dad', 'ed_mom', 'income')
# Third stage
c <- application%>%
dplyr::select(MRUN, SITUACION_POSTULANTE)%>%
mutate(
appl = ifelse(SITUACION_POSTULANTE == 'P', 1,0))%>%
dplyr::select(MRUN, appl)
abc <- left_join(ab, c, by = 'MRUN')
# Fourth stage
d <- admission%>%
dplyr::select(MRUN, VIA_ADMISION)%>%
mutate(
admitted = ifelse(VIA_ADMISION > 0, 1, 0))%>%
filter(!is.na(MRUN))%>%
dplyr::select(MRUN, admitted)%>%
unique()
# Data
abcd <- left_join(abc, d, by = 'MRUN')
colnames(abcd) <- c('ID', 'GPA', 'NEM', 'RKG', 'LEN', 'MATH', 'GENDER', 'TYPE', 'SECTOR_SUB', 'SECTOR_PRIV', 'ED_DAD', 'ED_MOM', 'INCOME', 'APPLY', 'ADM')
# New variables for the coming analysis
abcd <- abcd%>%
mutate(
NEM = ifelse(NEM == 0, NA, NEM),
RKG = ifelse(RKG == 0, NA, RKG),
LEN = ifelse(LEN == 0, NA, LEN),
MATH = ifelse(MATH == 0, NA, MATH),
ED_SECONDARY = ifelse(ED_MOM == 1, 1, 0),
ED_TERTIARY = ifelse(ED_MOM == 2, 1, 0),
INCOME_Q2 = ifelse(INCOME == 1, 1,0),
INCOME_Q3 = ifelse(INCOME == 2, 1,0),
INCOME_Q4 = ifelse(INCOME == 3, 1,0),
INCOME_Q5 = ifelse(INCOME == 4, 1,0),
APPLY = ifelse(is.na(APPLY), 0, APPLY),
ADM = ifelse(is.na(ADM), 0, 1))
# Labels for columns
data <- abcd%>%
dplyr::select('GPA', 'NEM', 'RKG', 'LEN', 'MATH', 'GENDER', 'TYPE', 'SECTOR_SUB', 'SECTOR_PRIV', 'INCOME_Q2', 'INCOME_Q3', 'INCOME_Q4', 'INCOME_Q5', 'APPLY', 'ADM', 'ED_SECONDARY', 'ED_TERTIARY', 'INCOME')%>%
filter(LEN > 0 & MATH > 0)
# Some variables must be factors to be included in the ggplot plots without problems. We did this after realizing the usefulness of creating variables with certain characteristics instead of doing it in the ggplot function.
data <- data %>%
mutate(
Sector = ifelse(SECTOR_SUB == 1, 1,
ifelse(SECTOR_PRIV == 1,2,0)),
Sector = factor(Sector),
Sector = fct_recode(Sector,
'Public' = "0",
'Subsidized Private' = "1",
'Private'= "2"),
Fam_Income = factor(INCOME),
Fam_Income = fct_recode(Fam_Income,
'Income Q1' = "0",
'Income Q2' = "1",
'Income Q3'= "2",
'Income Q4' = '3',
'Income Q5' = '4'),
Mother_Ed = ifelse(ED_SECONDARY == 1, 1,
ifelse(ED_TERTIARY == 1,2,0)),
Mother_Ed = factor(Mother_Ed),
Mother_Ed = fct_recode(Mother_Ed,
'Primary Education' = "0",
'Secondary Education' = "1",
'Tertiary Education'= "2"))
# write.table(data, file = "adm_data.csv", sep = ",") we save the latest data set
rm(a, ab, abc, abcd, b, c, d) # we remove these objects from the environment
kable_styling(kable(data.frame(head(data, 10)), digits = 3,
caption = 'Table 1: Dataset'),
bootstrap_options = c('hover', 'responsive', 'bordered'),
full_width = F, position = 'center', font_size =12) %>%
scroll_box(height = '250px')
GPA | NEM | RKG | LEN | MATH | GENDER | TYPE | SECTOR_SUB | SECTOR_PRIV | INCOME_Q2 | INCOME_Q3 | INCOME_Q4 | INCOME_Q5 | APPLY | ADM | ED_SECONDARY | ED_TERTIARY | INCOME | Sector | Fam_Income | Mother_Ed |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5.40 | 496 | 497 | 675 | 674 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | Subsidized Private | Income Q2 | Secondary Education |
5.43 | 504 | 504 | 377 | 431 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 3 | Public | Income Q4 | Primary Education |
5.70 | 559 | 568 | 304 | 475 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Public | Income Q1 | Secondary Education |
5.65 | 550 | 553 | 562 | 535 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 4 | Public | Income Q5 | Primary Education |
5.83 | 589 | 595 | 366 | 319 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Public | Income Q1 | Primary Education |
4.95 | 391 | 391 | 466 | 443 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 3 | Public | Income Q4 | Primary Education |
6.83 | 803 | 850 | 495 | 540 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | Public | Income Q3 | Secondary Education |
5.70 | 559 | 574 | 355 | 509 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | Public | Income Q2 | Secondary Education |
6.03 | 631 | 631 | 440 | 595 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 4 | Private | Income Q5 | Tertiary Education |
5.48 | 513 | 531 | 209 | 516 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Public | Income Q2 | Primary Education |
codebook <- data.frame(cbind(
Variable = c('GPA','NEM', 'RKG', 'LEN', 'MATH', 'GENDER', 'TYPE', 'SECTOR', 'ED_PARENTS', 'INCOME_Q', 'APPLY', 'ADM'),
Description = c('High school grade point average.', 'Standardized high school grade point average.', 'Relative position of the student in his or her school of graduation', 'Standardized score in the Language PSU', 'Standardized score in the Mathematics PSU', 'gender of student', 'Type of high school, Technical-professional or Humanist, from which the student graduated', "Sector to which the student's school of graduation belongs", 'Higher educational level of the father or mother',"Quintile of per capita income to which the student's family belongs (1 USD = 752.9 CLP)", "Student's application to a university", 'Admission of the student to a university'),
Type = c('Continuous','Continuous','Continuous','Continuous','Continuous','Binary (0 = Male, 1 = Female)','Binary (0 = Humanist, 1 = Technical school)','Binary (0 = Public, 1 = Private)','Categorical (0 = Primary, 1 = Secondary, 2 = Tertiary)','Categorical (Q1 = less than 99.6, Q2 = 99.7-167.4, Q3 = 167.5-257.7, Q4 = 257,8-468.9, Q5 = 469.0 and more)','Binary (0 = Did not apply, 1 = Applied)','Binary (0 = Not admitted, 1 = Admitted)')))
kable(codebook,
caption = 'Table 2: List of variables')%>%
kable_styling(bootstrap_options = c('bordered', 'condensed'), full_width = F, position = 'center', font_size =12)%>%
column_spec(1, italic = T, width = '.75in')
Variable | Description | Type |
---|---|---|
GPA | High school grade point average. | Continuous |
NEM | Standardized high school grade point average. | Continuous |
RKG | Relative position of the student in his or her school of graduation | Continuous |
LEN | Standardized score in the Language PSU | Continuous |
MATH | Standardized score in the Mathematics PSU | Continuous |
GENDER | gender of student | Binary (0 = Male, 1 = Female) |
TYPE | Type of high school, Technical-professional or Humanist, from which the student graduated | Binary (0 = Humanist, 1 = Technical school) |
SECTOR | Sector to which the student’s school of graduation belongs | Binary (0 = Public, 1 = Private) |
ED_PARENTS | Higher educational level of the father or mother | Categorical (0 = Primary, 1 = Secondary, 2 = Tertiary) |
INCOME_Q | Quintile of per capita income to which the student’s family belongs (1 USD = 752.9 CLP) | Categorical (Q1 = less than 99.6, Q2 = 99.7-167.4, Q3 = 167.5-257.7, Q4 = 257,8-468.9, Q5 = 469.0 and more) |
APPLY | Student’s application to a university | Binary (0 = Did not apply, 1 = Applied) |
ADM | Admission of the student to a university | Binary (0 = Not admitted, 1 = Admitted) |
kable(sample,
digits = 2,
col.names = c('Variable', 'Percentages'),
align = 'lc',
caption = 'Table 2: Sample description (n = 184,221)')%>%
kable_styling(bootstrap_options = c('striped','condensed', 'bordered'),
full_width = F, position = 'center', font_size =12)%>%
column_spec(1, italic = T)%>%
row_spec(0, bold = T)%>%
kableExtra::group_rows('Income', 1, 3) %>%
kableExtra::group_rows('Sector', 4, 5) %>%
kableExtra::group_rows('Type of high school', 6, 7)%>%
kableExtra::group_rows('Educational Level of father', 8, 10) %>%
kableExtra::group_rows('Educational Level of mother', 11, 13) %>%
kableExtra::group_rows('Gender', 14, 15)
Variable | Percentages |
---|---|
Income | |
Low | 42.94 |
Medium | 31.17 |
High | 25.89 |
Sector | |
Public | 33.15 |
Private | 66.85 |
Type of high school | |
Humanist | 75.38 |
Technical school | 24.62 |
Educational Level of father | |
Primary | 20.35 |
Secondary | 47.26 |
Tertiary | 32.39 |
Educational Level of mother | |
Primary | 17.55 |
Secondary | 50.14 |
Tertiary | 32.31 |
Gender | |
Male | 45.47 |
Female | 54.53 |
grViz("digraph flowchart {
label= 'Flowchart Admission process 2021';
labelloc=top;
labeljust=left;
# node definitions with substituted label text
node [fontname = Arial, shape = rectangle, fontsize = 10]
tab1 [label = '@@1']
tab2 [label = '@@2']
tab3 [label = '@@3']
tab4 [label = '@@4']
tab5 [label = '@@5']
# edge definitions with the node IDs
tab1 -> tab2;
tab2 -> tab3;
tab3 -> tab4;
tab4 -> tab5
}
[1]: 'Registration (n = 276,059)'
[2]: 'Participants 2021 graduates of regular education levels (n = 211,985)'
[3]: 'Students taking the PSU Tests (n = 184,221)'
[4]: 'Students who applied to a university (n = 90,120)'
[5]: 'Admitted students (75,544)'
")
gender_means <- data%>%
na.omit()%>%
group_by(GENDER)%>%
summarise(
mean_len = mean(LEN),
sd_len = sd(LEN),
mean_math = mean(MATH),
sd_math = sd(MATH),
mean_nem = mean(NEM),
sd_nem = sd(NEM),
mean_rkg = mean(RKG),
sd_rkg = sd(RKG))%>%
rename(var = GENDER)%>%
mutate(var = ifelse(var == 0, 'Male', 'Female'))
kable(data_means,
digits = 2,
col.names = c('Variable', 'Mean', 'SD','Mean', 'SD','Mean', 'SD','Mean', 'SD'),
align = 'lcccccccc',
caption = 'Statistic descriptives by attributes')%>%
kable_styling(bootstrap_options = c('striped','condensed', 'bordered'),
full_width = F, position = 'center', font_size =12)%>%
column_spec(1, italic = T)%>%
row_spec(0, bold = T)%>%
add_header_above(c('', 'Language' = 2, 'Mathematics' = 2, 'NEM' = 2, 'Ranking' = 2))%>%
kableExtra::group_rows('Gender', 1, 2) %>%
kableExtra::group_rows('Type of High school', 3, 4) %>%
kableExtra::group_rows('School Sector', 5, 6)%>%
kableExtra::group_rows('Educational Level of Parents', 7, 9) %>%
kableExtra::group_rows('Income', 10, 12) %>%
kableExtra::group_rows('Application', 13, 14) %>%
kableExtra::group_rows('Admission', 15, 16)
Variable | Mean | SD | Mean | SD | Mean | SD | Mean | SD |
---|---|---|---|---|---|---|---|---|
Gender | ||||||||
Male | 564.39 | 84.50 | 580.21 | 89.13 | 629.38 | 100.26 | 653.70 | 117.11 |
Female | 558.57 | 83.79 | 548.34 | 86.51 | 653.26 | 96.17 | 682.67 | 113.15 |
Type of High school | ||||||||
Humanist | 567.97 | 83.49 | 571.80 | 87.98 | 646.74 | 97.62 | 671.16 | 113.58 |
Technical | 517.58 | 74.85 | 503.43 | 71.96 | 616.04 | 101.64 | 660.60 | 128.97 |
School Sector | ||||||||
Public | 544.55 | 82.40 | 532.50 | 84.06 | 638.60 | 99.20 | 669.42 | 118.34 |
Subsidized Private | 553.75 | 79.35 | 551.85 | 78.65 | 629.85 | 97.98 | 658.96 | 118.00 |
Educational Level of Parents | ||||||||
Private | 603.93 | 85.09 | 632.67 | 85.98 | 682.37 | 89.31 | 699.12 | 100.35 |
Primary Education | 528.40 | 76.88 | 521.20 | 78.02 | 627.13 | 99.43 | 661.54 | 121.81 |
Secondary Education | 549.84 | 79.86 | 545.99 | 80.99 | 632.38 | 98.07 | 661.67 | 117.65 |
Income | ||||||||
Tertiary Education | 583.32 | 85.31 | 593.13 | 91.15 | 658.60 | 97.03 | 681.23 | 110.95 |
Income Q1 | 540.51 | 78.77 | 532.29 | 79.43 | 627.62 | 98.91 | 658.73 | 119.86 |
Income Q2 | 552.90 | 80.21 | 547.55 | 81.27 | 634.51 | 98.57 | 664.48 | 118.50 |
Application | ||||||||
Income Q3 | 559.74 | 81.74 | 557.29 | 82.36 | 638.18 | 96.88 | 666.21 | 115.43 |
Income Q4 | 564.07 | 83.93 | 565.67 | 84.86 | 640.66 | 97.86 | 666.47 | 114.75 |
Admission | ||||||||
Income Q5 | 592.15 | 86.82 | 614.03 | 92.44 | 672.39 | 94.27 | 692.35 | 105.99 |
Overall | 561.17 | 84.16 | 562.57 | 89.11 | 642.60 | 98.73 | 669.73 | 115.83 |
plotmathsec <- ggplot(data, aes(x=Mother_Ed, y = MATH, fill = Mother_Ed))+
geom_boxplot()+
theme_bw()+
labs(x = "Education", y = "Mathematics", fill = "Education")+
scale_x_discrete(labels = c('','',''))+
scale_fill_discrete(guide = "none")+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
plotlensec <- ggplot(data, aes(x=Mother_Ed, y = LEN, fill = Mother_Ed))+
geom_boxplot()+
theme_bw()+
labs(x = "Education", y = "Language", fill = "Education")+
scale_fill_discrete(guide = 'none')+
scale_x_discrete(labels = c('','',''))+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
plotrkgsec <- ggplot(data, aes(x=Mother_Ed, y = RKG, fill = Mother_Ed))+
geom_boxplot()+
theme_bw()+
labs(x = "Education", y = "Ranking", fill = "Education")+
scale_x_discrete(labels = c('','',''))+
scale_fill_discrete(guide = 'none')+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
plotnemsec <- ggplot(data, aes(x=Mother_Ed, y = NEM, fill = Mother_Ed))+
geom_boxplot()+
theme_bw()+
labs(x = "Education", y = "NEM", fill = "Education")+
scale_x_discrete(labels = c('','',''))+
scale_fill_discrete(guide = "none")+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
pp1 <- (plotmathsec+plotlensec)/(plotrkgsec+plotnemsec)
pp1
Figure 1. Average scores in the admissions factors by Mother’s educational level
plotmathsec <- ggplot(data, aes(x=Fam_Income, y = MATH, fill = Fam_Income))+
geom_boxplot()+
theme_bw()+
labs(x = "Family Income", y = "Mathematics", fill = "Family Income")+
scale_x_discrete(labels = c('','','','',''))+
scale_fill_discrete(guide = "none")+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
plotlensec <- ggplot(data, aes(x=Fam_Income, y = LEN, fill = Fam_Income))+
geom_boxplot()+
theme_bw()+
labs(x = "Family Income", y = "Language", fill = "Family Income")+
scale_fill_discrete(guide = 'none')+
scale_x_discrete(labels = c('','','','',''))+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
plotrkgsec <- ggplot(data, aes(x=Fam_Income, y = RKG, fill = Fam_Income))+
geom_boxplot()+
theme_bw()+
labs(x = "Family Income", y = "Ranking", fill = "Family Income")+
scale_x_discrete(labels = c('','','','',''))+
scale_fill_discrete(guide = "none")+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
plotnemsec <- ggplot(data, aes(x=Fam_Income, y = NEM, fill = Fam_Income))+
geom_boxplot()+
theme_bw()+
labs(x = "Family Income", y = "NEM", fill = "Family Income")+
scale_x_discrete(labels = c('','','','',''))+
scale_fill_discrete(guide = "none")+
scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
theme(axis.title.x=element_blank())
pp2 <- (plotmathsec+plotlensec)/(plotrkgsec+plotnemsec)
pp2
Figure 2. Average scores in the admissions factors by Family income
Empty_mod<- glm(ADM ~ 1, family = binomial(link = 'logit'), data = data)
summary(Empty_mod)
adm_mod1 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + ED_SECONDARY + ED_TERTIARY + SECTOR_SUB + SECTOR_PRIV + TYPE, family = binomial(link = 'logit'), data = data)
summary(adm_mod1)
wald_adm_mod1 = glht(adm_mod1, linfct = c('NEM_55 = 0' , 'RKG_55 = 0' , 'LEN_55 = 0', 'MATH_55 = 0' , 'GENDER = 0' , 'INCOME_Q2 = 0', 'INCOME_Q3 = 0', 'INCOME_Q4 = 0', 'INCOME_Q5 = 0' , 'ED_SECONDARY = 0' , 'ED_TERTIARY = 0' , 'SECTOR_SUB = 0' , 'SECTOR_PRIV = 0', 'TYPE = 0'))
summary(wald_adm_mod1, test = Chisqtest()) # Joint chi-square test
wald_adm_mod1_3sl <- glht(adm_mod1, linfct = c('ED_SECONDARY=0', 'ED_TERTIARY=0', 'TYPE = 0'))
summary(wald_adm_mod1_3sl, test = Chisqtest())
adm_mod2 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + ED_SECONDARY + ED_TERTIARY + SECTOR_SUB + SECTOR_PRIV, family = binomial(link = 'logit'), data = data)
summary(adm_mod2)
DevTest = -2*(logLik(adm_mod2) - logLik(adm_mod1))
RegPvalue = pchisq((DevTest), df=1, lower.tail=FALSE)
MixPvalue = RegPvalue/2
DevTest; RegPvalue; MixPvalue
adm_mod3 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + SECTOR_SUB + SECTOR_PRIV, family = binomial(link = 'logit'), data = data)
summary(adm_mod3)
DevTest = -2*(logLik(adm_mod3) - logLik(adm_mod2))
RegPvalue = pchisq((DevTest), df=2, lower.tail=FALSE)
MixPvalue = RegPvalue/2
DevTest; RegPvalue; MixPvalue
wald_adm_mod2 = glht(adm_mod2, linfct = c('NEM_55 = 0' , 'RKG_55 = 0' , 'LEN_55 = 0', 'MATH_55 = 0' , 'GENDER = 0' , 'INCOME_Q2 = 0' , 'INCOME_Q3 = 0' , 'INCOME_Q4 = 0', 'INCOME_Q5 = 0', 'SECTOR_SUB = 0' , 'SECTOR_PRIV = 0'))
summary(wald_adm_mod2, test = Chisqtest()) # Joint chi-square test
adm_mod3_1 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + SECTOR_SUB + SECTOR_PRIV, family = binomial(link = 'probit'), data = data)
Table 3. Binary logistic regression results
mod_results <- tidy(adm_mod3)
mod_results <- round(mod_results[2:5],3)
coef <- data.frame(adm_mod3$coefficients)
coef_names <- rownames(coef)
flextable(cbind(coef_names,mod_results))
surr_rsq(adm_mod3_1, adm_mod3_1, data, avg.num = 100)
coef_names | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 1.352 | 0.024 | 55.270 | 0.000 |
NEM_55 | -0.011 | 0.004 | -2.594 | 0.009 |
RKG_55 | 0.022 | 0.003 | 6.247 | 0.000 |
LEN_55 | 0.024 | 0.001 | 20.902 | 0.000 |
MATH_55 | 0.036 | 0.001 | 29.529 | 0.000 |
GENDER | -0.335 | 0.018 | -18.918 | 0.000 |
INCOME_Q2 | -0.064 | 0.023 | -2.765 | 0.006 |
INCOME_Q3 | -0.075 | 0.027 | -2.790 | 0.005 |
INCOME_Q4 | -0.108 | 0.027 | -4.008 | 0.000 |
INCOME_Q5 | -0.023 | 0.029 | -0.775 | 0.439 |
SECTOR_SUB | -0.071 | 0.019 | -3.655 | 0.000 |
SECTOR_PRIV | 0.165 | 0.032 | 5.117 | 0.000 |
## ------------------------------------------------------------------------
## The surrogate R-squared of the model
## ------------------------------------------------------------------------
## ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 +
## INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + SECTOR_SUB + SECTOR_PRIV
## ------------------------------------------------------------------------
## is:
## [1] 0.10179
dat <- read.table(header=TRUE, text='
Fixed_effects Odds_Ratio
NEM_550 0.989
Ranking_550 1.022
Language_550 1.024
Mathematics_550 1.037
Sex_Female 0.715
Income_Q2 0.938
Income_Q3 0.928
Income_Q4 0.897
Income_Q5 0.978
Sector_Private_Subsidized 0.932
Sector_Private 1.180')
c <- ggplot(dat[c(1:4),], aes(x = Odds_Ratio, y = Fixed_effects))+
geom_point(size = 2.5)+
geom_vline(xintercept = 1, linetype="dashed",
color = "blue", size=1) +
scale_x_continuous(limits = c(.70, 1.25), breaks = seq(.70, 1.25, 0.1))+
labs(x = 'Odds Ratio', y = 'Predictors')+
scale_y_discrete(limits=rev)+
theme_bw(base_size = 14)+
ggtitle("Academic factors")+
theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))
d <- ggplot(dat[c(5:11),], aes(x = Odds_Ratio, y = Fixed_effects))+
geom_point(size = 2.5)+
geom_vline(xintercept = 1, linetype="dashed",
color = "blue", size=1) +
scale_x_continuous(limits = c(.70, 1.25), breaks = seq(.70, 1.25, 0.1))+
labs(x = 'Odds Ratio', y = '')+
scale_y_discrete(limits=rev)+
theme_bw(base_size = 14)+
ggtitle("Non-academic factors")+
theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))
c+d
Figure 3. Binary logistic regression results
Table 4. Estimated probabilities of being admitted into selective higher education, admission process 2021
probs <- (summary(glht(model=adm_mod3, linfct=rbind(
"Males_Pub_Q1" = c(1,0,0,0,0,0,0,0,0,0,0,0),
"Males_Pub_Q2" = c(1,0,0,0,0,0,1,0,0,0,0,0),
"Males_Pub_Q3" = c(1,0,0,0,0,0,0,1,0,0,0,0),
"Males_Pub_Q4" = c(1,0,0,0,0,0,0,0,1,0,0,0),
"Males_Pub_Q5" = c(1,0,0,0,0,0,0,0,0,1,0,0),
"Males_Sub_Q1" = c(1,0,0,0,0,0,0,0,0,0,1,0),
"Males_Sub_Q2" = c(1,0,0,0,0,0,1,0,0,0,1,0),
"Males_Sub_Q3" = c(1,0,0,0,0,0,0,1,0,0,1,0),
"Males_Sub_Q4" = c(1,0,0,0,0,0,0,0,1,0,1,0),
"Males_Sub_Q5" = c(1,0,0,0,0,0,0,0,0,1,1,0),
"Males_Pri_Q1" = c(1,0,0,0,0,0,0,0,0,0,0,1),
"Males_Pri_Q2" = c(1,0,0,0,0,0,1,0,0,0,0,1),
"Males_Pri_Q3" = c(1,0,0,0,0,0,0,1,0,0,0,1),
"Males_Pri_Q4" = c(1,0,0,0,0,0,0,0,1,0,0,1),
"Males_Pri_Q5" = c(1,0,0,0,0,0,0,0,0,1,0,1),
"Females_Pub_Q1" = c(1,0,0,0,0,1,0,0,0,0,0,0),
"Females_Pub_Q2" = c(1,0,0,0,0,1,1,0,0,0,0,0),
"Females_Pub_Q3" = c(1,0,0,0,0,1,0,1,0,0,0,0),
"Females_Pub_Q4" = c(1,0,0,0,0,1,0,0,1,0,0,0),
"Females_Pub_Q5" = c(1,0,0,0,0,1,0,0,0,1,0,0),
"Females_Sub_Q1" = c(1,0,0,0,0,1,0,0,0,0,1,0),
"Females_Sub_Q2" = c(1,0,0,0,0,1,1,0,0,0,1,0),
"Females_Sub_Q3" = c(1,0,0,0,0,1,0,1,0,0,1,0),
"Females_Sub_Q4" = c(1,0,0,0,0,1,0,0,1,0,1,0),
"Females_Sub_Q5" = c(1,0,0,0,0,1,0,0,0,1,1,0),
"Females_Pri_Q1" = c(1,0,0,0,0,1,0,0,0,0,0,1),
"Females_Pri_Q2" = c(1,0,0,0,0,1,1,0,0,0,0,1),
"Females_Pri_Q3" = c(1,0,0,0,0,1,0,1,0,0,0,1),
"Females_Pri_Q4" = c(1,0,0,0,0,1,0,0,1,0,0,1),
"Females_Pri_Q5" = c(1,0,0,0,0,1,0,0,0,1,0,1))),test=adjusted("none")))
logit_pr <- logit_prob(probs[["test"]][["coefficients"]])
logit_pr <- data.frame(logit_pr)
tab_prob <- cbind(
a = data.frame(logit_pr[1:5,]),
b = data.frame(logit_pr[6:10,]),
c = data.frame(logit_pr[11:15,]),
d = data.frame(logit_pr[16:20,]),
e = data.frame(logit_pr[21:25,]),
f = data.frame(logit_pr[26:30,]))
colnames(tab_prob) <- c('male_pub', 'male_sub', 'male_priv','fem_pub', 'fem_sub', 'fem_priv')
rownames(tab_prob) <- c('Q1', 'Q2', 'Q3','Q4', 'Q5')
flextable(cbind(x = c('Q1', 'Q2', 'Q3','Q4', 'Q5'), round(tab_prob,3)))
x | male_pub | male_sub | male_priv | fem_pub | fem_sub | fem_priv |
---|---|---|---|---|---|---|
Q1 | 0.794 | 0.783 | 0.820 | 0.734 | 0.720 | 0.765 |
Q2 | 0.784 | 0.772 | 0.810 | 0.722 | 0.707 | 0.754 |
Q3 | 0.782 | 0.770 | 0.809 | 0.720 | 0.705 | 0.752 |
Q4 | 0.776 | 0.764 | 0.804 | 0.713 | 0.698 | 0.745 |
Q5 | 0.791 | 0.779 | 0.817 | 0.730 | 0.716 | 0.761 |
data_prob <- data.frame(Quintil = c('Q1', 'Q2', 'Q3', 'Q4', 'Q5'), tab_prob)
library(ggplot2)
a <- ggplot(data_prob, aes(x = Quintil, group=1))+
geom_point(aes(y = male_pub))+
geom_line(aes(y = male_pub), color = 'red', size = 1)+
geom_point(aes(y = male_sub))+
geom_line(aes(y = male_sub), color = 'green', size = 1)+
geom_point(aes(y = male_priv))+
geom_line(aes(y = male_priv), color = 'blue', size = 1)+
scale_y_continuous(limits = c(.6, .9), breaks = seq(.6, .9, .1))+
labs(x = 'Quintile family income', y = 'Probability of admission')+
theme_bw(base_size = 14)+
ggtitle("Males")+
theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))
b <- ggplot(data_prob, aes(x = Quintil, group=1))+
geom_point(aes(y = fem_pub))+
geom_line(aes(y = fem_pub), color = 'red', size = 1)+
geom_point(aes(y = fem_sub))+
geom_line(aes(y = fem_sub), color = 'green', size = 1)+
geom_point(aes(y = fem_priv))+
geom_line(aes(y = fem_priv), color = 'blue', size = 1)+
scale_y_continuous(limits = c(.6, .9), breaks = seq(.6, .9, .1))+
labs(x = 'Quintile family income', y = '')+
theme_bw(base_size = 14)+
ggtitle("Females")+
theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))
a+b
Figure 6. Estimated probabilities of being admitted into selective higher education, admission process 2021
income_prob <- ggplot(prob_mod, aes(y = probability, x = Fam_Income, fill = factor(Fam_Income))) +
geom_violin() +
theme_bw() +
labs(x = "Family income", y = "Probability of Admission", fill = "Family income")+
scale_fill_discrete("Family income",
labels = c("Q1","Q2","Q3", "Q4", "Q5"))+
scale_x_discrete(labels = c('','','','',''))+
theme(axis.title.x=element_blank()) + theme(legend.position="bottom")
income_prob
Figure 7. Estimated probabilities of being admitted by Family income
school_prob <- ggplot(prob_mod, aes(y = probability, x = Sector, fill = Sector)) +
geom_violin() +
theme_bw() +
labs(x = "School sector", y = "Probability of Admission", fill = "School sector")+
scale_fill_discrete("School sector")+
scale_x_discrete(labels = c('','',''))+
theme(axis.title.x=element_blank()) + theme(legend.position="bottom")
school_prob
Figure 8. Estimated probabilities of being admitted by School sector