#if (!require("easypackages")) install.packages("easypackages")
library(easypackages)
libraries('dplyr', 'tidyverse', 'ggplot2', 'glm2', 'VGAM', 'multcomp', 'stringr', 'readxl', 'Lahman', 'kableExtra', 'GGally', 'modelr', 'broom', 'DiagrammeR', "PerformanceAnalytics", 'finalfit', "patchwork", "flextable", "dvmisc", "SurrogateRsq")

Note: %>% this symbol is known as ‘pipe’. Basically, it allows to chain operations, which helps to reduce the code needed to perform some actions. Also, inside the dplyr package, in tidyverse, are most of the ‘verbs’ you will see in data manipulation: select, filter, group_by, summarise, mutate, and arrange.

If you are interested in these functions (very useful if you are not good with base r), you can read this material from Professor Brandon LeBeau (EMS program), with whom I learned most of what I know about r via RStudio:

https://github.com/lebebr01/psqf_6250/tree/master/Syntax/R/v2

This course has a website as well: https://psqf6250.brandonlebeau.org/rcode/, however, Data Manipulation unit is missing.

Data sets

registration <- readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/A_Inscritos_puntajes/A_INSCRITOS_PUNTAJES_PDT_2021_PUB_MRUN.csv")
socioeconomic <- readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/B_Socioeconomico/B_SOCIOECONOMICO_DOMICILIO_PDT_2021_PUB_MRUN.csv")
application <-readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/C_Postulacion_Seleccion/C_POSTULANTES_SELECCION_PDT_2021_PUB_MRUN.csv")
admission <- readr::read_csv2("C:/Users/geral/OneDrive - University of Iowa/CIDCIE/1. Bases de Datos/PSU/PSU 2019 - 2021/D_Matricula_MRUN/D_MATRICULA_PDT__2021_PUB_MRUN.csv")

Data munging

# Select important variables from first stage
a <- registration%>%
  dplyr::select(MRUN, COD_SEXO, RBD, CODIGO_ENS, RAMA_EDUCACIONAL,
         DEPENDENCIA, PROMEDIO_NOTAS, PTJE_NEM, PTJE_RANKING,
         CLEC_ACTUAL, MATE_ACTUAL, PUNTAJES_PROCESO)%>%
  dplyr::filter(PUNTAJES_PROCESO == 1 & CODIGO_ENS %in% c('310', '410', '510', '610'))%>% #filters
  dplyr::select(MRUN, COD_SEXO, RBD, RAMA_EDUCACIONAL, DEPENDENCIA,
  PROMEDIO_NOTAS, PTJE_NEM, PTJE_RANKING, CLEC_ACTUAL,MATE_ACTUAL)%>%
  mutate(
    gender = ifelse(COD_SEXO == 2, 1,0),
    type_hs = ifelse(RAMA_EDUCACIONAL == 'H1', 0,1),
    sector_sub = ifelse(DEPENDENCIA == 3, 1, 0),
    sector_priv = ifelse(DEPENDENCIA == 4, 1, 0))%>%
  dplyr::select(MRUN,RBD,PROMEDIO_NOTAS, PTJE_NEM, PTJE_RANKING, CLEC_ACTUAL,MATE_ACTUAL,gender, type_hs, sector_sub, sector_priv)

colnames(a) <- c('MRUN', 'RBD', 'gpa', 'NEM', 'RKG', 'LEN', 'MATH', 'gender', 'type_hs','sector_sub', 'sector_priv')

# Second stage
b <- socioeconomic%>%
  dplyr::select(MRUN, INGRESO_PERCAPITA_GRUPO_FA, EDUCACION_MADRE, EDUCACION_PADRE)
ab <- left_join(a, b, by = 'MRUN')

ab <- ab%>%
  mutate(
    inc = INGRESO_PERCAPITA_GRUPO_FA,
    ed_m = EDUCACION_MADRE,
    ed_d = EDUCACION_PADRE,
    ed_dad = ifelse(ed_d >= 0 & ed_d <=5, 0,
                    ifelse(ed_d >= 6 & ed_d <= 9, 1,
                           ifelse(ed_d == 13, NA, 2))),
    ed_mom = ifelse(ed_m >= 1 & ed_m <= 5, 0,
                    ifelse(ed_m >= 6 & ed_m <= 9, 1,
                           ifelse(ed_m == 13, NA, 2))),
    income = ifelse(inc == 1|inc == 2, 0,
                    ifelse(inc == 3|inc == 4, 1,
                           ifelse(inc == 5|inc == 6, 2,
                                  ifelse(inc == 7|inc == 8, 3, 4)))))%>%
  dplyr::select('MRUN', 'gpa', 'NEM', 'RKG', 'LEN', 'MATH', 'gender', 'type_hs', 'sector_sub', 'sector_priv', 'ed_dad', 'ed_mom', 'income')

# Third stage
c <- application%>%
  dplyr::select(MRUN, SITUACION_POSTULANTE)%>%
  mutate(
    appl = ifelse(SITUACION_POSTULANTE == 'P', 1,0))%>%
  dplyr::select(MRUN, appl)
abc <- left_join(ab, c, by = 'MRUN')

# Fourth stage
d <- admission%>%
  dplyr::select(MRUN, VIA_ADMISION)%>%
  mutate(
    admitted = ifelse(VIA_ADMISION > 0, 1, 0))%>%
  filter(!is.na(MRUN))%>%
  dplyr::select(MRUN, admitted)%>%
  unique()

# Data
abcd <- left_join(abc, d, by = 'MRUN')

colnames(abcd) <- c('ID', 'GPA', 'NEM', 'RKG', 'LEN', 'MATH', 'GENDER', 'TYPE', 'SECTOR_SUB', 'SECTOR_PRIV', 'ED_DAD', 'ED_MOM', 'INCOME', 'APPLY', 'ADM')

# New variables for the coming analysis
abcd <- abcd%>%
  mutate(
    NEM = ifelse(NEM == 0, NA, NEM),
    RKG = ifelse(RKG == 0, NA, RKG),
    LEN = ifelse(LEN == 0, NA, LEN),
    MATH = ifelse(MATH == 0, NA, MATH),
    ED_SECONDARY = ifelse(ED_MOM == 1, 1, 0),
    ED_TERTIARY =   ifelse(ED_MOM == 2, 1, 0),
    INCOME_Q2 = ifelse(INCOME == 1, 1,0),
    INCOME_Q3 = ifelse(INCOME == 2, 1,0),
    INCOME_Q4 = ifelse(INCOME == 3, 1,0),
    INCOME_Q5 = ifelse(INCOME == 4, 1,0),
    APPLY = ifelse(is.na(APPLY), 0, APPLY),
    ADM = ifelse(is.na(ADM), 0, 1))

# Labels for columns
data <- abcd%>%
  dplyr::select('GPA', 'NEM', 'RKG', 'LEN', 'MATH', 'GENDER', 'TYPE', 'SECTOR_SUB', 'SECTOR_PRIV', 'INCOME_Q2', 'INCOME_Q3', 'INCOME_Q4', 'INCOME_Q5', 'APPLY', 'ADM', 'ED_SECONDARY', 'ED_TERTIARY', 'INCOME')%>%
  filter(LEN > 0 & MATH > 0)

# Some variables must be factors to be included in the ggplot plots without problems. We did this after realizing the usefulness of creating variables with certain characteristics instead of doing it in the ggplot function.
data <- data %>% 
  mutate(
    Sector = ifelse(SECTOR_SUB == 1, 1,
                 ifelse(SECTOR_PRIV == 1,2,0)),
    Sector = factor(Sector),
    Sector = fct_recode(Sector,
    'Public' = "0",
    'Subsidized Private' = "1",
    'Private'= "2"),
    Fam_Income = factor(INCOME),
    Fam_Income = fct_recode(Fam_Income,
    'Income Q1' = "0",
    'Income Q2' = "1",
    'Income Q3'= "2",
    'Income Q4' = '3',
    'Income Q5' = '4'),
    Mother_Ed = ifelse(ED_SECONDARY == 1, 1,
                 ifelse(ED_TERTIARY == 1,2,0)),
    Mother_Ed = factor(Mother_Ed),
    Mother_Ed = fct_recode(Mother_Ed,
    'Primary Education' = "0",
    'Secondary Education' = "1",
    'Tertiary Education'= "2"))
# write.table(data, file = "adm_data.csv", sep = ",") we save the latest data set
rm(a, ab, abc, abcd, b, c, d) # we remove these objects from the environment

kable_styling(kable(data.frame(head(data, 10)), digits = 3,
                    caption = 'Table 1: Dataset'),
              bootstrap_options = c('hover', 'responsive', 'bordered'),
              full_width = F, position = 'center', font_size =12) %>% 
  scroll_box(height = '250px')
Table 1: Dataset
GPA NEM RKG LEN MATH GENDER TYPE SECTOR_SUB SECTOR_PRIV INCOME_Q2 INCOME_Q3 INCOME_Q4 INCOME_Q5 APPLY ADM ED_SECONDARY ED_TERTIARY INCOME Sector Fam_Income Mother_Ed
5.40 496 497 675 674 0 0 1 0 1 0 0 0 0 0 1 0 1 Subsidized Private Income Q2 Secondary Education
5.43 504 504 377 431 0 1 0 0 0 0 1 0 0 0 0 0 3 Public Income Q4 Primary Education
5.70 559 568 304 475 1 0 0 0 0 0 0 0 0 0 1 0 0 Public Income Q1 Secondary Education
5.65 550 553 562 535 0 1 0 0 0 0 0 1 1 1 0 0 4 Public Income Q5 Primary Education
5.83 589 595 366 319 1 1 0 0 0 0 0 0 0 0 0 0 0 Public Income Q1 Primary Education
4.95 391 391 466 443 1 0 0 0 0 0 1 0 1 0 0 0 3 Public Income Q4 Primary Education
6.83 803 850 495 540 1 0 0 0 0 1 0 0 0 0 1 0 2 Public Income Q3 Secondary Education
5.70 559 574 355 509 0 0 0 0 1 0 0 0 0 0 1 0 1 Public Income Q2 Secondary Education
6.03 631 631 440 595 1 0 0 1 0 0 0 1 1 1 0 1 4 Private Income Q5 Tertiary Education
5.48 513 531 209 516 0 0 0 0 1 0 0 0 0 0 0 0 1 Public Income Q2 Primary Education

Codebook

codebook <- data.frame(cbind(
  Variable = c('GPA','NEM', 'RKG', 'LEN', 'MATH', 'GENDER', 'TYPE', 'SECTOR', 'ED_PARENTS', 'INCOME_Q', 'APPLY', 'ADM'),
  Description = c('High school grade point average.', 'Standardized high school grade point average.', 'Relative position of the student in his or her school of graduation', 'Standardized score in the Language PSU', 'Standardized score in the Mathematics PSU', 'gender of student', 'Type of high school, Technical-professional or Humanist, from which the student graduated', "Sector to which the student's school of graduation belongs", 'Higher educational level of the father or mother',"Quintile of per capita income to which the student's family belongs (1 USD = 752.9 CLP)", "Student's application to a university", 'Admission of the student to a university'),
  Type = c('Continuous','Continuous','Continuous','Continuous','Continuous','Binary (0 = Male, 1 = Female)','Binary (0 = Humanist, 1 = Technical school)','Binary (0 = Public, 1 = Private)','Categorical (0 = Primary, 1 = Secondary, 2 = Tertiary)','Categorical (Q1 = less than 99.6, Q2 = 99.7-167.4, Q3 = 167.5-257.7, Q4 = 257,8-468.9, Q5 = 469.0 and more)','Binary (0 = Did not apply, 1 = Applied)','Binary (0 = Not admitted, 1 = Admitted)')))

kable(codebook,
      caption = 'Table 2: List of variables')%>%
  kable_styling(bootstrap_options = c('bordered', 'condensed'), full_width = F, position = 'center', font_size =12)%>%
      column_spec(1, italic = T, width = '.75in')
Table 2: List of variables
Variable Description Type
GPA High school grade point average. Continuous
NEM Standardized high school grade point average. Continuous
RKG Relative position of the student in his or her school of graduation Continuous
LEN Standardized score in the Language PSU Continuous
MATH Standardized score in the Mathematics PSU Continuous
GENDER gender of student Binary (0 = Male, 1 = Female)
TYPE Type of high school, Technical-professional or Humanist, from which the student graduated Binary (0 = Humanist, 1 = Technical school)
SECTOR Sector to which the student’s school of graduation belongs Binary (0 = Public, 1 = Private)
ED_PARENTS Higher educational level of the father or mother Categorical (0 = Primary, 1 = Secondary, 2 = Tertiary)
INCOME_Q Quintile of per capita income to which the student’s family belongs (1 USD = 752.9 CLP) Categorical (Q1 = less than 99.6, Q2 = 99.7-167.4, Q3 = 167.5-257.7, Q4 = 257,8-468.9, Q5 = 469.0 and more)
APPLY Student’s application to a university Binary (0 = Did not apply, 1 = Applied)
ADM Admission of the student to a university Binary (0 = Not admitted, 1 = Admitted)

Sample description

kable(sample,
      digits = 2,
      col.names = c('Variable', 'Percentages'),
      align = 'lc',
      caption = 'Table 2: Sample description (n = 184,221)')%>%
  kable_styling(bootstrap_options = c('striped','condensed', 'bordered'),
                full_width = F, position = 'center', font_size =12)%>%
  column_spec(1, italic = T)%>%
  row_spec(0, bold = T)%>%
  kableExtra::group_rows('Income', 1, 3) %>%
  kableExtra::group_rows('Sector', 4, 5) %>%
  kableExtra::group_rows('Type of high school', 6, 7)%>%
  kableExtra::group_rows('Educational Level of father', 8, 10) %>%
  kableExtra::group_rows('Educational Level of mother', 11, 13) %>%
  kableExtra::group_rows('Gender', 14, 15)
Table 2: Sample description (n = 184,221)
Variable Percentages
Income
Low 42.94
Medium 31.17
High 25.89
Sector
Public 33.15
Private 66.85
Type of high school
Humanist 75.38
Technical school 24.62
Educational Level of father
Primary 20.35
Secondary 47.26
Tertiary 32.39
Educational Level of mother
Primary 17.55
Secondary 50.14
Tertiary 32.31
Gender
Male 45.47
Female 54.53

Admission process

grViz("digraph flowchart {

label= 'Flowchart Admission process 2021';
labelloc=top;
labeljust=left;

# node definitions with substituted label text
node [fontname = Arial, shape = rectangle, fontsize = 10]        
tab1 [label = '@@1']
tab2 [label = '@@2']
tab3 [label = '@@3']
tab4 [label = '@@4']
tab5 [label = '@@5']

# edge definitions with the node IDs
tab1 -> tab2;
tab2 -> tab3;
tab3 -> tab4;
tab4 -> tab5
}

[1]: 'Registration (n = 276,059)'
[2]: 'Participants 2021 graduates of regular education levels (n = 211,985)'
[3]: 'Students taking the PSU Tests (n = 184,221)'
[4]: 'Students who applied to a university (n = 90,120)'
[5]: 'Admitted students (75,544)'
")

First part of the analysis: descriptives

gender_means <- data%>%
  na.omit()%>%
  group_by(GENDER)%>%
  summarise(
    mean_len = mean(LEN),
    sd_len = sd(LEN),
    mean_math = mean(MATH),
    sd_math = sd(MATH),
    mean_nem = mean(NEM),
    sd_nem = sd(NEM),
    mean_rkg = mean(RKG),
    sd_rkg = sd(RKG))%>%
  rename(var = GENDER)%>%
  mutate(var = ifelse(var == 0, 'Male', 'Female'))
kable(data_means,
      digits = 2,
      col.names = c('Variable', 'Mean', 'SD','Mean', 'SD','Mean', 'SD','Mean', 'SD'),
      align = 'lcccccccc',
      caption = 'Statistic descriptives by attributes')%>%
  kable_styling(bootstrap_options = c('striped','condensed', 'bordered'),
                full_width = F, position = 'center', font_size =12)%>%
  column_spec(1, italic = T)%>%
  row_spec(0, bold = T)%>%
  add_header_above(c('', 'Language' = 2, 'Mathematics' = 2, 'NEM' = 2, 'Ranking' = 2))%>%
  kableExtra::group_rows('Gender', 1, 2) %>%
  kableExtra::group_rows('Type of High school', 3, 4) %>%
  kableExtra::group_rows('School Sector', 5, 6)%>%
  kableExtra::group_rows('Educational Level of Parents', 7, 9) %>%
  kableExtra::group_rows('Income', 10, 12) %>%
  kableExtra::group_rows('Application', 13, 14) %>%
  kableExtra::group_rows('Admission', 15, 16)
Statistic descriptives by attributes
Language
Mathematics
NEM
Ranking
Variable Mean SD Mean SD Mean SD Mean SD
Gender
Male 564.39 84.50 580.21 89.13 629.38 100.26 653.70 117.11
Female 558.57 83.79 548.34 86.51 653.26 96.17 682.67 113.15
Type of High school
Humanist 567.97 83.49 571.80 87.98 646.74 97.62 671.16 113.58
Technical 517.58 74.85 503.43 71.96 616.04 101.64 660.60 128.97
School Sector
Public 544.55 82.40 532.50 84.06 638.60 99.20 669.42 118.34
Subsidized Private 553.75 79.35 551.85 78.65 629.85 97.98 658.96 118.00
Educational Level of Parents
Private 603.93 85.09 632.67 85.98 682.37 89.31 699.12 100.35
Primary Education 528.40 76.88 521.20 78.02 627.13 99.43 661.54 121.81
Secondary Education 549.84 79.86 545.99 80.99 632.38 98.07 661.67 117.65
Income
Tertiary Education 583.32 85.31 593.13 91.15 658.60 97.03 681.23 110.95
Income Q1 540.51 78.77 532.29 79.43 627.62 98.91 658.73 119.86
Income Q2 552.90 80.21 547.55 81.27 634.51 98.57 664.48 118.50
Application
Income Q3 559.74 81.74 557.29 82.36 638.18 96.88 666.21 115.43
Income Q4 564.07 83.93 565.67 84.86 640.66 97.86 666.47 114.75
Admission
Income Q5 592.15 86.82 614.03 92.44 672.39 94.27 692.35 105.99
Overall 561.17 84.16 562.57 89.11 642.60 98.73 669.73 115.83

First part of the analysis: relationship between predictors

plotmathsec <- ggplot(data, aes(x=Mother_Ed, y = MATH, fill = Mother_Ed))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Education", y = "Mathematics", fill = "Education")+
  scale_x_discrete(labels = c('','',''))+
  scale_fill_discrete(guide = "none")+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

plotlensec <- ggplot(data, aes(x=Mother_Ed, y = LEN, fill = Mother_Ed))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Education", y = "Language", fill = "Education")+
  scale_fill_discrete(guide = 'none')+
  scale_x_discrete(labels = c('','',''))+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

plotrkgsec <- ggplot(data, aes(x=Mother_Ed, y = RKG, fill = Mother_Ed))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Education", y = "Ranking", fill = "Education")+
  scale_x_discrete(labels = c('','',''))+
  scale_fill_discrete(guide = 'none')+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

plotnemsec <- ggplot(data, aes(x=Mother_Ed, y = NEM, fill = Mother_Ed))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Education", y = "NEM", fill = "Education")+
  scale_x_discrete(labels = c('','',''))+
  scale_fill_discrete(guide = "none")+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

pp1 <- (plotmathsec+plotlensec)/(plotrkgsec+plotnemsec)
pp1

Figure 1. Average scores in the admissions factors by Mother’s educational level

plotmathsec <- ggplot(data, aes(x=Fam_Income, y = MATH, fill = Fam_Income))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Family Income", y = "Mathematics", fill = "Family Income")+
  scale_x_discrete(labels = c('','','','',''))+
  scale_fill_discrete(guide = "none")+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

plotlensec <- ggplot(data, aes(x=Fam_Income, y = LEN, fill = Fam_Income))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Family Income", y = "Language", fill = "Family Income")+
  scale_fill_discrete(guide = 'none')+
  scale_x_discrete(labels = c('','','','',''))+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

plotrkgsec <- ggplot(data, aes(x=Fam_Income, y = RKG, fill = Fam_Income))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Family Income", y = "Ranking", fill = "Family Income")+
  scale_x_discrete(labels = c('','','','',''))+
  scale_fill_discrete(guide = "none")+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

plotnemsec <- ggplot(data, aes(x=Fam_Income, y = NEM, fill = Fam_Income))+
  geom_boxplot()+
  theme_bw()+
  labs(x = "Family Income", y = "NEM", fill = "Family Income")+
  scale_x_discrete(labels = c('','','','',''))+
  scale_fill_discrete(guide = "none")+
  scale_y_continuous(limits = c(150, 850), breaks = seq(400, 800, 200)) +
  theme(axis.title.x=element_blank())

pp2 <- (plotmathsec+plotlensec)/(plotrkgsec+plotnemsec)
pp2

Figure 2. Average scores in the admissions factors by Family income

Second part of the analysis: Binary logistic regression

Empty_mod<- glm(ADM ~ 1, family = binomial(link = 'logit'), data = data)
summary(Empty_mod)

adm_mod1 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + ED_SECONDARY + ED_TERTIARY + SECTOR_SUB + SECTOR_PRIV + TYPE, family = binomial(link = 'logit'), data = data)
summary(adm_mod1)

wald_adm_mod1 = glht(adm_mod1, linfct = c('NEM_55 = 0' , 'RKG_55 = 0' , 'LEN_55 = 0', 'MATH_55 = 0' , 'GENDER = 0' , 'INCOME_Q2 = 0', 'INCOME_Q3 = 0', 'INCOME_Q4 = 0', 'INCOME_Q5 = 0' , 'ED_SECONDARY = 0' , 'ED_TERTIARY = 0' , 'SECTOR_SUB = 0' , 'SECTOR_PRIV = 0', 'TYPE = 0'))
summary(wald_adm_mod1, test = Chisqtest()) # Joint chi-square test

wald_adm_mod1_3sl <- glht(adm_mod1, linfct = c('ED_SECONDARY=0', 'ED_TERTIARY=0', 'TYPE = 0'))
summary(wald_adm_mod1_3sl, test = Chisqtest())

adm_mod2 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + ED_SECONDARY + ED_TERTIARY + SECTOR_SUB + SECTOR_PRIV, family = binomial(link = 'logit'), data = data)
summary(adm_mod2)

DevTest = -2*(logLik(adm_mod2) - logLik(adm_mod1))
RegPvalue = pchisq((DevTest), df=1, lower.tail=FALSE)
MixPvalue = RegPvalue/2
DevTest; RegPvalue; MixPvalue

adm_mod3 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + SECTOR_SUB + SECTOR_PRIV, family = binomial(link = 'logit'), data = data)
summary(adm_mod3)

DevTest = -2*(logLik(adm_mod3) - logLik(adm_mod2))
RegPvalue = pchisq((DevTest), df=2, lower.tail=FALSE)
MixPvalue = RegPvalue/2
DevTest; RegPvalue; MixPvalue

wald_adm_mod2 = glht(adm_mod2, linfct = c('NEM_55 = 0' , 'RKG_55 = 0' , 'LEN_55 = 0', 'MATH_55 = 0' , 'GENDER = 0' , 'INCOME_Q2 = 0' , 'INCOME_Q3 = 0' , 'INCOME_Q4 = 0', 'INCOME_Q5 = 0', 'SECTOR_SUB = 0' , 'SECTOR_PRIV = 0'))
summary(wald_adm_mod2, test = Chisqtest()) # Joint chi-square test

adm_mod3_1 <- glm(ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 + INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + SECTOR_SUB + SECTOR_PRIV, family = binomial(link = 'probit'), data = data)

Table 3. Binary logistic regression results

mod_results <- tidy(adm_mod3)
mod_results <- round(mod_results[2:5],3)
coef <- data.frame(adm_mod3$coefficients)
coef_names <- rownames(coef)
flextable(cbind(coef_names,mod_results))
surr_rsq(adm_mod3_1, adm_mod3_1, data, avg.num = 100)
## ------------------------------------------------------------------------ 
## The surrogate R-squared of the model 
## ------------------------------------------------------------------------ 
## ADM ~ 1 + NEM_55 + RKG_55 + LEN_55 + MATH_55 + GENDER + INCOME_Q2 +  
##     INCOME_Q3 + INCOME_Q4 + INCOME_Q5 + SECTOR_SUB + SECTOR_PRIV 
## ------------------------------------------------------------------------ 
## is: 
## [1]  0.10179
dat <- read.table(header=TRUE, text='
Fixed_effects               Odds_Ratio
NEM_550                     0.989
Ranking_550                 1.022
Language_550                1.024
Mathematics_550             1.037
Sex_Female                  0.715
Income_Q2                     0.938
Income_Q3                     0.928
Income_Q4                     0.897
Income_Q5                     0.978
Sector_Private_Subsidized     0.932
Sector_Private              1.180')

c <- ggplot(dat[c(1:4),], aes(x = Odds_Ratio, y = Fixed_effects))+
  geom_point(size = 2.5)+
  geom_vline(xintercept = 1, linetype="dashed", 
             color = "blue", size=1) +
  scale_x_continuous(limits = c(.70, 1.25), breaks = seq(.70, 1.25, 0.1))+
  labs(x = 'Odds Ratio', y = 'Predictors')+
  scale_y_discrete(limits=rev)+
  theme_bw(base_size = 14)+
  ggtitle("Academic factors")+
  theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))

d <- ggplot(dat[c(5:11),], aes(x = Odds_Ratio, y = Fixed_effects))+
  geom_point(size = 2.5)+
  geom_vline(xintercept = 1, linetype="dashed", 
             color = "blue", size=1) +
  scale_x_continuous(limits = c(.70, 1.25), breaks = seq(.70, 1.25, 0.1))+
  labs(x = 'Odds Ratio', y = '')+
  scale_y_discrete(limits=rev)+
  theme_bw(base_size = 14)+
  ggtitle("Non-academic factors")+
  theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))

c+d

Figure 3. Binary logistic regression results

Table 4. Estimated probabilities of being admitted into selective higher education, admission process 2021

probs <- (summary(glht(model=adm_mod3, linfct=rbind(
  "Males_Pub_Q1" = c(1,0,0,0,0,0,0,0,0,0,0,0),
  "Males_Pub_Q2" = c(1,0,0,0,0,0,1,0,0,0,0,0),
  "Males_Pub_Q3" = c(1,0,0,0,0,0,0,1,0,0,0,0),
  "Males_Pub_Q4" = c(1,0,0,0,0,0,0,0,1,0,0,0),
  "Males_Pub_Q5" = c(1,0,0,0,0,0,0,0,0,1,0,0),
  "Males_Sub_Q1" = c(1,0,0,0,0,0,0,0,0,0,1,0),
  "Males_Sub_Q2" = c(1,0,0,0,0,0,1,0,0,0,1,0),
  "Males_Sub_Q3" = c(1,0,0,0,0,0,0,1,0,0,1,0),
  "Males_Sub_Q4" = c(1,0,0,0,0,0,0,0,1,0,1,0),
  "Males_Sub_Q5" = c(1,0,0,0,0,0,0,0,0,1,1,0),
  "Males_Pri_Q1" = c(1,0,0,0,0,0,0,0,0,0,0,1),
  "Males_Pri_Q2" = c(1,0,0,0,0,0,1,0,0,0,0,1),
  "Males_Pri_Q3" = c(1,0,0,0,0,0,0,1,0,0,0,1),
  "Males_Pri_Q4" = c(1,0,0,0,0,0,0,0,1,0,0,1),
  "Males_Pri_Q5" = c(1,0,0,0,0,0,0,0,0,1,0,1),
  "Females_Pub_Q1" = c(1,0,0,0,0,1,0,0,0,0,0,0),
  "Females_Pub_Q2" = c(1,0,0,0,0,1,1,0,0,0,0,0),
  "Females_Pub_Q3" = c(1,0,0,0,0,1,0,1,0,0,0,0),
  "Females_Pub_Q4" = c(1,0,0,0,0,1,0,0,1,0,0,0),
  "Females_Pub_Q5" = c(1,0,0,0,0,1,0,0,0,1,0,0),
  "Females_Sub_Q1" = c(1,0,0,0,0,1,0,0,0,0,1,0),
  "Females_Sub_Q2" = c(1,0,0,0,0,1,1,0,0,0,1,0),
  "Females_Sub_Q3" = c(1,0,0,0,0,1,0,1,0,0,1,0),
  "Females_Sub_Q4" = c(1,0,0,0,0,1,0,0,1,0,1,0),
  "Females_Sub_Q5" = c(1,0,0,0,0,1,0,0,0,1,1,0),
  "Females_Pri_Q1" = c(1,0,0,0,0,1,0,0,0,0,0,1),
  "Females_Pri_Q2" = c(1,0,0,0,0,1,1,0,0,0,0,1),
  "Females_Pri_Q3" = c(1,0,0,0,0,1,0,1,0,0,0,1),
  "Females_Pri_Q4" = c(1,0,0,0,0,1,0,0,1,0,0,1),
  "Females_Pri_Q5" = c(1,0,0,0,0,1,0,0,0,1,0,1))),test=adjusted("none")))
logit_pr <- logit_prob(probs[["test"]][["coefficients"]])
logit_pr <- data.frame(logit_pr)

tab_prob <- cbind(
a = data.frame(logit_pr[1:5,]),
b = data.frame(logit_pr[6:10,]),
c = data.frame(logit_pr[11:15,]),
d = data.frame(logit_pr[16:20,]),
e = data.frame(logit_pr[21:25,]),
f = data.frame(logit_pr[26:30,]))

colnames(tab_prob) <- c('male_pub', 'male_sub', 'male_priv','fem_pub', 'fem_sub', 'fem_priv')
rownames(tab_prob) <- c('Q1', 'Q2', 'Q3','Q4', 'Q5')
flextable(cbind(x = c('Q1', 'Q2', 'Q3','Q4', 'Q5'), round(tab_prob,3)))
data_prob <- data.frame(Quintil = c('Q1', 'Q2', 'Q3', 'Q4', 'Q5'), tab_prob)
library(ggplot2)

a <- ggplot(data_prob, aes(x = Quintil, group=1))+
  geom_point(aes(y = male_pub))+
  geom_line(aes(y = male_pub), color = 'red', size = 1)+
  geom_point(aes(y = male_sub))+
  geom_line(aes(y = male_sub), color = 'green', size = 1)+
  geom_point(aes(y = male_priv))+
  geom_line(aes(y = male_priv), color = 'blue', size = 1)+
  scale_y_continuous(limits = c(.6, .9), breaks = seq(.6, .9, .1))+
  labs(x = 'Quintile family income', y = 'Probability of admission')+
  theme_bw(base_size = 14)+
  ggtitle("Males")+
  theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))

b <- ggplot(data_prob, aes(x = Quintil, group=1))+
  geom_point(aes(y = fem_pub))+
  geom_line(aes(y = fem_pub), color = 'red', size = 1)+
  geom_point(aes(y = fem_sub))+
  geom_line(aes(y = fem_sub), color = 'green', size = 1)+
  geom_point(aes(y = fem_priv))+
  geom_line(aes(y = fem_priv), color = 'blue', size = 1)+
  scale_y_continuous(limits = c(.6, .9), breaks = seq(.6, .9, .1))+
  labs(x = 'Quintile family income', y = '')+
  theme_bw(base_size = 14)+
  ggtitle("Females")+
  theme(axis.title = element_text(size = 12), plot.title = element_text(size = 12))

a+b

Figure 6. Estimated probabilities of being admitted into selective higher education, admission process 2021

income_prob <- ggplot(prob_mod, aes(y = probability, x = Fam_Income, fill = factor(Fam_Income))) +
  geom_violin() +
  theme_bw() +
  labs(x = "Family income", y = "Probability of Admission", fill = "Family income")+
  scale_fill_discrete("Family income",
                      labels = c("Q1","Q2","Q3", "Q4", "Q5"))+
  scale_x_discrete(labels = c('','','','',''))+
  theme(axis.title.x=element_blank()) + theme(legend.position="bottom")
income_prob

Figure 7. Estimated probabilities of being admitted by Family income

school_prob <- ggplot(prob_mod, aes(y = probability, x = Sector, fill = Sector)) +
  geom_violin() +
  theme_bw() +
  labs(x = "School sector", y = "Probability of Admission", fill = "School sector")+
  scale_fill_discrete("School sector")+
  scale_x_discrete(labels = c('','',''))+
  theme(axis.title.x=element_blank()) + theme(legend.position="bottom")
school_prob

Figure 8. Estimated probabilities of being admitted by School sector