/*
HSR Method I - Week 3 Review of regression
*/
version 16
cd "H:\Teaching\Methods 2020\lectures\Week 3 Regression review\code"
log using "W3 regression.txt", text replace
/// --- Data
use https://www.stata-press.com/data/r16/cattaneo2, clear
desc bweight lbweight mbsmoke msmoke mage medu mrace fbaby
tabstat bweight, by(mbsmoke) stats(N mean sd min max)
/// --- Birthweight distributes normal, as many biological/physical metrics do
set scheme s1mono, permanently
hist bweight, kdensity saving(w_all.gph, replace) title("All")
hist bweight if mbsmoke == 1, kdensity saving(w_smok.gph, replace) title("Smoker")
hist bweight if mbsmoke == 0, kdensity saving(w_nonsmok.gph, replace) title("Non-smoker")
graph combine w_all.gph w_smok.gph w_nonsmok.gph, col(1) xcommon ysize(10)
graph export wg_graph.png, replace
graph combine w_all.gph w_smok.gph w_nonsmok.gph, row(1) ysize(4) xsize(12)
graph export wg_graph1.png, replace
/// --- Exaggerate the difference
clonevar bweight_fake = bweight
replace bweight_fake = bweight + 1000 if mbsmoke == 0
replace bweight_fake = bweight - 500 if mbsmoke == 1
hist bweight_fake, kdensity percent saving(hist_bwfake, replace)
graph export bw_fake.png, replace
* The linear/OLS is still a good model because normality is conditional on convariates
reg bweight_fake mbsmoke
predict res_std, rstandard
hist res_std, kdensity saving(rno.gph, replace)
qnorm res_std, saving(qno.gph, replace)
graph combine rno.gph qno.gph, row(1)
graph export nor.png, replace
/// ---- Meaning of R^2
reg bweight_fake mbsmoke
capture drop yhat
predict yhat
corr yhat bweight_fake
di 0.7721 ^2
* .59613841
* could have done
return list
di r(rho)^2
/// ---- Asymptotics
qui reg bweight mbsmoke mage medu
matrix list e(b)
matrix list e(V)
di sqrt(e(V)[2,2])
* Save
scalar bsmoke = e(b)[1,1]
scalar bsmoke_se = sqrt(e(V)[1,1])
di bsmoke " " bsmoke_se
/// -- Centering
sum mage
gen mage_c = mage - 35
qui reg bweight i.msmoke##c.mage
est sto m1
qui reg bweight i.msmoke##c.mage_c
est sto m2
est table m1 m2
/// ---- Wald tests
* Smoking categories
desc msmoke
label list smoke2
tab msmoke
reg bweight i.msmoke mage medu
test medu
*test medu =0
test medu = 1
* Joint: test if effect of 6-10 cigs is same as 11+ daily
test 2.msmoke= 3.msmoke
* Can't do: test medu > 12
** More on tests
qui reg bweight i.mbsmoke##c.mage medu
* to remember how to name the coefficient use the code below
matrix list e(b)
test mage 1.mbsmoke#c.mage
* syntax can be confusing
test mage = 1.mbsmoke#c.mage =0
* Same test comparing between two models
qui reg bweight i.mbsmoke##c.mage medu
scalar sse_f = e(rss)
qui reg bweight i.mbsmoke medu
scalar sse_r = e(rss)
di ((sse_r - sse_f)/2)/(sse_f/(4642-4-1))
* asymptotically equivalent to likelihood ratio tests comparing two models
qui reg bweight i.mbsmoke##c.mage medu
est sto mf
qui reg bweight i.mbsmoke medu
est sto mr
lrtest mf mr
/// --- ANOVA
reg bweight i.msmoke
test (1.msmoke=0) (2.msmoke=0) (3.msmoke=0)
* anova command
anova bweight i.msmoke
* type help contrast
contrast a.msmoke
/// t-tests and regression
reg bweight i.mbsmoke
ttest bweight, by(mbsmoke)
/// --- Confidence interval
reg bweight i.mbsmoke mage medu
test 1.mbsmoke = -292.2675
test 1.mbsmoke = -206.7613
test 1.mbsmoke = -250
test 1.mbsmoke = -300
* We are saying that the coefficient of mbsmoke distributes normal with
* mean -249.5144 and standard deviation (the standard error) of 21.80749
* The Wald test of the null beta=0 distributes t-student
* The estimated CI is [-292.2675, -206.7613]
* Let's simulate the standardized coefficient for smoke
preserve
clear
set seed 1234567066
set obs 10000
* The rt function generates t-student distribution with mean zero and sd 1
gen zt = rt(4642-4)
* Make it have a mean of bsmoke and sd of bsmoke_se
gen beta_sm_sd = bsmoke_se*zt + bsmoke
* Get the 2,5 and 97.5 percentile
_pctile beta_sm_sd, p(2.5)
local ci_lb=r(r1)
_pctile beta_sm_sd, p(97.5)
local ci_ub=r(r1)
di "[" `ci_lb' ", " `ci_ub' "]"
* Another way to get the percentile
*centile beta_sm_sd, centile(2.5(5)97.5)
* How many times are the simulated coefficients within the confidence interval?
qui count if beta_sm_sd >= -292.2675 & beta_sm_sd <= -206.7613
di r(N)/10000
* More nifty. What is the probability that the difference in birthweight between
* smokers and non-smokers is greater than -300 grams?
qui count if beta_sm_sd >= -310
di r(N)/10000
restore
/// --- Linear probability model
sum lbweight mage
reg lbweight mage, robust
predict yhat_ols
line yhat_ols mage, color(red) saving(pred_ols.gph, replace)
* Compare to logistic or probit
logit lbweight mage, nolog
predict yhat_logit
line yhat_logit mage, sort color(blue) saving(pred_logit.gph, replace)
margins, dydx(mage)
probit lbweight mage, nolog
margins, dydx(mage)
* Intuition
graph combine pred_ols.gph pred_logit.gph, row(1)
graph export compare_preds.png, replace
* Compare to quadratic specification
reg lbweight c.mage##c.mage, robust
margins, dydx(c.mage)
predict yhat_quad
line yhat_quad mage, sort
* not great
/// --- A nice way to explore the shape of a 0/1 variable in relation to another var
* Lowess is a smoother method, semiparametric
lowess lbweight mage, gen(y_smooth)
* make it pretty
scatter lbweight mage, jitter(2) msize(vsmall) legend(off) ///
|| line y_smooth mage, sort color(red)
graph export low.png, replace
* There is an option to transform into logits:
* Logits are log-odds: log(p/(1-p))
lowess lbweight mage, logit saving(y_smooth_l.gph, replace)
* That's the smoothed version of this
logit lbweight mage
predict yhat_logit1, xb
line yhat_logit1 mage, sort saving(logit.gph, replace)
graph combine y_smooth_l.gph logit.gph, col(1) xcommon ysize(7)
/// --- Lowess is easier to undertand, but nonparametric methods now have their own
* command: npregress, using kernel (just a weighting function) or series
* it can be slow with a lot of observations
npregress kernel lbweight mage
npgraph
* Or make your own graphs
npregress kernel lbweight mage, predict(y_kernel deriv)
sum y_kernel deriv
* note this matches the output
scatter lbweight mage, jitter(3) || line y_kernel mage, sort color(red)
line y_kernel mage, sort color(red)
graph export ykernel.png, replace
/// --- Interactions and stratification
* Run stratified models and save coefficients
qui reg lbweight mage if mbsmoke == 1, robust
local m1_alpha1 = _b[mage]
qui reg lbweight mage if mbsmoke == 0, robust
local m1_gamma1 = _b[mage]
* Difference
di `m1_alpha1' - `m1_gamma1'
*.006432
* Should match interaction term
qui reg lbweight c.mage##i.mbsmoke, robust
di _b[1.mbsmoke#c.mage]
*.006432
log close