This R Code describes the functions used to find matching observations using the method described in (Causal effects of the timing of life course events: age at retirement and subsequent health by Barban et al.)
The code consists of 3 parts:
R functions used to a) import life course trajectories from Stata files; b) Find sequences with minimal distances under different metrics (Optimal matching, Propensity scores, combined)
Simulate a dataset with same characteristics of data from the Linneus database
Use the method and obtain as output the IDs of suitable matched controls under different matching strategies.
import_seq=function(age=0, sex="man"){
require(TraMineR)
require(foreign)
require(RColorBrewer)
filename=paste(sex,"_",age,".dta",sep="")
print(filename)
if (age==0) age=60
data=read.dta(filename,convert.factors=F)
SEQ=list()
##### defining health seq
health.col60=c("seq_health_55", "seq_health_56" , "seq_health_57" ,"seq_health_58" ,"seq_health_59" )
health.col61=c(health.col60, "seq_health_60")
health.col62=c(health.col61, "seq_health_61")
health.col63=c(health.col62, "seq_health_62")
health.col64=c(health.col63, "seq_health_63")
health.col65=c(health.col64, "seq_health_64")
health.col66=c(health.col65, "seq_health_65")
health.col67=c(health.col66, "seq_health_66")
health.col68=c(health.col67, "seq_health_67")
health.col69=c(health.col68, "seq_health_68")
health.col=get(paste("health.col",age, sep=""))
h=which(colnames(data) %in% health.col)
SEQ$health=seqdef(data, h, id=data$I_LINNEID, cnames=c(55:(age-1)),
missing.color="white", alphabet=c(0,1,2,3,4,5,6,7),
labels=c("No hospital", "No Hospital/Sick-benefit",
"No Hospital/invalidity", "No Hospital/both benefit",
"1" ,"2","3", ">3")
,right=NA)
if (sex=="man") pscol=which(colnames(data) %in% paste("psM_",age, sep=""))
if (sex=="woman") pscol=which(colnames(data) %in% paste("psW_",age, sep=""))
SEQ$ps=data[,pscol]
t=paste("retirement_",age, sep="")
SEQ$treatment=data[,colnames(data)==t]
SEQ$Treatment=as.factor(SEQ$treatment)
levels(SEQ$Treatment)=c("control", "treated")
SEQ$treatment=SEQ$treatment
out=paste("mean_hospital_5_",(age), sep="")
SEQ$outcome=data[,colnames(data)==out]
education=paste("education_",(age), sep="")
SEQ$education=data[,colnames(data)==education]
SEQ$yearofbirth=data[,colnames(data)=="YEAROFBIRTH"]
save(SEQ,file=paste("R_",sex,"_",age,".Rdata", sep=""))
return(SEQ)
}
find.min_distATT=function(SEQ, fileout){
require(TraMineR)
require(foreign)
require(snowfall)
timestart=Sys.time()
### Defining internal vars
treatment=SEQ$treatment
SEQ$health=SEQ$health
SEQ$work=SEQ$work
outcome=SEQ$outcome
idnames=rownames(SEQ[[1]])
names(SEQ[[3]])=idnames
c=2
r=sum(treatment==1)
ncontr=sum(treatment==0)
id_1=rownames(SEQ[[1]][treatment==1,])
id_2=c()
submat=list()
seq_t=c()
id_m=matrix(ncol=c, nrow=r)
##########################
### substitution matrix for trajectories
sub_matrix=list()
for (i in 1:1) {sub_matrix[[i]]=seqsubm(SEQ[[i]], method="TRATE", with.missing=TRUE)}
##############
min_dist<-function(k, c=2, seq=SEQ, sub=sub_matrix)
{
print(paste(k,"/",r,sep=""))
id_1=rownames(seq[[1]][seq$treatment==1,])
d<-list()
seq_controls<-list()
id_m=c()
for (i in 1:c){
###id reference seq
if (i==1){
seq_t=seq[[i]][seq$treatment==1, ]
id<-which(idnames==rownames(seq_t[k,]))
#check.dim=length(seq[[i]][seq$treatment==0 & seq$education==seq$education[id] & seq$yearofbirth==seq$yearofbirth[id] ,])
seq_controls[[i]]=seq[[i]][seq$treatment==0 & seq$yearofbirth==seq$yearofbirth[id] ,]
#if (check.dim==0) seq_controls[[i]]=seq[[i]][seq$treatment==0 & seq$yearofbirth==seq$yearofbirth[id] ,]
d[[i]]=seqdist(seq_controls[[i]],method = "OM", indel=1, sm=sub[[i]], refseq=seq_t[k,],with.missing=TRUE)
if (length(which(d[[i]]==min(d[[i]])))==1) id_m[i]=rownames(seq_controls[[i]][which.min(d[[i]]),])
if (length(which(d[[i]]==min(d[[i]])))>1) id_m[i]=rownames(seq_controls[[i]][sample(which(d[[i]]==min(d[[i]])),1),])
}
#if (i==2){ d[[i]]=d[[1]]*0; id_m[i]=names(seq_controls[[i]][sample(which(d[[i]]==min(d[[i]])),1)]}
if (i==2){
seq_t=seq[[3]][seq$treatment==1]
seq_controls[[2]]=seq[[3]][seq$treatment==0 & seq$yearofbirth==seq$yearofbirth[id] ]
#rownames(seqcontrols)=rownames(seq_controls[[i]])
#seq_controls[[i]]=seqcontrols
d[[i]]=abs(seq_controls[[i]]-seq_t[k])
if (length(which(d[[i]]==min(d[[i]])))==1) id_m[i]=names(seq_controls[[i]][which.min(d[[i]])])
if (length(which(d[[i]]==min(d[[i]])))>1) id_m[i]=names(seq_controls[[i]][sample(which(d[[i]]==min(d[[i]])),1)])
}
}
d[[2]][is.na(d[[2]])==T]<-max(d[[2]]) #if propensity score is missing
D2=(d[[1]]/max(d[[1]])) + (d[[2]]/max(d[[2]]))
if (length(which(D2==min(D2)))==1) id_m[3]=names(D2[which.min(D2)])
if (length(which(D2==min(D2)))>1) id_m[3]=names(D2[sample(which(D2==min(D2)),1)])
output=matrix(c(id_1[k], id_m[1:3]), ncol=4, nrow=1)
return(as.numeric(output))
}
library(foreach)
library(doSNOW)
registerDoSNOW(makeCluster(12, type="SOCK"))
getDoParWorkers()
OUT<-foreach(k=1:r, .packages='TraMineR', .combine="cbind") %dopar% min_dist(k, c=2, seq=SEQ, sub=sub_matrix)
B=as.data.frame(t(OUT))
names(B)=c("id_T", "M_health", "M_ps", "M_total")
write.dta(B,file=paste(fileout, "_single_matches_couples.dta",sep=""))
save(B, file=paste(fileout, "_single_matches_couples.Rdata",sep=""))
print(Sys.time()-timestart)
return(B)
}
n = 10000
sample_size = n
I_LINNEID = 1:n
YEAROFBIRTH = sample( 1935:1945, n, replace=TRUE, prob=rep(1/11,11) )
retirement_61 = sample(c(0,1), n, replace=TRUE, prob=c(1-0.07, 0.07) )
education_61 = sample( 1:3, n, replace=TRUE, prob=c(.39, .39, .22) )
cum_hosp_61 = rpois(n, .6)
seq_health_55 = sample( 0:7, n, replace=TRUE, prob=c(.744, .143, .307, .142, .168,.123,.072, .316) )
seq_health_56 = sample( 0:7, n, replace=TRUE, prob=c(.755, .118, .385, .161, .178,.129,.075, .333) )
seq_health_57 = sample( 0:7, n, replace=TRUE, prob=c(.763, .979, .474, .184, .184,.130,.075, .346))
seq_health_58 = sample( 0:7, n, replace=TRUE, prob=c(.748, .968, .574, .208, .191,.124,.076, .370) )
seq_health_59 = sample( 0:7, n, replace=TRUE, prob=c(.731, .960, .692, .235, .201,.130,.081, .388) )
seq_health_60 = sample( 0:7, n, replace=TRUE, prob=c(.708, .950, .834, .289, .209,.129,.085, .419) )
psM_61 = rnorm(n, mean=.0920115 , sd=.0418976 )
age_last_obs = sample( 62:71, n, replace=TRUE, prob=c(.128, .124, .113, .103, .872,.824,.815, .761, .708, .132) )
sum_hosp = rpois(n, 4.413244)
outcome_hosp = rnorm(n, mean= .8156941 ,sd= 3.32509)
dataset=data.frame(I_LINNEID, YEAROFBIRTH, retirement_61, education_61,
cum_hosp_61, seq_health_55, seq_health_56, seq_health_57,
seq_health_57, seq_health_58, seq_health_59, seq_health_60,
psM_61, age_last_obs, outcome_hosp)
library(foreign)
write.dta(dataset, "man_61.dta")
rm(list=ls())
setwd( "yourdirectory")
library(doSNOW)
S61=import_seq(age=61, sex="man")
res_man61=find.min_distATT(S61, fileout="man_61")