RetirementSweden

R code

This R Code describes the functions used to find matching observations using the method described in (Causal effects of the timing of life course events: age at retirement and subsequent health by Barban et al.)

The code consists of 3 parts:

  1. R functions used to a) import life course trajectories from Stata files; b) Find sequences with minimal distances under different metrics (Optimal matching, Propensity scores, combined)

  2. Simulate a dataset with same characteristics of data from the Linneus database

  3. Use the method and obtain as output the IDs of suitable matched controls under different matching strategies.

R functions

a) Import life course trajectories

import_seq=function(age=0, sex="man"){
require(TraMineR)
require(foreign)
require(RColorBrewer)



filename=paste(sex,"_",age,".dta",sep="")
print(filename)
if (age==0) age=60
data=read.dta(filename,convert.factors=F)
SEQ=list()

##### defining health seq

health.col60=c("seq_health_55",  "seq_health_56" , "seq_health_57" ,"seq_health_58" ,"seq_health_59"  )

health.col61=c(health.col60, "seq_health_60")
health.col62=c(health.col61, "seq_health_61")
health.col63=c(health.col62, "seq_health_62")
health.col64=c(health.col63, "seq_health_63")
health.col65=c(health.col64, "seq_health_64")
health.col66=c(health.col65, "seq_health_65")
health.col67=c(health.col66, "seq_health_66")
health.col68=c(health.col67, "seq_health_67")
health.col69=c(health.col68, "seq_health_68")
health.col=get(paste("health.col",age, sep=""))

h=which(colnames(data) %in% health.col)


SEQ$health=seqdef(data, h, id=data$I_LINNEID, cnames=c(55:(age-1)),
	 				missing.color="white", alphabet=c(0,1,2,3,4,5,6,7), 
					labels=c("No hospital", "No Hospital/Sick-benefit", 
					"No Hospital/invalidity", "No Hospital/both benefit", 
					"1" ,"2","3", ">3")
					,right=NA)
					


 if (sex=="man") pscol=which(colnames(data) %in% paste("psM_",age, sep=""))
 if (sex=="woman") pscol=which(colnames(data) %in% paste("psW_",age, sep=""))
 
SEQ$ps=data[,pscol]

t=paste("retirement_",age, sep="")
SEQ$treatment=data[,colnames(data)==t]

SEQ$Treatment=as.factor(SEQ$treatment)
levels(SEQ$Treatment)=c("control", "treated")

SEQ$treatment=SEQ$treatment

out=paste("mean_hospital_5_",(age), sep="")
SEQ$outcome=data[,colnames(data)==out]

education=paste("education_",(age), sep="")
SEQ$education=data[,colnames(data)==education]

SEQ$yearofbirth=data[,colnames(data)=="YEAROFBIRTH"]


save(SEQ,file=paste("R_",sex,"_",age,".Rdata", sep=""))
return(SEQ)

}

b) Find Minimum Distance

find.min_distATT=function(SEQ, fileout){
require(TraMineR)
require(foreign)
require(snowfall)


timestart=Sys.time()

### Defining internal vars
treatment=SEQ$treatment



SEQ$health=SEQ$health
SEQ$work=SEQ$work

outcome=SEQ$outcome

idnames=rownames(SEQ[[1]])	
names(SEQ[[3]])=idnames		
c=2
r=sum(treatment==1)
ncontr=sum(treatment==0)

id_1=rownames(SEQ[[1]][treatment==1,])
id_2=c()
submat=list()
seq_t=c()
id_m=matrix(ncol=c, nrow=r)

##########################
### substitution matrix for trajectories
sub_matrix=list()
for (i in 1:1) {sub_matrix[[i]]=seqsubm(SEQ[[i]], method="TRATE", with.missing=TRUE)}
##############




min_dist<-function(k, c=2, seq=SEQ, sub=sub_matrix)
		{
		print(paste(k,"/",r,sep=""))
      id_1=rownames(seq[[1]][seq$treatment==1,])
		
		d<-list()
    	seq_controls<-list()
    	id_m=c()
    	 
		for (i in 1:c){
	
 	###id reference seq


	    if (i==1){ 
	      seq_t=seq[[i]][seq$treatment==1, ]
		   id<-which(idnames==rownames(seq_t[k,]))
		   #check.dim=length(seq[[i]][seq$treatment==0 & seq$education==seq$education[id] & seq$yearofbirth==seq$yearofbirth[id] ,]) 
		   seq_controls[[i]]=seq[[i]][seq$treatment==0 & seq$yearofbirth==seq$yearofbirth[id] ,]                         	    
		                      #if (check.dim==0)  seq_controls[[i]]=seq[[i]][seq$treatment==0  & seq$yearofbirth==seq$yearofbirth[id] ,]                         	    
		    d[[i]]=seqdist(seq_controls[[i]],method = "OM", indel=1, sm=sub[[i]], refseq=seq_t[k,],with.missing=TRUE)
	                          
	      if (length(which(d[[i]]==min(d[[i]])))==1) id_m[i]=rownames(seq_controls[[i]][which.min(d[[i]]),])		
         if (length(which(d[[i]]==min(d[[i]])))>1) id_m[i]=rownames(seq_controls[[i]][sample(which(d[[i]]==min(d[[i]])),1),])		
    	   }
		                      
	   #if (i==2){  d[[i]]=d[[1]]*0; id_m[i]=names(seq_controls[[i]][sample(which(d[[i]]==min(d[[i]])),1)]}
   	      
      if (i==2){ 
         seq_t=seq[[3]][seq$treatment==1]
         seq_controls[[2]]=seq[[3]][seq$treatment==0 & seq$yearofbirth==seq$yearofbirth[id] ]                         	    
         #rownames(seqcontrols)=rownames(seq_controls[[i]])
         #seq_controls[[i]]=seqcontrols
         
           d[[i]]=abs(seq_controls[[i]]-seq_t[k])
         
         if (length(which(d[[i]]==min(d[[i]])))==1) id_m[i]=names(seq_controls[[i]][which.min(d[[i]])])		
         if (length(which(d[[i]]==min(d[[i]])))>1) id_m[i]=names(seq_controls[[i]][sample(which(d[[i]]==min(d[[i]])),1)])		
         }
        }                                        
        d[[2]][is.na(d[[2]])==T]<-max(d[[2]]) #if propensity score is missing

  
        
         D2=(d[[1]]/max(d[[1]])) + (d[[2]]/max(d[[2]]))		
         if (length(which(D2==min(D2)))==1) id_m[3]=names(D2[which.min(D2)])
         if (length(which(D2==min(D2)))>1) id_m[3]=names(D2[sample(which(D2==min(D2)),1)]) 


output=matrix(c(id_1[k], id_m[1:3]), ncol=4, nrow=1)


return(as.numeric(output))

}

 library(foreach)	
 library(doSNOW)
 registerDoSNOW(makeCluster(12, type="SOCK"))
getDoParWorkers()

OUT<-foreach(k=1:r, .packages='TraMineR', .combine="cbind") %dopar% min_dist(k, c=2, seq=SEQ, sub=sub_matrix) 


B=as.data.frame(t(OUT))
names(B)=c("id_T", "M_health", "M_ps",  "M_total")
write.dta(B,file=paste(fileout,  "_single_matches_couples.dta",sep=""))
save(B, file=paste(fileout,  "_single_matches_couples.Rdata",sep=""))
print(Sys.time()-timestart)
return(B)	



}

Simulate dataset

n = 10000
sample_size = n 
I_LINNEID = 1:n
YEAROFBIRTH = sample( 1935:1945, n, replace=TRUE, prob=rep(1/11,11) )
retirement_61 = sample(c(0,1), n, replace=TRUE, prob=c(1-0.07, 0.07) )
education_61 = sample( 1:3, n, replace=TRUE, prob=c(.39, .39, .22) )
cum_hosp_61 = rpois(n, .6)
seq_health_55 = sample( 0:7, n, replace=TRUE, prob=c(.744, .143, .307, .142, .168,.123,.072, .316) )
seq_health_56 = sample( 0:7, n, replace=TRUE, prob=c(.755, .118, .385, .161, .178,.129,.075, .333) )
seq_health_57 = sample( 0:7, n, replace=TRUE, prob=c(.763, .979, .474, .184, .184,.130,.075, .346))
seq_health_58 = sample( 0:7, n, replace=TRUE, prob=c(.748, .968, .574, .208, .191,.124,.076, .370) )
seq_health_59 = sample( 0:7, n, replace=TRUE, prob=c(.731, .960, .692, .235, .201,.130,.081, .388) )
seq_health_60 = sample( 0:7, n, replace=TRUE, prob=c(.708, .950, .834, .289, .209,.129,.085, .419) )
psM_61 = rnorm(n, mean=.0920115  ,  sd=.0418976 )
age_last_obs =   sample( 62:71, n, replace=TRUE, prob=c(.128, .124, .113, .103, .872,.824,.815, .761, .708, .132) )
sum_hosp = rpois(n,  4.413244)
outcome_hosp = rnorm(n, mean= .8156941  ,sd=  3.32509)

dataset=data.frame(I_LINNEID, YEAROFBIRTH, retirement_61, education_61,
						cum_hosp_61, seq_health_55, seq_health_56, seq_health_57,
						seq_health_57, seq_health_58, seq_health_59, seq_health_60,
						psM_61, age_last_obs, outcome_hosp)
						
						
library(foreign)
write.dta(dataset, "man_61.dta")

Execute (example for treatment age 61, men)

rm(list=ls())

setwd( "yourdirectory")

library(doSNOW)
S61=import_seq(age=61, sex="man")
res_man61=find.min_distATT(S61,  fileout="man_61")