/*March 22, 2016:
Motivated by the CCSS expansion data that has weight^=1 for ALL survivors, I will modify the macro to take weight.*/

/*
Created by Qi Liu, Feb 2016. This macro created data in the way decribed in the paper below, and this data can be used to get cumualtive incidence for left&right trunction data.
Reference paper: Ronald B. Geskus, "Cause-Specific Cumulative Incidence Estimation and the Fine and Gray Model Under Both Left Truncation and Right Censoring", Biometrics 67, 39–49

indata: 		Input data for cumulative incidence analysis; It should not contain variable names "Tstart" & "Tstop" which will be genrated and used inside the macro.
timestart: 		The start time (e.g, if we use St Jude life study, this is age at 10 year post DX);
timestop: 		The end time (e.g.,  if we use St Jude life study, this is the end of study: event date for those with events [event of interest or competing risk event], 
					follow up end date for those censored);
id: 			Id for each subject (e.g.,  if we use St Jude life study, it is MRN)
evtvar: 		Event variable, 0 for censoring, 1 for event of interest, and 2,3,4... for competing risk event.
wtvar:			weight per subject from the input data.
outdata: 		The resulted output data that contains weights and indcator for event of interest. This can be used to estimate the survival function and 1-survival fucntion is cumulative incidence.

****NOTE:***** 
The survival probability estimated from "proc phreg" is slight different from that produced in R and hence resulted in minor difderence in the estimates of cumulative incidence,
especially the last 2-3 time points when people at risk is small.
*/


/*	%let indata=dat1; %let timestart=agestart; %let timestop=timeage; %let id=nid; %let evtvar=sevent; %let outdata=cumout;	*/


%macro wdata(indata=,timestart=,timestop=,id=,evtvar=,wtvar=,outdata=);
/*
%let indata=gt; 	%let timestart=startdx; %let timestop=timedx;	%let id=mrn;	%let evtvar=status14;	%let outdata=cmout; %let wtvar=weight;
%let indata=data_1; %let id=ccssid; %let evtvar=SMNcat; %let timestop=timesmn; %let output=ci_SMN;	%let timestart=start;
%let indata=pseudo; 	%let timestart=startdx; %let timestop=timedx;	%let id=mrn;	%let evtvar=status14;	%let outdata=cmout;
%let indata=gt; 	%let timestart=astart; 	%let timestop=timebirth; 	%let id=mrn; 	%let evtvar=status14; 	%let outdata=cm_timesbirth;
*/
/*If there is no weight variable specified then eveyone has weight 1.*/
%if &wtvar= %then %do;
	data indata;
		set &indata;
		wtvar=1;
	run;
%end;
%else %if &wtvar^= %then %do;
	data indata;
		set &indata;
		wtvar=&wtvar;
	run;
%end;

%let prec=2.220446e-13;
%let prec0=1.0e-13;

data gs;
	set indata;
	cen=(&evtvar=0); /*an indicator for censoring*/
	val=1;	
	/*In the situation for CCSS, start time is 5 and events before 5 are put as prevalence at 5, then tiemstart=timestop, which will not run in proc pheg.*/
	if &timestart^=. and &timestart=&timestop then &timestop=&timestop+&prec0;	
	/*to be same as the R code that for ceosnring Tstop+prec: i.e., when events and censoring on the same date, put censoring after event.*/
	Tstop_cen=&timestop+&prec*cen;
	if &evtvar=. or &timestop=. then delete;
run;

/*If there is no competing risk (This could happend when the original data has few competing risk events, it is very possible that bootstrapping samples 
contain no one with a competing risk events)*/
data cmpksj;
	set gs(where=(&evtvar not in (0,1)));
run;
proc sql; select count(&id) into:hmany_cmpk from cmpksj; quit;
%if &hmany_cmpk=0 %then %do; /*If there is no competing risk event.*/
	proc phreg data =gs;
		model (&timestart,&timestop)*&evtvar(0) =  /ties=Efron;
		baseline out=survprob survival=_all_;
		weight wtvar;
	run;
	data &outdata(keep=Tstop cumi SE low up);
		set survprob;
		cumi=1-survival;
		SE=StdErrSurvival;
		up=1-LowerSurvival;
		low=1-UpperSurvival;
		rename &timestop=Tstop;
	run;
%end;
%else %do; /*If there are competing risk events*/

/*R function "Y:\Common\Qi\St Jude\Nickhill\Imputation\New strategy\TimefromBirth\Left trunction\literature\crprep.R"
In R code:
  ## Calculate product-limit time-to-censoring distribution, "event" not included in case of ties
 #### Qi: if censored, then a tiny little bit time "prec" is added so when an event and censoring occurr at the same time, censor was made to be a little later.
  surv.cens <- survival::survfit(Surv(Tstart,Tstop+ifelse(status==cens,prec,0),status==cens)~strata.num)*/

/*	proc freq data=gs; table cen; run;	*/

/*proc lifetest does not accept (start, end) way  --no left trucntion time. Using the code below proc phreg, the survival function is not exactly the same as that in R.

/*This gave survival that is not exactly the same as R.
-----I will assume the survival fucntion estimates are correct and the difference is just due to software difference.*/
proc phreg data =gs;
	model (&timestart,Tstop_cen)*cen(0) =  /ties=Efron;
	baseline out=surv_cens survival=_all_ ;
	weight wtvar;
run;

/*##########################################################################################################
## Calculate time to entry (left truncation) distribution at t-, use 2*prec in order to exclude censorings at same time
  if(calc.trunc) surv.trunc <- survival::survfit(Surv(-Tstop,-(Tstart+2*prec),rep(1,n))~strata.num)*/
data gs;
	set gs;
	Rtstop=-&timestop;
	Rtstart=-(&timestart+2*&prec);
run;
/*this does not work given the <0 time values, while in R it works. survival::survfit(Surv(-Tstop,-(Tstart+2*prec),rep(1,n))~strata.num)
proc phreg data =gs;
model (Rtstop,Rtstart)*val(0)=;  
baseline out=Pred1 survival=_all_ ;
run;*/
/*What if I add a bit number to time to make it positive. This is just like moving the whole time scale up which should not change the survival function.
Then in the survival fucntion result, I can -the big number to go back to thr original time.*/
proc sql; select min(Rtstop) into:mintime separated by " " from gs; quit;
%put &mintime;
data gs;
	set gs;
	Rtstop_add=Rtstop+abs(&mintime)+10;  /*+ absolute of the smallest negative numebr +10 to make sure it is positive time*/
	Rtstart_add=Rtstart+abs(&mintime)+10;
run;
proc phreg data =gs;
	model (Rtstop_add,Rtstart_add)*val(0)=  /ties=Efron; 
	baseline out=surv_trunc survival=_all_ ;
	weight wtvar;
run;
data Surv_trunc;
	set Surv_trunc;
	time=Rtstart_add-(abs(&mintime)+10); /*Back to the original time*/  
run;

/*There are events 0,1,2 where 0 is censoring, 1 is event of interest, and 2 is the competing risk event.
Only for the competing risk events, we need to make it coutning process and gave weights.
For one with competing risk event, counting how many stop time (among those with event of interest) is bigger then this stop time (say 10),
	  then totally 10+1 segments will be created for this person, with the first row is its own start and end time.
*/

/*for event of interest and censoring subjects, only 1 row is kept*/
data not_cmpk;
	set gs(keep=&id &timestart &timestop &evtvar wtvar where=(&evtvar in (0,1)));
run;

/*Get the number of subjects with competing risk events*/
data cmpk;
	set gs(keep=&id &timestart &timestop &evtvar wtvar where=(&evtvar not in (0,1)));
	p=99;
run;
data _null_; set cmpk end=last; call symputx('ncmpk',_n_); run;
%put The numebr of subjects with competing risk events is: &ncmpk;

/*all the stop time points for theose with event of interest.*/
proc sql; create table uniqe_stop as select distinct &timestop as Tstop from gs(where=(&evtvar=1)); quit;
data uniqe_stop;
	set uniqe_stop;
	p=99;
run;
proc sql noprint;
	create table cmpkdata as
	select * from cmpk,  uniqe_stop(rename=(p=p2))  
	where cmpk.p=uniqe_stop.p2;
quit;

data cmpkdata;
	set cmpk(in=aa drop=p) cmpkdata(drop=p p2);
	if aa then Tstop=&timestop;
	if &timestop>=Tstop then delete;  /*if the 10 time points are later than the stop time of this person then keep 10 rows + its original row*/
run;
proc sort data=cmpkdata; by &id Tstop; run;
data cmpkdata;
	set cmpkdata;
	by &id;
	/*counting process data*/
	prev_time=lag(Tstop);
	Tstart=prev_time;
	drop prev_time;
run;
/*for the first time segment, use its own start time*/
data cmpkdata;
	set cmpkdata;
	by &id;
	if first.&id  then Tstart=&timestart;
run;
/*all subjects together*/
data allsubj;
	set not_cmpk(rename=(&timestop=Tstop &timestart=Tstart)) cmpkdata(keep=&id &evtvar Tstop Tstart wtvar);
run;

/*#####After we have the coutning process data, now need to make weights for censoring and trunction ###*/
/*get the survial function from surv_cens and suv_trunc for Tstop in allsubj*/

/*Censoring*/
proc sort data=allsubj; by Tstop; run;
data datwt(rename=(survival=weight_cens1));
	merge allsubj surv_cens(keep=Tstop_cen survival rename=(Tstop_cen=Tstop));
	by Tstop;
	retain aa;
	if survival^=. then aa=survival;
	else survival=aa;
	drop aa;
	if &id=. then delete;
run;
/*Trunction*/
data datwt;
	set datwt;
	time=-Tstop;
run;
proc sort data=datwt; by time; run;
data datwt(rename=(survival=weight_trunc1) drop=time);
	merge datwt surv_trunc(keep=Time survival);
	by Time;
	retain aa;
	if survival^=. then aa=survival;
	else survival=aa;
	drop aa;
	if &id=. then delete;
run;
proc sort data=datwt; by &id; run;
	
proc sort data=datwt; by &id Tstop; run;
/*check the number of rows per subject has --
If there is only one row, then weights=1; If there are more than 1 row (subjects with competing risk), then its first row have weight 1, other
row have weight/weight in first row*/
data datwt;
	set datwt;
	by &id;
	if first.&id then first=1;
	else first+1;
run;
data datwt2;
	merge datwt(rename=(first=rowid)) 
		datwt(keep=&id first weight_trunc1 weight_cens1 where=(first=1) rename=(weight_cens1=w_cens1 weight_trunc1=w_trunc1)); /*The first row's value*/
	by &id;
	drop first;
	if rowid=1 then do;
		weight_trunc=1; weight_cens=1;
	end;
	else do;
		weight_trunc=weight_trunc1/w_trunc1; 
		weight_cens=weight_cens1/w_cens1;
	end;
run;
/*double check the subjects with competing risk --only for them the weights will be ^=1*/
/*data see;
	set datwt2(where=(&evtvar=2));
run;*/
/*make a final dataset with an indicator for event=1*/
data datwt;
	set datwt2(keep=&id &evtvar Tstart Tstop weight_trunc weight_cens wtvar);
	evt=(&evtvar=1);
	wt=weight_trunc*weight_cens*wtvar;
run;

/*get cumulative incidence by 1-KM*/
proc phreg data =datwt;
	model (Tstart,Tstop)*evt(0) =  /ties=Efron;
	baseline out=survprob survival=_all_;
	weight wt;
run;
data &outdata(keep=Tstop cumi SE low up);
	set survprob;
	cumi=1-survival;
	SE=StdErrSurvival;
	up=1-LowerSurvival;
	low=1-UpperSurvival;
run;
%end; /*end for &hmany_cmpk^=0 codntion.*/
%mend wdata;


/*########  Example data set  #####*/
/*
proc import out=gt
			datafile="Y:\Common\Qi\St Jude\Nickhill\Imputation\New strategy\TimefromBirth\Left trunction\\dat for cum 14 more cmpk.csv" 
			DBMS=csv replace;
run;
proc freq data=gt; table status14 startdx; run;
*Make weights that is between 1 and 2:;
data gt;
	set gt;
	weight=ranuni(2016)+1;
run;
proc means data=gt min mean max; var weight; run;
*/

/*Use time from birth as the time scale. Since everyone starts at different age, there is left trunction and right censoring.*/
/*If there is weights in the input data:*/
	*%wdata(indata=gt,timestart=astart,timestop=timebirth,id=mrn,evtvar=status14,wtvar=weight,outdata=cm_timesbirth);
/*If there is no weights in the input data:*/
	*%wdata(indata=gt,timestart=astart,timestop=timebirth,id=mrn,evtvar=status14,outdata=cm_timesbirth2);
/*The corresponding R codes to get left tructuon cumualtive incidence is:
indata=read.csv("Y:\\Common\\Qi\\St Jude\\Nickhill\\Imputation\\New strategy\\TimefromBirth\\Left trunction\\dat for cum 14 more cmpk.csv")
library("mstate")
data1=indata[,c("MRN","astart","timebirth","status14")]
expdata=crprep(Tstop="timebirth", status="status14", data=data1, trans = 1, cens = 0, Tstart="astart", id="MRN")
fit1=survfit(Surv(Tstart,Tstop,status==1)~1,data=expdata,weight=weight.cens*weight.trunc)
cm1=cbind(summary(fit1)$time,1-summary(fit1)$surv)
*/




/*Use time from DX as the time scale. Eveyone start at 10 years from DX (so startdx=10), no left trunction, only right censoring.*/
*%wdata(indata=gt,timestart=startdx,timestop=timedx,id=mrn,evtvar=status14,outdata=cm_timesDX);

/*Since this has only right censoring, the cumualtinve incidence macro such as "cuminc.sas" (cumic_weight.sas is its version with weight) and "CIF_weight.sas" woudl work.
As we out note at the beginning, given the slight different survival probability produced by "proc phreg", the resutled cumualtive incidence is slightly different. */
*%include "W:\Common\Qi\webpage\MCC\cuminc.sas";
*%Greenwod(inset =gt, outset =cumout, timevar =timedx, eventvar =status14, lastevnt =2);

/*Make pseudo data set that has no competing risk and do cumualtive incidence with left trunction (different start time)
data pseudo;
	set gt;
	if status14=2 then status14=0;
run;
proc freq data=pseudo; table status14; run;*/
*%wdata(indata=pseudo,timestart=astart,timestop=timebirth,id=mrn,evtvar=status14,outdata=cm_timesbirth);