####################################################################################################### ## Cluster detection code for case-based and event-based analyses. ## ## Programmed by R.J. Rosychuk, Ph.D., P.Stat. ## ## Beta version: September 2, 2008 ## ## These codes are developed using Splus 8.0 and R for Windows. These codes are hoped to be useful, but are ## provided WITHOUT ANY WARRANTY; ## without even the implied warranty of MERCHANTABILITY or FITNESS FOR ## A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## ## See www.ualberta.ca/~rhondar/cluster for usage information. ####################################################################################################### #I: total number of cells in the study region #S: total number of strata #nnmat: I by I matrix that contains the ordered nearest neighbours for #each cell, nnmat[i,1]=i, nnmat[i,j]=(j+1)th nearest neighbour to cell #i (j not equal to i) #casemat: I by S matrix of counts of cases, casemat[i,s]=number of #cases in cell i and stratum s (sometimes called datmat) #popmat: I by S matrix of populations, popmat[i,s]=population in #stratum s for cell i #eventmat=array of event frequencies, eventmat[i,j,s]=number of cases #in cell i and stratum s that have j events test.stat=function(i,k,datmat,nnmat,resultmat){ #calculate the test statistic for either the case-based or event-based #method for cell i with cluster size k #datmat=matrix of cases or events, datmat[i,s]=cases or events in cell #i and stratum s #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #resultmat=matrix of results from the testing #sum up the number of cases for each region tempdattot=rowSums(datmat) #get the items that are for cell i and its nearest neighbours nndata=tempdattot[nnmat[i,]] sums=cumsum(nndata) #get the number of neighbours with at least k items ell=min(which(sums>=k))-1 resultmat[i,4]=ell #store the observed value of the test statistic resultmat[i,5]=sums[ell+1] #store the observed number of items return(resultmat) } get.signif.case=function(i,k,datmat,popmat,nnmat,totitem,totpop,resultmat,signiflevel,print.it=F){ #obtains the case-based testing for cell i with cluster size k #datmat=matrix of cases, datmat[i,s]=cases in cell i and stratum s #popmat=matrix of population, popmat[i,s]=population in cell i and #stratum s #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #totitem=vector of case totals per stratum, totitem[s]=number of cases #in stratum s in the entire study region #totpop=vector of population totals per stratum, totpop[s]=population #in stratum s in the entire study region #resultmat=matrix of results from the testing #popmat=matrix of population, popmat[i,s]=population in cell i and #stratum s #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #signiflevel=significance level to test each cell at #print.it=if T, prints out some intermediate calculations numstrat=ncol(datmat) resultmat=test.stat(i,k,datmat,nnmat,resultmat) #cat("resultmat[i,4]=",resultmat[i,4],"\n") ncombined=popmat[nnmat[i,1:(resultmat[i,4]+1)],] if(is.matrix(ncombined)==T) {ncombined=colSums(ncombined)} else{ncombined=sum(ncombined)} lambda=sum(ncombined*(totitem/totpop)) signif=1-ppois(k-1,lambda) resultmat[i,6]=round(lambda,3) #expected resultmat[i,7]=round(resultmat[i,5]/lambda,3) #observed over expected resultmat[i,8]=round(signif,3) #pvalue resultmat[i,9]=0 if(signif=sum(totitem)) cat("PROBLEMS: cluster size includes all cases.\n") } } return(kmat) } ################################################################ # main testing function for the case-based test, no simulations ################################################################ case.test=function(kmat,datmat,popmat,nnmat,signiflevel=0.05,print.it=F){ #this is the main function to run to obtain the case-based test, with #or without the testing algorithm #kmat=vector of user supplied cluster sizes to test, has dimension #equal to the number of cells (no testing algorithm) # =matrix of user suppplied cluster sizes to test, has multiple #sizes per cell as columns (testing algorithm) #datmat=matrix of cases, datmat[i,s]=cases in cell i and stratum s #popmat=matrix of population, popmat[i,s]=population in cell i and #stratum s #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #signiflevel=significance level to test each cell at #print.it=if T, prints out some intermediate calculations #outputs a matrix of results: Alg(w) indicates if the testing algorithm #was used and if so what number of neighbours were combined to # determine the cluster size, Flag=1 if #significant at the siglevel, 0 otherwise datmat=as.matrix(datmat);popmat=as.matrix(popmat) totitem=colSums(datmat);totpop=colSums(popmat) I=nrow(datmat) if(print.it==T){ cat("\n\n###############################################################\n") cat("CASE-BASED TEST: significance level=",signiflevel,"\n") cat("Number of cells: ",I," Total cases:",sum(totitem)," Total population:",sum(totpop),"\n") cat("Number of strata: ",ncol(datmat),"\n") if(ncol(datmat)>1){ cat("Cases by strata:\n") print(totitem) cat("Population by strata:\n") print(totpop) } cat("###############################################################\n") } resultmat=matrix(NA,I,9) dimnames(resultmat)[[2]]=c("Cell","Alg(w)","ClustSize","TestStat","Observed","Expected","Obs/Exp","p-value","Flag") if(length(kmat)==I){#do not do the algorithm, use user supplied cluster sizes for(i in 1:I){ resultmat[i,1:3]=c(i,NA,kmat[i]) resultmat=get.signif.case(i,kmat[i],datmat,popmat,nnmat,totitem,totpop,resultmat,signiflevel,print.it) } } else{#do the testing algorithm maxw=ncol(kmat) for(i in 1:I){ resultmat[i,1]=i w=0;stop=FALSE while(w0){ #the total number of cases to randomize in stratum s is totitem[s] myweight=rep(1:I,popmat[,s]) simvals=sample(myweight,totitem[s],replace=T) myfreq=table(simvals) simdatmat[as.numeric(attributes(myfreq)$dimnames[[1]]),s]=myfreq } } return(simdatmat) } ################################################################ # main simulation function for the case-based test ################################################################ sim.case.calc=function(kmat,datmat,popmat,nnmat,resultmat,numsim,signiflevel=0.05,print.it=F){ #this is the main function to run to obtain the simulations for the #case-based test, with or without the testing algorithm #returns the overall clustering p-value #kmat=vector of user supplied cluster sizes to test, has dimension #equal to the number of cells (no testing algorithm) # =matrix of user suppplied cluster sizes to test, has multiple #sizes per cell as columns (testing algorithm) #popmat=matrix of population, popmat[i,s]=population in cell i and #stratum s #datmat=matrix of cases, datmat[i,s]=cases in cell i and stratum s #resultmat=matrix of results output from event.test #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #numsim=the number of simulations #signiflevel=significance level to test each cell at #print.it=if T, prints out some intermediate calculations numsig=sum(resultmat[,9]) #determine how many cells were significant mcvals=rep(0,3*numsig) #a vector to collect the number of significant cells per simulation for(j in 1:numsim){ #get the simulated case matrix simdatamat=get.sim.case(datmat,popmat) simresultmat=case.test(kmat,simdatamat,popmat,nnmat,signiflevel,print.it) simnumsig=sum(simresultmat[,9]) mcvals[simnumsig+1]=mcvals[simnumsig+1]+1 } #determine number of sims that exceeded numsig significant cells revmcvals=mcvals[(3*numsig):1] mymat=cumsum(revmcvals) mymat=mymat[(3*numsig):1] exceeded=sum(mymat[(numsig+1+1):(length(mymat))]) mcpval=exceeded/numsim cat("\nSimulation results:\n") mcvals=as.matrix(mcvals) names(mcvals)=0:(length(mcvals)-1) cat("Frequency of the number of clusters identified:\n") print(t(mcvals)) cat("Number of sims=",numsim," Number of sims >",numsig,"=",exceeded," pvalue=",round(mcpval,3),"\n") return(mcpval) } get.Qx.event=function(eventmat,print.it=F){ #determine the proportion of cases that have each event frequency for #each stratum s, Qs(x) #eventmat is an array: cell by reps by strata #print.it=if T, prints out some intermediate calculations numstrata=dim(eventmat)[[3]] #number of strata Qxmat=matrix(NA,dim(eventmat)[[2]],dim(eventmat)[[3]]) for(s in 1:numstrata){ #for each strata num1=colSums(eventmat[,,s]) den1=sum(eventmat[,,s]) if(den1==0){ Qxmat[,s]=rep(0,nrow(Qxmat)) } else{ Qxmat[,s]=num1/den1 } } if(print.it==T){ cat("Qxmat=\n") print(Qxmat) } return(Qxmat) } get.event.prob=function(maxz,lambda,Qx){ #obtain the probabilities for the total number of events in combined #cells #maxz=maximum level of the recursion to calculate #lambda=expected value for the number of cases #Qx=vector, Qx[x]=probability that a case has exactly x events nx=length(Qx) prob.event=rep(0,maxz+1); #set up a vector for the Probability of exactly x events prob.event[1]=exp(-lambda) #the probability when x=0 newQx=c(1:nx)*Qx #do some pre-multiplication for the recursion ntimes=max(maxz-nx+1,0) newQx=c(newQx,rep(0,ntimes)) #If Qx has fewer than maxz elements, flood remaining with 0 so vector multiplication works for(z in 1:maxz){#the recursion relation prob.event[z+1]=(lambda/z)*(newQx[1:z]%*%(prob.event[z:1])) #cat("prob.event[",z+1,"]=",prob.event[z+1],"\n") } return(prob.event) } get.event.prob.part=function(prob.event,z,lambda,Qx){ #obtain an additional probability for the total number of events in #combined cells #prob.event=vector, prob.event[x]=probability of exactly x-1 events in #combined cells #z=level of the recursion to calculate #lambda=expected value for the number of cases #Qx=vector, Qx[x]=probability that a case has exactly x events nx=length(Qx) newQx=c(1:nx)*Qx ntimes=max(z-nx+1,0) newQx=c(newQx,rep(0,ntimes)) #If Qx has fewer than maxz elements, #flood remaining with 0 so vector multiplication works prob.event=c(prob.event,NA) prob.event[z+1]=(lambda/z)*(newQx[1:z]%*%(prob.event[z:1])) return(prob.event) } get.signif.event=function(i,k,datmat,popmat,nnmat,totitem,totpop,Qxmat,resultmat,signiflevel,print.it=F){ #does the event-based test for cell i with cluster size k #datmat=matrix of cases, datmat[i,s]=cases in cell i and stratum s #popmat=matrix of population, popmat[i,s]=population in cell i and #stratum s #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #totitem=vector of event totals per stratum, totitem[s]=number of event #in stratum s in the entire study region #totpop=vector of population totals per stratum, totpop[s]=population #in stratum s in the entire study region #Qxmat{x,s] gives Q=s(x) for event frequency x and stratum s #partExp=expected number of events for one case #resultmat=matrix of results from the testing #signiflevel=significance level to test each cell at #print.it=if T, prints out some intermediate calculations resultmat=test.stat(i,k,datmat,nnmat,resultmat) numstrata=ncol(popmat) #number of strata ncombined=popmat[nnmat[i,1:(resultmat[i,4]+1)],] if(is.matrix(ncombined)==T){ncombined=colSums(ncombined)} if(numstrata==1){ #cat("ncombined:\n");print(ncombined) ncombined=sum(ncombined) #cat("ncombined:\n");print(ncombined) #cat("sum(totitem)=",sum(totitem)," totpop=",totpop,"\n") lambda=ncombined*sum(totitem)/totpop #cat("lambda:\n");print(lambda) Qx=Qxmat #cat("expected Qx:",round(sum(diag(1:nrow(Qx))%*%Qx),3),"\n") #cat("expected:",round(lambda*sum(diag(1:nrow(Qx))%*%Qx),3) ,"\n") } else{ lambdas=ncombined*(totitem/totpop) lambda=sum(lambdas) lambdaratio=lambdas/lambda Qx=Qxmat%*%lambdaratio; } prob.event=get.event.prob(k-1,lambda,Qx) signif=1-sum(prob.event) #store the results expected=lambda*sum(diag(1:nrow(Qx))%*%Qx)#expected resultmat[i,6]=round(expected,3) resultmat[i,7]=round(resultmat[i,5]/expected,3) #observed over expected resultmat[i,8]=round(signif,3) #pvalue resultmat[i,9]=0 if(signif100){ try1=ceiling(1.1*lambda*sum(c(1:(length(Qx))*Qx))) } prob.event=get.event.prob(try1,lambda,Qx) signif=1-sum(prob.event) if(i==66){ cat("try1=",try1,"\n") cat("signif=",signif,"\n") } if(signifsigniflevel){ prob.event=get.event.prob.part(prob.event,z,lambda,Qx); sums=1-sum(prob.event) z=z+1 } kmat[i,w+1]=z } if(kmat[i,w+1]1){ cat("Cases by event frequency and strata:\n") print(colSums(eventmat)) cat("Population by strata:\n") print(totpop) } cat("###############################################################\n") } if(length(kmat)==I){#do not do the algorithm, use user supplied cluster sizes for(i in 1:I){ resultmat[i,1:3]=c(i,NA,kmat[i]) resultmat=get.signif.event(i,kmat[i],datmat,popmat,nnmat,totitem,totpop,Qxmat,resultmat,signiflevel,print.it) } } else{#do the testing algorithm maxw=ncol(kmat) for(i in 1:I){ resultmat[i,1]=i w=0;stop=FALSE while(w0){ #the total number of cases to randomize in stratum s with x events is totitem[x,s] myweight=rep(1:I,popmat[,s]) simvals=sample(myweight,totitem[x,s],replace=T) myfreq=table(simvals) simdatmat[as.numeric(attributes(myfreq)$dimnames[[1]]),x,s]=myfreq } } } return(simdatmat) } ################################################################ # main simulation function for the event-based test ################################################################ sim.event.calc=function(kmat,eventmat,popmat,nnmat,resultmat,numsim,signiflevel=0.05,print.it=F){ #this is the main function to run to obtain the simulations for the #event-based test, with or without the testing algorithm #kmat=vector of user supplied cluster sizes to test, has dimension #equal to the number of cells (no testing algorithm) # =matrix of user suppplied cluster sizes to test, has multiple #sizes per cell as columns (testing algorithm) #popmat=matrix of population, popmat[i,s]=population in cell i and #stratum s #eventmat=array of event frequencies, eventmat[i,j,s]=number of cases #in cell i and stratum s that have j events #resultmat=matrix of results output from event.test #nnmat=matrix of nearest neighbours, nnmat[i,j]=(j-1)th nearest #neighbour to cell i, where nnmat[i,1]=i #numsim=the number of simulations #signiflevel=significance level to test each cell at #print.it=if T, prints out some intermediate calculations numsig=sum(resultmat[,9]) #determine how many cells were significant mcvals=rep(0,3*numsig) #a vector to collect the number of significant cells per simulation for(j in 1:numsim){ #get the simulated event matrix simeventmat=get.sim.event(eventmat,popmat) simresultmat=event.test(kmat,simeventmat,popmat,nnmat,signiflevel,print.it) simnumsig=sum(simresultmat[,9]) mcvals[simnumsig+1]=mcvals[simnumsig+1]+1 } #determine number of sims that exceeded numsig significant cells revmcvals=mcvals[(3*numsig):1] mymat=cumsum(revmcvals) mymat=mymat[(3*numsig):1] exceeded=sum(mymat[(numsig+1+1):(length(mymat))]) mcpval=exceeded/numsim cat("\nSimulation results:\n") mcvals=as.matrix(mcvals) names(mcvals)=0:(length(mcvals)-1) cat("Frequency of the number of clusters identified:\n") print(t(mcvals)) cat("Number of sims=",numsim," Number of sims >",numsig,"=",exceeded," pvalue=",round(mcpval,3),"\n") return(mcpval) }