# code for # M LeBlanc, C Kooperberg, T Grogan, T Miller (2003) # Directed indices for exploring gene expression data. # Bioinformatics, 19 , 686-693 # Copyright (C) 2002--2003 Michael LeBlanc # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # The text of the GNU General Public License, version 2, is available # as http://www.gnu.org/copyleft or by writing to the Free Software # Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # Michael LeBlanc, 04/11/2003, mleblanc@fhcrc.org # # GIN FUNCTION genorder<-function(x0,coef0,class0=1,x,xadjust0=x0,xadjust=x, y, class=rep(class0,length(y)),lambda.cov=1, lambda.eff=5,lambda.class=5,coeflim=2.5,surv=F,zcurrent=NULL,numfeat=100, times=y,status=rep(1,length(y)),std=F,bundle=T,adaptbundle=F,plt=T ) { #------------------------------------------------------------ # Main Gene Index (GIN) function # Generate feature variables based distances from feature of interest # # x0 = reference gene expression (vector) # coef0 = target coefficient (single) # class0 = target class (single) # x = expressions (matrix n (observations) by p (genes) # xadjust = adjusted expressions of dimension above (if adjusted correlations are desired) # xadjust0 = adjusted expression of reference gene (if adjusted correlations are desired) # y = outcome (vector) # class = class of genes (vector) # lambda.cov = weight for the gene to gene association (nu_e in paper) # lambda.eff = weight for the gene to outcome association (nu_o in paper) # lambda.class = weight for class component (nu_c in paper) # coeflim = maximum magnitude for coefficient to be used in outcome gene association calculation # surv = survival analysis (F/T) # zcurrent = adjustment variable for regressions # numfeat = number of features to be used in bundling predictors # times = survival times for survival data # status = status indicators (0,1) for survival data # bundle = construct bundled predictors # plt = plot marginal coefficients by gene index (with class group indicated) #-------------------------------------------------------------------------------------- if (std) x_standx(x) p<-ncol(x) n<-nrow(x) coef<-rep(NA,p) # FOR SURVIVAL ANALYSIS if (surv==T) { residreg<-coxph(Surv(times,status)~runif(length(times)),iter.max=0) resid<-residreg$residual wt<-(status-resid)+.01 y<-resid/wt if (is.null(zcurrent)) { bbsimp<-qreg(x,y,wt,var.calc=F) coef1<-bbsimp$betah sterr<-sqrt(bbsimp$varv) tstat<-bbsimp$tstat } else { bbsimp<-qreg(cbind(zcurrent,x),y,wt,var.calc=F) coef1<-bbsimp$betah[-1] sterr<-sqrt(bbsimp$varv)[-1] tstat<-bbsimp$tstat } ind<-abs(coef1)>coeflim*sterr coef<-(!ind)*coef1+ind*sign(coef1)*coeflim*sterr } # FOR UNCENSORED DATA else { if (is.null(zcurrent)) { bbsimp<-qreg(x,y) coef1<-bbsimp$betah sterr<-sqrt(bbsimp$varv) tstat<-bbsimp$tstat } else { bbsimp<-qreg(cbind(zcurrent,x),y,wt) coef1<-bbsimp$betah[-1] sterr<-sqrt(bbsimp$varv)[-1] tstat<-bbsimp$tstat } ind<-abs(coef1)>coeflim*sterr coef<-(!ind)*coef1+ind*sign(coef1)*coeflim*sterr } # Calculate components of GIN margcoef<-coef effdist<-(coef-coef0)**2 effdist<-effdist/sqrt(var(effdist)) covdist<-as.vector((1-cor(xadjust0,xadjust))**2) rhosum<-cor(xadjust0,xadjust) covdist<-covdist/sqrt(var(covdist)) classdist<-(class-class0)**2 classdist<-classdist # Calculate GIN statistic jointdist<-lambda.cov*covdist+lambda.eff*effdist+lambda.class*classdist parmsum<-lambda.cov+lambda.eff+lambda.class varfrac.cov<-lambda.cov/parmsum varfrac.eff<-lambda.eff/parmsum varfrac.class<-lambda.class/parmsum # Summary of the % variation in each component print("lambda.cov and varfrac.cov") print(c(lambda.cov,varfrac.cov)) print("lambda.eff and varfrac.eff") print(c(lambda.eff,varfrac.eff)) print("lambda.class and varfrac.class") print(c(lambda.class,varfrac.class)) featord<-order(jointdist) # Construct the list of predictors # here need to say how many predictors one wants. # construct feature label list indlist<-(1:p)[featord] # here we bundle the variables if it is requested bundlist<-bestvec<-featscore<-zmat<-NULL coefmean<-coef0 xadjustmean<-xadjust0 if (bundle) { bundlist<-indlist[1:numfeat] # construct the bundled variables nrow<-dim(x)[1] zmat<-matrix(NA,nrow,numfeat) for (j in 1:numfeat) { zmat[,j]<-apply(as.matrix(cbind(x0,x[,bundlist[1:j]])),1,"mean") } zmat<-standx(zmat) # need to perform all regressions bestvec<-featscore<-rep(NA,numfeat) jj<-0 for (ind in 1:numfeat) { jj<-jj+1 if (surv==F) { if (is.null(zcurrent)) { aa<-lm(y~zmat[,ind]); coef<-aa$coef[2] } else { aa<-lm(y~cbind(zcurrent,zmat[,jj])); coef<-aa$coef[3] } } if (surv==T) { if (is.null(zcurrent)) { aa<-coxph(Surv(times,status)~zmat[,jj]); coef<-aa$coef } else { aa<-coxph(Surv(times,status)~cbind(zcurrent,zmat[,j])) ; coef<-aa$coef[2] } } bestvec[jj]<-coef if (surv==F) featscore[jj]<-sqrt(summary(aa)$fstatistic[1])*sign(coef) else featscore[jj]<-sqrt(aa$score)*sign(coef) } } if (plt) { plot(1:numfeat,y=margcoef[featord[1:numfeat]],pch=1,ylab="marginal coeficient",xlab="index") points(x=(1:numfeat)[class[featord[1:numfeat]]==class0],y=(margcoef[featord[1:numfeat]])[class[featord[1:numfeat]]==class0],pch=16) } # OUTPUT # For individual mariginal effects #--------------------------------- # featlist = variable labels in order of GIN # margcoef = marginal regression coeficients in order of GIN # margtstat = marginal test statistics in order of GIN # jointdist = GIN measure # rhosum = component gene/reference gene correlations # For bundled variables #---------------------------------- # bundlist = variables as entered into bundle # feateff = coefficient for the bundled variable # featscore = test statistic for the bundled variable # featmat = bundled variable #---------------------------------- return(list(featlist=featord,margcoef=margcoef,margtstat=tstat,jointdist=jointdist,rhosum=rhosum, bundlist=bundlist,feateff=bestvec,featscore=featscore,featmat=zmat)) } #-------------------------------------------------------- # Standardize expression variables standx_function(x) { p_dim(x)[2] for (i in 1:p) { x[,i]_(x[,i]-mean(x[,i]))/sqrt(var(x[,i])) } return(x) } #---------------------------------------------------------- # fast marginal regressions qreg<-function(x,y,wt=rep(1,length(y)),times=y,var.calc=T ) { nn<-apply(!is.na(x),2,"sum") yy<-y-mean(wt*y)/mean(wt) xx<-t(t(x)-apply(wt*x,2,"mean",na.rm=T)/apply(wt*(!is.na(x)),2,"mean",na.rm=T)) bbxy<-apply(xx*yy*wt,2,"sum",na.rm=T) bbxx<-abs(apply(xx*xx*wt,2,"sum",na.rm=T)) betah<-bbxy/bbxx if (var.calc) sig2<-apply(wt*(yy-t(t(xx)*betah))**2/(nn-2),2,"sum",na.rm=T) else sig2<-1 varv<-sig2/bbxx tstat<-bbxy/sqrt(bbxx*sig2) return(betah,varv,tstat) } #----------------------------------------------------- # EXAMPLE p<-200 # 200 genes n<-100 # 100 observations x<-matrix(rnorm(n*p),ncol=p) eta<-as.vector(x%*%c(rep(1,5),rep(0,p-5))) # signal function of 5 genes y<-eta+rnorm(n) x0<-(x[,60]+x[,70]+x[,80])/3 # reference gene just mean of 3 other genes class<-c(rep(1,40),rep(0,p-40)) # first 40 genes in the same class # as reference gene class0<-1 # reference gene class # Uncensored Data output<-genorder(x0=x0,coef0=5,x=x,y=y,class0=1,class=class, lambda.cov=1,lambda.eff=1,lambda.class=1, coeflim=5,surv=F,zcurrent=NULL,numfeat=25,bundle=T) # Censored Data lambda_exp(eta-mean(eta)) # hazard rates tt_-log(runif(n))/lambda # exponential times cc_runif(n)*4 # censoring times status<-1*(tt