# AUTHORS: Mickael Guedj (mickael.guedj@gmail.com), Jerome Wojcik (jerome.wojcik@merckserono.net) and Gregory Nuel (gregory.nuel@genopole.cnrs.fr). 2007 #! modifications at 7/05/08 # bug in p-value computation (x.sim = x.sim - delta) corrected # aov.test added # PARAMETERS : # dv = default value # 'x', a sequence of random variables in which we want to search for local scores. # 'geno', and 'pheno', input required if instead of 'x' you want to use association data. Genotypes (for instance O for aa, 1 for aA, 2 for AA and NA for missing value) are stored into 'geno' with one column per marker. Phenotypes such as case-control status (e.g. 0 for control and 1 for cases) or familial information are stored into 'pheno'. Each row in 'pheno' and 'geno' corresponds to one subject. # 'info', additional information about the markers e.g. position, chromosome... (optional, NULL dv). # 'association.test', stands for the association test to apply on 'geno' and 'pheno'. Either a R function (may be defined by the user itself / must have two arguments, the first being one columns of genotypes for a given marker, the second being 'pheno') or a string that corresponds to one of the tests proposed in lhisa (fuea.test dv, fisher.test ...). # 'p.value', if FALSE (dv): do not associate any p.value to the local scores. If TRUE : compute p.values with Monte-Carlo simulations. # 'selection', the method used to select a subset of local scores (optional). If 'none'(dv): no selection method is applied; if 'sequential': local scores are added in the selection from the best local score until they are no more significant at the given level (see the 'level' parameter); if 'distribution': the selection is based on the distribution of local scores under the null hypothesis that there is no markers associated to the phenotype; if 'pmin': the selection is based on the method initially proposed in the original publication by Guedj et al (2006 in Stat. Appl. Genet. Mol. Bio.). # 'B', the number of simulations performed in Monte-Carlo simulations. # 'delta', the level of significance for single-marker statistics used in the computation of local scores that defines the threshold upon which markers contribute positively to the local score (0.05 dv). # the level of significance used in selection methods (0.05 dv). # 'coding', specify the codes used for the genotypes (0, 1 and 2 dv) library("allelic") lhisa = function(x = NULL, pheno = NULL, geno = NULL, info = NULL, association.test = c("fuea.test", "fisher.test", "chisq.test", "aov.test"), pop = NULL, p.value = FALSE, coding = c(0,1,2), selection = c("none", "sequencial", "distribution", "pmin"), B = 1000, delta = 0.05, level = 0.05, ...) { # Implementation of local score functions fast.scolo = function(x) { ell = length(x) List = numeric(0) k = 0 L = 0 R = 0 for (iii in 1:ell) { L = R R = R+x[iii] if (x[iii] >= 0) { List = rbind(List,c(iii,iii,L,R)) k = dim(List)[1] cont = 1 while(cont == 1) { toto = sub.fast.scolo(List) cont = toto$cont List = toto$List } } } k = length(List[,1]) res = matrix(0,k,3) for (iii in 1:k) res[iii,] = c(List[iii,1], List[iii,2], List[iii,4] - List[iii,3]); rr = order(res[,3], decreasing = T) start = res[rr,1] end = res[rr,2] H = res[rr,3] T = cumsum(H) return(data.frame(start, end, H, T)) } sub.fast.scolo = function(List) { cont = 0 k = 0 if (is(List)[1] == "matrix") k = length(List[,1]); if (is(List)[1] == "numeric" & length(List) > 0) { k = 1 List = t(as.matrix(List)) } if (k >= 1) { index = 1:k aa = which(List[,3] < List[k,3]) if (length(aa) > 0) { j = max(index[aa]) if (List[j,4] < List[k,4]) { cont = 1 List[j,c(2,4)] = List[k,c(2,4)] List = List[1:j,] } } } res = new.env() res$cont = cont res$List = List return(res) } # Implementation of simulation functions x.simulate.ind = function(my.x, my.geno, my.pheno, my.coding, my.test, my.pop, my.sat) { return(sample(my.x)) } x.simulate.ld = function(my.x, my.geno, my.pheno, my.coding, my.test, my.pop, my.sat) { perm = 1:dim(my.pheno)[1] sim.pheno = as.data.frame(pheno[sample(perm),]) return(-log10(apply(geno, 2, FUN = my.test, my.pheno = sim.pheno, my.coding = my.coding, other.geno = my.geno))) } # Implementation of association-test functions fuea..test = function(my.geno, my.pheno, my.coding, other.geno, my.pop, my.sat) { d0 = length(which(my.pheno[,1] == 1 & my.geno == my.coding[1])) d1 = length(which(my.pheno[,1] == 1 & my.geno == my.coding[2])) d2 = length(which(my.pheno[,1] == 1 & my.geno == my.coding[3])) h0 = length(which(my.pheno[,1] == 0 & my.geno == my.coding[1])) h1 = length(which(my.pheno[,1] == 0 & my.geno == my.coding[2])) h2 = length(which(my.pheno[,1] == 0 & my.geno == my.coding[3])) return(allelic.exact.test(d0, d1, d2, h0, h1,h2)) } fisher..test = function(my.geno, my.pheno, other.geno, my.coding, my.pop, my.sat) { return(fisher.test(my.pheno[,1], my.geno)$p.value) } chisq..test = function(my.geno, my.pheno, other.geno,my.coding, my.pop, my.sat) { #print(table(my.pheno, my.geno)) return(chisq.test(my.pheno[,1], my.geno)$p.value) } aov..test = function(my.geno, my.pheno, other.geno,my.coding, my.pop, my.sat){ res = summary(aov(my.pheno[,1]~my.geno))[[1]][1,5] if (!is.na(res)) return(res) if (is.na(res)) return(1) } lr..test = function(my.geno, my.pheno, other.geno, my.coding, my.pop, my.sat){ pv.pop = NULL for (iii in 1:length(levels(my.pop))){ aa = which(as.character(my.pop) == levels(my.pop)[iii]) if (dim(table(my.pheno[aa,1],my.geno[aa]))[1] > 1 & dim(table(my.pheno[aa,1],my.geno[aa])[2] > 1)) pv.pop[iii] = my.sat(my.geno[aa], my.pheno[aa,], other.geno, my.coding, my.pop, my.sat) else pv.pop[iii] = 1; } return(sum(pv.pop)) } # Parameters management selection = match.arg(selection) if (length(list(...)) > 0) warning("non-matched further arguments are disregarded"); if (is.null(x) & is.null(pheno) & is.null(geno)) stop("you need to specify one of the following parameters : 'x' or 'geno and pheno'"); if (!is.null(x) & !is.null(pheno) & !is.null(geno)) warning("'geno' and 'pheno' are not take into account in the presence of the parameter 'x'"); if (is.null(x) & is.null(geno) & !is.null(pheno)) stop("you must specify 'geno' AND 'pheno'"); if (is.null(x) & !is.null(geno) & is.null(pheno)) stop("you must specify 'geno' AND 'pheno'"); if (!is.function(association.test) & !is.character(association.test)) stop("'test' must be a function or a string"); if (!is.null(pheno) & !is.null(geno)) { if (!is.data.frame(geno) | !is.data.frame(pheno)) stop("'geno' and 'pheno' sould be data frames"); if (dim(geno)[1] != dim(pheno)[1]) stop("'geno' and 'pheno' must concern the same number of subjects"); } if (!is.null(info)) { if (!is.data.frame(info)) stop("info should be a data frame"); if (!is.null(geno) & dim(geno)[2] != dim(info)[1]) stop("'info' and 'geno' should concern the same number of markers"); if (!is.null(x) & length(x) != dim(info)[1]) stop("'info' and 'x' should concern the same number of markers"); } if (!is.function(association.test)) { association.test = match.arg(association.test) if (association.test == "fuea.test") { association.test = fuea..test if (length(coding)!=3) stop("'coding' must be of size 3 if you use 'fuea.test', coding[1] and coding [3] standing for the homozygous genotypes and coding[2] standing for the heterozygous one") if (length(intersect(coding, c(0,1,2))) != 3) warning(paste("since you don't use the 'coding' by default, please check that ", coding[1], " and ", coding[3], " stands for the homozygous genotypes and ", coding[2], " stands fot the heterozygous one")) } else if (association.test == "chisq.test") association.test = chisq..test else if (association.test == "fisher.test") association.test = fisher..test else if (association.test == "aov.test") association.test = aov..test } sat = NULL if (!is.null(pop)){ sat = association.test association.test = lr..test if (is.null(pop)) stop ("'pop' must be specified in the context of local replication") if (length(pop) != dim(pheno)[1]) stop("'pop' must contain the same number of subjects than 'pheno' and 'geno'"); if (!is.factor(pop)){ warning("'pop' must contain factors and its levels are hence transformed into factors") pop = as.factor(pop) if (length(levels(pop)) < 2) stop("'pop' must contain more than one population in order to detect local replications") } } if (!is.logical(p.value)) stop("'p.value' must be FALSE OF TRUE"); if (B < 2) stop("B < 2, however Monte-Carlo simulatins must be based on more then 2 simulations"); if (delta < 0 | delta > 1) stop("'delta' must be within the interval 0..1") if (level < 0 | level > 1) stop("'delta' must be within the interval 0..1") if (selection != "none") p.value = TRUE; if (!is.null(x)) x.simulate = x.simulate.ind else x.simulate = x.simulate.ld; if (!is.null(geno)) {if(length(setdiff(c(as.matrix(geno), NA),c(coding, NA))) != 0 | length(union(c(as.matrix(geno), NA),c(coding, NA)))-1 != (length(coding))) warning("some codes in 'geno' are unspecified in 'coding'"); } # STEP 1 : we generate 'x' and 'x.obs = x - delta' pv = NULL if (!is.null(geno)) { pv = apply(geno, 2, FUN = association.test, my.pheno = pheno, my.coding = coding, my.pop = pop, my.sat = sat) x = -log10(pv) } delta.stat = as.numeric(quantile(x, 1-delta, na.rm = TRUE), na.rm = T) x.obs = x - delta.stat # STEP 2 : we calculate 'scolo.obs' and corresponding parameters: 'n.obs' scolo.obs = fast.scolo(x.obs) nb.obs = dim(scolo.obs)[1] # STEP 3 : p-value computation if (!p.value) p.value = rep(NA, dim(scolo.obs)[1]) else { p.value = rep(0, nb.obs) nb.B = rep(0,nb.obs) for (iii in 1:B) { x.sim = x.simulate(my.x = x.obs, my.pheno = pheno, my.geno = geno, my.test = association.test, my.coding = coding, my.pop = pop, my.sat = sat)-delta.stat if (max(x.sim) > 0){ scolo.sim = fast.scolo(x.sim) nb.sim = min(nb.obs, dim(scolo.sim)[1]) nb.B[1:nb.sim] = nb.B[1:nb.sim] + 1 for (jjj in 1:nb.sim) { if (scolo.sim[jjj,3] >= scolo.obs[jjj,3]) p.value[jjj] = p.value[jjj] + 1; } } } p.value = p.value/nb.B } # STEP 4 : selection # STEP 5 : mise en forme des résultats result = list() result$pv = as.numeric(pv) result$raw.x = as.numeric(x) result$x = as.numeric(x.obs) result$losc = as.data.frame(cbind(scolo.obs, p.value)) return(result) }