class: center, middle, inverse, title-slide # HW Keys ### Jinliang Yang ### April 21th, 2020 --- # Inbred lines The inbred lines are assumed to be a random sample of genotypes from the population. | Source | df | Observed MS | E(MS) | | :------: | :-------: | :--------------------:|:------: | | Environment | `\(e-1\)` | | | | Replications/E | `\((r-1)e\)` | | | | Inbred lines | `\(n-1\)` | `\(MS_{progeny}\)` | `\(V_e + rV_{PE} + reV_{progeny}\)` | | Inbreds x E | `\((n-1)(e-1)\)` | `\(MS_{PE}\)` | `\(V_e + rV_{PE}\)` | | pooled error | `\((n-1)(r-1)\)` | `\(MS_{error}\)` | `\(V_e\)` | - Inbred lines: `\(V_{progeny} = V_G\)` - `\(V_{progeny} = V_G = \frac{MS_{progeny} - MS_{PE}}{re}\)` - `\(V_{PE} = V_{GE} = \frac{MS_{PE} - MS_{Error}}{r}\)` -- __Broad-sense heritability__, on an entry-mean basis, was calculated from the equation: > Hallauer et al., Handbook of Plant Breeding `\begin{align*} H^2 = \frac{V_G}{V_G + V_{GE}/e + V_E/{re}} \end{align*}` --- # Flint-Garcia et al., 2009 data ```r f <- read.delim("https://jyanglab.com/agro932-lab/data/journal.pone.0007433.s001.txt", header=TRUE) # Convert missing data to NA f[f=="."] <- NA # four environments # table(f$Env) f$INBRED <- as.factor(f$INBRED) f$Env <- as.factor(f$Env) # tricky part, be careful: f$TotKnlWt_Inbred <- as.numeric(as.character((f$TotKnlWt_Inbred))) fit <- lm(TotKnlWt_Inbred ~ INBRED + Env, data=f) anova(fit) ``` ``` ## Analysis of Variance Table ## ## Response: TotKnlWt_Inbred ## Df Sum Sq Mean Sq F value Pr(>F) ## INBRED 303 182960 603.8 1.9637 2.214e-12 *** ## Env 3 18749 6249.7 20.3247 1.548e-12 *** ## Residuals 581 178653 307.5 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ``` --- # Flint-Garcia et al., 2009 data Without GxE: | Source | df | Observed MS | E(MS) | | :------: | :-------: | :--------------------:|:------: | | Environment | `\(e-1\)` | | | | Replications/E | `\((r-1)e\)` | | | | Inbred lines | `\(n-1\)` | `\(MS_{progeny}\)` | `\(V_e + reV_{progeny}\)` | | pooled error | `\((n-1)(r-1)\)` | `\(MS_{error}\)` | `\(V_e\)` | - Inbred lines: `\(V_{progeny} = V_G = V_A\)` - `\(V_{progeny} = V_G = \frac{MS_{progeny} - MS_{Error}}{re}\)` - `\(V_E = V_{e} = MS_{Error}\)` `\begin{align*} H^2 = \frac{V_G}{V_G + V_E/{re}} \end{align*}` ```r a <- anova(fit) vg <- (a[1,3] - a[3,3])/4 ve <- a[3,3] h2 <- vg/(vg + ve/4) ``` --- # Heritability function ```r geth2 <- function(df, trait="TotKnlWt_Inbred"){ f <- df # Convert missing data to NA f[f=="."] <- NA f$INBRED <- as.factor(f$INBRED) f$Env <- as.factor(f$Env) # tricky part, be careful: f[, trait] <- as.numeric(as.character((f[, trait]))) fit <- lm(f[, trait] ~ INBRED + Env, data=f) a <- anova(fit) vg <- (a[1,3] - a[3,3])/4 ve <- a[3,3] h2 <- vg/(vg + ve/4) return(h2) } ``` -- ```r res1 <- geth2(df=f, trait="CobDia_Hyb") # low BPH 0.05 res2 <- geth2(df=f, trait="CobWt_Hyb") # intermediate 0.66 res3 <- geth2(df=f, trait="PltYield_Hyb") # high BPH 1.85 ``` --- # Test the hypothesis ```r df <- data.frame(Trait =c("CobDia_Hyb", "CobWt_Hyb", "PltYield_Hyb"), H2 = c(res1, res2, res3), BPH = c(0.05, 0.66, 1.85)) cor.test(df$H2, df$BPH) ``` --- # Visualize the results ```r library(ggplot2) ggplot(df, aes(x=H2, y=BPH)) + geom_point() + geom_smooth(method=lm) + theme_classic() ``` --- # K-fold validation <div align="center"> <img src="kfold.png" height=300> </div> In each round ( `\(n\)` round in total), we split the dataset into `\(k\)` parts: - one part is used for validation, - and the remaining `\(k\)`−1 parts are merged into a training subset for model evaluation. --- # K-fold validation Search: "r code k fold cross validation" - [StackExchange](https://stats.stackexchange.com/questions/61090/how-to-split-a-data-set-to-do-10-fold-cross-validation) -- ```r kfold <- function(num=101, k=10){ # num: num of samples # k: number of fold #Create 10 equally size folds folds <- cut(1:num, breaks=k, labels=FALSE) # Randomly shuffle the index of the data idx <- sample(num, replace=FALSE) out <- data.frame(idx=idx, folds=folds) return(out) } kfold(num=101, k=10) ``` -- ### Presentation Order ```r set.seed(6283) a <- kfold(num=8, k=2) id <- read.delim("data/idx.tsv", header=TRUE) out <- merge(a, id, by="idx") ``` --- # K-fold validation ```r # prediction function using GBLUP and RRBLUP approaches pred <- function(y, Zs, trn, tst, lambda_g=4, lambda_rr=12566){ # y: a vector of the phenotype # Zs: a matrix of the standardized genoytpe # trn: idx for training individuals # tst: idx for testing individuals # lambda_g and lamda_rr, lambda for GBLUP and RR-BLUP <- 4 # fit$Ve / fit$Vu y.trn <- y[trn] y.tst <- y[tst] Zs.trn <- Zs[trn,] Zs.tst <- Zs[tst,] n.trn <- length(y.trn) Gtrn <- tcrossprod(Zs.trn) / ncol(Zs.trn) Gtrn <- Gtrn + diag(n.trn)*0.001 Gtst.trn <- tcrossprod(Zs.tst, Zs.trn) / ncol(Zs.tst) Ginv.trn <- solve(Gtrn) ones <- matrix(1, ncol=1, nrow=n.trn) Z <- diag(n.trn) LHS1 <- cbind(crossprod(ones), crossprod(ones, Z)) LHS2 <- cbind(crossprod(Z, ones), crossprod(Z) + Ginv.trn*lambda_g) LHS <- rbind(LHS1, LHS2) RHS <- rbind( crossprod(ones, y.trn), crossprod(Z,y.trn) ) sol.trn <- solve(LHS, RHS) # prediction y.hat <- Gtst.trn %*% Ginv.trn %*% matrix(sol.trn[c(2:(n.trn+1))]) out1 <- cor(y.hat, y.tst) ones <- matrix(1, ncol=1, nrow=n.trn) I <- diag(ncol(Zs)) LHS1 <- cbind(crossprod(ones), crossprod(ones, Zs.trn)) LHS2 <- cbind(crossprod(Zs.trn, ones), crossprod(Zs.trn) + I*lambda_rr) LHS <- rbind(LHS1, LHS2) RHS <- rbind( crossprod(ones, y.trn), crossprod(Zs.trn, y.trn) ) sol.trn <- solve(LHS, RHS) # prediction y.hat2 <- Zs.tst %*% matrix(sol.trn[-1]) out2 <- cor(y.hat2, y.tst) return(data.frame(gblup=out1, rrblup=out2)) } ``` --- # K-fold validation #### One round of k-fold CV ```r cv <- function(y, Zs, df, k=10){ # pred(y, Zs, trn=df[df$folds !=1, ]$idx, tst=df[df$folds ==1, ]$idx, lambda=4) out1 <- data.frame() for(i in 1:k){ tem <- pred(y, Zs, trn=df[df$folds !=i, ]$idx, tst=df[df$folds ==i, ]$idx, lambda_g=4, lambda_rr=12566) out1 <- rbind(out1, tem) } out2 <- data.frame(gblup=mean(out1$gblup), rrblup=mean(out1$rrblup)) return(out2) } ``` -- #### Run it multiple rounds ```r run_cv <- function(y, Zs, n=20, k=10, random.seed=1234){ res <- data.frame() for(j in 1:n){ set.seed(random.seed+j) message(sprintf("###>>> running the [ %s ]th shuffling for [ %s fold ] cross-validation ...", j, k)) # apply the kfold function to divide data into k-fold df <- kfold(num=length(y), k) tem <- cv(y, Zs, df, k) res <- rbind(res, tem) } return(res) } ``` --- # Let's do a test run ```r # read phenotype and SNP files pheno_file <- "https://jyanglab.com/agro932-lab/data/Loblolly_Pine/Phenotypic_Data/DATA_rootnum_age10_rootnum.csv" pheno <- read.csv(pheno_file, header=TRUE, stringsAsFactors = FALSE) na.index <- which(is.na(pheno$Derregressed_BV)) pheno <- pheno[-na.index, ] y <- pheno$Derregressed_BV y <- matrix(y, ncol=1) Z <- read.table("https://jyanglab.com/agro932-lab/cache/Z.txt") Zs <- scale(Z, center = TRUE, scale = TRUE) ### the main code: # out <- run_cv(y, Zs, n=2, k=5) out <- run_cv(y, Zs, n=2, k=5, random.seed=123467) # write.table(out, "cache/gblup_rrblup_comp.csv", sep=",", row.names = FALSE, quote=FALSE) ```