Git version control

Here is a demo project.

git clone git@github.com:jyanglab/agro931-project.git

Fork the project

Fork the demo project on github to your own repo and git clone it to your local computer

Create or edit files

Folders in a project serve for their own purposes. Read here to learn more about them.

Commit your changes

Note commit with informative message.

git add --all
git commit -m "to explore litature in qtl from 1960-2018"

push to the branch

git push

Search pubmed using key words

### source https://freshbiostats.wordpress.com/2013/12/03/analysis-of-pubmed-search-results-using-r/
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.4.4
library(RISmed)
query = "QTL"

res <- EUtilsSummary(query, type="esearch", db = "pubmed", mindate=1960, maxdate=2018, retmax=30000)
QueryCount(res)
## [1] 12555

Pubs by Years

qtlpub <- EUtilsGet(res)
years <- YearPubmed(qtlpub)
qc <- as.data.frame(table(years))
write.table(qc, "qtl_pub_count.csv", sep=",", row.names=FALSE, quote=FALSE)
save(file="qtlpub", list="qtlpub")

Plot the number of pubs by year

load("qtlpub")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
qc <- read.csv("qtl_pub_count.csv")
theme_set(theme_grey(base_size = 18)) 
s <- ggplot(qc, aes(x=years, y=Freq)) + 
    #opts(axis.text.x=theme_text(angle=90)) +
    geom_bar(stat="identity") +
    labs(x="", y="# of pubs") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size=12))
s

What are the journals?

### get the count of NGS pub per journal:
journal <- MedlineTA(qtlpub)
qtl_journal_count <- as.data.frame(table(journal))
qtl_journal_count_top25 <- qtl_journal_count[order(-qtl_journal_count[,2]),][1:25,]

as_tibble(qtl_journal_count_top25)
## # A tibble: 25 x 2
##    journal           Freq
##  * <fct>            <int>
##  1 Theor Appl Genet  1614
##  2 Genetics           779
##  3 PLoS One           636
##  4 BMC Genomics       391
##  5 Anim Genet         354
##  6 Front Plant Sci    346
##  7 Mamm Genome        294
##  8 BMC Genet          228
##  9 G3 (Bethesda)      220
## 10 Genet Sel Evol     214
## # ... with 15 more rows

Top authors

### Get authors
auths <- Author(qtlpub)
Last <- sapply(auths, function(x) paste(x$LastName, x$ForeName))
auths2 <- as.data.frame(sort(table(unlist(Last)), dec=TRUE))
auths2$author <- row.names(auths2)
names(auths2)[1] <- c("num")

df <- as_tibble(auths2)
head(df, 30)
## # A tibble: 30 x 3
##    num                 Freq author
##    <fct>              <int> <chr> 
##  1 Wu Rongling           93 1     
##  2 Churchill Gary A      70 2     
##  3 Williams Robert W     64 3     
##  4 Haley C S             61 4     
##  5 Paigen Beverly        57 5     
##  6 Lu Lu                 54 6     
##  7 Xu Shizhong           53 7     
##  8 Yano Masahiro         51 8     
##  9 Blangero John         49 9     
## 10 Visser Richard G F    46 10    
## # ... with 20 more rows

Top Affiliations

### Get affiliations
aff <- Affiliation(qtlpub)
#Last <- sapply(auths, function(x) paste(x$LastName, x$ForeName))
aff2 <- as.data.frame(sort(table(unlist(aff)), dec=TRUE))
aff2$id <- row.names(aff2)
names(aff2)[1] <- c("number")

df2 <- as_tibble(aff2)
head(df2, 30)
## # A tibble: 30 x 3
##    number                                                       Freq id   
##    <fct>                                                       <int> <chr>
##  1 National Key Laboratory of Crop Genetic Improvement, Huazh…    71 1    
##  2 National Key Laboratory of Crop Genetic Improvement, Huazh…    68 2    
##  3 National Key Laboratory of Crop Genetic Improvement, Huazh…    62 3    
##  4 National Institute of Plant Genome Research (NIPGR), Aruna…    56 4    
##  5 Helmholtz Center Munich, Plant Genome and Systems Biology …    43 5    
##  6 International Crops Research Institute for the Semi-Arid T…    41 6    
##  7 State Key Laboratory of Rice Biology, China National Rice …    37 7    
##  8 The Fish Molecular Genetics and Biotechnology Laboratory, …    37 8    
##  9 Guangdong Key Laboratory of New Technology in Rice Breedin…    33 9    
## 10 Institute of Crop Science, Chinese Academy of Agricultural…    33 10   
## # ... with 20 more rows

Simulate a QTL experiment

library(qtl)
## Warning: package 'qtl' was built under R version 3.4.3
set.seed(12347)
# Fiveautosomes of length 50, 75, 100, 125, 60 cM
L <- c(50, 75, 100, 125, 60)
map <- sim.map(L, L/5+1, eq.spacing=FALSE, include.x=FALSE)
# Simulate a backcross with two QTL
a <- 0.7
mymodel <- rbind(c(1, 40, a),
                 c(4, 100, a))
pop <- sim.cross(map, type="bc", n.ind=200, model=mymodel)
plot.map(pop)

Check phenotypic distribution

hist(pop$pheno$phenotype, main="simulated phenotype", 
     breaks=50, xlab="Pheno", col="#cdb79e")

Conduct QTL Mapping using marker regression approach

# single-QTL scan by marker regression with the simulated data
out.mr <- scanone(pop, method="mr")
# plot of marker regression results for chr 4 and 12
plot(out.mr, chr=c(1,2,3,4,5), ylab="LOD score")

Interval Mapping

Conduct interval mapping using the same dataset.

Haley-Knott Regression

In practice, this is a version of interval mapping which a (very good) approximation to interval mapping via maximum likelihood.

n.perm: If specified, a permutation test is performed rather than an analysis of the observed data. This argument defines the number of permutation replicates.

#plot(out.mr, chr=c(1,2,3,4,5), ylab="LOD score", main="marker regression")

out.hk <- scanone(pop, method="hk")
perm <- scanone(pop, method="hk", n.perm=1000)
## Doing permutation in batch mode ...
summary(perm)
## LOD thresholds (1000 permutations)
##      lod
## 5%  2.30
## 10% 1.94
plot(out.hk, col="blue", chr=c(1,2,3,4,5), ylab="LOD score", main="interval mapping")
abline(h=summary(perm)[2], col="red", lty=2)