In local computer, using R
:
s <- scan('../../cache/out.sfs')s <- s[-c(1,length(s))]s <- s/sum(s)barplot(s,names=1:length(s), main='SFS')
hist(t$Pairwise)
fst <- read.table("cache/fst_win.txt", skip=1, header=FALSE)names(fst)[c(3,5)] <- c("midp", "fst")plot(fst$midp, fst$fst, xlab="Physical position", ylab="Fst", col="#5f9ea0", pch=16)
Maize reference genome
change to largedata\lab4
folder:
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-46/fasta/zea_mays/dna/Zea_mays.B73_RefGen_v4.dna.chromosome.Mt.fa.gz### then unzip itgunzip Zea_mays.B73_RefGen_v4.dna.chromosome.Mt.fa.gz
Similarly, we will download and unzip the Mt GFF3 file
Maize reference genome
change to largedata\lab4
folder:
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-46/fasta/zea_mays/dna/Zea_mays.B73_RefGen_v4.dna.chromosome.Mt.fa.gz### then unzip itgunzip Zea_mays.B73_RefGen_v4.dna.chromosome.Mt.fa.gz
Similarly, we will download and unzip the Mt GFF3 file
# install.package("data.table")library("data.table")## simply read in wouldn't workgff <- fread("largedata/lab4/Zea_mays.B73_RefGen_v4.46.chromosome.Mt.gff3", skip="#", header=FALSE, data.table=FALSE)## grep -v means select lines that not matching any of the specified patternsgff <- fread(cmd='grep -v "#" largedata/lab4/Zea_mays.B73_RefGen_v4.46.chromosome.Mt.gff3', header=FALSE, data.table=FALSE)
V1 V2 V3 V4 V5 V6 V7 V81 Mt Gramene chromosome 1 569630 . . .2 Mt ensembl gene 6391 6738 . + .3 Mt NCBI mRNA 6391 6738 . + .4 Mt NCBI exon 6391 6738 . + .5 Mt NCBI CDS 6391 6738 . + 06 Mt ensembl gene 6951 8285 . + . V91 ID=chromosome:Mt;Alias=AY506529.1,NC_007982.1;Is_circular=true2 ID=gene:ZeamMp002;biotype=protein_coding;description=orf115-a1;3 ID=transcript:ZeamMp002;Parent=gene:ZeamMp002;4 Parent=transcript:ZeamMp002;Name=ZeamMp002.exon1;constitutive=1;ensembl_end_phase=0;5 ID=CDS:ZeamMp002;Parent=transcript:ZeamMp002;6 ID=gene:ZeamMp003;biotype=protein_coding;description=orf444
V1 V2 V3 V4 V5 V6 V7 V81 Mt Gramene chromosome 1 569630 . . .2 Mt ensembl gene 6391 6738 . + . V91 ID=chromosome:Mt;Alias=AY506529.1,NC_007982.1;Is_circular=true2 ID=gene:ZeamMp002;biotype=protein_coding;description=orf115-a1;
1 sequence: The name of the sequence where the feature is located.
2 source: Keyword identifying the source of the feature, like a program (e.g. RepeatMasker) or an organization (like ensembl).
3 feature: The feature type name, like "gene" or "exon".
4 start: Genomic start of the feature, with a 1-base offset.
V1 V2 V3 V4 V5 V6 V7 V81 Mt Gramene chromosome 1 569630 . . .2 Mt ensembl gene 6391 6738 . + . V91 ID=chromosome:Mt;Alias=AY506529.1,NC_007982.1;Is_circular=true2 ID=gene:ZeamMp002;biotype=protein_coding;description=orf115-a1;
5 end: Genomic end of the feature, with a 1-base offset.
6 score: Numeric value that generally indicates the confidence of the source in the annotated feature.
7 strand: Single character that indicates the strand of the feature.
V1 V2 V3 V4 V5 V6 V7 V81 Mt Gramene chromosome 1 569630 . . .2 Mt ensembl gene 6391 6738 . + . V91 ID=chromosome:Mt;Alias=AY506529.1,NC_007982.1;Is_circular=true2 ID=gene:ZeamMp002;biotype=protein_coding;description=orf115-a1;
8 phase: phase of CDS (means CoDing Sequence) features.
9 attributes: All the other information pertaining to this feature.
names(gff) <- c("seq", "source", "feature", "start", "end", "score", "strand", "phase", "att")table(gff$feature)
g <- subset(gff, feature %in% "gene")g$geneid <- gsub(".*gene:|;biotype.*", "", g$att)### + strandgp <- subset(g, strand %in% "+") # nrow(gp) 75### get the 5k upstream of the + strand gene modelgp_up <- gpgp_up$end <- gp_up$start - 1gp_up$start <- gp_up$end - 5000 ### get the 5k downstream of the + strand gene modelgp_down <- gpgp_down$start <- gp_down$end + 1gp_down$end <- gp_down$start + 5000
### - strandgm <- subset(g, strand %in% "-") dim(gm) # 82fwrite(g, "cache/mt_gene.txt", sep="\t", row.names = FALSE, quote=FALSE)
library("data.table")library("GenomicRanges")library("plyr")theta <- fread("cache/theta.txt", data.table=FALSE)names(theta)[1] <- "seq"up5k <- read.table("cache/mt_gene_up5k.txt", header=TRUE)### define the subject file for theta valuesgrc <- with(theta, GRanges(seqnames=seq, IRanges(start=Pos, end=Pos)))### define the query file for genomic featuregrf <- with(up5k, GRanges(seqnames=seq, IRanges(start=start, end=end), geneid=geneid))### find overlaps between the twotb <- findOverlaps(query=grf, subject=grc)tb <- as.matrix(tb)out1 <- as.data.frame(grf[tb[,1]])out2 <- as.data.frame(grc[tb[,2]])### for each genomic feature, find the sites with non-missing dataout <- cbind(out1, out2[, "start"]) names(out)[ncol(out)] <- "pos"
#define unique identifier and merge with the thetasout$uid <- paste(out$seqnames, out$pos, sep="_")theta$uid <- paste(theta$seq, theta$Pos, sep="_")df <- merge(out, theta[, c(-1, -2)], by="uid")# for each upstream 5k region, how many theta valuesmx <- ddply(df, .(geneid), summarise, Pairwise = mean(Pairwise, na.rm=TRUE), thetaH = mean(thetaH, na.rm=TRUE), nsites = length(uid))
Copy and paste everything above, and pack it into an R
function:
get_mean_theta <- function(gf_file="cache/mt_gene_up5k.txt"){ # gf_file: gene feature file [chr, ="cache/mt_gene_up5k.txt"] theta <- fread("cache/theta.txt", data.table=FALSE) names(theta)[1] <- "seq" up5k <- read.table(gf_file, header=TRUE) ### define the subject file for theta values grc <- with(theta, GRanges(seqnames=seq, IRanges(start=Pos, end=Pos))) ### define the query file for genomic feature grf <- with(up5k, GRanges(seqnames=seq, IRanges(start=start, end=end), geneid=geneid)) ### find overlaps between the two tb <- findOverlaps(query=grf, subject=grc) tb <- as.matrix(tb) out1 <- as.data.frame(grf[tb[,1]]) out2 <- as.data.frame(grc[tb[,2]]) ### for each genomic feature, find the sites with non-missing data out <- cbind(out1, out2[, "start"]) names(out)[ncol(out)] <- "pos" #define unique identifier and merge with the thetas out$uid <- paste(out$seqnames, out$pos, sep="_") theta$uid <- paste(theta$seq, theta$Pos, sep="_") df <- merge(out, theta[, c(-1, -2)], by="uid") # for each upstream 5k region, how many theta values mx <- ddply(df, .(geneid), summarise, Pairwise = mean(Pairwise, na.rm=TRUE), thetaH = mean(thetaH, na.rm=TRUE), nsites = length(uid)) return(mx)}
Run the customized R function
### apply the functionup5k <- get_mean_theta(gf_file="cache/mt_gene_up5k.txt")down5k <- get_mean_theta(gf_file="cache/mt_gene_down5k.txt")
And then plot the results:
library("ggplot2")up5k$feature <- "up 5k"down5k$feature <- "down 5k"res <- rbind(up5k, down5k)ggplot(res, aes(x=feature, y=Pairwise, fill=feature)) + geom_violin(trim=FALSE)+ labs(title="Theta value", x="", y = "Log10 (theta)")+ geom_boxplot(width=0.1, fill="white")+ scale_fill_brewer(palette="Blues") + theme_classic()
In local computer, using R
:
s <- scan('../../cache/out.sfs')s <- s[-c(1,length(s))]s <- s/sum(s)barplot(s,names=1:length(s), main='SFS')
hist(t$Pairwise)
fst <- read.table("cache/fst_win.txt", skip=1, header=FALSE)names(fst)[c(3,5)] <- c("midp", "fst")plot(fst$midp, fst$fst, xlab="Physical position", ylab="Fst", col="#5f9ea0", pch=16)
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |