############################################################################################ ####sort out the bismark extracted data/ R script - open with R or Rstudio chr <- "SL3.0ch02" plants <- c("sulf_1", "sulf_2", "WT_1", "WT_2") contexts <- c("CG", "CHG", "CHH") for(j in 1:length(plants)) { plant <- plants[j] pefile <- dir(".", pattern = paste(plant, ".deduplicated.CX_reportchr", chr, ".txt", sep="")) message(pefile) df <- read.table(pefile, header=F) for (i in 1:3) {context <- contexts[i] sub_df <- df[(df$V6==context),1:5] write.table(sub_df, file = paste(pefile, "_", context, "_summarised.txt", sep = ""), sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) } } library(segmentSeq) for (i in 1:length(contexts)) {context <- contexts[i] files <- dir(".", pattern = paste(chr, ".*", context, sep="")) ####dir is the folder containing the summarized txt files dir <- "/sulf_data" libnames <- c("sulf_1", "sulf_2", "WT_1", "WT_2") replicates <- c("sulf", "sulf", "WT", "WT") nonconversion <- c(0.00414, 0.00403, 0.00403, 0.00382) aM <- readMeths(files=files, dir=dir, libnames=libnames, replicates=replicates, nonconversion=nonconversion) save(aM, file = paste("aM_", context, "_", chr, ".RData", sep = "")) } ############################################################################################ ####context and subcontext annotation library(segmentSeq) ####The R function to extract genome sequence extractSequences <- function(fastaFile) { fasta <- scan(fastaFile, sep = "\n", what = "character") fastaHeads <- gsub(">", "", fasta[grep(">", fasta)]) headloc <- grep(">", fasta) fastaSeq <- sapply(1:length(headloc), function(ii) toupper(paste(fasta[(headloc[ii] + 1):c(headloc[-1] - 1, length(fasta))[ii]], collapse = ""))) names(fastaSeq) <- fastaHeads fastaSeq } tomatoSeq <- extractSequences("S_lycopersicum_SL3.0.fa") tomatoSeq <- lapply(tomatoSeq, function(x) strsplit(x, "")[[1]]) cytypes <- c("CHG", "CHH") for(cytype in cytypes) { cyfile <- dir(pattern = paste("^aM_", cytype, "_.*\\.RData", sep = "")) chrfiles <- cyfile[grep("{2}", cyfile)] for (i in 1:length(chrfiles)) {chrfile <- chrfiles[i] tomatoSeqx <- tomatoSeq[[i]] load(chrfile) c1 <- tomatoSeqx[start(aM@alignments)] c2 <- tomatoSeqx[start(aM@alignments) + c(1,-1)[as.integer(strand(aM@alignments) == "-") + 1]] c3 <- tomatoSeqx[start(aM@alignments) + c(2,-2)[as.integer(strand(aM@alignments) == "-") + 1]] context <- paste(c1, c2, c3, sep = "") context[which(strand(aM@alignments) == "-")] <- chartr("ATGC","TACG",context[which(strand(aM@alignments) == "-")]) aM@alignments$context <- context save(aM, file = chrfile) } } ############################################################################################ ####calculate subcontext methylation of each region library(segmentSeq) library(GenomicRanges) raw <- NULL ####sulf_DMR1 is the DMR1 region of slTAB2 df <- read.csv("sulf_DMR1.csv", header=T) load("aM_CHG_SL3.0ch02.RData") subcontexts <- c("CCG", "CAG", "CTG") for(x in 1:length(subcontexts)) { subcontext <- subcontexts[x] aM <- aM[aM@alignments$context==subcontext,] for(c in 1:nrow(df)) {d <- df[c,] d_GR <- makeGRangesFromDataFrame(d) d_meth <- findOverlaps(aM@alignments, d_GR) m <- NULL n <- NULL newCs <- NULL newTs <- NULL overlap <- unique(d_meth@from) for (i in 1:length(overlap)) { j <- overlap[i] m <- aM@Cs[j,] n <- aM@Ts[j,] newCs <- rbind(newCs, m) newTs <- rbind(newTs, n) } new <- cbind(newCs, newTs) colnames(new) <- c("sulf_1C", "sulf_2C", "WT_1C", "WT_2C", "sulf_1T", "sulf_2T", "WT_1T", "WT_2T") sulf_total=sum(new[,1])+sum(new[,2])+sum(new[,5])+sum(new[,6])) WT_total=sum(new[,3])+sum(new[,4])+sum(new[,7])+sum(new[,8]) sulf_methylation <- (sum(new[,1])+sum(new[,2]))/sulf_total WT_methylation <- (sum(new[,3])+sum(new[,4]))/WT_total total <- data.frame(seqnames=d$seqnames, start=d$start, end=d$end, strand=d$strand, sulf=sulf_methylation,WT=WT_methylation) raw <- rbind(raw, total) } write.csv(raw, file=paste("sulf_DMR1_", subcontext, ".csv", sep="")) }