# ------------------------- # loading libraries # ------------------------- library(patRoon) library("xlsx") library(readxl) library("ggpubr") library(corrplot) library(xcms) # ------------------------- # initialization # ------------------------- workPath <- "D:/Youssef/Optimization/" setwd(workPath) anaInfo <- generateAnalysisInfo(path = "E:/Mohammad/data/20211130-YB-DD/20211130-YB-DD centroid mzXML", group = c("Blank", "Blank", "Blank", "After", "After", "After"), blank = c("Blank")) # Set to FALSE to skip data pre-treatment doDataPretreatment <- FALSE if (doDataPretreatment) { convertMSFiles(anaInfo = anaInfo, from = "bruker", to = "mzML", algorithm = "pwiz", centroid = TRUE) } # amount of iterations n = 150 suspFile1 <- read_xls("D:/Youssef/Optimization/formula_mass_with_RT.xls") fL <- c() fG <- c() FfG <- c() SfG <- c() SFfG <- c() ################################################################################################### FF (Feature Finding) using OpenMS #generating lists/vectors of variables in random order within a certain range list_noiseThrInt = floor(runif(n, min=500 , max=1000)) list_chromSNR = floor(runif(n, min=2 , max=10)) list_chromFWHM = floor(runif(n, min=3 , max=30)) list_minFWHM = floor(runif(n, min=3 , max=10)) list_maxFWHM = floor(runif(n, min=11 , max=30)) list_mzPPM = floor(runif(n, min=2 , max=30)) for (i in 1:n) { fListopenms <- findFeatures(anaInfo, "openms", noiseThrInt = list_noiseThrInt[i], chromSNR = list_chromSNR[i], chromFWHM = list_chromFWHM[i], minFWHM = list_minFWHM[i], maxFWHM = list_maxFWHM[i], mzPPM = list_mzPPM[i]) #finding features fL[i] <- length(fListopenms) fGroups <- groupFeatures(fListopenms, "openms") #grouping of features fG[i] <- length(fGroups) FfGroups <- filter(fGroups, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, #rule-based filtering maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfG[i] <- length(FfGroups) SfGroups <- screenSuspects(fGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of non-filtered groups SfG[i] <- length(SfGroups) SFfGroups <- screenSuspects(FfGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of filtered groups SFfG[i] <- length(SFfGroups) } #put data from loop into data frame df <- data.frame(list_noiseThrInt,list_chromSNR,list_chromFWHM,list_minFWHM,fL,list_maxFWHM,list_mzPPM,fG,FfG,SfG,SFfG) #sort from highest filtered suspect hits to lowest sorted_df <- df[order(-SFfG),] write.xlsx(sorted_df, "D:/Youssef/Optimization/parameter values OpenMS standard parameters.xlsx", sheetName = "df", append = FALSE) ################################################################################################################################### FF using XCMS3 #generating lists/vectors of variables in random order within a certain range list_mzPPM = floor(runif(n, min=2 , max=30)) list_peakwidthmin = floor(runif(n, min=3 , max=10)) list_peakwidthmax = floor(runif(n, min=11 , max=30)) list_snthresh = floor(runif(n, min=2 , max=10)) list_k = floor(runif(n, min=2 , max=10)) list_minIntensity = floor(runif(n, min=1000 , max=5000)) list_noise = floor(runif(n, min=1000 , max=5000)) for (i in 1:n) { fListxcms <- findFeatures(anaInfo, "xcms3",param = xcms::CentWaveParam( noise = list_noise[i], ppm = list_mzPPM[i], #finding features peakwidth = c(list_peakwidthmin[i] , list_peakwidthmax[i]), snthresh = list_snthresh[i], prefilter = c(list_k[i], list_minIntensity[i]))) fL[i] <- length(fListxcms) fGroups <- groupFeatures(fListxcms, "openms") #grouping of features fG[i] <- length(fGroups) FfGroups <- filter(fGroups, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, #rule-based filtering maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfG[i] <- length(FfGroups) SfGroups <- screenSuspects(fGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of non-filtered groups SfG[i] <- length(SfGroups) SFfGroups <- screenSuspects(FfGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of filtered groups SFfG[i] <- length(SFfGroups) } #out data into dataframe df <- data.frame(list_mzPPM,list_peakwidthmin,list_peakwidthmax,list_snthresh,list_k,list_minIntensity,list_noise,fL,fG,FfG,SfG,SFfG) sorted_df <- df[order(-SFfG),] #write dataframe to excel sheet write.xlsx(sorted_df, "D:/Youssef/Optimization/parameter values XCMS3 standard parameters.xlsx", sheetName = "df", append = FALSE) ##########################################E################################################################################### FF using KPIC2 #generating lists/vectors of variables in random order within a certain range list_level = floor(runif(n, min=1000 , max=5000)) list_min_snr = floor(runif(n, min=2 , max=100)) for (i in 1:n) { fListkpic<- findFeatures(anaInfo, "kpic2", level = list_level[i], min_snr = list_min_snr[i]) #finding features fL[i] <- length(fListkpic) fGroups <- groupFeatures(fListkpic, "openms") #grouping of features fG[i] <- length(fGroups) FfGroups <- filter(fGroups, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, #rule-based filtering maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfG[i] <- length(FfGroups) SfGroups <- screenSuspects(fGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of non-filtered groups SfG[i] <- length(SfGroups) SFfGroups <- screenSuspects(FfGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of filtered groups SFfG[i] <- length(SFfGroups) } df <- data.frame(list_level,list_min_snr,fL,fG,FfG,SfG,SFfG) sorted_df <- df[order(-SFfG),] write.xlsx(df, "D:/Youssef/Optimization/parameter values KPIC2 standard parameters.xlsx", sheetName = "df", append = FALSE) ##########################################E################################################################################### FF using SAFD #generating lists/vectors of variables in random order within a certain range list_resolution = floor(runif(n, min=10000 , max=100000)) list_minInt = floor(runif(n, min=1000 , max=5000)) list_maxNumbIter = floor(runif(n, min=5000 , max=10000)) list_minPeakWS = floor(runif(n, min=3 , max=10)) list_maxTPeakW = floor(runif(n, min=100 , max=400)) list_S2N = floor(runif(n, min=3 , max=100)) #as for the anaInfo, samples are run as sets of 3 ([1:3,} etc) and then run together afterwards. This is done since not enough RAM is available otherwise for (i in 1:6) { fListSAFDblanks<- findFeatures(anaInfo[1:3,], "safd", mzRange = c(100,1000), profPath = NULL, resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) fListSAFDspikebefore<- findFeatures(anaInfo[4,], "safd", mzRange = c(100,1000), profPath = NULL, resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) fListSAFDspikebefore<- findFeatures(anaInfo[5,], "safd", mzRange = c(100,1000), profPath = NULL, resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) fListSAFDspikebefore<- findFeatures(anaInfo[6,], "safd", mzRange = c(100,1000), profPath = NULL, resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) # fListSAFDspikeafter<- findFeatures(anaInfo[6:7,], "safd", mzRange = c(100,1000), profPath = NULL, # resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], # minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) # fListSAFDspikebefore<- findFeatures(anaInfo[7:8,], "safd", mzRange = c(100,1000), profPath = NULL, # resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], # minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) fListSAFD <- findFeatures(anaInfo, "safd", mzRange = c(100,1000), profPath = NULL, resolution = list_resolution[i], minInt = list_minInt[i], maxNumbIter = list_maxNumbIter[i], minPeakWS = list_minPeakWS[i], maxTPeakW = list_maxTPeakW[i], S2N = list_S2N[i]) fL[i] <- length(fListSAFD) fGroups <- groupFeatures(fListSAFD, "openms") #grouping of features fG[i] <- length(fGroups) FfGroups <- filter(fGroups, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, #rule-based filtering maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfG[i] <- length(FfGroups) SfGroups <- screenSuspects(fGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of non-filtered groups SfG[i] <- length(SfGroups) SFfGroups <- screenSuspects(FfGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of filtered groups SFfG[i] <- length(SFfGroups) } df <- data.frame(list_resolution,list_minInt,list_maxNumbIter,list_minPeakWS,list_maxTPeakW,list_S2N,fL,fG,FfG,SfG,SFfG) sorted_df <- df[order(-SFfG),] write.xlsx(df, "D:/Youssef/Optimization/parameter values SAFD 150 iterarions opmitization round 2.xlsx", sheetName = "df", append = FALSE) ##########################################E################################################################################### FF using BRUKER fListbruker<- findFeatures(anaInfo, "bruker", doFMF = "auto") #finding features fL <- length(fListbruker) fGroups <- groupFeatures(fListbruker, "openms") #grouping of features fG <- length(fGroups) FfGroups <- filter(fGroups, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, #rule-based filtering maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfG <- length(FfGroups) SfGroups <- screenSuspects(fGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of non-filtered groups SfG <- length(SfGroups) SFfGroups <- screenSuspects(FfGroups, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) #suspect screening of filtered groups SFfG <- length(SFfGroups) df <- data.frame(fL,fG,FfG,SfG,SFfG) write.xlsx(df, "D:/Youssef/Optimization/parameter values Bruker DataAnalysis.xlsx", sheetName = "df", append = FALSE) ################################################################################################################# #loop for testing OpenMS alignment parameters using random number generation ################################################################################################################# #amoun of iterations n = 150 #parameters to be optimized list_maxAlignRT = floor(runif(n, min=1 , max=30)) list_maxGroupRT = floor(runif(n, min=1 , max=12)) list_maxAlignMZ <- c() list_maxGroupMZ <- c() for (a in 1:n) { values_maxalignMZ = sample(seq(0.001, 0.010, by=0.001)) #generate random values between 0.001 and 0.010 with steps of 0.001 values_maxgroupMZ = sample(seq(0.001, 0.010, by=0.001)) list_maxAlignMZ[a] <- values_maxalignMZ list_maxGroupMZ[a] <- values_maxgroupMZ } SFfGopenms <- c() SFfGxcms <- c() SFfGSAFD <- c() SFfGkpic <- c() suspFile1 <- read_xls("D:/Youssef/Optimization/formula_mass_with_RT.xls") for (i in 1:n) { fGroupsopenms <- groupFeaturesOpenMS(fListopenms, rtalign = TRUE, maxAlignRT = list_maxAlignRT, maxAlignMZ = list_maxAlignMZ, maxGroupRT = list_maxGroupRT[i], maxGroupMZ = list_maxGroupMZ) fGroupsxcms <- groupFeaturesOpenMS(fListxcms,rtalign = TRUE, maxAlignRT = list_maxAlignRT, maxAlignMZ = list_maxAlignMZ, maxGroupRT = list_maxGroupRT[i], maxGroupMZ = list_maxGroupMZ) fGroupsSAFD <- groupFeaturesOpenMS(fListSAFD,rtalign = TRUE, maxAlignRT = list_maxAlignRT, maxAlignMZ = list_maxAlignMZ, maxGroupRT = list_maxGroupRT[i], maxGroupMZ = list_maxGroupMZ) fGroupskpic <- groupFeaturesOpenMS(fListkpic,rtalign = TRUE,maxAlignRT = list_maxAlignRT[i], maxAlignMZ = list_maxAlignMZ[i], maxGroupRT = list_maxGroupRT[i], maxGroupMZ = list_maxGroupMZ[i]) FfGroupsopenms <- filter(fGroupsopenms, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfGroupsxcms <- filter(fGroupsxcms, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfGroupsSAFD <- filter(fGroupsSAFD, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) FfGroupskpic <- filter(fGroupskpic, preAbsMinIntensity = 100, absMinIntensity = 10000, relMinReplicateAbundance = 1, maxReplicateIntRSD = 0.75, blankThreshold = 5, removeBlanks = TRUE, retentionRange = c(200, Inf), mzRange = c(100, 1000)) SFfGroupsopenms <- screenSuspects(FfGroupsopenms, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) SFfGroupsxcms <- screenSuspects(FfGroupsxcms, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) SFfGroupsSAFD <- screenSuspects(FfGroupsSAFD, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) SFfGroupskpic <- screenSuspects(FfGroupskpic, suspFile1, rtWindow = 12, mzWindow = 0.005, adduct = "[M-H]-", onlyHits = TRUE) SFfGopenms[i] <- length(SFfGroupsopenms) SFfGxcms[i] <- length(SFfGroupsxcms) SFfGSAFD[i] <- length(SFfGroupsSAFD) SFfGkpic[i] <- length(SFfGroupskpic) } alignment_df <- data.frame(list_maxAlignRT, list_maxGroupRT, list_maxAlignMZ ,list_maxGroupMZ,SFfGopenms,SFfGxcms,SFfGSAFD) write.xlsx(alignment_df, "D:/Youssef/Optimization/feature alignment optimization round 1.xlsx", sheetName = "alignment_df", append = FALSE) ##########################################E################################################################################ data import and plotting dfSARD <- read_xlsx("D:/Youssef/Optimization/parameter values SAFD 150 iterarions opmitization round 2.xlsx") param_df <- df[,] corrdf <- cor(param_df) #plot correlation matrix corrplot(corrdf, type="upper", method="number", bg = "lightblue") #plotting scatterplot ggscatter(dfSARD, x = "list_maxTPeakW" # parameter to plot , y = "SFfG", # suspect hit to plot add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "spearman", xlab = "list_maxTPeakW", ylab = "SuspectHits ") # ------------------------- # Automated Parameter Optimization using IPO # ------------------------- pSetOpenMS <- list(noiseThrInt = c(1000,5000), chromSNR = c(3, 100), chromFWHM = c(3, 30), minFWHM = c(3,10), maxFWHM = c(11,30), mzPPM = c(2,30)) future::plan("multisession", workers = 8) # to work in paralla ftOpt <- optimizeFeatureFinding(anaInfo, "openms", pSetOpenMS, parallel = TRUE, paramRanges = list(noiseThrInt = c(1000, Inf), chromSNR = c(3, Inf), chromFWHM = c(3, Inf), minFWHM = c(3, Inf), maxFWHM = c(11, Inf), mzPPM = c(2, Inf))) pSetXCMS3 <- list(ppm = c(2,30), noise = c(1000, 5000), snthresh = c(3, 100)) future::plan("multisession", workers = 8) ftOpt <- optimizeFeatureFinding(anaInfo, "xcms3", pSetXCMS3, parallel = TRUE, paramRanges = list(ppm = c(2,Inf), noise = c(1000, Inf), snthresh = c(3, Inf) ))