Metabolomics pathway based diagnosis processing R code, by Sijia Huang
If you have any questions, feel free to contact Sijia Huang at: SHuang@cc.hawaii.edu, thanks!

Form our metabolites library

con <- file('metabolites.gmt') 
open(con);
metabolites.list <- list();
pathway.list <- list();
current.line <- 1
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
 tempvec <- (unlist(strsplit(line, split="\t")))
 metabolites.list[[current.line]] <- tempvec[2:length(tempvec)]
 pathway.list[[current.line]] <- tempvec[1]
 current.line <- current.line + 1
} 
close(con)

Delete tabs in the list

for (i in 1:length(metabolites.list))
{
metabolites.list[[i]]<-subset(metabolites.list[[i]],metabolites.list[[i]]!=" ")
}

Mapping your file metabolite IDs to HMDB id

your_map_ID<-read.csv("your metabolite ID file.csv",header=T)  
#### sample file ###   your_map_ID<-read.csv("COH_plasma_ID.csv",header=T) 
your_data<-read.table("your metabolite data file.txt",header=T)
#### sample file ###   your_data<-read.csv("COH plasma.txt",header=T) 
### colnames(your_data)<-your_map_ID[,1]
new<-read.table("HMDB_pathway_mapping.txt",header=T,sep="\t") 
### our provided file: HMDB_pathway_mapping.txt ###
HMDB_ID<-as.character(as.matrix(new[,2]))
HMDB_ID<-tolower(HMDB_ID)
your_map_ID<-tolower(your_map_ID) ### your_map_ID<-tolower(your_map_ID[,1])

synonym<-read.csv("synonym_result.csv",header=T)
### our provided file: synonym_result.csv ###
synonym<-synonym[,-4]
synonymID<-tolower(as.character(synonym[,2]))
#summary(your_map_ID %in% synonymID)
#summary(your_map_ID %in% union(HMDB_ID,synonymID))
##....other mapping tools ##

map your file to HMDB IDs and then do pathifier

pathifier process

For your new data that you don’t know if there is breast cancer or not, combine with our data and suppose the new data is a cancer patient.
This new data will be processed together with our stored data and when your sample PDS score is closer to our normal samples, which will get you a lower probability with our logistic model, your samples will be classified as normal
library(pathifier)
### need to input normtf for your own data
### For your new data that you don't know if there is breast cancer or not, combine with our data and suppose the new data is a cancer patient. 

qpd <- quantify_pathways_deregulation(your_data, gnames,metabolites.list, pathway.list, normtf, attempts = 5,min_exp=0, min_std=0)  
qpdmat <- matrix(as.data.frame(qpd$scores), nrow=length(qpd$scores)/ncol(your_data), byrow=TRUE)
dim(qpdmat)
colnames(qpdmat) <- colnames(your_data)
rownames(qpdmat) <- names(qpd$scores)
write.table(qpdmat, "normalized pathifier matrix.txt",sep="\t", row.names=TRUE, col.names=TRUE, quote=FALSE)

feature selection process

pds_matrix<-read.table("normalized pathifier matrix.txt",header=T)
real_tf<-!normtf
pds_matrix<-as.data.frame(cbind(t(pds_matrix),realtf))
your_case<-pds_matrix[pds_matrix[,"real_tf"]==TRUE,]
your_control<-pds_matrix[pds_matrix[,"real_tf"]==FALSE,]

####set.seed(x)
#### need your number of cases as input
training_case_ID<-sample(row.names(your_case),round(ncases*0.8),replace=F) 
#### need your number of controls as input
training_control_ID<-sample(row.names(your_ctrl),round(ncontrols*0.8),replace=F)  

testing_case_ID<-setdiff(row.names(your_case),training_case_ID)
testing_ctrl_ID<-setdiff(row.names(your_ctrl),training_control_ID)

training_diagnosis<-rbind(your_case[training_case_ID,],your_ctrl[training_control_ID,])
testing_diagnosis<-rbind(your_case[testing_case_ID,],your_ctrl[testing_ctrl_ID,])

write.csv(training_diagnosis,"training_data.csv",quote=F,row.names=F)
write.csv(training_diagnosis,"training_data.csv",quote=F,row.names=F)
After generating the cases and controls with trainings and testings randomly
You need to use Weka to do feature selection with 10-fold cross-validation with CFS
Search method use “BestFirst -D 1 -N 5”, which is the default parameter setting #####
WEKA is downloadable from http://www.cs.waikato.ac.nz/ml/weka/downloading.html

After the features are selected

logistic regression and prediction process

### Your input pathways as listed: ###
### diagnosis_pathway_names<-c("aaa","bbb","ccc",...,"") ###
### example pathway list: 
diagnosis_pathway_names<-c("CAMP Signaling Pathway","Glutathione Metabolism","Glycine  Serine And Threonine Metabolism","Methionine Metabolism","Mitochondrial Beta-Oxidation Of Medium Chain Saturated Fatty Acids","Phospholipid Biosynthesis","Propanoate Metabolism","Taurine And Hypotaurine Metabolism")

zd_training<- as.numeric(!normtf)
glm.out.diagnosis = glm(zd_training ~ .,family=binomial, data=training_diagnosis[,diagnosis_pathway_names])
predictedStage<-as.numeric(predict(glm.out.diagnosis, type="response"))
referenceStage<-as.numeric(zd_training)
library(pROC)
pred_yourdata_diagnosis<- roc(referenceStage,predictedStage)
pred_yourdata_diagnosis<-smooth(pred_yourdata_diagnosis,method="density")
pred_yourdata_diagnosis ### summary of your ROC object, you can use this command to see the AUC of your ROC curve
plot(pred_yourdata_diagnosis,lwd=2,col="black")  ### plot of your ROC object
ci.auc(pred_yourdata_diagnosis) ### Use bootstrap method to generate confidence intervals ### 

predictedStage[predictedStage>=0.5]<-1
predictedStage[predictedStage<0.5]<-0
referenceStage<-as.numeric(zd_testing)
confusionmatrix<-table(predictedStage,referenceStage)
TP<-confusionmatrix[2,2]
TN<-confusionmatrix[1,1]
FP<-confusionmatrix[2,1]
FN<-confusionmatrix[1,2]

sens.diagnosis <- TP/(TP+FN) ###
spec.diagnosis <- TN/(TN+FP) ### 
MCC.diagnosis <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))  #### 
F.diagnosis <-2*TP/(2*TP+FP+FN) ### 


### for testing dataset prediction ###
### need input such as: testing_diagnosis (testing pds matrix data) ###
predictedStage<-as.numeric(predict(glm.out.diagnosis,newdata=as.data.frame(testing_diagnosis[,diagnosis_pathway_names]),type="response")) ### check for