Hi all,
We were asked by a few users to show a basic workflow for this sub-challenge. Although we do not endorse any workflow at this point, I thought illustrating a simple one could save people some time.
The workflow below chooses 3 genes previously reported to change with gestational age and fits a linear model. The RMSE of the model on the test data is 7.91.
#### #load gene level expression data
load("HTA20_RMA.RData")
#### #load sample annotation
ano=read.csv("anoSC1_v11_nokey.csv",stringsAsFactors = FALSE)
#### #pick top 3 genes from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6351599 Table S2
threesymbs=c("ANXA1","IFIT1","RPS24")
#### #get symbols for the expression matrix
library(org.Hs.eg.db)
symbol=as.vector(unlist(mget(gsub("_at","",rownames(eset_HTA20)), envir=org.Hs.egSYMBOL, ifnotfound=NA)))
mypreds=rownames(eset_HTA20)[match(threesymbs,symbol)] #find row names corresponding to the 3 genes
#### #prepare training and test dataframes
train=data.frame(Y=ano$GA[ano$Train==1],t(eset_HTA20[mypreds,ano$SampleID[ano$Train==1]]))
test=data.frame(t(eset_HTA20[mypreds,ano$SampleID[ano$Train==0]]))
#### #fit a linear model
mod=lm(Y~.,data=train)
#### #make predictions on test set
Ytest=predict(mod,test)
#### #make sure predictions are not out of range
Ytest[Ytest>=42]<-41.99
Ytest[Ytest<=8]<-8.01
#### #write prediction file
write.csv(data.frame(SampleID=rownames(test),GA=Ytest),file="work_flow_SC1.csv",row.names=FALSE)