Analyzing Tissue Expression Data: Steps I followed

Queries / help discussions specific to microarray specific objects file formats such as GEO's SOFT/MIAMI Compliant format, Affy Format, Agilent format, Illumina Format and other

Analyzing Tissue Expression Data: Steps I followed

Postby prasad » Wed Mar 31, 2010 9:38 pm

    downloaded the Affy CEL files from GEO - My Analysis was Normal tissues - I took GSE7307.
    It had 677 files.
    Of that I took out a subset of Normal Male and Female tissues.
    From them, I created those with Replicates, those common.
    I finally got 106 Female Samples, 91 Male Samples Common containing 25 Tissues.

    Running R was very easy:

Code: Select all
library(affy)
setwd("C:/cygwin/home/anaghapc/TF11Oct09/ERData/GSE7307/Males")
memory.limit(size=4000);
data=ReadAffy();
signal=mas5(data);
write.exprs(signal, file="gse7307_Male_mas5.txt");
signal=rma(data);
write.exprs(signal, file="gse7307_Male_rma.txt");
library(gcrma)
signal=gcrma(data);
write.exprs(signal, file="gse7307_Male_gcrma.txt");

unfortunately R crashed on Windows with 4GB RAM/Vista. I had to go to Linux and install R/Biocondcutor. Then get to Console mode to reduce the RAM preoccupied on the system.
got lucky, finally I could get the final files. only RMA was successful for Male , Female.


The output from this i.e. gse7307_male_rma.txt (in this case) was input to the CyberT, source for which was in R code - hdarray.R

The Code is automated, which needs the Tissue Names, their respective CEL file names, and the RMA Normalized Single File.
That outputs the a file for each Tissue Pair comparison showing the pvalue / the significance of the differential expression.

Code: Select all

rm(list=ls())
setwd("C:\\prasad\\NormalizedRWork18Mar10")
dpath = "C:\\prasad\\NormalizedRWork18Mar10/"
filenames <- list.files(dpath,pattern="*.txt", all.files=TRUE,full.name=TRUE,recursive=FALSE);
shortnames <- list.files(dpath,pattern="*.txt", all.files=TRUE,full.name=FALSE,recursive=FALSE);
for ( fi in (1:length(filenames)) ) {
source('C:\\prasad\\NormalizedRWork18Mar10\\hdarraysource\\hdarray.R')
#library("fdrtool")
#library('xlsReadWrite')
print(shortnames[fi]);
Data <- read.table(filenames[fi], sep="\t", header=TRUE)
tmp = shortnames[fi];
tmp = substr(tmp,1,nchar(tmp)-4);
print(tmp);
#isFemale = grep('Female',filenames[fi])
#if (isFemale ){
#FileInfoName =  paste(dpath,'GSE7307_FemaleInfo', '.dat', sep='')
#} else {
FileInfoName =  paste(dpath,'GSE7307_MaleInfo', '.dat', sep='')
#}
ExpNames <-names(Data)
ChipNamesExp <- substr(ExpNames, 1, 9)
ChipInfo <- read.table(FileInfoName, sep="\t", header=TRUE)
Tissue = ChipInfo[[1]]
Tissue = gsub(" ","_",Tissue)
ChipNameCond = ChipInfo[[2]]
ChipNameCond = substr(ChipNameCond, 1, 9)
TissueUni = matrix(data =0, nrow = 25, ncol = 1)
cnt =0
for( i in (1:length(Tissue))) {
   Tmp <- grep('[a-z]', Tissue[i])
   if(!length(Tmp)){
      Tissue[i] = name
   } else {
      name = Tissue[i]

      cnt = cnt +1
      TissueUni[cnt] = as.character(name)
   }
}
nt = length(TissueUni)
# i is for control, j is for treatment

for (i in (1:(nt-1))) {
      indi = grep(paste('^', TissueUni[i], '$', sep=''), Tissue)
      ni = length(indi)
   for (j in ((i+1):nt)) {
      indj = grep(paste('^', TissueUni[j], '$', sep=''), Tissue)
      
      nj = length(indj)
      ind = 1
      for (k in (1:ni)) {
         indk = grep(paste('^', ChipNameCond[indi[k]], '$', sep=''), ChipNamesExp)
         ind = c(ind, indk)
      }
      for (k in (1:nj)) {
         indk = grep(paste('^', ChipNameCond[indj[k]], '$', sep=''), ChipNamesExp)
         ind = c(ind, indk)
      }
      rm(h);
      h = Data[ind]

      pierre(h, 2, (1+ni), (1+ni+1), (1+ni+nj), (1+ni+nj), 0.25, 101, min(10,3*ni,3*nj), min(ni,nj), 0, 1, 0, 0)
      #rm(ProcessedData);
      #ProcessedData <- read.table("allgene.txt", sep="\t", header=TRUE)
      tmp2 = paste(tmp,TissueUni[i], '_', TissueUni[j], sep='')   
print(paste(i,':',j,':',tmp2,sep=''));
      #write.table(ProcessedData, paste(dpath, tmp2, '.txt', sep=''), sep='\t')
      file.rename('allgene.txt',paste(dpath,'Male/', tmp2, '.txt', sep=''));
      file.rename('temp.ps',paste(dpath, 'Male/', tmp2, '.ps', sep=''));

   }
}

}



The above code gave an output of around 300 files which contained the significance statistic - the p-raw value.

Now I wanted to calculate the Correlation and RMSD between the Sample - in an all Tissues to All Tissues so that I can draw a Heat map among the Tissues which generated a 25 x 25 Matrix.
For this I used the following C++ Glue Code:
Code: Select all
   vector<string>TissueNames;
TissueNames.push_back("Accumbens");
TissueNames.push_back("Amygdala");
TissueNames.push_back("Aorta");
TissueNames.push_back("Cerebellum");
TissueNames.push_back("Cerebral_Cortex");
TissueNames.push_back("Corpus_Callosum");
TissueNames.push_back("Dorsal_Root_Ganglia");
TissueNames.push_back("Frontal_Lobe");
TissueNames.push_back("Hippocampus");
TissueNames.push_back("Hypothalamus");
TissueNames.push_back("Medulla");
TissueNames.push_back("Midbrain");
TissueNames.push_back("Nodose_Nucleus");
TissueNames.push_back("Occipital_Lobe");
TissueNames.push_back("Parietal_Lobe");
TissueNames.push_back("Pituitary_Gland");
TissueNames.push_back("Putamen");
TissueNames.push_back("Spinal_Cord");
TissueNames.push_back("Substantia_Nigra");
TissueNames.push_back("Subthalamic_Nucleus");
TissueNames.push_back("Temporal_Lobe");
TissueNames.push_back("Thalamus");
TissueNames.push_back("Trigeminal_Ganglia");
TissueNames.push_back("Ventral_Tegmental_Area");
TissueNames.push_back("Vestibular_Nuclei_Superior");
   
   string path="/home/anaghapc/TF11Oct09/ERData/GSE7307/RMANormalizedCyberTFemaleData24Mar10/Females";
   string sortpath2="/home/anaghapc/TF11Oct09/ERData/GSE7307/RMANormalizedCyberTFemaleData24Mar10/Females/Sorted";


   double corvals[25][25];
   double rmsdvals[25][25];
   for ( int i = 0; i < TissueNames.size() -1; i++)
   {
      for ( int j = i+1; j < TissueNames.size(); j++)
      {
         string fname = path+"/gse7307_Female_rma"+TissueNames[i]+"_"+TissueNames[j]+".txt";
      BioInputStream fin(fname);
      string aline;
      getline(fin,aline);
      replace (aline.begin(), aline.end(), '"',' ');
      vector<string> header = getWordsFromSentence(aline);

      int xcindex = getIndexForColumnName(header,"xC");
      int xeindex = getIndexForColumnName(header,"xE");

      vector<double> xcValues;
      vector<double> xeValues;
      
      while ( getline(fin, aline) ){

      replace (aline.begin(), aline.end(), '"',' ');
      istringstream ss2(aline);
      string tstr;
      ss2>>tstr;
      tstr = ss2.str().substr(tstr.length()+1);
      vector<string> meancols = getWordsFromSentence(tstr);

      xcValues.push_back(stringToDouble(meancols[xcindex]));
      xeValues.push_back(stringToDouble(meancols[xeindex]));
      //cout<<xcindex<<"\t"<<xeindex<<"\t"<<meancols[xcindex]<<"\t"<<meancols[xeindex]<<endl;
      //getchar();

      }

      
      corvals[i][i] = 1.0;
      corvals[j][j] = 1.0;

      rmsdvals[i][i] = 1.0;
      rmsdvals[j][j] = 1.0;

      corvals[i][j] = BioStatistics::getCorrelationCoefficient(xcValues,xeValues);
      corvals[j][i] = BioStatistics::getCorrelationCoefficient(xcValues,xeValues);
      rmsdvals[i][j] = BioStatistics::getRMSD(xcValues,xeValues);
      rmsdvals[j][i] = BioStatistics::getRMSD(xcValues,xeValues);
      
      cout<<TissueNames[i]<<","<<TissueNames[j]<<","<<corvals[i][j]<<","<<rmsdvals[i][j]<<endl;

      }
   }
   
   string outcorrname = path+"/FemaleRMA25x25CorrVals.txt";
   string outrmsdname = path+"/FemaleRMA25x25RmsdVals.txt";
   BioOutputStream fout1(outcorrname);
   BioOutputStream fout2(outrmsdname);

   for (int i = 0; i < TissueNames.size(); i++)
   {
      fout1<<"\t"<<TissueNames[i];
      fout2<<"\t"<<TissueNames[i];
   }
   fout1<<endl;
   fout2<<endl;

   for ( int i = 0; i < 25; i++)
   {
      fout1<<TissueNames[i]<<"\t";
      fout2<<TissueNames[i]<<"\t";
      for ( int j = 0; j < 25; j++)
      {
         fout1<<corvals[i][j]<<"\t";
         fout2<<rmsdvals[i][j]<<"\t";
      }
      fout1<<endl;
      fout2<<endl;
   }
   



I took this 25x25 Matrix Code and run the following Code in R to get the Heat Map.

Code: Select all
library(RColorBrewer)
library(gplots)
x=read.table("C:/Users/anaghapc/Desktop/Norm.csv", header=TRUE)
mat=data.matrix(x)
pdf("C:/Users/anaghapc/Desktop/Norm.pdf", height=10, width=10)
heatmap.2(mat,
Rowv=FALSE,
Colv=TRUE,
    dendrogram= c("column"),
distfun = dist,
hclustfun = hclust,
xlab = "X data", ylab = "Y data",
key=TRUE,
keysize=1,
trace="none",
density.info=c("none"),
margins=c(10, 8),
#col=brewer.pal(10,"PiYG")
    col=greenred(75),
)
dev.off()


prasad
 
Posts: 8
Joined: Sat Mar 20, 2010 4:51 am

Return to Microarray Domain

Who is online

Users browsing this forum: No registered users and 1 guest

cron