Dear all,

I'm analysing a Rat Gene ST 2.1 array. I would like to filter the dataset 
using thresholds of expression for a minimum proportion of samples in each 
group.
I've been following the paper by Rodrigo-Domingo et al. (2014) Reproducible 
probe-level analysis of the Affymetrix Exon 1.0 ST array with 
R/Bioconductor.

They have two steps to the filtering step, first they filter probesets and 
then the transcripts. I'm getting stuck at the filtering step (code chunk 
number 7: intensityFiltering in the R code provided in this paper), as I 
can't find a way to summarise the Gene St array at the probeset level in 
order to do the first step. I have used plm <- RmaPlm(can) to summarise my 
data at the transcript level, but the plmPs  <- RmaPlm(csN, mergeGroups = 
FALSE) seems to summarise by transcripts as well.

So my questions are:
1) Is it possible to summarise a Gene ST array at the probeset level? If 
yes, how?
2) Less specific to the aroma-affymetrix package, but is it necessary to 
have the probeset-level dataset in order to filter present/absent 
probes/transcripts? What would be an appropriate workflow for this?

Thank you very much for your help,
Best wishes,
Sophie.

Here is the code chunk from that paper:

###################################################
### code chunk number 7: intensityFiltering
###################################################

# ** user-defined groups; default: groups defined by the treatment column 
in SampleInformation.txt
sample.info$number <- seq(1, nrow(sample.info))
groups <- split(sample.info$number, sample.info$treatment)

# check whether the filtering is already performed, perform it otherwise
# at probeset level
if(file.exists(file = paste(output.folder, "/PresentProbesets", ds, ".Rdata"
, sep = ""))){
 load(file = paste(output.folder, "/PresentProbesets", ds, ".Rdata", sep = 
""))
} else {
 # remove cross-hybridising probesets
 non.crosshyb.probesets <- probesets.NetAffx.32$probeset_id[probesets.
NetAffx.32$crosshyb_type == 1] 
 if(!exists("exFit")){
  load(file= paste(output.folder, "CoreExonIntensities.Rdata", sep = "/"))
 }
 exon.intensities <- exFit[exFit$groupName %in% non.crosshyb.probesets, -c(3
:5)]
 rm("exFit")
 gc()

 # ** user-defined criteria for absent/present probesets
 # ** criterion 1: probeset absent/present in a group of samples: present 
in at least half of the samples
 present.exons <- lapply(groups, FUN = function(group){
   if(length(group) > 1){
     apply(log2(exon.intensities[, group + 2]) < 3, 1, sum)/length(group) < 
0.5
   } else {
     log2(exon.intensities[, group + 2]) > 3
   }
 })
 present.exons <- t(do.call(rbind, present.exons))  # convert to dataframe
 rownames(present.exons) <- exon.intensities$groupName  # use probeset 
identities

 # ** criterion 2: probeset absent/present in the dataset
 # remove probesets not present in any of the groups
 absent.exons <- apply(present.exons, 1, AllFalse)
 probesets.to.keep <- absent.exons[absent.exons == FALSE]
 probesets.to.keep <- as.factor(names(probesets.to.keep))
 n.present.exons <- length(probesets.to.keep)
 rm(list = c("absent.exons"))
 save(probesets.to.keep, file = paste(output.folder, "/PresentProbesets", ds
, ".Rdata", sep = ""))
}
# check whether transcript filtering has been performed; perform it if not
if(file.exists(file = paste(output.folder, "/PresentTranscripts", ds, 
".Rdata", sep = ""))){
 load(file = paste(output.folder, "/PresentTranscripts", ds, ".Rdata", sep = 
""))
} else {
 if(!exists("exon.intensities")){
  load(file= paste(output.folder, "CoreExonIntensities.Rdata", sep = "/"))
  exon.intensities <- exFit[exFit$groupName %in% non.crosshyb.probesets, -c(
3:5)]
  rm("exFit")
  gc()
 }

 # ** user-defined criteria for absent transcripts:
 # criterion 1: half or more of probesets of transcript present in sample
 # --> transcript present in sample
 core.transcripts <- unique(exon.intensities$unitName)
 # create a list of transcript clusters present/absent in each sample
 present.genes    <- lapply(core.transcripts, FUN = function(gene){
   apply(log2(exon.intensities[exon.intensities$unitName == gene, -c(1:2)]) 
< 3, 2, sum)/
   length(exon.intensities[exon.intensities$unitName == gene,]$groupName) < 
0.5
 })# FALSE: gene not present in sample
 names(present.genes) <- core.transcripts 
 present.genes <- do.call(rbind, present.genes) # convert to logical matrix 

 # criterion 2: transcript present in at least half of the samples of a 
group 
 # --> transcript present in the group
 present.genes.in.group <- lapply(groups, FUN = function(group){
   if(length(group) > 1){
     apply(present.genes[ , group], 1, sum)/length(group) >= 0.5
   } else {
     present.genes[ , group]
   }
 }) 
 present.genes.in.group <- do.call(rbind, present.genes.in.group) # logical 
matrix
 present.genes.in.group <- t(present.genes.in.group)
 # keep genes only present in at least two groups
 transcripts.to.keep <- apply(present.genes.in.group, 1, TwoOrMoreTrue)
 transcripts.to.keep <- names(transcripts.to.keep[transcripts.to.keep == 
TRUE])
 save(transcripts.to.keep, file = paste(output.folder, "/PresentTranscripts"
, ds, ".Rdata", sep = ""))
 rm("exon.intensities")
 gc()
}
 n.present.transcript.clusters <- length(transcripts.to.keep)


-- 
-- 
When reporting problems on aroma.affymetrix, make sure 1) to run the latest 
version of the package, 2) to report the output of sessionInfo() and 
traceback(), and 3) to post a complete code example.


You received this message because you are subscribed to the Google Groups 
"aroma.affymetrix" group with website http://www.aroma-project.org/.
To post to this group, send email to aroma-affymetrix@googlegroups.com
To unsubscribe and other options, go to http://www.aroma-project.org/forum/

--- 
You received this message because you are subscribed to the Google Groups 
"aroma.affymetrix" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to aroma-affymetrix+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to