This code performs the same operation in about 1/10th the time on my 
machine.
Give it a try.

look <- function(i) {
        # look for subsets
        dif <- m[, i] - m
        apply(dif, 2, min) > -0.5
        }
nosubsets <- function(df) {
        # eliminate events that are subsets of other events in terms of 
attendance
        m <- table(df$B, df$A)
        nevents <- dim(m)[2]
        found <- sapply(seq(nevents), look)
        diag(found) <- FALSE
        df[df$A %in% dimnames(m)[[2]][rowSums(found)<0.5], ]
        }
nosubsets(DF)

Jean



mdvaan wrote on 03/13/2012 10:56:33 PM:

> Hi,
> 
> I have data on individuals (B) who participated in events (A). If ALL
> participants in an event are a subset of the participants in another 
event I
> would like to remove the smaller event and if the participants in one 
event
> are exactly similar to the participants in another event I would like to
> remove one of the events (I don't care which one). The following example
> does that however it is extremely slow (and the true dataset is very 
large).
> What would be a more efficient way to solve the problem? I really 
appreciate
> your help. Thanks! 
> 
> DF <- data.frame(read.table(textConnection("  A  B
> 12095    69832
> 12095    51750
> 12095    6734
> 18774    51750
> 18774    51733
> 18774    6734
> 18774    69833
> 19268    51750
> 19268    6734
> 19268    51733
> 19268    65251
> 5169    54441
> 5169    15480
> 5169    3228
> 5966    51733
> 5966    65251
> 5966    68197
> 5966    6734
> 5966    51750
> 5966    69833
> 7189    135523
> 7189    65251
> 7189    51733
> 7189    69833
> 7189    135522
> 7189    68197
> 7189    6734
> 7797    51750
> 7797    6734
> 7797    69833
> 7866    6734
> 7866    69833
> 7866    51733
> 8596    51733
> 8596    51750
> 8596    65251
> 8677    6734
> 8677    51750
> 8677    51733
> 8936    68197
> 8936    6734
> 8936    65251
> 8936    51733
> 9204    51750
> 9204    69833
> 9204    6734
> 9204    51733"),head=TRUE,stringsAsFactors=FALSE))
> 
> data <- unique(DF$A)
> for (m in 1:length(data))
>    {
>    for (m in 1:length(data))
>       {
>       tdata <- data[-m]
>       q <- 0
>       for (n in 1:length(tdata))
>          {
>          if (length(which(DF[DF$A == data[m], 2] %in% DF[DF$A == 
> tdata[n], 2] ==
> TRUE)) == length(DF[DF$A == data[m], 2]))
>             {
>             q <- q + 1
>             }
>          }
>       if (q > 0)
>          {
>          data <- data[-m]
>          m <- m - 1
>          }
>       }
>    }
> DF <- DF[DF$A %in% data,]

        [[alternative HTML version deleted]]

______________________________________________
[email protected] mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to