Dear srinivas,
 
You can try using trigrams, a special case of N-grams, often used in Natural 
Language Processing.
 
> I am interested in grouping/cluster these names   as those which are
>similar  letter to letter.  Are there any text clustering algorithm in R
>which can group names of similar type in to segments of exactly matching ,
>90% matching, 80% matching,....etc.
 
As an example:
 
# supose we have a list with locations
# (here we got a matrix, second column is used to create the sample, not 
relevant)
 
# locations with errors
Poblacion_dist = matrix(
 c("MADRIZ", 0.3,
   "BARÇELONA", 0.25,
   "BILAO", 0.135,
   "SEVILA", 0.1,
   "VALENÇIA", 0.1,
   "CORUNA", 0.025,
   "ALACANTE",0.025,
   "VALLADOLI", 0.025,
   "SANTIAGO", 0.01,
   "SAN SEBASTIAN", 0.01,
   "CADIZ", 0.01,
   "ZARAGOZA", 0.01), 
 ncol = 2, byrow=T)
 
# True locations
Poblacion = matrix(
 c("MADRID", 0.3,
   "BARCELONA", 0.25,
   "BILBAO", 0.135,
   "SEVILLA", 0.1,
   "VALENCIA", 0.1,
   "CORUÑA", 0.025,
   "ALICANTE",0.025,
   "VALLADOLID", 0.025,
   "SANTIAGO", 0.01,
   "SAN_SEBASTIAN", 0.01,
   "CADIZ", 0.01,
   "ZARAGOZA", 0.01), 
 ncol = 2, byrow=T) 
 
muestrear = function(que, cuantas_veces){
   sample(que[,1], prob = as.numeric(que[,2]), cuantas_veces)
   }
 
Provincias = ((replicate(10,c(muestrear(Poblacion,1), 
c(muestrear(Poblacion_dist,1))))))

 
# now we have a list with 20 locations 
Provincias = Provincias[1:length(Provincias)]
 
# next we need to process each location as a set of trigrams
word2trigram = function(word){
   trigramatrix =  matrix(c(seq(1, nchar(word)-2), seq(1, nchar(word)-2)+2), 
ncol = 2, byrow = F)
   trigram = c()
   for (i in 1:nrow(trigramatrix)) {
       trigram = 
append(trigram,substr(word,trigramatrix[i,1],trigramatrix[i,2]))
   }
   return(trigram)
}
Prov2trigram = lapply(Provincias, word2trigram)
 
# every trigram in the sample
Trigrams = levels(factor((unlist(Prov2trigram))))
 
# we get how many times appears a trigram in a location
ocrrnc.mtrx = matrix(rep(0,length(Trigrams)* length(Prov2trigram)), ncol = 
length(Prov2trigram))
for (i in 1:ncol(ocrrnc.mtrx)) {
  ocrrnc.mtrx[,i] = as.integer(table(append(Prov2trigram[[i]], Trigrams))-1)
  }
 
# calculate cosine (often used in NLP)
matrizCos = function(X){
    X  = t(X )
    nterm = nrow(X )
    modulo = c()
    cosen = matrix(rep(0,(nterm*nterm)),ncol = nterm)
    for (i in 1:nterm){
        Vec = X [i,]
        modulo[i] = sqrt(Vec%*%Vec)
        cosen[,i] = (X  %*% Vec)
    }
    cosen = (cosen/modulo)/matrix(rep(modulo,nterm),ncol = nterm,byrow=T)
    cosen[is.nan(cosen)] <- 0
    return (cosen)
}
rslt.dst.mat = matrizCos(ocrrnc.mtrx)
 
# and get the clusters
attr(rslt.dst.mat , "dimnames")<-list(Provincias , Provincias )
plot(hclust(as.dist(1-rslt.dst.mat),method = 'med'))
 
I hope this helps,
Eduardo San Miguel Martin

        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to