This is an automated email from the git hooks/post-receive script. albac-guest pushed a commit to branch master in repository r-cran-fastmatch.
commit 4d6bc6c3e4955932663bc404ffaf647f9d666b2b Author: Alba Crespi <[email protected]> Date: Thu May 14 01:00:00 2015 +0100 Imported Upstream version 1.0-4 --- DESCRIPTION | 15 +++ MD5 | 7 ++ NAMESPACE | 3 + NEWS | 32 +++++ R/fastmatch.R | 2 + R/match.hash.R | 7 ++ man/fmatch.Rd | 120 +++++++++++++++++++ src/fastmatch.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 541 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..a9e672e --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: fastmatch +Version: 1.0-4 +Title: Fast match() function +Author: Simon Urbanek <[email protected]> +Maintainer: Simon Urbanek <[email protected]> +Description: Package providing a fast match() replacement for cases + that require repeated look-ups. It is slightly faster that R's + built-in match() function on first match against a table, but + extremely fast on any subsequent lookup as it keeps the hash + table in memory. +License: GPL-2 +URL: http://www.rforge.net/fastmatch +Packaged: 2012-01-21 10:09:18 UTC; svnuser +Repository: CRAN +Date/Publication: 2012-01-21 10:22:24 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..28aa699 --- /dev/null +++ b/MD5 @@ -0,0 +1,7 @@ +89f00fff119030016fece98c08b5040b *DESCRIPTION +7dd3c164abc64183f0681eaf7b85d73e *NAMESPACE +27e152f5450341fbb88d31cfbff45520 *NEWS +770a7b76ccff6f95d86152999543269b *R/fastmatch.R +ddc4a8e8795d9bc6be2c7d507b7e160b *R/match.hash.R +1cf3221f784b90ed613d2454cc00a727 *man/fmatch.Rd +632693d50dad9116f97f57578ee10502 *src/fastmatch.c diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..eceafca --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,3 @@ +useDynLib(fastmatch) +export(fmatch) +S3method(print, match.hash) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..13dcc71 --- /dev/null +++ b/NEWS @@ -0,0 +1,32 @@ + NEWS for fastmatch +-------------------- + +0.1-4 2012-01-12 + o some R functions (such as subset assignment like x[1] <- 2) + can create a new object (with possibly modified content) and + copy all attributes including the hash cache. If the original + object was used as a table in fmatch(), the hash cache will be + copied into the modified object and thus its cache will be + possibly out of sync with the object. fmatch() will now + identify such cases and discard the hash to prevent errorneous + results. + +0.1-3 2011-12-21 + o match() coerces POSIXlt objects into characters, but so far + fmatch() performed the match on the actual objects. + Now fmatch() coerces POSIXlt object into characters just like + match(), but note that you will lose the ability to perform + fast lookups if the table is a POSIXlt object -- please use + POSIXct objects (much more efficient) or use as.character() on + the POSIXlt object to create a table that you want to re-use. + +0.1-2 2011-09-14 + o bugfix: nomatch was ignored in the fastmatch implementation + (thanks to Enrico Schumann for reporting) + +0.1-1 2010-12-23 + o minor cleanups + +0.1-0 2010-12-23 + o initial release + diff --git a/R/fastmatch.R b/R/fastmatch.R new file mode 100644 index 0000000..fe32610 --- /dev/null +++ b/R/fastmatch.R @@ -0,0 +1,2 @@ +fmatch <- function(x, table, nomatch = NA_integer_, incomparables = NULL) + .Call("fmatch", x, table, nomatch, incomparables, PACKAGE = "fastmatch") diff --git a/R/match.hash.R b/R/match.hash.R new file mode 100644 index 0000000..be1c08a --- /dev/null +++ b/R/match.hash.R @@ -0,0 +1,7 @@ +# match.hash is an infomal (S3) class representing the +# chain of hash tables stored in the .match.hash attribute +# of tables that have been hashed + +# we provide a (sort of dummy) print method so +# the output is not as ugly +print.match.hash <- function(x, ...) { cat("<hash table>\n"); x } diff --git a/man/fmatch.Rd b/man/fmatch.Rd new file mode 100644 index 0000000..03eef29 --- /dev/null +++ b/man/fmatch.Rd @@ -0,0 +1,120 @@ +\name{fmatch} +\alias{fmatch} +\alias{fastmatch} +\title{ +Fast match() replacement +} +\description{ +\code{fmatch} is a faster version of the built-in \code{\link{match}}() +function. It is slightly faster than the built-in version because it +uses more specialized code, but in addition it retains the hash table +within the table object such that it can be re-used, dramatically reducing +the look-up time especially for large tables. + +Although \code{fmatch} can be used separately, in general it is also +safe to use: \code{match <- fmatch} since it is a drop-in +replacement. Any cases not directly handled by \code{fmatch} are passed +to \code{match} with a warning. +} +\usage{ +fmatch(x, table, nomatch = NA_integer_, incomparables = NULL) +} +\arguments{ + \item{x}{values to be matched} + \item{table}{values to be matched against} + \item{nomatch}{the value to be returned in the case when no match is + found. It is coerced to \code{integer}.} + \item{incomparables}{a vector of values that cannot be matched. Any + value other than \code{NULL} will result in a fall-back to + \code{match} without any speed gains.} +} +\details{ + See \code{\link{match}} for the purpose and details of the + \code{match} function. \code{fmatch} is a drop-in replacement for + the \code{match} function with the focus on + performance. \code{incomparables} are not supported by \code{fmatch} + and will be passed down to \code{match}. + + The first match against a table results in a hash table to be computed + from the table. This table is then attached as the `.match.hash` + attribute of the table so that it can be re-used on subsequent calls + to \code{fmatch} with the same table. + + The hashing algorithm used is the same as the \code{match} function in + R, but it is re-implemented in a slight different way to improve its + performance at the cost of supporting only a subset of types (integer, + real and character). For any other types \code{fmatch} falls back to + \code{match} (with a warning). +} +\value{ + A vector of the same length as \code{x} - see \code{\link{match}} for + details. +} +%\references{ +%} +%\author{ +%% ~~who you are~~ +%} +\note{ +\code{fmatch} modifies the \code{table} by attaching an attribute to + it. It is expected that the values will not change unless that + attribute is dropped. Under normal circumstances this should not have + any effect from user's point of view, but there is a theoretical + chance of the cache being out of sync with the table in case the table + is modified directly (e.g. by some C code) without removing + attributes. + + Also \code{fmatch} does not convert to a common encoding so strings + with different representation in two encodings don't match. +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ +\seealso{ +\code{\link{match}} +} +\examples{ +# some random speed comparison examples: +# first use integer matching +x = as.integer(rnorm(1e6) * 1000000) +s = 1:100 +# the first call to fmatch is comparable to match +system.time(fmatch(s,x)) +# but the subsequent calls take no time! +system.time(fmatch(s,x)) +system.time(fmatch(-50:50,x)) +system.time(fmatch(-5000:5000,x)) +# here is the speed of match for comparison +system.time(base::match(s, x)) +# the results should be identical +identical(base::match(s, x), fmatch(s, x)) + +# next, match a factor against the table +# this will require both x and the factor +# to be cast to strings +s=factor(c("1","1","2","foo","3",NA)) +# because the casting will have to allocate a string +# cache in R, we run a dummy conversion to take +# that out of the equation +dummy = as.character(x) +# now we can run the speed tests +system.time(fmatch(s, x)) +system.time(fmatch(s, x)) +# the cache is still valid for string matches as well +system.time(fmatch(c("foo","bar","1","2"),x)) +# now back to match +system.time(base::match(s, x)) +identical(base::match(s, x), fmatch(s, x)) + +# finally, some reals to match +y = rnorm(1e6) +s = c(y[sample(length(y), 100)], 123.567, NA, NaN) +system.time(fmatch(s, y)) +system.time(fmatch(s, y)) +system.time(fmatch(s, y)) +system.time(base::match(s, y)) +identical(base::match(s, y), fmatch(s, y)) + +# this used to fail before 0.1-2 since nomatch was ignored +identical(base::match(4L, 1:3, nomatch=0), fmatch(4L, 1:3, nomatch=0)) +} +\keyword{manip} diff --git a/src/fastmatch.c b/src/fastmatch.c new file mode 100644 index 0000000..f92b3e3 --- /dev/null +++ b/src/fastmatch.c @@ -0,0 +1,355 @@ +/* + * fastmatch: fast implementation of match() in R using semi-permanent hash tables + * + * Copyright (C) 2010, 2011 Simon Urbanek + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* for speed (should not really matter in this case as most time is spent in the hashing) */ +#define USE_RINTERNALS 1 +#include <Rinternals.h> + +/* for malloc/free since we handle our hash table memory separately from R */ +#include <stdlib.h> +/* for hashing for pointers we need intptr_t */ +#include <stdint.h> + +/* match5 to fall-back to R's internal match for types we don't support */ +SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env); + +/* ".match.hash" symbol - cached on first use */ +SEXP hs; + +typedef int hash_index_t; + +typedef struct hash { + int m, k, els, type; + void *src; + SEXP prot, parent; + struct hash *next; + hash_index_t ix[1]; +} hash_t; + +/* create a new hash table with the given source and length. + we store only the index - values are picked from the source + so you must make sure the source is still alive when used */ +static hash_t *new_hash(void *src, hash_index_t len) { + hash_t *h; + hash_index_t m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */ + while (m < desired) { m *= 2; k++; } + h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m)); + if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0)); + h->m = m; + h->k = k; + h->src = src; + return h; +} + +/* free the hash table (and all chained hash tables as well) */ +static void free_hash(hash_t *h) { + if (h->next) free_hash(h->next); + if (h->prot) R_ReleaseObject(h->prot); + free(h); +} + +/* R finalized for the hash table object */ +static void hash_fin(SEXP ho) { + hash_t *h = (hash_t*) EXTPTR_PTR(ho); + if (h) free_hash(h); +} + +/* pi-hash fn */ +#define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k)) + +/* add the integer value at index i (0-based!) to the hash */ +static void add_hash_int(hash_t *h, hash_index_t i) { + int *src = (int*) h->src; + int val = src[i++], addr; + addr = HASH(val); +#ifdef PROFILE_HASH + int oa = addr; +#endif + while (h->ix[addr] && src[h->ix[addr] - 1] != val) { + addr++; + if (addr == h->m) addr = 0; + } +#ifdef PROFILE_HASH + if (addr != oa) printf("%d: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa); +#endif + if (!h->ix[addr]) + h->ix[addr] = i; +} + +/* to avoid aliasing rules issues use a union */ +union dint_u { + double d; + unsigned int u[2]; +}; + +/* add the double value at index i (0-based!) to the hash */ +static void add_hash_real(hash_t *h, hash_index_t i) { + double *src = (double*) h->src; + union dint_u val; + int addr; + /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */ + val.d = (src[i] == 0.0) ? 0.0 : src[i]; + if (R_IsNA(val.d)) val.d = NA_REAL; + else if (R_IsNaN(val.d)) val.d = R_NaN; + addr = HASH(val.u[0]+ val.u[1]); +#ifdef PROFILE_HASH + int oa = addr; +#endif + while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) { + addr++; + if (addr == h->m) addr = 0; + } +#ifdef PROFILE_HASH + if (addr != oa) printf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, addr - oa, addr, oa); +#endif + if (!h->ix[addr]) + h->ix[addr] = i + 1; +} + +/* add the pointer value at index i (0-based!) to the hash */ +static void add_hash_ptr(hash_t *h, hash_index_t i) { + int addr; + void **src = (void**) h->src; + intptr_t val = (intptr_t) src[i++]; +#if (defined _LP64) || (defined __LP64__) || (defined WIN64) + addr = HASH((val & 0xffffffff) ^ (val >> 32)); +#else + addr = HASH(val); +#endif +#ifdef PROFILE_HASH + int oa = addr; +#endif + while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) { + addr++; + if (addr == h->m) addr = 0; + } +#ifdef PROFILE_HASH + if (addr != oa) printf("%p: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa); +#endif + if (!h->ix[addr]) + h->ix[addr] = i; +} + +/* NOTE: we are returning a 1-based index ! */ +static int get_hash_int(hash_t *h, int val, int nmv) { + int *src = (int*) h->src; + int addr; + addr = HASH(val); + while (h->ix[addr]) { + if (src[h->ix[addr] - 1] == val) + return h->ix[addr]; + addr ++; + if (addr == h->m) addr = 0; + } + return nmv; +} + +/* NOTE: we are returning a 1-based index ! */ +static int get_hash_real(hash_t *h, double val, int nmv) { + double *src = (double*) h->src; + int addr; + union dint_u val_u; + /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */ + if (val == 0.0) val = 0.0; + if (R_IsNA(val)) val = NA_REAL; + else if (R_IsNaN(val)) val = R_NaN; + val_u.d = val; + addr = HASH(val_u.u[0] + val_u.u[1]); + while (h->ix[addr]) { + if (src[h->ix[addr] - 1] == val) + return h->ix[addr]; + addr++; + if (addr == h->m) addr = 0; + } + return nmv; +} + +/* NOTE: we are returning a 1-based index ! */ +static int get_hash_ptr(hash_t *h, void *val_ptr, int nmv) { + void **src = (void **) h->src; + intptr_t val = (intptr_t) val_ptr; + int addr; +#if (defined _LP64) || (defined __LP64__) || (defined WIN64) + addr = HASH((val & 0xffffffff) ^ (val >> 32)); +#else + addr = HASH(val); +#endif + while (h->ix[addr]) { + if ((intptr_t) src[h->ix[addr] - 1] == val) + return h->ix[addr]; + addr ++; + if (addr == h->m) addr = 0; + } + return nmv; +} + +static SEXP asCharacter(SEXP s, SEXP env) +{ + SEXP call, r; + PROTECT(call = lang2(install("as.character"), s)); + PROTECT(r = eval(call, env)); + UNPROTECT(2); + return r; +} + + +/* the only externally visible function to be called from R */ +SEXP fmatch(SEXP x, SEXP y, SEXP nonmatch, SEXP incomp) { + SEXP a; + SEXPTYPE type; + hash_t *h = 0; + int nmv = asInteger(nonmatch), n = LENGTH(x), np = 0, y_to_char = 0, y_factor = 0; + + /* edge-cases of 0 length */ + if (n == 0) return allocVector(INTSXP, 0); + if (LENGTH(y) == 0) { /* empty table -> vector full of nmv */ + int *ai; + a = allocVector(INTSXP, n); + ai = INTEGER(a); + for (np = 0; np < n; np++) ai[np] = nmv; + return a; + } + + /* if incomparables are used we fall back straight to match() */ + if (incomp != R_NilValue && !(isLogical(incomp) && LENGTH(incomp) == 1 && LOGICAL(incomp)[0] == 0)) { + Rf_warning("incomparables used in fmatch(), falling back to match()"); + return match5(y, x, nmv, incomp, R_BaseEnv); + } + + /* implicitly convert factors/POSIXlt to character */ + if (OBJECT(x)) { + if (inherits(x, "factor")) { + x = PROTECT(asCharacterFactor(x)); + np++; + } else if (inherits(x, "POSIXlt")) { + x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */ + np++; + } + } + + /* for y we may need to do that later */ + y_factor = OBJECT(y) && inherits(y, "factor"); + y_to_char = y_factor || (OBJECT(y) && inherits(y, "POSIXlt")); + + /* coerce to common type - in the order of SEXP types */ + if(TYPEOF(x) >= STRSXP || TYPEOF(y) >= STRSXP) + type = STRSXP; + else + type = (TYPEOF(x) < TYPEOF(y)) ? TYPEOF(y) : TYPEOF(x); + + /* we only support INT/REAL/STR */ + if (type != INTSXP && type != REALSXP && type != STRSXP) { + Rf_warning("incompatible type, fastmatch() is falling back to match()"); + return match5(y, x, nmv, NULL, R_BaseEnv); + } + + if (y_to_char && type != STRSXP) /* y = factor -> character -> type must be STRSXP */ + type = STRSXP; + + /* coerce x - not y yet because we may get away with the existing cache */ + if (TYPEOF(x) != type) { + x = PROTECT(coerceVector(x, type)); + np++; + } + + /* find existing cache(s) */ + if (!hs) hs = Rf_install(".match.hash"); + a = Rf_getAttrib(y, hs); + if (a != R_NilValue) { /* if there is a cache, try to find the matching type */ + h = (hash_t*) EXTPTR_PTR(a); + /* could the object be out of sync ? If so, better remove the hash and ignore it */ + if (h->parent != y) { +#if HASH_VERBOSE + Rprintf(" - DISCARDING hash, its parent and the bearer don't match, taking no chances.\n"); +#endif + h = 0; + Rf_setAttrib(y, hs, R_NilValue); + } + while (h && h->type != type) h = h->next; + } + /* if there is no cache or not of the needed coerced type, create one */ + if (a == R_NilValue || !h) { + h = new_hash(DATAPTR(y), LENGTH(y)); + h->type = type; + h->parent = y; +#if HASH_VERBOSE + Rprintf(" - creating new hash for type %d\n", type); +#endif + if (a == R_NilValue) { /* if there is no cache attribute, create one */ + a = R_MakeExternalPtr(h, R_NilValue, R_NilValue); + Rf_setAttrib(y, hs, a); + Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("match.hash")); + R_RegisterCFinalizer(a, hash_fin); + } else { /* otherwise append the new cache */ + hash_t *lh = (hash_t*) EXTPTR_PTR(a); + while (lh->next) lh = lh->next; + lh->next = h; +#if HASH_VERBOSE + Rprintf(" (appended to the cache list)\n"); +#endif + } + + if (TYPEOF(y) != type) { +#if HASH_VERBOSE + if (y_to_char) + Rprintf(" (need to convert table factor/POSIXlt to strings\n"); + else + Rprintf(" (need to coerce table to %d)\n", type); +#endif + y = y_to_char ? (y_factor ? asCharacterFactor(y) : asCharacter(y, R_GlobalEnv)) : coerceVector(y, type); + h->src = DATAPTR(y); /* this is ugly, but we need to adjust the source since we changed it */ + h->prot = y; /* since the coerced object is temporary, we let the hash table handle its life span */ + R_PreserveObject(y); + } + /* make sure y doesn't go away while we create the hash */ + /* R_PreserveObject(y); */ + /* spawn a thread to create the hash */ + /* nope - so far we do it serially */ + + { /* create the hash table */ + int i, n = LENGTH(y); + if (type == INTSXP) + for(i = 0; i < n; i++) + add_hash_int(h, i); + else if (type == REALSXP) + for(i = 0; i < n; i++) + add_hash_real(h, i); + else + for(i = 0; i < n; i++) + add_hash_ptr(h, i); + } + } + + { /* query the hash table */ + int i, n = LENGTH(x); + SEXP r = allocVector(INTSXP, n); + int *v = INTEGER(r); + if (type == INTSXP) { + int *k = INTEGER(x); + for (i = 0; i < n; i++) + v[i] = get_hash_int(h, k[i], nmv); + } else if (type == REALSXP) { + double *k = REAL(x); + for (i = 0; i < n; i++) + v[i] = get_hash_real(h, k[i], nmv); + } else { + SEXP *k = (SEXP*) DATAPTR(x); + for (i = 0; i < n; i++) + v[i] = get_hash_ptr(h, k[i], nmv); + } + if (np) UNPROTECT(np); + return r; + } +} -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-cran-fastmatch.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
