Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r89771:134351c66421 Date: 2017-01-25 20:51 +0100 http://bitbucket.org/pypy/pypy/changeset/134351c66421/
Log: Implement Siphash-2-4, the same hashing function as CPython 3.x. Disabled by default. diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py --- a/rpython/rlib/objectmodel.py +++ b/rpython/rlib/objectmodel.py @@ -520,10 +520,22 @@ # ---------- HASH_ALGORITHM = "rpython" # XXX Is there a better name? +HASH_ALGORITHM_FIXED = False -def _hash_string(s): - """The algorithm behind compute_hash() for a string or a unicode.""" +@not_rpython +def set_hash_algorithm(algo): + """Must be called very early, before any string is hashed with + compute_hash()!""" + global HASH_ALGORITHM + if HASH_ALGORITHM != algo: + assert not HASH_ALGORITHM_FIXED, "compute_hash() already called!" + assert algo in ("rpython", "siphash24") + HASH_ALGORITHM = algo + + +def _hash_string_rpython(s): from rpython.rlib.rarithmetic import intmask + length = len(s) if length == 0: return -1 @@ -535,6 +547,83 @@ x ^= length return intmask(x) + +@not_rpython +def _hash_string_siphash24(s): + """This version is called when untranslated only.""" + import array + from rpython.rlib.rsiphash import siphash24 + from rpython.rtyper.lltypesystem import lltype, rffi + from rpython.rlib.rarithmetic import intmask + + if isinstance(s, str): + pass + elif isinstance(s, unicode): + if rffi.sizeof(lltype.UniChar) == 4: + kind = "I" + else: + kind = "H" + s = array.array(kind, map(ord, s)).tostring() + else: + if lltype.typeOf(s).TO.chars.OF == lltype.Char: + kind = "B" + elif rffi.sizeof(lltype.UniChar) == 4: + kind = "I" + else: + kind = "H" + s = array.array(kind, map(ord, s.chars)).tostring() + ptr = rffi.str2charp(s) + x = siphash24(ptr, len(s)) + rffi.free_charp(ptr) + return intmask(x) + +def ll_hash_string_siphash24(ll_s): + """Called from lltypesystem/rstr.py. 'll_s' is a rstr.STR or UNICODE.""" + from rpython.rlib.rsiphash import siphash24 + from rpython.rtyper.lltypesystem import lltype, rffi, rstr + from rpython.rlib.rarithmetic import intmask + + length = len(ll_s.chars) + # no GC operation from here! + if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char: + addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0) + else: + addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0) + length *= rffi.sizeof(rstr.UNICODE.chars.OF) + x = siphash24(addr, length) + keepalive_until_here(ll_s) + return intmask(x) +ll_hash_string_siphash24._jit_look_inside_ = False + + +@not_rpython +def _hash_string(s): + """The algorithm behind compute_hash() for a string or a unicode. + This version is only for untranslated usage, and 's' is a str or unicode. + """ + global HASH_ALGORITHM_FIXED + HASH_ALGORITHM_FIXED = True + if HASH_ALGORITHM == "rpython": + return _hash_string_rpython(s) + if HASH_ALGORITHM == "siphash24": + return _hash_string_siphash24(s) + raise NotImplementedError + +def ll_hash_string(ll_s): + """The algorithm behind compute_hash() for a string or a unicode. + This version is called from lltypesystem/rstr.py, and 'll_s' is a + rstr.STR or rstr.UNICODE. + """ + if HASH_ALGORITHM == "rpython": + return _hash_string_rpython(ll_s.chars) + if HASH_ALGORITHM == "siphash24": + if we_are_translated(): + return ll_hash_string_siphash24(ll_s) + else: + return _hash_string_siphash24(ll_s) + raise NotImplementedError + + def _hash_float(f): """The algorithm behind compute_hash() for a float. This implementation is identical to the CPython implementation, diff --git a/rpython/rlib/rsiphash.py b/rpython/rlib/rsiphash.py --- a/rpython/rlib/rsiphash.py +++ b/rpython/rlib/rsiphash.py @@ -2,6 +2,7 @@ from contextlib import contextmanager from rpython.rlib import rarithmetic from rpython.rlib.objectmodel import not_rpython, always_inline +from rpython.rlib.rgc import no_collect from rpython.rlib.rarithmetic import r_uint64 from rpython.rlib.rawstorage import misaligned_is_fine from rpython.rtyper.lltypesystem import lltype, llmemory, rffi @@ -73,11 +74,11 @@ return v0, v1, v2, v3 -def siphash24(ptr, size): - """Takes a CCHARP pointer and a size. Returns the hash as a r_uint64, +@no_collect +def siphash24(addr_in, size): + """Takes an address pointer and a size. Returns the hash as a r_uint64, which can then be casted to the expected type.""" - addr_in = llmemory.cast_ptr_to_adr(ptr) direct = (misaligned_is_fine or (rffi.cast(lltype.Signed, addr_in) & 7) == 0) diff --git a/rpython/rlib/test/test_rsiphash.py b/rpython/rlib/test/test_rsiphash.py --- a/rpython/rlib/test/test_rsiphash.py +++ b/rpython/rlib/test/test_rsiphash.py @@ -1,5 +1,5 @@ from rpython.rlib.rsiphash import siphash24, choosen_seed -from rpython.rtyper.lltypesystem import rffi +from rpython.rtyper.lltypesystem import llmemory, rffi CASES = [ @@ -32,8 +32,8 @@ q = rffi.str2charp('?' + s) with choosen_seed(0x8a9f065a358479f4, 0x11cb1e9ee7f40e1f, test_misaligned_path=True): - x = siphash24(p, len(s)) - y = siphash24(rffi.ptradd(q, 1), len(s)) + x = siphash24(llmemory.cast_ptr_to_adr(p), len(s)) + y = siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s)) rffi.free_charp(p) rffi.free_charp(q) assert x == y diff --git a/rpython/rtyper/lltypesystem/rbytearray.py b/rpython/rtyper/lltypesystem/rbytearray.py --- a/rpython/rtyper/lltypesystem/rbytearray.py +++ b/rpython/rtyper/lltypesystem/rbytearray.py @@ -8,10 +8,10 @@ def mallocbytearray(size): return lltype.malloc(BYTEARRAY, size) -_, _, copy_bytearray_contents = rstr._new_copy_contents_fun(BYTEARRAY, BYTEARRAY, +_, _, copy_bytearray_contents, _ = rstr._new_copy_contents_fun(BYTEARRAY, BYTEARRAY, lltype.Char, 'bytearray') -_, _, copy_bytearray_contents_from_str = rstr._new_copy_contents_fun(rstr.STR, +_, _, copy_bytearray_contents_from_str, _ = rstr._new_copy_contents_fun(rstr.STR, BYTEARRAY, lltype.Char, 'bytearray_from_str') diff --git a/rpython/rtyper/lltypesystem/rstr.py b/rpython/rtyper/lltypesystem/rstr.py --- a/rpython/rtyper/lltypesystem/rstr.py +++ b/rpython/rtyper/lltypesystem/rstr.py @@ -3,7 +3,7 @@ from rpython.annotator import model as annmodel from rpython.rlib import jit, types from rpython.rlib.objectmodel import (malloc_zero_filled, we_are_translated, - _hash_string, keepalive_until_here, specialize, enforceargs) + ll_hash_string, keepalive_until_here, specialize, enforceargs) from rpython.rlib.signature import signature from rpython.rlib.rarithmetic import ovfcheck from rpython.rtyper.error import TyperError @@ -136,15 +136,19 @@ copy_raw_to_string = func_with_new_name(copy_raw_to_string, 'copy_raw_to_%s' % name) - return copy_string_to_raw, copy_raw_to_string, copy_string_contents + return (copy_string_to_raw, copy_raw_to_string, copy_string_contents, + _get_raw_buf) (copy_string_to_raw, copy_raw_to_string, - copy_string_contents) = _new_copy_contents_fun(STR, STR, Char, 'string') + copy_string_contents, + _get_raw_buf_string) = _new_copy_contents_fun(STR, STR, Char, 'string') (copy_unicode_to_raw, copy_raw_to_unicode, - copy_unicode_contents) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar, 'unicode') + copy_unicode_contents, + _get_raw_buf_unicode) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar, + 'unicode') CONST_STR_CACHE = WeakValueDictionary() CONST_UNICODE_CACHE = WeakValueDictionary() @@ -382,7 +386,7 @@ # but our malloc initializes the memory to zero, so we use zero as the # special non-computed-yet value. Also, jit.conditional_call_elidable # always checks for zero, for now. - x = _hash_string(s.chars) + x = ll_hash_string(s) if x == 0: x = 29872897 s.hash = x diff --git a/rpython/translator/c/test/test_typed.py b/rpython/translator/c/test/test_typed.py --- a/rpython/translator/c/test/test_typed.py +++ b/rpython/translator/c/test/test_typed.py @@ -1,8 +1,12 @@ from __future__ import with_statement import math -import sys +import sys, os +if __name__ == '__main__': + # hack for test_hash_string_siphash24() + sys.path.insert(0, os.path.join(os.path.dirname(__file__), + '..', '..', '..', '..')) import py from rpython.rlib.rstackovf import StackOverflow @@ -597,6 +601,40 @@ assert res[3] == compute_hash(d) assert res[4] == compute_hash(("Hi", None, (7.5, 2, d))) + def _test_hash_string(self, algo): + from rpython.rlib import objectmodel + objectmodel.set_hash_algorithm(algo) + s = "hello" + u = u"world" + hash_s = compute_hash(s) + hash_u = compute_hash(u) + # + def fn(length): + assert length >= 1 + return str((compute_hash(s), + compute_hash(u), + compute_hash(s[0] + s[1:length]), + compute_hash(u[0] + u[1:length]))) + + assert fn(5) == str((hash_s, hash_u, hash_s, hash_u)) + + f = self.getcompiled(fn, [int]) + res = f(5) + res = [int(a) for a in res[1:-1].split(",")] + assert res[0] == hash_s + assert res[1] == hash_u + assert res[2] == hash_s + assert res[3] == hash_u + + def test_hash_string_rpython(self): + self._test_hash_string("rpython") + + def test_hash_string_siphash24(self): + import subprocess + subprocess.check_call([sys.executable, __file__, "siphash24", + self.__class__.__module__, + self.__class__.__name__]) + def test_list_basic_ops(self): def list_basic_ops(i, j): l = [1, 2, 3] @@ -896,3 +934,11 @@ f = self.getcompiled(func, [int]) res = f(2) assert res == 1 # and not 2 + + +if __name__ == '__main__': + # for test_hash_string_siphash24() + algo, clsmodule, clsname = sys.argv[1:] + mod = __import__(clsmodule, None, None, [clsname]) + cls = getattr(mod, clsname) + cls()._test_hash_string(algo) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit