Repository: lucy Updated Branches: refs/heads/0.3 a8807951e -> 07df6940e
Optimize and fix encoding of similarity values The previous code was a bit unclear as it extracted the LSB of the exponent as part of the mantissa. This did actually work because exponent and mantissa were stored next to each other like in the IEEE format. For the same reason, the code can be optimized to use a single shift and mask operation. The actual encoding does not change. More importantly, this commit fixes a bug where, due to a missing range check, values smaller than 2^-31 were encoded as 255 and subsequently decoded as approximately 2^31. Such small values are probably rare but this could cause significant errors when calculating similarities. Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/3bbf12de Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/3bbf12de Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/3bbf12de Branch: refs/heads/0.3 Commit: 3bbf12de05a45e1641f2c72ce43a0e6e1f1a6050 Parents: a880795 Author: Nick Wellnhofer <[email protected]> Authored: Sun Oct 19 17:50:25 2014 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Sun Oct 19 18:25:19 2014 +0200 ---------------------------------------------------------------------- core/Lucy/Index/Similarity.c | 26 +++++++++++++++++++------- core/Lucy/Index/Similarity.cfh | 2 +- perl/t/504-similarity.t | 7 ++++++- 3 files changed, 26 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/3bbf12de/core/Lucy/Index/Similarity.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Similarity.c b/core/Lucy/Index/Similarity.c index bf7e7ef..27794ed 100644 --- a/core/Lucy/Index/Similarity.c +++ b/core/Lucy/Index/Similarity.c @@ -30,6 +30,12 @@ #include "Lucy/Store/InStream.h" #include "Lucy/Store/OutStream.h" +// The exponent range [-31;32] is mapped to [0;63]. Values outside +// of the range are clamped resulting in 6 bits for the exponent. +// The IEEE bias is 127, so we have to subtract 127 and add 31 to +// the upper bits. +#define EXP_OFFSET ((127 - 31) << 2) + Similarity* Sim_new() { Similarity *self = (Similarity*)VTable_Make_Obj(SIMILARITY); @@ -158,15 +164,21 @@ Sim_encode_norm(Similarity *self, float f) { } else { const uint32_t bits = *(uint32_t*)&f; - uint32_t mantissa = (bits & 0xffffff) >> 21; - uint32_t exponent = (((bits >> 24) & 0x7f) - 63) + 15; - if (exponent > 31) { - exponent = 31; - mantissa = 7; - } + // The normalized value contains two bits of mantissa (excluding + // the implicit leading bit) in the least significant bits and the + // exponent in the upper bits. + norm = (bits >> 21) & 0x3ff; - norm = (exponent << 3) | mantissa; + if (norm <= EXP_OFFSET) { + norm = 0; + } + else { + norm -= EXP_OFFSET; + if (norm > 255) { + norm = 255; + } + } } return norm; http://git-wip-us.apache.org/repos/asf/lucy/blob/3bbf12de/core/Lucy/Index/Similarity.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Similarity.cfh b/core/Lucy/Index/Similarity.cfh index e49021c..934a5d4 100644 --- a/core/Lucy/Index/Similarity.cfh +++ b/core/Lucy/Index/Similarity.cfh @@ -106,7 +106,7 @@ class Lucy::Index::Similarity cnick Sim Query_Norm(Similarity *self, float sum_of_squared_weights); /** encode_norm and decode_norm encode and decode between 32-bit IEEE - * floating point numbers and a 5-bit exponent, 3-bit mantissa float. The + * floating point numbers and a 6-bit exponent, 3-bit mantissa float. The * range covered by the single-byte encoding is 7x10^9 to 2x10^-9. The * accuracy is about one significant decimal digit. */ http://git-wip-us.apache.org/repos/asf/lucy/blob/3bbf12de/perl/t/504-similarity.t ---------------------------------------------------------------------- diff --git a/perl/t/504-similarity.t b/perl/t/504-similarity.t index 3383419..c296be1 100644 --- a/perl/t/504-similarity.t +++ b/perl/t/504-similarity.t @@ -38,7 +38,7 @@ sub new { } package main; -use Test::More tests => 9; +use Test::More tests => 10; use Lucy::Test; use bytes; no bytes; @@ -79,6 +79,11 @@ for ( 0 .. 255 ) { is_deeply( \@transformed, \@floats, "using the norm_decoder produces desired results" ); +my $small_encoded = $sim->encode_norm(1e-30); +my $large_encoded = $sim->encode_norm(1e30); +ok( $small_encoded != $large_encoded, + "extremely small and large values are encoded differently" ); + my $folder = Lucy::Store::RAMFolder->new; my $indexer = Lucy::Index::Indexer->new( index => $folder,
