#105: BibRank has strange overflow errors
----------------------+-----------------------------------------------------
 Reporter:  jblayloc  |       Owner:     
     Type:  defect    |      Status:  new
 Priority:  critical  |   Milestone:     
Component:  BibRank   |     Version:     
 Keywords:            |  
----------------------+-----------------------------------------------------
 Cf. RT Ticket #84756 and the many that preceded it today.

 'Error when analysing the record 813951 (((353165,
 \'x\\x9cmV\\xd9\\x96\\xdb
 
\\x0cM\\xdb\\xd9\\xa7\\xed\\xbc\\xf5\\x1b\\xfa\\xd8_\\x92\\xb1\\x1c\\xeb\\x04\\x83\\x87\\xc5\\x99\\xb4?_\\x84\\xb71\\xe8!\\x1c\\xdb\\x8a\\x90tuu\\xe1\\x9f\\x7f>\\x9dN\\x0eG\\x87\\x1eM\\xf8\\xfd5\\xbdQ^\\xd2\\xef\\xe4\\x1f\\xd3\\xa2\\xb1\\xc1\\x16\\xa7\\xd2t\\x9f\\x96s\\xf4\\xab\\xe1\\xdbfxH\\x8bm\\x02\\x90)]\\x9e\\xd22\\x80\\xa1\\xce\\xeav\\xb6}9\\xd8\\x8c5\\xda*\\xd0\\xb3\\xedq\\xb3\\xddq\\x16\\x10\\xb0\\xf4\\xe1H\\x1d
 [...]

 and then further down

 >>>> Frame update_rnkWORD in /usr/lib/python2.4/site-
 packages/invenio/bibrank_word_indexer.py at line 1125
 *******************************************************************************
       1122                         (serialize_via_marshal(doc_terms), j))
       1123             except (ZeroDivisionError, OverflowError), e:
       1124                 ## This is to try to isolate division by zero
 errors.
 ***** 1125                 register_exception(prefix="Error when analysing
 the record %s (%s): %s\n" % (j, repr(docs_terms), e), alert_admin=True)
       1126         write_message("Phase 4: ......processed %s/%s records"
 % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
       1127         i += 5000
       1128     write_message("Phase 4: Finished calculating normalization
 value for all affected records and updating %sR" % table[:-1])
 *******************************************************************************
                        terms =  ['circuitri', 'orthogon', 'sichtermann',
 'four', 'francesco', 'fouz', 'adiguzel', 'gabriella', 'kittikul',
 'cyprus', 'acciari', 'grenier', 'yanagisawa', 'accret', 'nurushev',
 'semitranspar', 'bueltmann', 'sputter', 'cinabro', 'petran', 'digit',
 'liuti', 'dierlamm', 'dell', 'bratislava', 'lumin', 'ioana', 'gyuk',
 'disturb', 'guertin', 'nrqed', 'nissim', 'mudrinic', 'delaere',
 'paschalis', 'vanini', 'semileptonic', 'musella', 'miller', 'bacon',
 'zeuner', 'pulse', 'setare', 'second', 'unparticle',  [...]
                     termlist =
 
'x\x9cmU\xdbv\x1a1\x0c\xa4IJ!\xcd\xe57\xfa\xd8_\x12\xb6\xb3\xab\x83\xd7r|!\xd9\xf4\xe7c\x19\x16\xba\xb2\x1f\xf0\xf1a\x90,\x8dF\xc3\xbf\xb8\xddl6\x1a\'\xe3\xe2\x9f\xbbr\xc5\x1f|\x94\xcf&\xfe,\xc7h\xe04\x9f\x81_W\xe0\x91\x8f\x14P\x1d-8}F\xef\xaf\xe8\x03\x1f#}\xf4\xd2Q\xd0&\x9c\x81\xbb\x15\x10\xc9\xe6$#v\xe5\x00\xef\x03}\xe2$1\xaea0)\xcd\xf1\x90\xc3
 
S\xfe\xbe6\x85\xe4\xc0\xca\x06\xb8DE\xd6\xca0\xfe\x85\x0f&\xc6\x1cz\xd5\x0f6\x93\x93\xb9j\xf5\xc9\x04\'\x93U\x96\x14\xa4\x82\xa1\x1bd\xd8\xae\x92;\x8dd
 [...]
                   terms_docs =  (('feu',
 
'x\x9c\xab\xce\xd4\xd9\xc1\xce\xa0\xc1\xc4\xc0\xc0\x90\xc9\x03"\xa4A\xc4\xc4C\x9cP1.\x10!\x01"\xa6\xae\xe1@\x16c\x04\x11s\x0c\xd9\x90\xc5\x84@\xc4\xbc~Fd1)\x10\xa1(\xc3\x82,&\x0b"V3\xa2\xa8\x93\x07\x11:\xcd\xac\xc8n\x01\xdb\xbb\xf1\'7\xb2:E\x10\xb1\xc5\x1dEL\x10Dl\xbd\xca\x03\x15c\x84\x13G\xa2Q\xfc\x06\x16s\x13fB\xd6+\t"<\xd2an\x11\x01\x112
 
\xc2\xab\x0fS\xddYA^\x0c;\xce\xd7\xb3a\xd8qs+\x17\xb2\x18XoB-+F\x18<-c\xc2\x08\x83w\xef\xb9\x90\xed\x10\x05\x11E7Q\xc2\n\x1cG\xc5\xffQ\xd4\x81\xc3\x
 [...]
                            e =  <exceptions.OverflowError instance at
 0x2aaac1628ef0>
                    doc_terms =  {'dimens': (1,
 
-9999999999999996446694976155709487744223436895164571082730835896328150429530154104129207016657051441328451345835071730730887895404954955808768L),
 'heavy': (7,
 
-59999999999999985444094529596080590941338053162300706020621731468494338297707397872495298029879999131306709294177224944938608812468731904L),
 'strickland': (3, 47010101010101005402208072838712131585L), 'show': (1,
 
-17999999999999987105606361484617250802693942325943798019766109201320572717342760321913882460547638594451198115
 [...]
                            i =  0
                    term_docs =  {843780: (18, 1), 745480: (22, 1), 376841:
 (12, 1), 376843: (12, 1), 262158: (2, 41), 376848: (12, 1), 376849: (12,
 1), 376850: (12, 1), 376851: (12, 1), 376852: (12, 1), 376855: (12, 1),
 294936: (4, 31), 376857: (12, 1), 376858: (12, 1), 303131: (4, 36),
 557084: (8, 1), 376863: (12, 1), 811044: (20, 1), 786472: (20, 1), 376874:
 (12, 1), 376875: (12, 1), 466988: (4, 34), 565293: (4, 30), 475191: (2,
 33), 671801: (2, 32), 245821: (2, 40), 213058: (6, 36), 114755: (2, 48),
 450629: (2, 22), 254031: [...]
                            j =  813951
                      hitlist =
 
'x\x9cu}\x07T\x94\xc7\xf7\xf6\x02\xcb\xb2KS\xc4\x86\x82\x82\r\xbbbE4\x82%\xf6\xde\x8d\xbd\x83
 
\x86\x9f\xa8\x80\x8a\x8a\r{\xc1\x8a\nj\xec\x8ab\xef5\xb6$b\xef-\x06\xc5\x82\xbdD\x13\x15\xfc\x7f\xef\xdd\xb0\xecs\x9e\xf9<\xc79\x9e\xeb\xcc;3\xf7>\xb7\xce\xbc\xef\x8e\t\xd1?t\xd6\x95\xb5\xd5\xe9t!\xf9\xb4\xc6Fk\x8c}\x9d\xfe\xa3\x15\xb4\xd0L\xc7\xed\xff\xa39[hN\n\x9a\xabN\xff\x1fM\x9arZ\x93W\xd1\xcfMA\xcb\xa7\xa0\xb9+h\xf9\x15\xb4B\nZ\xe1\t9k\xd1kMq\xad\xf1P\xf4+\xa2\xa0\x15]\x05cKi\x8d\xe7\x04\xe3\x7f4\
 [...]
                           Nj =  {851972: 0, 851975: 0, 729092: 1, 852016:
 0, 839748: 0, 841853: 0, 837758: 0, 837773: 0, 854382: 0, 837791: 0,
 850634: 0, 848067: 0, 856273: 0, 856289: 0, 856299: 0, 856302: 0, 817391:
 0, 856305: 0, 856314: 0, 856322: 0, 856334: 0, 856336: 0, 856339: 0,
 678207: 6.3276179911890414e+303, 819527: 0, 819540: 0, 856405: 0, 856408:
 0, 856416: 0, 856423: 0, 819560: 0, 819566: 0, 819580: 0, 856453: 0,
 856454: 0, 856456: 0, 856465: 0, 844182: 0, 856475: 0, 819625: 0, 819630:
 0, 819631: 0, 819633: 0, 8360 [...]
                            N =  852008L
                   docs_terms =  ((353165, 'x\x9cmV\xd9\x96\xdb
 
\x0cM\xdb\xd9\xa7\xed\xbc\xf5\x1b\xfa\xd8_\x92\xb1\x1c\xeb\x04\x83\x87\xc5\x99\xb4?_\x84\xb71\xe8!\x1c\xdb\x8a\x90tuu\xe1\x9f\x7f>\x9dN\x0eG\x87\x1eM\xf8\xfd5\xbdQ^\xd2\xef\xe4\x1f\xd3\xa2\xb1\xc1\x16\xa7\xd2t\x9f\x96s\xf4\xab\xe1\xdbfxH\x8bm\x02\x90)]\x9e\xd22\x80\xa1\xce\xeav\xb6}9\xd8\x8c5\xda*\xd0\xb3\xedq\xb3\xddq\x16\x10\xb0\xf4\xe1H\x1d\xa8`\xddly>x\x80#_zp\xd2\x9fB\x1c\xb7\x82q\xd4\xa4\xa4\xc4\xc0\xdf\x861\xd8
 y\r\xa8z0\xa5\xe5%-\r\xa8\xcb\xd9\xd9h\x96b\x9f [...]
                      records =  [851972, 851975, 729092, 852016, 839748,
 841853, 837758, 837773, 854382, 837791, 850634, 848067, 856273, 856289,
 856299, 856302, 817391, 856305, 856314, 856322, 856334, 856336, 856339,
 678207, 819527, 819540, 856405, 856408, 856416, 856423, 819560, 819566,
 819580, 856453, 856454, 856456, 856465, 844182, 856475, 819625, 819630,
 819631, 819633, 836020, 819637, 819639, 856511, 819656, 836044, 856525,
 840143, 856530, 836053, 856534, 836057, 856540, 856541, 856553, 856555,
 856560, 836082, 836084, 856 [...]
                          Git =
 
-12999999999999991817389682417468862187419114123579026663066501514274825124452551811108554012080612263759878904812463585724730801910132334221328384L
                            t =  'published'
                        table =  'rnkWORD01F'
                           tf =  (2, 0)
                           Fi =  6
                           Gi =  {'circuitri': 700101010101, 'orthogon':
 
280101010101010094172562307349483972788772356261391727535363193093753561665418569918597587533825L,
 'sichtermann': 5001010101010100416490365522508181673359704065L, 'four':
 
30101010101010085952199305240045609687472957893926644090825154158714757445825778615300069250605300541821698851681686451078192974299922433L,
 'francesco':
 
2201010101010100350255904031349694355024605030177406033087969259061262215342577519803280523993808897L,
 'fouz': 6101010101010101490956373 [...]
                        stime =  1275640254.7711351

 I'm guessing that something in the intake process is breaking horribly.
 Since we've only just started seeing these, I'm guessing (hoping) that
 it's a problem of bad data from a recent SPIRES dump?  But if it's not,
 then something's pretty seriously wrong somewhere else.

 Sadly, I don't know enough about the intake pipeline to be able to make
 any but the most vague and uneducated guess.  Help?

-- 
Ticket URL: <https://cdswaredev.cern.ch/invenio/ticket/105>
Invenio <http://invenio-software.org>

Reply via email to