Re: [Pytables-users] Performance problems in indexed tables.

Ümit Seren Mon, 23 Jan 2012 04:47:41 -0800

Because the profile output is probably not formatted properly in the
mail I attached the two line_profiler profile output files. In
addition to this I also added the profile for _table__whereIndexed()
function.



On Mon, Jan 23, 2012 at 12:13 PM, Ümit Seren <uemit.se...@gmail.com> wrote:
> Hi Anthony
> I did some profiling with line_profiler.
>
> Here are the results for an table with index:
>
> Line #      Hits         Time  Per Hit   % Time  Line Contents
> ==============================================================
>  1469                                               def readWhere(
> self, condition, condvars=None, field=None,
>  1470
> start=None, stop=None, step=None ):
>  1471                                                   """
>  1480                                                   """
>  1481         1            8      8.0      0.0
> self._checkFieldIfNumeric(field)
>  1482
>  1483         1            2      2.0      0.0          coords = [
> p.nrow for p in
>  1484 6951894     25942594      3.7     92.4
> self._where(condition, condvars, start, stop, step) ]
>  1485         1            5      5.0      0.0
> self._whereCondition = None  # reset the conditions
>  1486         1            6      6.0      0.0          if len(coords) > 1:
>  1487         1           13     13.0      0.0              cstart,
> cstop = coords[0], coords[-1]+1
>  1488         1            5      5.0      0.0              if cstop
> - cstart == len(coords):
>  1489                                                           #
> Chances for monotonically increasing row values. Refine.
>  1490         1            4      4.0      0.0
> inc_seq = numpy.alltrue(
>  1491         1      1124713 1124713.0      4.0
> numpy.arange(cstart, cstop) == numpy.array(coords))
>  1492         1            5      5.0      0.0                  if inc_seq:
>  1493         1      1000172 1000172.0      3.6
> return self.read(cstart, cstop, field=field)
>  1494                                                   return
> self.readCoordinates(coords, field)
>
> File: profile.py
> Function: function_to_test at line 11
> Total time: 39.1439 s
>
> the profile for the _where() function looks like this:
>
> Line #      Hits         Time  Per Hit   % Time  Line Contents
> ==============================================================
>  1423                                               def _where( self,
> condition, condvars,
>  1424
> start=None, stop=None, step=None ):
>  1425                                                   """Low-level
> counterpart of `self.where()`."""
>  1426         1            2      2.0      0.0          if profile:
> tref = time()
>  1427         1            1      1.0      0.0          if profile:
> show_stats("Entering table._where", tref)
>  1428                                                   # Adjust the
> slice to be used.
>  1429         1           20     20.0      0.0          (start, stop,
> step) = self._processRangeRead(start, stop, step)
>  1430         1            2      2.0      0.0          if start >=
> stop:  # empty range, reset conditions
>  1431
> self._useIndex = False
>  1432
> self._whereCondition = None
>  1433                                                       return iter([])
>  1434
>  1435                                                   # Compile the
> condition and extract usable index conditions.
>  1436         1          153    153.0      0.0          condvars =
> self._requiredExprVars(condition, condvars, depth=3)
>  1437         1         1299   1299.0      0.3          compiled =
> self._compileCondition(condition, condvars)
>  1438
>  1439                                                   # Can we use indexes?
>  1440         1            2      2.0      0.0          if
> compiled.index_expressions:
>  1441         1            1      1.0      0.0              chunkmap
> = _table__whereIndexed(
>  1442         1       439602 439602.0     99.6                  self,
> compiled, condition, condvars, start, stop, step)
>  1443         1            5      5.0      0.0              if
> type(chunkmap) != numpy.ndarray:
>  1444                                                           # If
> it is not a NumPy array it should be an iterator
>  1445                                                           #
> Reset conditions
>  1446
> self._useIndex = False
>  1447
> self._whereCondition = None
>  1448                                                           #
> ...and return the iterator
>  1449                                                           return 
> chunkmap
>  1450                                                   else:
>  1451                                                       chunkmap
> = None  # default to an in-kernel query
>  1452
>  1453         2            7      3.5      0.0          args =
> [condvars[param] for param in compiled.parameters]
>  1454         1            5      5.0      0.0
> self._whereCondition = (compiled.function, args)
>  1455         1          141    141.0      0.0          row =
> tableExtension.Row(self)
>  1456         1            2      2.0      0.0          if profile:
> show_stats("Exiting table._where", tref)
>  1457         1           27     27.0      0.0          return
> row._iter(start, stop, step, chunkmap=chunkmap)
>
> File: profile.py
> Function: function_to_test at line 11
> Total time: 22.4212 s
>
>
> The profile for a non-indexed table looks like this:
>
> Line #      Hits         Time  Per Hit   % Time  Line Contents
> ==============================================================
>  1469                                               def readWhere(
> self, condition, condvars=None, field=None,
>  1470
> start=None, stop=None, step=None ):
>  1480                                                   """
>  1481         1            8      8.0      0.0
> self._checkFieldIfNumeric(field)
>  1482
>  1483         1            1      1.0      0.0          coords = [
> p.nrow for p in
>  1484   6951894     14060474      2.0     87.2
> self._where(condition, condvars, start, stop, step) ]
>  1485         1            4      4.0      0.0
> self._whereCondition = None  # reset the conditions
>  1486         1            4      4.0      0.0          if len(coords) > 1:
>  1487         1           10     10.0      0.0              cstart,
> cstop = coords[0], coords[-1]+1
>  1488         1            5      5.0      0.0              if cstop
> - cstart == len(coords):
>  1489                                                           #
> Chances for monotonically increasing row values. Refine.
>  1490         1            3      3.0      0.0
> inc_seq = numpy.alltrue(
>  1491         1      1104094 1104094.0      6.8
> numpy.arange(cstart, cstop) == numpy.array(coords))
>  1492         1            4      4.0      0.0                  if inc_seq:
>  1493         1       959634 959634.0      6.0
> return self.read(cstart, cstop, field=field)
>  1494                                                   return
> self.readCoordinates(coords, field)
>
> File: profile.py
> Function: function_to_test at line 11
> Total time: 27.5148 s
>
> Line #      Hits         Time  Per Hit   % Time  Line Contents
> ==============================================================
>  1423                                               def _where( self,
> condition, condvars,
>  1424
> start=None, stop=None, step=None ):
>  1425                                                   """Low-level
> counterpart of `self.where()`."""
>  1426         1            2      2.0      0.1          if profile:
> tref = time()
>  1427         1            2      2.0      0.1          if profile:
> show_stats("Entering table._where", tref)
>  1428                                                   # Adjust the
> slice to be used.
>  1429         1           29     29.0      2.1          (start, stop,
> step) = self._processRangeRead(start, stop, step)
>  1430         1            2      2.0      0.1          if start >=
> stop:  # empty range, reset conditions
>  1431
> self._useIndex = False
>  1432
> self._whereCondition = None
>  1433                                                       return iter([])
>  1434
>  1435                                                   # Compile the
> condition and extract usable index conditions.
>  1436         1          154    154.0     10.9          condvars =
> self._requiredExprVars(condition, condvars, depth=3)
>  1437         1         1057   1057.0     75.0          compiled =
> self._compileCondition(condition, condvars)
>  1438
>  1439                                                   # Can we use indexes?
>  1440         1            2      2.0      0.1          if
> compiled.index_expressions:
>  1441                                                       chunkmap
> = _table__whereIndexed(
>  1442                                                           self,
> compiled, condition, condvars, start, stop, step)
>  1443                                                       if
> type(chunkmap) != numpy.ndarray:
>  1444                                                           # If
> it is not a NumPy array it should be an iterator
>  1445                                                           #
> Reset conditions
>  1446
> self._useIndex = False
>  1447
> self._whereCondition = None
>  1448                                                           #
> ...and return the iterator
>  1449                                                           return 
> chunkmap
>  1450                                                   else:
>  1451         1            2      2.0      0.1              chunkmap
> = None  # default to an in-kernel query
>  1452
>  1453         2            4      2.0      0.3          args =
> [condvars[param] for param in compiled.parameters]
>  1454         1            2      2.0      0.1
> self._whereCondition = (compiled.function, args)
>  1455         1          128    128.0      9.1          row =
> tableExtension.Row(self)
>  1456         1            2      2.0      0.1          if profile:
> show_stats("Exiting table._where", tref)
>  1457         1           23     23.0      1.6          return
> row._iter(start, stop, step, chunkmap=chunkmap)
>
> File: profile.py
> Function: function_to_test at line 11
> Total time: 10.7759 s
>
>
> Well I have the hdf5 file which reproduces the problem. I will try to
> write a script which creates the an hdf5 file that shows the same
> behavior.
>
> On Sun, Jan 22, 2012 at 11:20 PM, Anthony Scopatz <scop...@gmail.com> wrote:
>> Hello Ümit,
>>
>> Yes this is some seriously messed up behavior.  I would suggest profiling
>> (kern_prof, line_profiler) to try to figure out what is going on here.  The
>> is counter productive at the very least....
>>
>> My suspicion is that when indexes are used that BOTH the index and the table
>> values themselves are being compared rather than ONLY the index being
>> compared.  This is just my suspicion, but it would seem to give this
>> behavior.  Profiling would at least tell us which function or method is the
>> trouble maker.
>>
>> Do you have a script that reproduces this as a whole?
>>
>> Be Well
>> Anthony
>>
>> On Sat, Jan 21, 2012 at 7:23 AM, Ümit Seren <uemit.se...@gmail.com> wrote:
>>>
>>> I recently used ptrepack to compact my hdf5 file and forgot to active
>>> the options to propagate indexes.
>>> Just out of curiosity I decided to compare performance between the two
>>> tables (one with index and one without) for some queries.
>>>
>>> The table structure looks like this:
>>>
>>>  "gene_mid_pos": UInt32Col(shape=(), dflt=0, pos=3),
>>>  "gene_start": UInt32Col(shape=(), dflt=0, pos=4),
>>>  "mac": Int32Col(shape=(), dflt=0, pos=5),
>>>  "maf": Float32Col(shape=(), dflt=0.0, pos=6),
>>>  "perc_var_expl": Float32Col(shape=(), dflt=0.0, pos=7),
>>>  "score": Float32Col(shape=(), dflt=0.0, pos=8),
>>>  "snp_chr": UInt8Col(shape=(), dflt=0, pos=9),
>>>  "snp_pos": UInt32Col(shape=(), dflt=0, pos=10)}
>>>  byteorder := 'little'
>>>  chunkshape := (4854,)
>>>  autoIndex := True
>>>  colindexes := {
>>>    "gene": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "gene_start": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "snp_pos": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "gene_mid_pos": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "perc_var_expl": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "mac": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "score": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "snp_chr": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "maf": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "gene_end": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>>>    "gene_chr": Index(6, medium, shuffle, zlib(1)).is_CSI=False}
>>>
>>>
>>> and contains around 7 million rows (sorted after score in descending
>>> order)
>>>
>>> To verify the results I also created a second hdf5 file with ptrepack
>>> where I propagate the indexes just to make sure that compacting isn't
>>> the root of the performance issue.
>>> Here are my results:
>>>
>>> orig_table  ... original table with indexes
>>> com_table_no_index = compacted table without indexes
>>> com_table_with_index = compacted table with indexes
>>>
>>> table.readWhere('score > 5')
>>>
>>> orig_table:                  19.4s
>>> com_table_with_index: 18.7s
>>> com_table_no_index:     7.9s
>>>
>>>
>>> table.readWhere('gene_chr >2') (really low selectivity because snp_chr
>>> is between 1 and 5)
>>>
>>> orig_table:                   46.3s
>>> com_table_with_index: 44.8s
>>> com_table_no_index:    37.8s
>>>
>>> table.readWhere('(maf >0.2 and maf < 0.9 )')
>>>
>>> orig_table:                  19.4 s
>>> com_table_with_index: 19.2 s
>>> com_table_no_index:    9.31 s
>>>
>>> Both queries have a hit rate of 90%
>>>
>>> table.readWhere('(maf >0.21 and maf < 0.22 )'
>>> orig_table:                  35.8 s
>>> com_table_with_index: 35.7  s
>>> com_table_no_index:    29.3 s
>>>
>>> I tried also a query with a smaller hit-rate (10%)
>>>
>>>
>>> table.readWhere('(gene_start >2000) & (gene_end<900000)')
>>>
>>> orig_table:                   5.67s
>>> com_table_with_index: 5.52s
>>> com_table_no_index:   4.22s
>>>
>>>
>>> table.readWhere('(snp_chr ==2 )') (really low selectivity because
>>> snp_chr is between 1 and 5)
>>>
>>> orig_table:                  12 s
>>> com_table_with_index: 12.3 s
>>> com_table_no_index:    9.16 s
>>>
>>>
>>>
>>> I checked with willQueryUseIndexing if the indexes are used and in all
>>> cases they were.
>>>
>>> Here are some observations:
>>>
>>> - Querying for a column with high selectivity (score,maf) runs more
>>> than 2x fast on a table without index compared to one with an index.
>>> - Querying for a column with low selectivity (chr) or with small
>>> hit-rate still shows a difference but isn't as high as in the first
>>> case.
>>>
>>>
>>> Anyone knows why querying an indexed table is slower than an un-indexed
>>> one?
>>>
>>> Thanks in advance
>>>
>>> I am using pytables 2.3
>>>
>>> cheers
>>> Ümit
>>>
>>>
>>> ------------------------------------------------------------------------------
>>> Try before you buy = See our experts in action!
>>> The most comprehensive online learning library for Microsoft developers
>>> is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
>>> Metro Style Apps, more. Free future releases when you subscribe now!
>>> http://p.sf.net/sfu/learndevnow-dev2
>>> _______________________________________________
>>> Pytables-users mailing list
>>> Pytables-users@lists.sourceforge.net
>>> https://lists.sourceforge.net/lists/listinfo/pytables-users
>>
>>
>>
>> ------------------------------------------------------------------------------
>> Try before you buy = See our experts in action!
>> The most comprehensive online learning library for Microsoft developers
>> is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
>> Metro Style Apps, more. Free future releases when you subscribe now!
>> http://p.sf.net/sfu/learndevnow-dev2
>> _______________________________________________
>> Pytables-users mailing list
>> Pytables-users@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/pytables-users
>>

indexed.lprof
Description: Binary data

non_indexed.lprof
Description: Binary data

------------------------------------------------------------------------------
Try before you buy = See our experts in action!
The most comprehensive online learning library for Microsoft developers
is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
Metro Style Apps, more. Free future releases when you subscribe now!
http://p.sf.net/sfu/learndevnow-dev2

_______________________________________________
Pytables-users mailing list
Pytables-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/pytables-users

Re: [Pytables-users] Performance problems in indexed tables.

Reply via email to