This is an automated email from the git hooks/post-receive script. afif-guest pushed a commit to branch master in repository python-intervaltree-bio.
commit a6d415a36c5b068e731cb8e4abfa5225acfd198e Author: Afif Elghraoui <[email protected]> Date: Sat Dec 12 15:36:33 2015 -0800 Imported Upstream version 1.0.1 --- .gitignore | 5 + .travis.yml | 8 ++ CHANGELOG.txt | 16 +++ LICENSE.txt | 21 ++++ MANIFEST.in | 1 + README.rst | 47 ++++++++ intervaltree_bio/__init__.py | 230 +++++++++++++++++++++++++++++++++++++++ setup.cfg | 9 ++ setup.py | 54 +++++++++ tests/__init__.py | 10 ++ tests/genomeintervaltree_test.py | 54 +++++++++ tests/pickle_test.py | 22 ++++ 12 files changed, 477 insertions(+) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7d9f84e --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc +.DS_Store +*.egg-info +*.komodoproject +dist/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..a6cfe31 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,8 @@ +language: python +python: + - "2.7" + - "3.2" + - "3.3" + - "3.4" +script: + - python setup.py develop test diff --git a/CHANGELOG.txt b/CHANGELOG.txt new file mode 100644 index 0000000..4982b68 --- /dev/null +++ b/CHANGELOG.txt @@ -0,0 +1,16 @@ +Version 1.0.1 +------------- + + - GenomeIntervalTree may now be pickled. + +Version 1.0.0 +------------- + + - Replaced dash in the package name to underscore, as it seemed to confuse the package manager (python setup.py develop would fail on packages attempting to use intervaltree). + Formally, it's a new package now. The package with the dash in the name had to be removed completely from PyPI + - Added the the possibility to specify an interval_maker in GenomeIntervalTree's methods from_bed and from_table. + +Version 0.6 +------------- + + - Extracted genome-related code into a separate package from https://github.com/konstantint/PyIntervalTree (version 0.5). diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..5a59174 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015, Konstantin Tretyakov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e7d1090 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.rst CHANGELOG.txt LICENSE.txt \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..b0f249f --- /dev/null +++ b/README.rst @@ -0,0 +1,47 @@ +================ +Intervaltree-Bio +================ + +Convenience classes for loading UCSC genomic annotation records into a set of `interval tree <https://pypi.python.org/pypi/intervaltree>`__ data structures. + +Installation +------------ + +The easiest way to install most Python packages is via ``easy_install`` or ``pip``:: + + $ pip install intervaltree-bio + +The package requires the ``intervaltree`` package (which is normally installed automatically when using ``pip`` or ``easy_install``). + +Usage +-------- + +One of the major uses for Interval tree data structures is in bioinformatics, where the +intervals correspond to genes or other features of the genome. + +As genomes typically consist of a set of *chromosomes*, a separate interval tree for each +chromosome has to be maintained. Thus, rather than using an single interval tree, you would typically use +something like ``defaultdict(IntervalTree)`` to index data of genomic features. +The module ``intervaltree_bio`` offers a ``GenomeIntervalTree`` data structure, which is a similar convenience +data structure. In addition to specific methods for working with genomic intervals it also +provides facilities for reading BED files and the refGene table from `UCSC <http://genome.ucsc.edu/>`__. + +The core example is loading the transcription regions of the ``knownGene`` table from the UCSC website:: + + >> from intervaltree_bio import GenomeIntervalTree + >> knownGene = GenomeIntervalTree.from_table() + >> len(knownGene) + +It is then possible to use the data structure to search known genes within given intervals:: + + >> result = knownGene[b'chr1'].search(100000, 138529) + +It is possible to load other UCSC tables besides ``knownGene`` or specify custom URL or file to read the table from. +Consult the docstring of the ``GenomeIntervalTree.from_table`` method for more details. + +Copyright +---------- + + * Copyright (c) Konstantin Tretyakov + * MIT license. + * Report issues via `Github <https://github.com/konstantint/intervaltree-bio>`__. diff --git a/intervaltree_bio/__init__.py b/intervaltree_bio/__init__.py new file mode 100644 index 0000000..083baca --- /dev/null +++ b/intervaltree_bio/__init__.py @@ -0,0 +1,230 @@ +''' +intervaltree-bio: Interval tree convenience classes for genomic data. + +One of the major uses for Interval tree data structures is in bioinformatics, where the +intervals correspond to genes or other features of the genome. + +As genomes typically consist of a set of *chromosomes*, a separate interval tree for each +chromosome has to be maintained. Thus, rather than using an IntervalTree, you would typically use +something like ``defaultdict(IntervalTree)`` to index data of genomic features. +This module offers a ``GenomeIntervalTree`` data structure, which is a similar convenience +data structure. In addition to specific methods for working with genomic intervals it also +provides facilities for reading BED files and the refGene table from UCSC. + +Copyright 2014, Konstantin Tretyakov + +Licensed under MIT. +''' +try: + from urllib import urlopen + from StringIO import StringIO as BytesIO +except ImportError: # Python 3? + from urllib.request import urlopen + from io import BytesIO + +import zlib +import warnings +from collections import defaultdict +from intervaltree import Interval, IntervalTree + +class UCSCTable(object): + '''A container class for the parsing functions, used in GenomeIntervalTree.from_table``.''' + + KNOWN_GENE_FIELDS = ['name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'proteinID', 'alignID'] + REF_GENE_FIELDS = ['bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2', 'cdsStartStat', 'cdsEndStat', 'exonFrames'] + ENS_GENE_FIELDS = REF_GENE_FIELDS + + @staticmethod + def KNOWN_GENE(line): + return dict(zip(UCSCTable.KNOWN_GENE_FIELDS, line.split(b'\t'))) + @staticmethod + def REF_GENE(line): + return dict(zip(UCSCTable.REF_GENE_FIELDS, line.split(b'\t'))) + @staticmethod + def ENS_GENE(line): + return dict(zip(UCSCTable.ENS_GENE_FIELDS, line.split(b'\t'))) + +class IntervalMakers(object): + '''A container class for interval-making functions, used in GenomeIntervalTree.from_table and GenomeIntervalTree.from_bed.''' + + @staticmethod + def TX(d): + return [Interval(int(d['txStart']), int(d['txEnd']), d)] + + @staticmethod + def CDS(d): + return [Interval(int(d['cdsStart']), int(d['cdsEnd']), d)] + + @staticmethod + def EXONS(d): + exStarts = d['exonStarts'].split(b',') + exEnds = d['exonEnds'].split(b',') + for i in range(int(d['exonCount'])): + yield Interval(int(exStarts[i]), int(exEnds[i]), d) + +def _fix(interval): + ''' + Helper function for ``GenomeIntervalTree.from_bed and ``.from_table``. + + Data tables may contain intervals with begin >= end. Such intervals lead to infinite recursions and + other unpleasant behaviour, so something has to be done about them. We 'fix' them by simply setting end = begin+1. + ''' + if interval.begin >= interval.end: + warnings.warn("Interval with reversed coordinates (begin >= end) detected when reading data. Interval was automatically fixed to point interval [begin, begin+1).") + return Interval(interval.begin, interval.begin+1, interval.data) + else: + return interval + +class GenomeIntervalTree(defaultdict): + ''' + The data structure maintains a set of IntervalTrees, one for each chromosome. + It is essentially a ``defaultdict(IntervalTree)`` with a couple of convenience methods + for reading various data formats. + + Examples:: + + >>> gtree = GenomeIntervalTree() + >>> gtree.addi('chr1', 0, 100) + >>> gtree.addi('chr1', 1, 100) + >>> gtree.addi('chr2', 0, 100) + >>> len(gtree) + 3 + >>> len(gtree['chr1']) + 2 + >>> sorted(gtree.keys()) + ['chr1', 'chr2'] + + ''' + def __init__(self): + super(GenomeIntervalTree, self).__init__(IntervalTree) + + def addi(self, chrom, begin, end, data=None): + self[chrom].addi(begin, end, data) + + def __len__(self): + return sum([len(tree) for tree in self.values()]) + + @staticmethod + def from_bed(fileobj, field_sep=b'\t', interval_maker=None): + ''' + Initialize a ``GenomeIntervalTree`` from a BED file. + Each line of the file must consist of several fields, separated using ``field_sep``. + The first three fields are ``chrom``, ``start`` and ``end`` (where ``start`` is 0-based and + the corresponding interval is ``[start, end)``). The remaining fields are ``name``, ``score``, + ``strand``, ..., or something else, depending on the flavor of the format. + + Each Interval in the tree has its data field set to a list with "remaining" fields, + i.e. interval.data[0] should be the ``name``, interval.data[1] is the ``score``, etc. + + if the ``interval_maker`` parameter is not None, intervals are created by calling this function with the BED line split into fields as input. + The function must return an iterable of ``Interval`` objects. + + Example:: + >>> test_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgTfbsUniform/wgEncodeAwgTfbsBroadDnd41Ezh239875UniPk.narrowPeak.gz' + >>> data = zlib.decompress(urlopen(test_url).read(), 16+zlib.MAX_WBITS) + >>> gtree = GenomeIntervalTree.from_bed(BytesIO(data)) + >>> len(gtree) + 1732 + >>> assert gtree[b'chr10'].search(22610878) == set([Interval(22610878, 22611813, [b'.', b'1000', b'.', b'471.725544438908', b'-1', b'3.21510858105313', b'389']), Interval(22610878, 22611813, [b'.', b'791', b'.', b'123.885507169449', b'-1', b'3.21510858105313', b'596'])]) + >>> assert gtree[b'chr10'].search(22611813) == set([]) + >>> assert gtree[b'chr1'].search(145036590, 145036594) == set([Interval(145036593, 145037123, [b'.', b'247', b'.', b'38.6720804428054', b'-1', b'3.06233123683911', b'265'])]) + >>> assert gtree[b'chr10'].search(145036594, 145036595) == set([]) + + ''' + # We collect all intervals into a set of lists, and then put them all at once into the tree structures + # It is slightly more efficient than adding intervals one by one. + # Moreover, the current implementation throws a "maximum recursion depth exceeded" error + # in this case on large files (TODO) + + interval_lists = defaultdict(list) + for ln in fileobj: + if ln.endswith(b'\n'): + ln = ln[0:-1] + ln = ln.split(field_sep) + if interval_maker is not None: + for interval in interval_maker(ln): + interval_lists[ln[0]].append(_fix(interval)) + else: + interval_lists[ln[0]].append(_fix(Interval(int(ln[1]), int(ln[2]), data=ln[3:]))) + gtree = GenomeIntervalTree() + for k, v in getattr(interval_lists, 'iteritems', interval_lists.items)(): + gtree[k] = IntervalTree(v) + return gtree + + @staticmethod + def from_table(fileobj=None, url='http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownGene.txt.gz', + parser=UCSCTable.KNOWN_GENE, mode='tx', decompress=None): + ''' + UCSC Genome project provides several tables with gene coordinates (https://genome.ucsc.edu/cgi-bin/hgTables), + such as knownGene, refGene, ensGene, wgEncodeGencodeBasicV19, etc. + Indexing the rows of those tables into a ``GenomeIntervalTree`` is a common task, implemented in this method. + + The table can be either specified as a ``fileobj`` (in which case the data is read line by line), + or via an ``url`` (the ``url`` may be to a ``txt`` or ``txt.gz`` file either online or locally). + The type of the table is specified using the ``parser`` parameter. This is a function that takes a line + of the file (with no line ending) and returns a dictionary, mapping field names to values. This dictionary will be assigned + to the ``data`` field of each interval in the resulting tree. + + Finally, there are different ways genes can be mapped into intervals for the sake of indexing as an interval tree. + One way is to represent each gene via its transcribed region (``txStart``..``txEnd``). Another is to represent using + coding region (``cdsStart``..``cdsEnd``). Finally, the third possibility is to map each gene into several intervals, + corresponding to its exons (``exonStarts``..``exonEnds``). + + The mode, in which genes are mapped to intervals is specified via the ``mode`` parameter. The value can be ``tx``, ``cds`` and + ``exons``, corresponding to the three mentioned possiblities. + + If a more specific way of interval-mapping is required (e.g. you might want to create 'coding-region+-10k' intervals), you can provide + an "interval-maker" function as the ``mode`` parameter. An interval-maker function takes as input a dictionary, returned by the parser, + and returns an iterable of Interval objects. + + The ``parser`` function must ensure that its output contains the field named ``chrom``, and also fields named ``txStart``/``txEnd`` if ``mode=='tx'``, + fields ``cdsStart``/``cdsEnd`` if ``mode=='cds'``, and fields ``exonCount``/``exonStarts``/``exonEnds`` if ``mode=='exons'``. + + The ``decompress`` parameter specifies whether the provided file is gzip-compressed. + This only applies to the situation when the url is given (no decompression is made if fileobj is provided in any case). + If decompress is None, data is decompressed if the url ends with .gz, otherwise decompress = True forces decompression. + + >> knownGene = GenomeIntervalTree.from_table() + >> len(knownGene) + 82960 + >> result = knownGene[b'chr1'].search(100000, 138529) + >> len(result) + 1 + >> list(result)[0].data['name'] + b'uc021oeg.2' + ''' + if fileobj is None: + data = urlopen(url).read() + if (decompress is None and url.endswith('.gz')) or decompress: + data = zlib.decompress(data, 16+zlib.MAX_WBITS) + fileobj = BytesIO(data) + + interval_lists = defaultdict(list) + + if mode == 'tx': + interval_maker = IntervalMakers.TX + elif mode == 'cds': + interval_maker = IntervalMakers.CDS + elif mode == 'exons': + interval_maker = IntervalMakers.EXONS + elif getattr(mode, __call__, None) is None: + raise Exception("Parameter `mode` may only be 'tx', 'cds', 'exons' or a callable") + else: + interval_maker = mode + + for ln in fileobj: + if ln.endswith(b'\n'): + ln = ln[0:-1] + d = parser(ln) + for interval in interval_maker(d): + interval_lists[d['chrom']].append(_fix(interval)) + + # Now convert interval lists into trees + gtree = GenomeIntervalTree() + for chrom, lst in getattr(interval_lists, 'iteritems', interval_lists.items)(): + gtree[chrom] = IntervalTree(lst) + return gtree + + def __reduce__(self): + t = defaultdict.__reduce__(self) + return (t[0], ()) + t[2:] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..d336f54 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,9 @@ +[egg_info] +tag_build = +tag_svn_revision = false + +[pytest] +addopts = --ignore=setup.py --ignore=build --ignore=dist --doctest-modules +norecursedirs=*.egg + + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1aeef19 --- /dev/null +++ b/setup.py @@ -0,0 +1,54 @@ +''' +intervaltree-bio: Interval tree convenience classes for genomic data. + +Note that "python setup.py test" invokes pytest on the package. With appropriately +configured setup.cfg, this will check both xxx_test modules and docstrings. + +Copyright 2014, Konstantin Tretyakov. + +Licensed under MIT. +''' +import sys +from setuptools import setup, find_packages +from setuptools.command.test import test as TestCommand + +# This is a plug-in for setuptools that will invoke py.test +# when you run python setup.py test +class PyTest(TestCommand): + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = [] + self.test_suite = True + + def run_tests(self): + import pytest # import here, because outside the required eggs aren't loaded yet + sys.exit(pytest.main(self.test_args)) + + +version = '1.0.1' +setup( + name = 'intervaltree_bio', + version = version, + description = 'Interval tree convenience classes for genomic data', + long_description=open("README.rst").read(), + classifiers=[ # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers + 'Development Status :: 4 - Beta', + 'Programming Language :: Python', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Topic :: Software Development :: Libraries', + ], + keywords="interval-tree data-structure intervals tree genomics bioinformatics ucsc", # Separate with spaces + author = 'Konstantin Tretyakov', + author_email = '[email protected]', + url='https://github.com/konstantint/intervaltree-bio', + license="MIT", + packages=find_packages(exclude=['examples', 'tests']), + include_package_data=True, + zip_safe=True, + install_requires=['intervaltree'], + tests_require=['pytest'], + cmdclass={'test': PyTest}, + entry_points={} +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..99b7f2f --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,10 @@ +''' +intervaltree-bio: Interval tree convenience classes for genomic data. + +Test module. Meant for use with ``py.test``. +To run all tests simply invoke ``python setup.py test`` from the package root. + +Copyright 2014 Konstantin Tretyakov + +Licensed under MIT. +''' diff --git a/tests/genomeintervaltree_test.py b/tests/genomeintervaltree_test.py new file mode 100644 index 0000000..9b5bff5 --- /dev/null +++ b/tests/genomeintervaltree_test.py @@ -0,0 +1,54 @@ +''' +Test module for GenomeIntervalTree + +Copyright 2014, Konstantin Tretyakov + +Licensed under MIT license. +''' +import os +try: + from urllib import urlretrieve +except ImportError: # Python 3? + from urllib.request import urlretrieve + +from intervaltree_bio import GenomeIntervalTree, UCSCTable + +def test_knownGene(): + # To speed up testing, we'll download the file and reuse the downloaded copy + knownGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownGene.txt.gz' + # Mirror. Slightly faster and more stable, I believe: + knownGene_url = 'http://kt.era.ee/distribute/pyintervaltree/knownGene.txt.gz' + + # To speed up testing, we'll download the file and reuse the downloaded copy + knownGene_file, headers = urlretrieve(knownGene_url) + + knownGene_localurl = 'file:///%s' % os.path.abspath(knownGene_file) + knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, decompress=True) # Py3 downloads .gz files to local files with names not ending with .gz + assert len(knownGene) == 82960 + result = knownGene[b'chr1'].search(100000, 138529) + assert len(result) == 1 + assert list(result)[0].data['name'] == b'uc021oeg.2' + + knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='cds', decompress=True) + assert len(knownGene) == 82960 + assert not knownGene[b'chr1'].overlaps(100000, 138529) + + knownGene = GenomeIntervalTree.from_table(url=knownGene_localurl, mode='exons', decompress=True) + assert len(knownGene) == 742493 + result = list(knownGene[b'chr1'].search(134772, 140566)) + assert len(result) == 3 + assert result[0].data == result[1].data and result[0].data == result[2].data + +def test_ensGene(): + # Smoke-test we can at least read ensGene. + ensGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/ensGene.txt.gz' + ensGene_url = 'http://kt.era.ee/distribute/pyintervaltree/ensGene.txt.gz' + ensGene = GenomeIntervalTree.from_table(url=ensGene_url, mode='cds', parser=UCSCTable.ENS_GENE) + assert len(ensGene) == 204940 + +def test_refGene(): + # Smoke-test for refGene + refGene_url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/refGene.txt.gz' + refGene_url = 'http://kt.era.ee/distribute/pyintervaltree/refGene.txt.gz' + refGene = GenomeIntervalTree.from_table(url=refGene_url, mode='tx', parser=UCSCTable.REF_GENE) + assert len(refGene) == 52350 # NB: Some time ago it was 50919, hence it seems the table data changes on UCSC and eventually the mirror and UCSC won't be the same. diff --git a/tests/pickle_test.py b/tests/pickle_test.py new file mode 100644 index 0000000..9ba566e --- /dev/null +++ b/tests/pickle_test.py @@ -0,0 +1,22 @@ +''' +Test module for GenomeIntervalTree + +Copyright 2014, Konstantin Tretyakov + +Licensed under MIT license. +''' +import os + +from intervaltree_bio import GenomeIntervalTree +from collections import defaultdict +import pickle + +def test_pickling(): + git = GenomeIntervalTree() + git['a'][1:2] = ['some', 'data'] + git['a'][1.5:2.5] = ['more', 'data'] + git['b'][10:12] = ['even', 'more', 'data'] + s = pickle.dumps(git) + new_git = pickle.loads(s) + assert len(git) == len(new_git) + assert len(git['a']) == len(new_git['a']) -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-intervaltree-bio.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
