This is an automated email from the git hooks/post-receive script. sascha-guest pushed a commit to branch master in repository fastaq.
commit 08dda22ae0c1c9ee5e2d9ebcc49c01252ae66389 Author: Sascha Steinbiss <[email protected]> Date: Fri Aug 21 08:55:00 2015 +0000 Imported Upstream version 3.6.1 --- pyfastaq/common.py | 2 +- pyfastaq/genetic_codes.py | 31 +++++++++++ pyfastaq/intervals.py | 7 +++ pyfastaq/runners/to_fasta.py | 4 +- pyfastaq/runners/to_orfs_gff.py | 2 +- pyfastaq/sequences.py | 24 +++++++-- pyfastaq/tasks.py | 60 +++++++++++++++++++++- ...o_fasta.strip_after_whitespace_non_unique.in.fa | 6 +++ ..._fasta.strip_after_whitespace_non_unique.out.fa | 6 +++ ...st.to_fasta.strip_after_whitespace_unique.in.fa | 6 +++ ...t.to_fasta.strip_after_whitespace_unique.out.fa | 6 +++ pyfastaq/tests/data/tasks_test_to_fastg.fasta | 4 ++ pyfastaq/tests/data/tasks_test_to_fastg.fastg | 8 +++ .../data/tasks_test_to_fastg.ids_to_circularise | 1 + pyfastaq/tests/intervals_test.py | 10 ++++ pyfastaq/tests/sequences_test.py | 30 +++++++++++ pyfastaq/tests/tasks_test.py | 45 ++++++++++++++++ setup.py | 13 ++--- 18 files changed, 248 insertions(+), 17 deletions(-) diff --git a/pyfastaq/common.py b/pyfastaq/common.py index 42b6ccb..f157366 100644 --- a/pyfastaq/common.py +++ b/pyfastaq/common.py @@ -1 +1 @@ -version = '3.2.0' +version = '3.6.1' diff --git a/pyfastaq/genetic_codes.py b/pyfastaq/genetic_codes.py index c32c065..81a2279 100644 --- a/pyfastaq/genetic_codes.py +++ b/pyfastaq/genetic_codes.py @@ -1,4 +1,6 @@ +# see http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi codes = {} +starts = {} #standard genetic code codes[1] = { @@ -68,6 +70,11 @@ codes[1] = { 'GGG': 'G', } +starts[1] = set([ + 'TTG', + 'CTG', + 'ATG', +]) #mycoplasma genetic code codes[4] = { @@ -137,3 +144,27 @@ codes[4] = { 'GGG': 'G' } +starts[4] = set([ + 'TTA', + 'TTG', + 'CTG', + 'ATT', + 'ATC', + 'ATA', + 'ATG', + 'GTG', +]) + + +# Bacterial, Archaeal and Plant Plastid Code +codes[11] = codes[1] + +starts[11] = set([ + 'TTG', + 'CTG', + 'ATT', + 'ATC', + 'ATA', + 'ATG', + 'GTG', +]) diff --git a/pyfastaq/intervals.py b/pyfastaq/intervals.py index b320c63..ef344ed 100644 --- a/pyfastaq/intervals.py +++ b/pyfastaq/intervals.py @@ -31,6 +31,13 @@ class Interval: def __le__(self, i): return self.start < i.start or (self.start == i.start and self.end <= i.end) + def distance_to_point(self, p): + '''Returns the distance from the point to the interval. Zero if the point lies inside the interval.''' + if self.start <= p <= self.end: + return 0 + else: + return min(abs(self.start - p), abs(self.end - p)) + def intersects(self, i): '''Returns true iff this interval intersects the interval i''' return self.start <= i.end and i.start <= self.end diff --git a/pyfastaq/runners/to_fasta.py b/pyfastaq/runners/to_fasta.py index 379abc6..7cecf1c 100644 --- a/pyfastaq/runners/to_fasta.py +++ b/pyfastaq/runners/to_fasta.py @@ -9,12 +9,14 @@ def run(description): parser.add_argument('outfile', help='Name of output file') parser.add_argument('-l', '--line_length', type=int, help='Number of bases on each sequence line of output file. Set to zero for no linebreaks in sequences [%(default)s]', default=60) parser.add_argument('-s', '--strip_after_whitespace', action='store_true', help='Remove everything after first whitespace in every sequence name') + parser.add_argument('-u', '--check_unique', action='store_true', help='Die if any of the output sequence names are not unique') options = parser.parse_args() tasks.to_fasta( options.infile, options.outfile, line_length=options.line_length, - strip_after_first_whitespace=options.strip_after_whitespace + strip_after_first_whitespace=options.strip_after_whitespace, + check_unique=options.check_unique ) diff --git a/pyfastaq/runners/to_orfs_gff.py b/pyfastaq/runners/to_orfs_gff.py index 039016c..46b6e40 100644 --- a/pyfastaq/runners/to_orfs_gff.py +++ b/pyfastaq/runners/to_orfs_gff.py @@ -9,4 +9,4 @@ def run(description): parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output GFF file') options = parser.parse_args() - tasks.fastaq_to_orfs_gff(options.infile, options.gff_out, min_length=options.min_length) + tasks.fastaq_to_orfs_gff(options.infile, options.outfile, min_length=options.min_length) diff --git a/pyfastaq/sequences.py b/pyfastaq/sequences.py index 4a3c2a1..fdffc60 100644 --- a/pyfastaq/sequences.py +++ b/pyfastaq/sequences.py @@ -14,8 +14,7 @@ class Error (Exception): pass # of the file, for any given filehandle previous_lines = {} - -codon2aa = genetic_codes.codes[1] +genetic_code = 1 redundant_nts = { 'R': ('A', 'G'), @@ -322,6 +321,25 @@ class Fasta: return sorted(orfs, key=lambda t:t[0]) + + def is_complete_orf(self): + '''Returns true iff length is >= 6, is a multiple of 3, and there is exactly one stop codon in the sequence and it is at the end''' + if len(self) %3 != 0 or len(self) < 6: + return False + + orfs = self.orfs() + complete_orf = intervals.Interval(0, len(self) - 1) + for orf in orfs: + if orf == complete_orf: + return True + return False + + + def looks_like_gene(self, translation_table=1): + '''Returns true iff: length >=6, length is a multiple of 3, first codon is start, last codon is a stop and has no other stop codons''' + return self.is_complete_orf() and len(self) >= 6 and len(self) %3 == 0 and self.seq[0:3] in genetic_codes.starts[genetic_code] + + # Fills the object with the next sequence in the file. Returns # True if this was successful, False if no more sequences in the file. # If reading a file of quality scores, set read_quals = True @@ -409,7 +427,7 @@ class Fasta: def translate(self, frame=0): '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' - return Fasta(self.id, ''.join([codon2aa.get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) + return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) class Embl(Fasta): diff --git a/pyfastaq/tasks.py b/pyfastaq/tasks.py index 7527910..e77e40e 100644 --- a/pyfastaq/tasks.py +++ b/pyfastaq/tasks.py @@ -597,6 +597,47 @@ def sort_by_size(infile, outfile, smallest_first=False): utils.close(fout) +def to_fastg(infile, outfile, circular=None): + '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs''' + if circular is None: + to_circularise = set() + elif type(circular) is not set: + f = utils.open_file_read(circular) + to_circularise = set([x.rstrip() for x in f.readlines()]) + utils.close(f) + else: + to_circularise = circular + + seq_reader = sequences.file_reader(infile) + fout = utils.open_file_write(outfile) + nodes = 1 + + for seq in seq_reader: + new_id = '_'.join([ + 'NODE', str(nodes), + 'length', str(len(seq)), + 'cov', '1', + 'ID', seq.id + ]) + + if seq.id in to_circularise: + seq.id = new_id + ':' + new_id + ';' + print(seq, file=fout) + seq.revcomp() + seq.id = new_id + "':" + new_id + "';" + print(seq, file=fout) + else: + seq.id = new_id + ';' + print(seq, file=fout) + seq.revcomp() + seq.id = new_id + "';" + print(seq, file=fout) + + nodes += 1 + + utils.close(fout) + + def translate(infile, outfile, frame=0): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) @@ -780,16 +821,21 @@ def strip_illumina_suffix(infile, outfile): utils.close(f_out) -def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False): +def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length + if check_unique: + used_names = {} for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() + if check_unique: + used_names[seq.id] = used_names.get(seq.id, 0) + 1 + if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: @@ -798,6 +844,18 @@ def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False utils.close(f_out) sequences.Fasta.line_length = original_line_length + if check_unique: + all_unique = True + + for name, count in used_names.items(): + if count > 1: + print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr) + all_unique = False + + if not all_unique: + raise Error('Not all sequence names unique. Cannot continue') + + def to_fasta_union(infile, outfile, seqname='union'): seq_reader = sequences.file_reader(infile) diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa new file mode 100644 index 0000000..c2e1044 --- /dev/null +++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa @@ -0,0 +1,6 @@ +>1 spam +ACGT +>1 eggs +A +>2 +GTTTG diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa new file mode 100644 index 0000000..d3080af --- /dev/null +++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa @@ -0,0 +1,6 @@ +>1 +ACGT +>1 +A +>2 +GTTTG diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa new file mode 100644 index 0000000..d3000d3 --- /dev/null +++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.in.fa @@ -0,0 +1,6 @@ +>1 abcde +ACGT +>2 abcde +G +>3 hello +GTACCA diff --git a/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa new file mode 100644 index 0000000..dc8e2e1 --- /dev/null +++ b/pyfastaq/tests/data/sequences_test.to_fasta.strip_after_whitespace_unique.out.fa @@ -0,0 +1,6 @@ +>1 +ACGT +>2 +G +>3 +GTACCA diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.fasta b/pyfastaq/tests/data/tasks_test_to_fastg.fasta new file mode 100644 index 0000000..2c60bfd --- /dev/null +++ b/pyfastaq/tests/data/tasks_test_to_fastg.fasta @@ -0,0 +1,4 @@ +>seq1 +ATTTG +>seq2 +ACCG diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.fastg b/pyfastaq/tests/data/tasks_test_to_fastg.fastg new file mode 100644 index 0000000..1721e72 --- /dev/null +++ b/pyfastaq/tests/data/tasks_test_to_fastg.fastg @@ -0,0 +1,8 @@ +>NODE_1_length_5_cov_1_ID_seq1; +ATTTG +>NODE_1_length_5_cov_1_ID_seq1'; +CAAAT +>NODE_2_length_4_cov_1_ID_seq2:NODE_2_length_4_cov_1_ID_seq2; +ACCG +>NODE_2_length_4_cov_1_ID_seq2':NODE_2_length_4_cov_1_ID_seq2'; +CGGT diff --git a/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise b/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise new file mode 100644 index 0000000..a2da667 --- /dev/null +++ b/pyfastaq/tests/data/tasks_test_to_fastg.ids_to_circularise @@ -0,0 +1 @@ +seq2 diff --git a/pyfastaq/tests/intervals_test.py b/pyfastaq/tests/intervals_test.py index e899a63..2878859 100644 --- a/pyfastaq/tests/intervals_test.py +++ b/pyfastaq/tests/intervals_test.py @@ -32,6 +32,16 @@ class TestIntervals(unittest.TestCase): self.assertEqual(len(intervals.Interval(1,1)), 1) self.assertEqual(len(intervals.Interval(10,20)), 11) + def test_distance_to_point(self): + '''Test distance_to_point''' + self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(42)) + self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(44)) + self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(50)) + self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(41)) + self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(51)) + self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(55)) + self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(37)) + def test_intersects(self): '''Intersection of two intervals should do the right thing''' a = intervals.Interval(5, 10) diff --git a/pyfastaq/tests/sequences_test.py b/pyfastaq/tests/sequences_test.py index cc22c8d..51e8e2e 100644 --- a/pyfastaq/tests/sequences_test.py +++ b/pyfastaq/tests/sequences_test.py @@ -231,6 +231,36 @@ class TestFasta(unittest.TestCase): self.assertEqual(orfs[i][1], expected[i][1]) + def test_is_complete_orf(self): + '''Test is_complete_orf''' + tests = [ + (sequences.Fasta('ID', 'TTT'), False), + (sequences.Fasta('ID', 'TTTTAA'), True), + (sequences.Fasta('ID', 'TTTTAATAA'), False), + (sequences.Fasta('ID', 'TTGTAA'), True), + (sequences.Fasta('ID', 'TTTAAC'), True), + (sequences.Fasta('ID', 'TGA'), False), + (sequences.Fasta('ID', 'TGAA'), False), + ] + + for t in tests: + self.assertEqual(t[0].is_complete_orf(), t[1]) + + + def test_looks_like_gene(self): + '''Test looks_like_gene''' + tests = [ + (sequences.Fasta('ID', 'TTT'), False), + (sequences.Fasta('ID', 'TTGTAA'), True), + (sequences.Fasta('ID', 'TTGTTTTAA'), True), + (sequences.Fasta('ID', 'TTGTAATTTTAA'), False), + (sequences.Fasta('ID', 'TTGTTTTGAA'), False), + ] + + for t in tests: + self.assertEqual(t[0].looks_like_gene(), t[1]) + + def test_is_all_Ns(self): '''Test is_all_Ns()''' self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns()) diff --git a/pyfastaq/tests/tasks_test.py b/pyfastaq/tests/tasks_test.py index 6d14ef6..12a9870 100644 --- a/pyfastaq/tests/tasks_test.py +++ b/pyfastaq/tests/tasks_test.py @@ -357,6 +357,29 @@ class TestSequenceTrim(unittest.TestCase): os.unlink(tmp2) +class ToFastg(unittest.TestCase): + def test_to_fastg_ids_set(self): + '''Test to_fastg when ids are a set''' + infile = os.path.join(data_dir, 'tasks_test_to_fastg.fasta') + tmpfile = 'tmp.to_fastg.fastg' + expected = os.path.join(data_dir, 'tasks_test_to_fastg.fastg') + ids = {'seq2'} + tasks.to_fastg(infile, tmpfile, circular=ids) + self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) + os.unlink(tmpfile) + + + def test_to_fastg_ids_file(self): + '''Test to_fastg when ids in a file''' + infile = os.path.join(data_dir, 'tasks_test_to_fastg.fasta') + tmpfile = 'tmp.to_fastg.fastg' + expected = os.path.join(data_dir, 'tasks_test_to_fastg.fastg') + ids_file = os.path.join(data_dir, 'tasks_test_to_fastg.ids_to_circularise') + tasks.to_fastg(infile, tmpfile, circular=ids_file) + self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) + os.unlink(tmpfile) + + class TestTranslate(unittest.TestCase): def test_translate(self): '''Test translate works in each frame''' @@ -544,6 +567,7 @@ class TestReplaceBases(unittest.TestCase): os.unlink(tmpfile) + class TestSortBySize(unittest.TestCase): def test_sort_by_size(self): '''Test sort_by_size''' @@ -592,6 +616,27 @@ class TestToFasta(unittest.TestCase): self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_strip_after_whitespace.fa.to_fasta'), tmpfile)) os.unlink(tmpfile) + def test_to_fasta_strip_after_whitespace_non_unique(self): + '''Test strip_after_whitespace with non-unique names''' + tmpfile = 'tmp.strip_after_whitespace.fa' + infile = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_non_unique.in.fa') + expected = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_non_unique.out.fa') + + with self.assertRaises(tasks.Error): + tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=True) + + tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=False) + self.assertTrue(filecmp.cmp(tmpfile, expected, shallow=False)) + os.unlink(tmpfile) + + def test_to_fasta_strip_after_whitespace_unique(self): + '''Test strip_after_whitespace with unique names''' + tmpfile = 'tmp.strip_after_whitespace.fa' + infile = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_unique.in.fa') + expected = os.path.join(data_dir, 'sequences_test.to_fasta.strip_after_whitespace_unique.out.fa') + tasks.to_fasta(infile, tmpfile, strip_after_first_whitespace=True, check_unique=True) + self.assertTrue(filecmp.cmp(tmpfile, expected, shallow=False)) + os.unlink(tmpfile) class TestToUniqueByID(unittest.TestCase): def test_to_unique_by_id(self): diff --git a/setup.py b/setup.py index ec726f7..9c355f2 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,10 @@ -import os import glob -import sys from setuptools import setup, find_packages -try: - import numpy -except ImportError: - print("Error! numpy for Python3 not found.\nPlease install it (e.g. apt-get install python3-numpy)", file=sys.stderr) - sys.exit(1) - setup( name='pyfastaq', - version='3.2.0', + version='3.6.1', description='Script to manipulate FASTA and FASTQ files, plus API for developers', packages = find_packages(), author='Martin Hunt', @@ -20,7 +12,8 @@ setup( url='https://github.com/sanger-pathogens/Fastaq', scripts=glob.glob('scripts/*'), test_suite='nose.collector', - install_requires=['nose >= 1.3'], + tests_require=['nose >= 1.3'], + install_requires=['numpy >= 1.7.1'], license='GPLv3', classifiers=[ 'Development Status :: 4 - Beta', -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fastaq.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
