You need to reset your items_dict when you see an hg17 line.

Here is one way to do it. I used a class to make it easier to break the problem into functions. Putting the functions in a class makes it easy to share the header and counts.

class Grouper:
    ''' Process a sequence of strings of the form
        Header
        Data
        Data

        Header
        ...

        Look for repeated Data items under a single Header. When found, print
        the Header and the repeated item.

        Possible usage:
        out = open('outfile.txt', 'w')
        Grouper().process(open('infile.txt'), 'hg17', out)
        out.close()
    '''

    def reset(self, header='No header'):
        ''' Reset the current header and counts '''
        self.currHeader = header
        self.counts = {}


def process(self, data, headerStart, out): ''' Find duplicates within groups of lines of data ''' self.reset()

        for line in data:
            line = line.strip() # get rid of newlines from file input

            if line.startswith(headerStart):
                # Found a new header line, show the current group and restart
                self.showDups(out)
                self.reset(line)

            elif line:
                # Found a data line, count it
                self.counts[line] = self.counts.get(line, 0) + 1

        # Show the last group
        self.showDups(out)


def showDups(self, out): # Get list of items with count > 1 items = [ (k, cnt) for k, cnt in self.counts.items() if cnt > 1 ]

        # Show the items
        if items:
            items.sort()
            print >> out, self.currHeader
            for k, cnt in sorted(items):
                print >> out, '%s occurs %d times' % (k, cnt)
            print >> out


if __name__ == '__main__': import sys

    data = '''hg17_chainMm5_chr15 range=chr7:148238502-148239073
    ENST00000339563.1
    ENST00000342196.1
    ENST00000339563.1
    ENST00000344055.1

    hg17_chainMm5_chr13 range=chr5:42927967-42928726
    ENST00000279800.3
    ENST00000309556.3
    ENST00000279800.3

    hg17_chainMm5_chr6 range=chr1:155548627-155549517
    ENST00000321157.3
    ENST00000256324.4'''.split('\n')

    Grouper().process(data, 'hg17', sys.stdout)


Kent

Scott Melnyk wrote:
Hello once more.

I am stuck on how best to tie the finding Unique Items in Lists ideas to my file

I am stuck at level below:  What I have here taken from the unique
items thread does not work as I need to separate each grouping to the
hg chain it is in (see below for examples)

import sys
WFILE=open(sys.argv[1], 'w') def get_list_dup_dict(fname='Z:/datasets/fooyoo.txt', threshold=2):
a_list=open(fname, 'r')
#print "beginning get_list_dup"
items_dict, dup_dict = {}, {}
for i in a_list:
items_dict[i] = items_dict.get(i, 0) + 1


for k, v in items_dict.iteritems():
if v==threshold:
dup_dict[k] = v


    return dup_dict

def print_list_dup_report(fname='Z:/datasets/fooyoo.txt', threshold=2):
    #print "Beginning report generation"
    dup_dict = get_list_dup_dict(fname='Z:/datasets/fooyoo.txt', threshold=2)
    for k, v in sorted(dup_dict.iteritems()):
        print WFILE,'%s occurred %s times' %(k, v)

if __name__ == '__main__':
        print_list_dup_report()


My issue is that my file is as follows: hg17_chainMm5_chr15 range=chr7:148238502-148239073 ENST00000339563.1 ENST00000342196.1 ENST00000339563.1 ENST00000344055.1

hg17_chainMm5_chr13 range=chr5:42927967-42928726
ENST00000279800.3
ENST00000309556.3

hg17_chainMm5_chr6 range=chr1:155548627-155549517
ENST00000321157.3
ENST00000256324.4
I need a print out that would give the line hg17.... and then any
instances of the ENST that occur more than once only for that chain
section. Even better it only prints the hg17 line if it is followed
by an instance of ENST that occurs more than once


I am hoping for something that gives me an out file roughly like:

hg17_chainMm5_chr15 range=chr7:148238502-148239073
ENST00000339563.1 occurs 2 times

hg17_chainMm5_chr13 range=chr5:42927967-42928726
ENST00000279800.3 occurs 2 times


All help and ideas appreciated, I am trying to get this finished as
soon as possible, the output file will be used to go back to my 2 gb
file and pull out the rest of the data I need.

Thanks,
Scott
_______________________________________________
Tutor maillist  -  Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor


_______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor

Reply via email to