An alternative approach (I found the Yorick's code to
be too slow for large # of calls) :
We can use file size to pick a random point in the
file. We can read and ignore text till next new line.
This will avoid outputting partial lines. Return the
next line (which I guess is still random :)).
Indicative code -
import os,random
def getrandomline(filename) :
offset = random.randint(0,os.stat(filename)[6])
fd = file(filename,'rb')
fd.seek(offset)
fd.readline() # Read and ignore
return fd.readline()
getrandomline("shaks12.txt")
Caveat: The above code will never choose 1st line and
will return '' for last line. Other than the boundary
conditions it will work well (even for large files).
Interestingly :
On modifying this code to take in file object rather
than filename, the performance improved by ~50%. On
wrapping it in a class, it further improved by ~25%.
On executing the get random line 100,000 times on
large file (size 2707519 with 9427 lines), the class
version finished < 5 seconds.
Platform : 2GHz Intel Core 2 Duo macBook (2GB RAM)
running Mac OSX (10.4.10).
Output using python 2.5.1 (stackless)
Approach using enum approach : 9.55798196793 : for
[100] iterations
Approach using filename : 11.552863121 : for [100000]
iterations
Approach using file descriptor : 5.97015094757 : for
[100000] iterations
Approach using class : 4.46039891243 : for [100000]
iterations
Output using python 2.3.5 (default python on OSX)
Approach using enum approach : 12.2886080742 : for
[100] iterations
Approach using filename : 12.5682640076 : for [100000]
iterations
Approach using file descriptor : 6.55952501297 : for
[100000] iterations
Approach using class : 5.35413718224 : for [100000]
iterations
I am attaching test program FYI.
--
Aditya
--- Nathan Coulter
<[EMAIL PROTECTED]> wrote:
> > -------Original Message-------
> > From: Tiger12506 <[EMAIL PROTECTED]>
>
> > Yuck. Talk about a one shot function! Of course
> it only reads through the
> > file once! You only call the function once. Put a
> second print randline(f)
> > at the bottom of your script and see what happens
> :-)
> >
> > JS
> >
>
> *sigh*
>
> #!/bin/env python
>
> import os
> import random
>
> text = 'shaks12.txt'
> if not os.path.exists(text):
> os.system('wget
> http://www.gutenberg.org/dirs/etext94/shaks12.txt')
>
> def randline(f):
> for i,j in enumerate(file(f, 'rb')):
> if random.randint(0,i) == i:
> line = j
> return line
>
> print randline(text)
> print randline(text)
> print randline(text)
>
> --
> Yorick
> _______________________________________________
> Tutor maillist - Tutor@python.org
> http://mail.python.org/mailman/listinfo/tutor
>
____________________________________________________________________________________
Sucker-punch spam with award-winning protection.
Try the free Yahoo! Mail Beta.
http://advision.webevents.yahoo.com/mailbeta/features_spam.html
import os
import random
class randomline :
def __init__(self, filename="largefile.txt") :
self.filesize = os.stat(filename)[6]
self.fd = file(filename, 'rb')
def getline(self) :
offset = random.randint(0,self.filesize)
self.fd.seek(offset)
self.fd.readline()
line = self.fd.readline()
return (offset,line)
def close(self) :
self.fd.close()
# Uses file name
def getrandomline(filename) :
offset = random.randint(0,os.stat(filename)[6])
fd = file(filename, 'rb')
fd.seek(offset)
ret = (offset,fd.readline())
fd.close()
return ret
# Uses file descriptor
def getrandline(fd) :
offset = random.randint(0,os.fstat(fd.fileno())[6])
fd.seek(offset)
line = fd.readline()
return (offset,fd.readline())
# Uses enumeration
def randline(fd):
for i,j in enumerate(fd) :
if random.randint(0,i) == i:
line = j
fd.seek(0)
return line
if __name__ == '__main__' :
# Substitute your file name
filename = "largefile.txt"
# Class
rd = randomline(filename)
print rd.getline()
rd.close()
# file name
print getrandomline(filename)
# file descriptor
fd = file(filename,'rb')
print getrandline(fd)
fd.close()
# Using enum approach
fd = file(filename,'rb')
print randline(fd)
fd.close()
from timeit import Timer
t_class = Timer('rd.getline()', 'from __main__ import randomline ; rd = randomline("'+filename+'")')
t_filename = Timer('getrandomline("'+filename+'")', 'from __main__ import getrandomline')
t_fd = Timer('getrandline(fd)', 'from __main__ import getrandline ; fd = file("'+filename+'")')
t_enum = Timer('randline(fd)', 'from __main__ import randline ; fd = file("'+filename+'")')
print 'Approach using enum approach : %s : for [%d] iterations' % (str(t_enum.timeit(100)),100)
print 'Approach using filename : %s : for [%d] iterations' % (str(t_filename.timeit(100000)),100000)
print 'Approach using file descriptor : %s : for [%d] iterations' % (str(t_fd.timeit(100000)),100000)
print 'Approach using class : %s : for [%d] iterations' % (str(t_class.timeit(100000)),100000)
_______________________________________________
Tutor maillist - Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor