An alternative approach (I found the Yorick's code to
be too slow for large # of calls) :

We can use file size to pick a random point in the
file. We can read and ignore text till next new line.
This will avoid outputting partial lines. Return the
next line (which I guess is still random :)). 

Indicative code -

import os,random

def getrandomline(filename) :
  offset = random.randint(0,os.stat(filename)[6])
  fd = file(filename,'rb')
  fd.seek(offset)
  fd.readline()  # Read and ignore
  return fd.readline()

getrandomline("shaks12.txt")

Caveat: The above code will never choose 1st line and
will return '' for last line. Other than the boundary
conditions it will work well (even for large files). 

Interestingly :

On modifying this code to take in file object rather
than filename, the performance improved by ~50%. On
wrapping it in a class, it further improved by ~25%.

On executing the get random line 100,000 times on
large file (size 2707519 with 9427 lines), the class
version finished < 5 seconds.

Platform : 2GHz Intel Core 2 Duo macBook (2GB RAM)
running Mac OSX (10.4.10).

Output using python 2.5.1 (stackless)

Approach using enum approach : 9.55798196793 : for
[100] iterations
Approach using filename : 11.552863121 : for [100000]
iterations
Approach using file descriptor : 5.97015094757 : for
[100000] iterations
Approach using class : 4.46039891243 : for [100000]
iterations

Output using python 2.3.5 (default python on OSX)

Approach using enum approach : 12.2886080742 : for
[100] iterations
Approach using filename : 12.5682640076 : for [100000]
iterations
Approach using file descriptor : 6.55952501297 : for
[100000] iterations
Approach using class : 5.35413718224 : for [100000]
iterations

I am attaching test program FYI.

--
Aditya

--- Nathan Coulter
<[EMAIL PROTECTED]> wrote:

> >  -------Original Message-------
> >  From: Tiger12506 <[EMAIL PROTECTED]>
> 
> >  Yuck. Talk about a one shot function! Of course
> it only reads through the
> >  file once! You only call the function once. Put a
> second print randline(f)
> >  at the bottom of your script and see what happens
> :-)
> >  
> >  JS
> >  
> 
> *sigh*
> 
> #!/bin/env python
> 
> import os
> import random
> 
> text = 'shaks12.txt'
> if not os.path.exists(text):
>   os.system('wget
> http://www.gutenberg.org/dirs/etext94/shaks12.txt')
> 
> def randline(f):
>     for i,j in enumerate(file(f, 'rb')):
>         if random.randint(0,i) == i:
>             line = j
>     return line
> 
> print randline(text)
> print randline(text)
> print randline(text)
> 
> -- 
> Yorick
> _______________________________________________
> Tutor maillist  -  Tutor@python.org
> http://mail.python.org/mailman/listinfo/tutor
> 



 
____________________________________________________________________________________
Sucker-punch spam with award-winning protection. 
Try the free Yahoo! Mail Beta.
http://advision.webevents.yahoo.com/mailbeta/features_spam.html
import os
import random

class randomline :
	
	def __init__(self, filename="largefile.txt") :
		self.filesize = os.stat(filename)[6]
		self.fd = file(filename, 'rb')

	def getline(self) :
		offset = random.randint(0,self.filesize)
		self.fd.seek(offset)
		self.fd.readline()
		line = self.fd.readline()
		return (offset,line)
	
	def close(self) :
		self.fd.close()

# Uses file name
def getrandomline(filename) :
	offset = random.randint(0,os.stat(filename)[6])
	fd = file(filename, 'rb')
	fd.seek(offset)
	ret = (offset,fd.readline())
	fd.close()
	return ret

# Uses file descriptor
def getrandline(fd) :
	offset = random.randint(0,os.fstat(fd.fileno())[6])
	fd.seek(offset)
	line = fd.readline()
	return (offset,fd.readline())

# Uses enumeration
def randline(fd):
	for i,j in enumerate(fd) :
		if random.randint(0,i) == i:
			line = j
	fd.seek(0)
	return line


if __name__ == '__main__' :

	# Substitute your file name
	filename = "largefile.txt"

	# Class
	rd = randomline(filename)
	print rd.getline()
	rd.close()

	# file name
	print getrandomline(filename)

	# file descriptor
	fd = file(filename,'rb')
	print getrandline(fd)
	fd.close()

	# Using enum approach
	fd = file(filename,'rb')
	print randline(fd)
	fd.close()

	from timeit import Timer 
	t_class = Timer('rd.getline()', 'from __main__ import randomline ; rd = randomline("'+filename+'")')
	t_filename = Timer('getrandomline("'+filename+'")', 'from __main__ import getrandomline')
	t_fd = Timer('getrandline(fd)', 'from __main__ import getrandline ; fd = file("'+filename+'")')
	t_enum = Timer('randline(fd)', 'from __main__ import randline ; fd = file("'+filename+'")')

	print 'Approach using enum approach : %s : for [%d] iterations' % (str(t_enum.timeit(100)),100)
	print 'Approach using filename : %s : for [%d] iterations' % (str(t_filename.timeit(100000)),100000)
	print 'Approach using file descriptor : %s : for [%d] iterations' % (str(t_fd.timeit(100000)),100000)
	print 'Approach using class : %s : for [%d] iterations' % (str(t_class.timeit(100000)),100000)

_______________________________________________
Tutor maillist  -  Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor

Reply via email to