#!/usr/bin/python
"""Natural-order sorting, supporting embedded numbers.
foo9bar2 < foo10bar2 < foo10bar10
"""
import random, re, sys
def natsort_key(item):
chunks = re.split('(\d+(?:\.\d+)?)', item)
for ii in range(len(chunks)):
if chunks[ii] and chunks[ii][0] in '0123456789':
if '.' in chunks[ii]: numtype = float
else: numtype = int
# wrap in tuple with '0' to explicitly specify numbers come first
chunks[ii] = (0, numtype(chunks[ii]))
else:
chunks[ii] = (1, chunks[ii])
return (chunks, item)
def natsort(seq):
"Sort a sequence of text strings in a reasonable order."
alist = [item for item in seq]
alist.sort(key=natsort_key)
return alist
def ok(a, b): assert a == b, (a, b)
def test():
ok(natsort('foo10bar10 foo9bar2 foo10bar2'.split()),
'foo9bar2 foo10bar2 foo10bar10'.split())
nums = map(str, range(100))
random.shuffle(nums)
ok(natsort(nums), map(str, range(100)))
assert nums != map(str, range(100)) # didn't get mutated...
# compare lexically when numerical value identical
ok(natsort('b1 a2 a1 b01 b2'.split()), 'a1 a2 b01 b1 b2'.split())
# A lazy implementor might support only short numbers; this prevents that:
ok(natsort(['a100000000000000000000000.txt',
'a99999999999999999999999.txt']),
['a99999999999999999999999.txt', 'a100000000000000000000000.txt'])
# Don't interpret 033 as octal (decimal 27).
ok(natsort(['033', '32']), ['32', '033'])
# This one was hard and possibly not worth it: use floating point
# to handle non-integer numbers. The cost is that sequences of
# integers separated by dots will lose precision in some cases,
# and also may sort funny.
# e.g. 1234.56789012345678901234567890.34567890.
ok(natsort(['2.49', '2.5', '2.6']), ['2.49', '2.5', '2.6'])
# Make sure we can still handle sequences of integers with dots.
ok(natsort(['1.2.3', '1.3.4']), ['1.2.3', '1.3.4'])
# There are cases where we lose on floating-point precision,
# although usually then the fallback to pure lexical comparison
# saves us:
ok(natsort(['1.3000000000000000000000000000000000000000000000000000001',
'1.3000000000000000000000000000000000000000000000000000000']),
['1.3000000000000000000000000000000000000000000000000000000',
'1.3000000000000000000000000000000000000000000000000000001'])
### Here are some cases where we get suboptimal answers:
# Hexadecimal numbers get mangled.
ok(natsort(['0x99', '0x9a']), ['0x9a', '0x99'])
# In cases where you expect asciibetical order, you will be
# disappointed.
ok(natsort([',', '5', ':']), ['5', ',', ':'])
# Fallback to pure lexical comparison doesn't always save us on
# floating-point precision:
ok(natsort(['01.3000000000000000000000000000000000000000000000000000001',
'1.3000000000000000000000000000000000000000000000000000000']),
['01.3000000000000000000000000000000000000000000000000000001',
'1.3000000000000000000000000000000000000000000000000000000'])
test()
if __name__ == '__main__':
sys.stdout.writelines(natsort(sys.stdin))