Hello community, here is the log from the commit of package python-Unidecode for openSUSE:Factory checked in at 2016-01-26 10:15:07 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-Unidecode (Old) and /work/SRC/openSUSE:Factory/.python-Unidecode.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-Unidecode" Changes: -------- --- /work/SRC/openSUSE:Factory/python-Unidecode/python-Unidecode.changes 2015-11-18 22:34:33.000000000 +0100 +++ /work/SRC/openSUSE:Factory/.python-Unidecode.new/python-Unidecode.changes 2016-01-26 10:15:08.000000000 +0100 @@ -1,0 +2,7 @@ +Fri Jan 22 19:02:19 UTC 2016 - [email protected] + +- update to Unidecode 0.04.19: + * Add unidecode_expect_ascii() and unidecode_expect_nonascii() + functions for performance critical applications. + +------------------------------------------------------------------- Old: ---- Unidecode-0.04.18.tar.gz New: ---- Unidecode-0.04.19.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-Unidecode.spec ++++++ --- /var/tmp/diff_new_pack.T9Cqi3/_old 2016-01-26 10:15:09.000000000 +0100 +++ /var/tmp/diff_new_pack.T9Cqi3/_new 2016-01-26 10:15:09.000000000 +0100 @@ -1,7 +1,7 @@ # # spec file for package python-Unidecode # -# Copyright (c) 2015 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2016 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -17,7 +17,7 @@ Name: python-Unidecode -Version: 0.04.18 +Version: 0.04.19 Release: 0 Summary: ASCII transliterations of Unicode text License: GPL-2.0+ ++++++ Unidecode-0.04.18.tar.gz -> Unidecode-0.04.19.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/ChangeLog new/Unidecode-0.04.19/ChangeLog --- old/Unidecode-0.04.18/ChangeLog 2015-06-13 13:48:20.000000000 +0200 +++ new/Unidecode-0.04.19/ChangeLog 2016-01-21 17:42:06.000000000 +0100 @@ -1,3 +1,9 @@ +2016-01-21 unidecode 0.04.19 + + * Add unidecode_expect_ascii() and unidecode_expect_nonascii() + functions for performance critical applications. + (thanks to Israel Saeta Perez) + 2015-06-13 unidecode 0.04.18 * Add a command line utility. (thanks to Andrew Udvare) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/MANIFEST.in new/Unidecode-0.04.19/MANIFEST.in --- old/Unidecode-0.04.18/MANIFEST.in 2015-06-13 14:12:54.000000000 +0200 +++ new/Unidecode-0.04.19/MANIFEST.in 2016-01-21 17:49:08.000000000 +0100 @@ -2,3 +2,4 @@ include ChangeLog include LICENSE include README.rst +recursive-include tests *.py diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/PKG-INFO new/Unidecode-0.04.19/PKG-INFO --- old/Unidecode-0.04.18/PKG-INFO 2015-06-13 14:17:25.000000000 +0200 +++ new/Unidecode-0.04.19/PKG-INFO 2016-01-21 18:14:54.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: Unidecode -Version: 0.04.18 +Version: 0.04.19 Summary: ASCII transliterations of Unicode text Home-page: UNKNOWN Author: Tomaz Solc @@ -13,7 +13,7 @@ represent it in ASCII. For example when integrating with legacy code that doesn't support Unicode, or for ease of entry of non-Roman names on a US keyboard, or when constructing ASCII machine identifiers from - human-readable Unicode strings that should still be somewhat intelligeble + human-readable Unicode strings that should still be somewhat intelligible (a popular example of this is when making an URL slug from an article title). @@ -50,9 +50,9 @@ Module content -------------- - The module exports a single function that takes an Unicode object (Python - 2.x) or string (Python 3.x) and returns a string (that can be encoded to - ASCII bytes in Python 3.x):: + The module exports a function that takes an Unicode object (Python 2.x) or + string (Python 3.x) and returns a string (that can be encoded to ASCII bytes in + Python 3.x):: >>> from unidecode import unidecode >>> unidecode(u'ko\u017eu\u0161\u010dek') @@ -105,13 +105,39 @@ Installation ------------ - To install Unidecode from the source distribution and run unit tests, use these - commands:: + To install the latest version of Unidecode from the Python package index, use + these commands:: + + $ pip install unidecode + + To install Unidecode from the source distribution and run unit tests, use:: $ python setup.py install $ python setup.py test + Performance notes + ----------------- + + By default, `unidecode` optimizes for the use case where most of the strings + passed to it are already ASCII-only and no transliteration is necessary (this + default might change in future versions). + + For performance critical applications, two additional functions are exposed: + + `unidecode_expect_ascii` is optimized for ASCII-only inputs (approximately 5 + times faster than `unidecode_expect_nonascii` on 10 character strings, more on + longer strings), but slightly slower for non-ASCII inputs. + + `unidecode_expect_nonascii` takes approximately the same amount of time on + ASCII and non-ASCII inputs, but is slightly faster for non-ASCII inputs than + `unidecode_expect_ascii`. + + Apart from differences in run time, both functions produce identical results. + For most users of Unidecode, the difference in performance should be + negligible. + + Source ------ @@ -136,7 +162,7 @@ Python code and later additions: - Copyright 2015, Tomaz Solc <[email protected]> + Copyright 2016, Tomaz Solc <[email protected]> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/README.rst new/Unidecode-0.04.19/README.rst --- old/Unidecode-0.04.18/README.rst 2015-06-13 14:00:40.000000000 +0200 +++ new/Unidecode-0.04.19/README.rst 2016-01-21 17:56:47.000000000 +0100 @@ -5,7 +5,7 @@ represent it in ASCII. For example when integrating with legacy code that doesn't support Unicode, or for ease of entry of non-Roman names on a US keyboard, or when constructing ASCII machine identifiers from -human-readable Unicode strings that should still be somewhat intelligeble +human-readable Unicode strings that should still be somewhat intelligible (a popular example of this is when making an URL slug from an article title). @@ -42,9 +42,9 @@ Module content -------------- -The module exports a single function that takes an Unicode object (Python -2.x) or string (Python 3.x) and returns a string (that can be encoded to -ASCII bytes in Python 3.x):: +The module exports a function that takes an Unicode object (Python 2.x) or +string (Python 3.x) and returns a string (that can be encoded to ASCII bytes in +Python 3.x):: >>> from unidecode import unidecode >>> unidecode(u'ko\u017eu\u0161\u010dek') @@ -97,13 +97,39 @@ Installation ------------ -To install Unidecode from the source distribution and run unit tests, use these -commands:: +To install the latest version of Unidecode from the Python package index, use +these commands:: + + $ pip install unidecode + +To install Unidecode from the source distribution and run unit tests, use:: $ python setup.py install $ python setup.py test +Performance notes +----------------- + +By default, `unidecode` optimizes for the use case where most of the strings +passed to it are already ASCII-only and no transliteration is necessary (this +default might change in future versions). + +For performance critical applications, two additional functions are exposed: + +`unidecode_expect_ascii` is optimized for ASCII-only inputs (approximately 5 +times faster than `unidecode_expect_nonascii` on 10 character strings, more on +longer strings), but slightly slower for non-ASCII inputs. + +`unidecode_expect_nonascii` takes approximately the same amount of time on +ASCII and non-ASCII inputs, but is slightly faster for non-ASCII inputs than +`unidecode_expect_ascii`. + +Apart from differences in run time, both functions produce identical results. +For most users of Unidecode, the difference in performance should be +negligible. + + Source ------ @@ -128,7 +154,7 @@ Python code and later additions: -Copyright 2015, Tomaz Solc <[email protected]> +Copyright 2016, Tomaz Solc <[email protected]> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/Unidecode.egg-info/PKG-INFO new/Unidecode-0.04.19/Unidecode.egg-info/PKG-INFO --- old/Unidecode-0.04.18/Unidecode.egg-info/PKG-INFO 2015-06-13 14:17:25.000000000 +0200 +++ new/Unidecode-0.04.19/Unidecode.egg-info/PKG-INFO 2016-01-21 18:14:54.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: Unidecode -Version: 0.04.18 +Version: 0.04.19 Summary: ASCII transliterations of Unicode text Home-page: UNKNOWN Author: Tomaz Solc @@ -13,7 +13,7 @@ represent it in ASCII. For example when integrating with legacy code that doesn't support Unicode, or for ease of entry of non-Roman names on a US keyboard, or when constructing ASCII machine identifiers from - human-readable Unicode strings that should still be somewhat intelligeble + human-readable Unicode strings that should still be somewhat intelligible (a popular example of this is when making an URL slug from an article title). @@ -50,9 +50,9 @@ Module content -------------- - The module exports a single function that takes an Unicode object (Python - 2.x) or string (Python 3.x) and returns a string (that can be encoded to - ASCII bytes in Python 3.x):: + The module exports a function that takes an Unicode object (Python 2.x) or + string (Python 3.x) and returns a string (that can be encoded to ASCII bytes in + Python 3.x):: >>> from unidecode import unidecode >>> unidecode(u'ko\u017eu\u0161\u010dek') @@ -105,13 +105,39 @@ Installation ------------ - To install Unidecode from the source distribution and run unit tests, use these - commands:: + To install the latest version of Unidecode from the Python package index, use + these commands:: + + $ pip install unidecode + + To install Unidecode from the source distribution and run unit tests, use:: $ python setup.py install $ python setup.py test + Performance notes + ----------------- + + By default, `unidecode` optimizes for the use case where most of the strings + passed to it are already ASCII-only and no transliteration is necessary (this + default might change in future versions). + + For performance critical applications, two additional functions are exposed: + + `unidecode_expect_ascii` is optimized for ASCII-only inputs (approximately 5 + times faster than `unidecode_expect_nonascii` on 10 character strings, more on + longer strings), but slightly slower for non-ASCII inputs. + + `unidecode_expect_nonascii` takes approximately the same amount of time on + ASCII and non-ASCII inputs, but is slightly faster for non-ASCII inputs than + `unidecode_expect_ascii`. + + Apart from differences in run time, both functions produce identical results. + For most users of Unidecode, the difference in performance should be + negligible. + + Source ------ @@ -136,7 +162,7 @@ Python code and later additions: - Copyright 2015, Tomaz Solc <[email protected]> + Copyright 2016, Tomaz Solc <[email protected]> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/setup.py new/Unidecode-0.04.19/setup.py --- old/Unidecode-0.04.18/setup.py 2015-06-13 13:50:47.000000000 +0200 +++ new/Unidecode-0.04.19/setup.py 2016-01-21 17:37:21.000000000 +0100 @@ -7,7 +7,7 @@ return open(os.path.join(os.path.dirname(__file__), "README.rst")).read() setup(name='Unidecode', - version='0.04.18', + version='0.04.19', description='ASCII transliterations of Unicode text', license='GPL', long_description=get_long_description(), diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/tests/test_unidecode.py new/Unidecode-0.04.19/tests/test_unidecode.py --- old/Unidecode-0.04.18/tests/test_unidecode.py 2015-05-19 16:05:34.000000000 +0200 +++ new/Unidecode-0.04.19/tests/test_unidecode.py 2015-12-10 14:25:48.000000000 +0100 @@ -2,7 +2,7 @@ # vim:ts=4 sw=4 expandtab softtabstop=4 import unittest import sys -from unidecode import unidecode +from unidecode import unidecode, unidecode_expect_ascii, unidecode_expect_nonascii import warnings # workaround for Python < 2.7 @@ -55,7 +55,7 @@ else: return x.decode('unicode-escape') -class TestUnidecode(unittest.TestCase): +class BaseTestUnidecode(): @unittest.skipIf(sys.version_info[0] >= 3, "not python 2") def test_ascii_warning(self): wlog = WarningLogger() @@ -63,7 +63,10 @@ for n in range(0,128): t = chr(n) - self.assertEqual(unidecode(t), t) + + r = self.unidecode(t) + self.assertEqual(r, t) + self.assertEqual(type(r), str) # Passing string objects to unidecode should raise a warning self.assertEqual(128, len(wlog.log)) @@ -76,7 +79,10 @@ for n in range(0,128): t = _chr(n) - self.assertEqual(unidecode(t), t) + + r = self.unidecode(t) + self.assertEqual(r, t) + self.assertEqual(type(r), str) # unicode objects shouldn't raise warnings self.assertEqual(0, len(wlog.log)) @@ -91,7 +97,7 @@ # Just check that it doesn't throw an exception t = _chr(n) - unidecode(t) + self.unidecode(t) def test_surrogates(self): wlog = WarningLogger() @@ -99,7 +105,7 @@ for n in range(0xd800, 0xe000): t = _chr(n) - s = unidecode(t) + s = self.unidecode(t) # Check that surrogate characters translate to nothing. self.assertEqual('', s) @@ -128,8 +134,8 @@ wlog = WarningLogger() wlog.start("Surrogate character") - a = unidecode(s) - a_sp = unidecode(s_sp) + a = self.unidecode(s) + a_sp = self.unidecode(s_sp) self.assertEqual('T', a) @@ -142,7 +148,7 @@ # 1 sequence of a-z for n in range(0, 26): a = chr(ord('a') + n) - b = unidecode(_chr(0x24d0 + n)) + b = self.unidecode(_chr(0x24d0 + n)) self.assertEqual(b, a) @@ -157,7 +163,7 @@ a = chr(ord('A') + n % 26) else: a = chr(ord('a') + n % 26) - b = unidecode(_chr(n)) + b = self.unidecode(_chr(n)) if not b: empty += 1 @@ -171,7 +177,7 @@ # 5 consecutive sequences of 0-9 for n in range(0x1d7ce, 0x1d800): a = chr(ord('0') + (n-0x1d7ce) % 10) - b = unidecode(_chr(n)) + b = self.unidecode(_chr(n)) self.assertEqual(b, a) @@ -219,13 +225,13 @@ (_u('\ua500'), ''), - # Table that has less than 256 entriees + # Table that has less than 256 entries (_u('\u1eff'), ''), ] for input, correct_output in TESTS: - test_output = unidecode(input) + test_output = self.unidecode(input) self.assertEqual(test_output, correct_output) self.assertTrue(isinstance(test_output, str)) @@ -246,7 +252,7 @@ ] for input, correct_output in TESTS: - test_output = unidecode(input) + test_output = self.unidecode(input) self.assertEqual(test_output, correct_output) self.assertTrue(isinstance(test_output, str)) @@ -455,7 +461,7 @@ else: inp = ''.join(map(chr, utf8_input)).decode('utf8') - output = unidecode(inp) + output = self.unidecode(inp) self.assertEqual(correct_output, output) @@ -478,7 +484,7 @@ ] for s in lower: - o = unidecode(s) + o = self.unidecode(s) self.assertEqual('the quick brown fox jumps over the lazy dog 1234567890', o) @@ -498,9 +504,18 @@ ] for s in upper: - o = unidecode(s) + o = self.unidecode(s) self.assertEqual('THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG 1234567890', o) +class TestUnidecode(BaseTestUnidecode, unittest.TestCase): + unidecode = staticmethod(unidecode) + +class TestUnidecodeExpectASCII(BaseTestUnidecode, unittest.TestCase): + unidecode = staticmethod(unidecode_expect_ascii) + +class TestUnidecodeExpectNonASCII(BaseTestUnidecode, unittest.TestCase): + unidecode = staticmethod(unidecode_expect_nonascii) + if __name__ == "__main__": unittest.main() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Unidecode-0.04.18/unidecode/__init__.py new/Unidecode-0.04.19/unidecode/__init__.py --- old/Unidecode-0.04.18/unidecode/__init__.py 2014-12-07 18:54:21.000000000 +0100 +++ new/Unidecode-0.04.19/unidecode/__init__.py 2015-12-10 14:25:48.000000000 +0100 @@ -19,19 +19,53 @@ Cache = {} -def unidecode(string): + +def _warn_if_not_unicode(string): + if version_info[0] < 3 and not isinstance(string, unicode): + warnings.warn( "Argument %r is not an unicode object. " + "Passing an encoded string will likely have " + "unexpected results." % (type(string),), + RuntimeWarning, 2) + + +def unidecode_expect_ascii(string): """Transliterate an Unicode object into an ASCII string >>> unidecode(u"\u5317\u4EB0") "Bei Jing " + + This function first tries to convert the string using ASCII codec. + If it fails (because of non-ASCII characters), it falls back to + transliteration using the character tables. + + This is approx. five times faster if the string only contains ASCII + characters, but slightly slower than using unidecode directly if non-ASCII + chars are present. """ - if version_info[0] < 3 and not isinstance(string, unicode): - warnings.warn( "Argument %r is not an unicode object. " - "Passing an encoded string will likely have " - "unexpected results." % (type(string),), - RuntimeWarning, 2) + _warn_if_not_unicode(string) + try: + bytestring = string.encode('ASCII') + except UnicodeEncodeError: + return _unidecode(string) + if version_info[0] >= 3: + return string + else: + return bytestring + +def unidecode_expect_nonascii(string): + """Transliterate an Unicode object into an ASCII string + + >>> unidecode(u"\u5317\u4EB0") + "Bei Jing " + """ + + _warn_if_not_unicode(string) + return _unidecode(string) + +unidecode = unidecode_expect_ascii +def _unidecode(string): retval = [] for char in string:
