Hello community, here is the log from the commit of package python-featureflow for openSUSE:Factory checked in at 2020-06-02 14:40:08 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-featureflow (Old) and /work/SRC/openSUSE:Factory/.python-featureflow.new.3606 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-featureflow" Tue Jun 2 14:40:08 2020 rev:4 rq:810640 version:3.0.3 Changes: -------- --- /work/SRC/openSUSE:Factory/python-featureflow/python-featureflow.changes 2019-07-24 20:34:38.818579933 +0200 +++ /work/SRC/openSUSE:Factory/.python-featureflow.new.3606/python-featureflow.changes 2020-06-02 14:41:08.823997061 +0200 @@ -1,0 +2,10 @@ +Mon Jun 1 13:31:02 UTC 2020 - [email protected] + +- version update to 3.0.3 + * no upstream changelog found +- run testsuite, use %pytest +- added patches + https://github.com/JohnVinyard/featureflow/pull/11 + + python-featureflow-no-unittest2.patch + +------------------------------------------------------------------- Old: ---- featureflow-3.0.1.tar.gz New: ---- featureflow-3.0.3.tar.gz python-featureflow-no-unittest2.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-featureflow.spec ++++++ --- /var/tmp/diff_new_pack.L9fHZM/_old 2020-06-02 14:41:10.472002270 +0200 +++ /var/tmp/diff_new_pack.L9fHZM/_new 2020-06-02 14:41:10.472002270 +0200 @@ -1,7 +1,7 @@ # # spec file for package python-featureflow # -# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2020 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,15 +19,17 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} %define skip_python2 1 Name: python-featureflow -Version: 3.0.1 +Version: 3.0.3 Release: 0 Summary: A python library for building feature extraction pipelines License: MIT Group: Development/Languages/Python -Url: https://github.com/JohnVinyard/featureflow +URL: https://github.com/JohnVinyard/featureflow Source0: https://files.pythonhosted.org/packages/source/f/featureflow/featureflow-%{version}.tar.gz # PATCH-FIX-OPENSUSE fix_certifi_dependency.patch -- loosen certifi version dependency Patch0: fix_certifi_dependency.patch +# https://github.com/JohnVinyard/featureflow/pull/11 +Patch1: python-featureflow-no-unittest2.patch BuildRequires: %{python_module setuptools} BuildRequires: fdupes BuildRequires: python-rpm-macros @@ -35,11 +37,10 @@ BuildRequires: %{python_module certifi >= 2017.7.27.1} BuildRequires: %{python_module dill} BuildRequires: %{python_module lmdb} -BuildRequires: %{python_module nose} BuildRequires: %{python_module numpy} +BuildRequires: %{python_module pytest} BuildRequires: %{python_module redis} BuildRequires: %{python_module requests} -BuildRequires: %{python_module unittest2} # /SECTION Requires: python-certifi >= 2017.7.27.1 Requires: python-dill @@ -59,6 +60,7 @@ %prep %setup -q -n featureflow-%{version} %patch0 -p1 +%patch1 -p1 %build %python_build @@ -67,9 +69,8 @@ %python_install %python_expand %fdupes %{buildroot}%{$python_sitelib} -# Tests don't work without multiprocessing -# %%check -# %%python_exec setup.py test +%check +%pytest %files %{python_files} %doc README.md ++++++ featureflow-3.0.1.tar.gz -> featureflow-3.0.3.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/PKG-INFO new/featureflow-3.0.3/PKG-INFO --- old/featureflow-3.0.1/PKG-INFO 2019-03-07 03:31:14.000000000 +0100 +++ new/featureflow-3.0.3/PKG-INFO 2020-03-04 21:46:10.000000000 +0100 @@ -1,12 +1,12 @@ Metadata-Version: 2.1 Name: featureflow -Version: 3.0.1 +Version: 3.0.3 Summary: UNKNOWN Home-page: https://github.com/JohnVinyard/featureflow Author: John Vinyard Author-email: [email protected] License: UNKNOWN -Download-URL: https://github.com/jvinyard/featureflow/tarball/3.0.1 +Download-URL: https://github.com/jvinyard/featureflow/tarball/3.0.3 Description: |Build Status| |Coverage Status| |Python 3| |PyPI| |License: MIT| featureflow @@ -21,8 +21,8 @@ The following example will compute word frequency in individual text documents, and then over the entire corpus of documents, but featureflow - isn’t limited to text data. It’s designed to work well with - sequential/streaming data (e.g. audio or video) that is often processed + isn't limited to text data. It's designed to work well with + sequential/streaming data (e.g. audio or video) that is often processed iteratively, in small chunks. You can see `all the code in this example in one place @@ -32,173 +32,173 @@ .. code:: python - import featureflow as ff + import featureflow as ff - @ff.simple_in_memory_settings - class Document(ff.BaseModel): - """ - Define the processing graph needed to extract document-level features, - whether, and how those features should be persisted. - """ - raw = ff.ByteStreamFeature( - ff.ByteStream, - chunksize=128, - store=True) - - checksum = ff.JSONFeature( - CheckSum, - needs=raw, - store=True) - - tokens = ff.Feature( - Tokenizer, - needs=raw, - store=False) - - counts = ff.JSONFeature( - WordCount, - needs=tokens, - store=True) + @ff.simple_in_memory_settings + class Document(ff.BaseModel): + """ + Define the processing graph needed to extract document-level features, + whether, and how those features should be persisted. + """ + raw = ff.ByteStreamFeature( + ff.ByteStream, + chunksize=128, + store=True) + + checksum = ff.JSONFeature( + CheckSum, + needs=raw, + store=True) + + tokens = ff.Feature( + Tokenizer, + needs=raw, + store=False) + + counts = ff.JSONFeature( + WordCount, + needs=tokens, + store=True) - We can define the individual processing “nodes” referenced in the graph + We can define the individual processing "nodes" referenced in the graph above like this: .. code:: python - import featureflow as ff - from collections import Counter - import re - import hashlib - - class Tokenizer(ff.Node): - """ - Tokenize a stream of text into individual, normalized (lowercase) - words/tokens - """ - def __init__(self, needs=None): - super(Tokenizer, self).__init__(needs=needs) - self._cache = '' - self._pattern = re.compile('(?P<word>[a-zA-Z]+)\W+') - - def _enqueue(self, data, pusher): - self._cache += data - - def _dequeue(self): - matches = list(self._pattern.finditer(self._cache)) - if not matches: - raise ff.NotEnoughData() - last_boundary = matches[-1].end() - self._cache = self._cache[last_boundary:] - return matches - - def _process(self, data): - yield map(lambda x: x.groupdict()['word'].lower(), data) - - - class WordCount(ff.Aggregator, ff.Node): - """ - Keep track of token frequency - """ - def __init__(self, needs=None): - super(WordCount, self).__init__(needs=needs) - self._cache = Counter() - - def _enqueue(self, data, pusher): - self._cache.update(data) - - - class CheckSum(ff.Aggregator, ff.Node): - """ - Compute the checksum of a text stream - """ - def __init__(self, needs=None): - super(CheckSum, self).__init__(needs=needs) - self._cache = hashlib.sha256() + import featureflow as ff + from collections import Counter + import re + import hashlib + + class Tokenizer(ff.Node): + """ + Tokenize a stream of text into individual, normalized (lowercase) + words/tokens + """ + def __init__(self, needs=None): + super(Tokenizer, self).__init__(needs=needs) + self._cache = '' + self._pattern = re.compile('(?P<word>[a-zA-Z]+)\W+') + + def _enqueue(self, data, pusher): + self._cache += data.decode() + + def _dequeue(self): + matches = list(self._pattern.finditer(self._cache)) + if not matches: + raise ff.NotEnoughData() + last_boundary = matches[-1].end() + self._cache = self._cache[last_boundary:] + return matches + + def _process(self, data): + yield map(lambda x: x.groupdict()['word'].lower(), data) + + + class WordCount(ff.Aggregator, ff.Node): + """ + Keep track of token frequency + """ + def __init__(self, needs=None): + super(WordCount, self).__init__(needs=needs) + self._cache = Counter() + + def _enqueue(self, data, pusher): + self._cache.update(data) + + + class CheckSum(ff.Aggregator, ff.Node): + """ + Compute the checksum of a text stream + """ + def __init__(self, needs=None): + super(CheckSum, self).__init__(needs=needs) + self._cache = hashlib.sha256() - def _enqueue(self, data, pusher): - self._cache.update(data) + def _enqueue(self, data, pusher): + self._cache.update(data) - def _process(self, data): - yield data.hexdigest() + def _process(self, data): + yield data.hexdigest() We can also define a graph that will process an entire corpus of stored document features: .. code:: python - import featureflow as ff + import featureflow as ff - @ff.simple_in_memory_settings - class Corpus(ff.BaseModel): - """ - Define the processing graph needed to extract corpus-level features, - whether, and how those features should be persisted. - """ - docs = ff.Feature( - lambda doc_cls: (doc.counts for doc in doc_cls), - store=False) - - total_counts = ff.JSONFeature( - WordCount, - needs=docs, - store=True) + @ff.simple_in_memory_settings + class Corpus(ff.BaseModel): + """ + Define the processing graph needed to extract corpus-level features, + whether, and how those features should be persisted. + """ + docs = ff.Feature( + lambda doc_cls: (doc.counts for doc in doc_cls), + store=False) + + total_counts = ff.JSONFeature( + WordCount, + needs=docs, + store=True) Finally, we can execute these processing graphs and access the stored features like this: .. code:: python - from __future__ import print_function - import argparse + from __future__ import print_function + import argparse - def process_urls(urls): - for url in urls: - Document.process(raw=url) + def process_urls(urls): + for url in urls: + Document.process(raw=url) - def summarize_document(doc): - return 'doc {_id} with checksum {cs} contains "the" {n} times'.format( - _id=doc._id, - cs=doc.checksum, - n=doc.counts.get('the', 0)) + def summarize_document(doc): + return 'doc {_id} with checksum {cs} contains "the" {n} times'.format( + _id=doc._id, + cs=doc.checksum, + n=doc.counts.get('the', 0)) - def process_corpus(document_cls): - corpus_id = Corpus.process(docs=document_cls) - return Corpus(corpus_id) + def process_corpus(document_cls): + corpus_id = Corpus.process(docs=document_cls) + return Corpus(corpus_id) - def summarize_corpus(corpus): - return 'The entire text corpus contains "the" {n} times'.format( - n=corpus.total_counts.get("the", 0)) + def summarize_corpus(corpus): + return 'The entire text corpus contains "the" {n} times'.format( + n=corpus.total_counts.get("the", 0)) - if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--url', - help='specify one or more urls of text files to ingest', - required=True, - action='append') - args = parser.parse_args() + if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--url', + help='specify one or more urls of text files to ingest', + required=True, + action='append') + args = parser.parse_args() - process_urls(args.url) + process_urls(args.url) - for doc in Document: - print(summarize_document(doc)) + for doc in Document: + print(summarize_document(doc)) - corpus = process_corpus(Document) - print(summarize_corpus(corpus)) + corpus = process_corpus(Document) + print(summarize_corpus(corpus)) To see this in action we can: .. code:: bash - python wordcount.py \ - --url http://textfiles.com/food/1st_aid.txt \ - --url http://textfiles.com/food/antibiot.txt \ - ... + python wordcount.py \ + --url http://textfiles.com/food/1st_aid.txt \ + --url http://textfiles.com/food/antibiot.txt \ + ... Installation ============ @@ -207,9 +207,9 @@ .. code:: bash - apt-get install python-dev + apt-get install python-dev - Numpy is optional. If you’d like to use it, the + Numpy is optional. If you'd like to use it, the `Anaconda <https://www.continuum.io/downloads>`__ distribution is highly recommended. @@ -217,7 +217,7 @@ .. code:: bash - pip install featureflow + pip install featureflow .. |Build Status| image:: https://travis-ci.org/JohnVinyard/featureflow.svg?branch=master :target: https://travis-ci.org/JohnVinyard/featureflow diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/README.md new/featureflow-3.0.3/README.md --- old/featureflow-3.0.1/README.md 2019-03-02 04:03:23.000000000 +0100 +++ new/featureflow-3.0.3/README.md 2019-03-07 16:35:17.000000000 +0100 @@ -71,7 +71,7 @@ self._pattern = re.compile('(?P<word>[a-zA-Z]+)\W+') def _enqueue(self, data, pusher): - self._cache += data + self._cache += data.decode() def _dequeue(self): matches = list(self._pattern.finditer(self._cache)) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/examples/wordcount.py new/featureflow-3.0.3/examples/wordcount.py --- old/featureflow-3.0.1/examples/wordcount.py 2019-03-02 04:03:23.000000000 +0100 +++ new/featureflow-3.0.3/examples/wordcount.py 2019-03-07 16:41:00.000000000 +0100 @@ -17,7 +17,7 @@ self._pattern = re.compile('(?P<word>[a-zA-Z]+)\W+') def _enqueue(self, data, pusher): - self._cache += data + self._cache += data.decode() def _dequeue(self): matches = list(self._pattern.finditer(self._cache)) @@ -123,8 +123,16 @@ n=corpus.total_counts.get("the", 0)) +example = '''example: + +python wordcount.py \\ + --url http://textfiles.com/food/1st_aid.txt \\ + --url http://textfiles.com/food/antibiot.txt \\ +''' + if __name__ == '__main__': - parser = argparse.ArgumentParser() + + parser = argparse.ArgumentParser(epilog=example, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '--url', help='specify one or more urls of text files to ingest', diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/featureflow/__init__.py new/featureflow-3.0.3/featureflow/__init__.py --- old/featureflow-3.0.1/featureflow/__init__.py 2019-03-07 03:28:11.000000000 +0100 +++ new/featureflow-3.0.3/featureflow/__init__.py 2020-03-02 02:46:33.000000000 +0100 @@ -1,4 +1,4 @@ -__version__ = '3.0.1' +__version__ = '3.0.3' from .model import BaseModel, ModelExistsError diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/featureflow/eventlog.py new/featureflow-3.0.3/featureflow/eventlog.py --- old/featureflow-3.0.1/featureflow/eventlog.py 2019-03-02 04:03:23.000000000 +0100 +++ new/featureflow-3.0.3/featureflow/eventlog.py 2020-03-02 02:30:48.000000000 +0100 @@ -22,7 +22,7 @@ data = json.loads(d.get(block=not raise_when_empty)) yield data['_id'], data['message'] except Empty: - raise StopIteration + break return gen() @@ -89,11 +89,13 @@ def unsubscribe(self): self.channel.unsubscribe() - def subscribe(self, last_id='', raise_when_empty=False): + def subscribe(self, last_id=b'', raise_when_empty=False): try: last_id = last_id.encode() except AttributeError: pass + + print(last_id) subscription = self.channel.subscribe(raise_when_empty=raise_when_empty) with self.env.begin() as txn: @@ -108,3 +110,4 @@ yield _id, data + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/featureflow/nmpy.py new/featureflow-3.0.3/featureflow/nmpy.py --- old/featureflow-3.0.1/featureflow/nmpy.py 2019-03-02 04:03:23.000000000 +0100 +++ new/featureflow-3.0.3/featureflow/nmpy.py 2020-03-02 02:16:06.000000000 +0100 @@ -98,7 +98,7 @@ def _prepare_data(self, data): try: return np.packbits(data.astype(np.uint8), axis=-1) - except ValueError: + except TypeError: return self._pack_recarray(data) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/featureflow/test_eventlog.py new/featureflow-3.0.3/featureflow/test_eventlog.py --- old/featureflow-3.0.1/featureflow/test_eventlog.py 2019-03-02 04:03:23.000000000 +0100 +++ new/featureflow-3.0.3/featureflow/test_eventlog.py 2020-03-02 02:21:49.000000000 +0100 @@ -55,7 +55,7 @@ def test_only_returns_events_greater_than_last_id(self): self.Model.process(stream='Bah bah black sheep') events = list(self.Settings.event_log.subscribe( - last_id='', raise_when_empty=True)) + last_id=b'', raise_when_empty=True)) last_id, _ = events[-1] self.Model.process(stream='Humpty dumpty sat on a wall') next_events = list(self.Settings.event_log.subscribe( diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/featureflow-3.0.1/featureflow.egg-info/PKG-INFO new/featureflow-3.0.3/featureflow.egg-info/PKG-INFO --- old/featureflow-3.0.1/featureflow.egg-info/PKG-INFO 2019-03-07 03:31:14.000000000 +0100 +++ new/featureflow-3.0.3/featureflow.egg-info/PKG-INFO 2020-03-04 21:46:09.000000000 +0100 @@ -1,12 +1,12 @@ Metadata-Version: 2.1 Name: featureflow -Version: 3.0.1 +Version: 3.0.3 Summary: UNKNOWN Home-page: https://github.com/JohnVinyard/featureflow Author: John Vinyard Author-email: [email protected] License: UNKNOWN -Download-URL: https://github.com/jvinyard/featureflow/tarball/3.0.1 +Download-URL: https://github.com/jvinyard/featureflow/tarball/3.0.3 Description: |Build Status| |Coverage Status| |Python 3| |PyPI| |License: MIT| featureflow @@ -21,8 +21,8 @@ The following example will compute word frequency in individual text documents, and then over the entire corpus of documents, but featureflow - isn’t limited to text data. It’s designed to work well with - sequential/streaming data (e.g. audio or video) that is often processed + isn't limited to text data. It's designed to work well with + sequential/streaming data (e.g. audio or video) that is often processed iteratively, in small chunks. You can see `all the code in this example in one place @@ -32,173 +32,173 @@ .. code:: python - import featureflow as ff + import featureflow as ff - @ff.simple_in_memory_settings - class Document(ff.BaseModel): - """ - Define the processing graph needed to extract document-level features, - whether, and how those features should be persisted. - """ - raw = ff.ByteStreamFeature( - ff.ByteStream, - chunksize=128, - store=True) - - checksum = ff.JSONFeature( - CheckSum, - needs=raw, - store=True) - - tokens = ff.Feature( - Tokenizer, - needs=raw, - store=False) - - counts = ff.JSONFeature( - WordCount, - needs=tokens, - store=True) + @ff.simple_in_memory_settings + class Document(ff.BaseModel): + """ + Define the processing graph needed to extract document-level features, + whether, and how those features should be persisted. + """ + raw = ff.ByteStreamFeature( + ff.ByteStream, + chunksize=128, + store=True) + + checksum = ff.JSONFeature( + CheckSum, + needs=raw, + store=True) + + tokens = ff.Feature( + Tokenizer, + needs=raw, + store=False) + + counts = ff.JSONFeature( + WordCount, + needs=tokens, + store=True) - We can define the individual processing “nodes” referenced in the graph + We can define the individual processing "nodes" referenced in the graph above like this: .. code:: python - import featureflow as ff - from collections import Counter - import re - import hashlib - - class Tokenizer(ff.Node): - """ - Tokenize a stream of text into individual, normalized (lowercase) - words/tokens - """ - def __init__(self, needs=None): - super(Tokenizer, self).__init__(needs=needs) - self._cache = '' - self._pattern = re.compile('(?P<word>[a-zA-Z]+)\W+') - - def _enqueue(self, data, pusher): - self._cache += data - - def _dequeue(self): - matches = list(self._pattern.finditer(self._cache)) - if not matches: - raise ff.NotEnoughData() - last_boundary = matches[-1].end() - self._cache = self._cache[last_boundary:] - return matches - - def _process(self, data): - yield map(lambda x: x.groupdict()['word'].lower(), data) - - - class WordCount(ff.Aggregator, ff.Node): - """ - Keep track of token frequency - """ - def __init__(self, needs=None): - super(WordCount, self).__init__(needs=needs) - self._cache = Counter() - - def _enqueue(self, data, pusher): - self._cache.update(data) - - - class CheckSum(ff.Aggregator, ff.Node): - """ - Compute the checksum of a text stream - """ - def __init__(self, needs=None): - super(CheckSum, self).__init__(needs=needs) - self._cache = hashlib.sha256() + import featureflow as ff + from collections import Counter + import re + import hashlib + + class Tokenizer(ff.Node): + """ + Tokenize a stream of text into individual, normalized (lowercase) + words/tokens + """ + def __init__(self, needs=None): + super(Tokenizer, self).__init__(needs=needs) + self._cache = '' + self._pattern = re.compile('(?P<word>[a-zA-Z]+)\W+') + + def _enqueue(self, data, pusher): + self._cache += data.decode() + + def _dequeue(self): + matches = list(self._pattern.finditer(self._cache)) + if not matches: + raise ff.NotEnoughData() + last_boundary = matches[-1].end() + self._cache = self._cache[last_boundary:] + return matches + + def _process(self, data): + yield map(lambda x: x.groupdict()['word'].lower(), data) + + + class WordCount(ff.Aggregator, ff.Node): + """ + Keep track of token frequency + """ + def __init__(self, needs=None): + super(WordCount, self).__init__(needs=needs) + self._cache = Counter() + + def _enqueue(self, data, pusher): + self._cache.update(data) + + + class CheckSum(ff.Aggregator, ff.Node): + """ + Compute the checksum of a text stream + """ + def __init__(self, needs=None): + super(CheckSum, self).__init__(needs=needs) + self._cache = hashlib.sha256() - def _enqueue(self, data, pusher): - self._cache.update(data) + def _enqueue(self, data, pusher): + self._cache.update(data) - def _process(self, data): - yield data.hexdigest() + def _process(self, data): + yield data.hexdigest() We can also define a graph that will process an entire corpus of stored document features: .. code:: python - import featureflow as ff + import featureflow as ff - @ff.simple_in_memory_settings - class Corpus(ff.BaseModel): - """ - Define the processing graph needed to extract corpus-level features, - whether, and how those features should be persisted. - """ - docs = ff.Feature( - lambda doc_cls: (doc.counts for doc in doc_cls), - store=False) - - total_counts = ff.JSONFeature( - WordCount, - needs=docs, - store=True) + @ff.simple_in_memory_settings + class Corpus(ff.BaseModel): + """ + Define the processing graph needed to extract corpus-level features, + whether, and how those features should be persisted. + """ + docs = ff.Feature( + lambda doc_cls: (doc.counts for doc in doc_cls), + store=False) + + total_counts = ff.JSONFeature( + WordCount, + needs=docs, + store=True) Finally, we can execute these processing graphs and access the stored features like this: .. code:: python - from __future__ import print_function - import argparse + from __future__ import print_function + import argparse - def process_urls(urls): - for url in urls: - Document.process(raw=url) + def process_urls(urls): + for url in urls: + Document.process(raw=url) - def summarize_document(doc): - return 'doc {_id} with checksum {cs} contains "the" {n} times'.format( - _id=doc._id, - cs=doc.checksum, - n=doc.counts.get('the', 0)) + def summarize_document(doc): + return 'doc {_id} with checksum {cs} contains "the" {n} times'.format( + _id=doc._id, + cs=doc.checksum, + n=doc.counts.get('the', 0)) - def process_corpus(document_cls): - corpus_id = Corpus.process(docs=document_cls) - return Corpus(corpus_id) + def process_corpus(document_cls): + corpus_id = Corpus.process(docs=document_cls) + return Corpus(corpus_id) - def summarize_corpus(corpus): - return 'The entire text corpus contains "the" {n} times'.format( - n=corpus.total_counts.get("the", 0)) + def summarize_corpus(corpus): + return 'The entire text corpus contains "the" {n} times'.format( + n=corpus.total_counts.get("the", 0)) - if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--url', - help='specify one or more urls of text files to ingest', - required=True, - action='append') - args = parser.parse_args() + if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--url', + help='specify one or more urls of text files to ingest', + required=True, + action='append') + args = parser.parse_args() - process_urls(args.url) + process_urls(args.url) - for doc in Document: - print(summarize_document(doc)) + for doc in Document: + print(summarize_document(doc)) - corpus = process_corpus(Document) - print(summarize_corpus(corpus)) + corpus = process_corpus(Document) + print(summarize_corpus(corpus)) To see this in action we can: .. code:: bash - python wordcount.py \ - --url http://textfiles.com/food/1st_aid.txt \ - --url http://textfiles.com/food/antibiot.txt \ - ... + python wordcount.py \ + --url http://textfiles.com/food/1st_aid.txt \ + --url http://textfiles.com/food/antibiot.txt \ + ... Installation ============ @@ -207,9 +207,9 @@ .. code:: bash - apt-get install python-dev + apt-get install python-dev - Numpy is optional. If you’d like to use it, the + Numpy is optional. If you'd like to use it, the `Anaconda <https://www.continuum.io/downloads>`__ distribution is highly recommended. @@ -217,7 +217,7 @@ .. code:: bash - pip install featureflow + pip install featureflow .. |Build Status| image:: https://travis-ci.org/JohnVinyard/featureflow.svg?branch=master :target: https://travis-ci.org/JohnVinyard/featureflow ++++++ python-featureflow-no-unittest2.patch ++++++ Index: featureflow-3.0.1/featureflow.egg-info/requires.txt =================================================================== --- featureflow-3.0.1.orig/featureflow.egg-info/requires.txt 2019-03-07 03:31:14.000000000 +0100 +++ featureflow-3.0.1/featureflow.egg-info/requires.txt 2020-06-01 15:07:06.290277032 +0200 @@ -1,6 +1,5 @@ dill nose -unittest2 certifi==2017.7.27.1 requests lmdb Index: featureflow-3.0.1/featureflow/test_bytestream.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_bytestream.py 2019-03-07 03:24:36.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_bytestream.py 2020-06-01 15:07:53.898542286 +0200 @@ -1,5 +1,5 @@ from .bytestream import BytesWithTotalLength, ByteStream, ZipWrapper, iter_zip -import unittest2 +import unittest import sys import tempfile import subprocess @@ -12,7 +12,7 @@ import zipfile from .util import wait_for_http_server -class BytestreamTests(unittest2.TestCase): +class BytestreamTests(unittest.TestCase): def setUp(self): self.HasUri = namedtuple('HasUri', ['uri']) self.bytestream = ByteStream(chunksize=3) @@ -118,7 +118,7 @@ class BytestreamTests(unittest2.TestCase self.assertEqual(self.expected, results) -class BytesWithTotalLengthTests(unittest2.TestCase): +class BytesWithTotalLengthTests(unittest.TestCase): def test_left_add(self): self.assertEqual( b'fakeblah', BytesWithTotalLength(b'fake', 100) + b'blah') @@ -138,7 +138,7 @@ class BytesWithTotalLengthTests(unittest self.assertEqual(b'blahfake', x) -class IterZipTests(unittest2.TestCase): +class IterZipTests(unittest.TestCase): def test_iter_zip_yields_open_zip_files(self): bio = BytesIO() filename = 'test.dat' Index: featureflow-3.0.1/featureflow/test_datawriter.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_datawriter.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_datawriter.py 2020-06-01 15:10:38.471459240 +0200 @@ -1,9 +1,9 @@ -import unittest2 +import unittest from .datawriter import BytesIODataWriter from .encoder import IdentityEncoder -class StringIODataWriterTests(unittest2.TestCase): +class StringIODataWriterTests(unittest.TestCase): def test_overflow(self): buffer_size_limit = 128 writer = BytesIODataWriter( Index: featureflow-3.0.1/featureflow/test_eventlog.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_eventlog.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_eventlog.py 2020-06-01 15:08:30.614746857 +0200 @@ -1,4 +1,4 @@ -import unittest2 +import unittest from .eventlog import InMemoryChannel, EventLog from .model import BaseModel from .persistence import PersistenceSettings @@ -10,7 +10,7 @@ from .data import UuidProvider, StringDe import json -class EventLogTests(unittest2.TestCase): +class EventLogTests(unittest.TestCase): def setUp(self): self._dir = tempfile.mkdtemp() Index: featureflow-3.0.1/featureflow/test_integration.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_integration.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_integration.py 2020-06-01 15:08:52.230867288 +0200 @@ -1,7 +1,7 @@ import requests import http.client from .util import wait_for_http_server -import unittest2 +import unittest from collections import defaultdict import random from requests.exceptions import HTTPError @@ -1761,7 +1761,7 @@ class BaseTest(object): self.assertEqual(data_source['lorem'].lower(), b''.join(doc.lowercase)) -class InMemoryTest(BaseTest, unittest2.TestCase): +class InMemoryTest(BaseTest, unittest.TestCase): def setUp(self): class Settings(PersistenceSettings): id_provider = UuidProvider() @@ -1771,7 +1771,7 @@ class InMemoryTest(BaseTest, unittest2.T self.Settings = Settings -class FileSystemTest(BaseTest, unittest2.TestCase): +class FileSystemTest(BaseTest, unittest.TestCase): def setUp(self): self._dir = mkdtemp() @@ -1787,7 +1787,7 @@ class FileSystemTest(BaseTest, unittest2 rmtree(self._dir) -class LmdbTest(BaseTest, unittest2.TestCase): +class LmdbTest(BaseTest, unittest.TestCase): def setUp(self): self._dir = mkdtemp() Index: featureflow-3.0.1/featureflow/test_lmdbstore.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_lmdbstore.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_lmdbstore.py 2020-06-01 15:09:31.819087872 +0200 @@ -1,4 +1,4 @@ -import unittest2 +import unittest from .lmdbstore import LmdbDatabase from uuid import uuid4 from .data import StringDelimitedKeyBuilder @@ -45,7 +45,7 @@ def db_count(d): return len(list(EphemeralLmdb(dir=d).db.iter_ids())) -class LmdbDatabaseTests(unittest2.TestCase): +class LmdbDatabaseTests(unittest.TestCase): def setUp(self): self.value = os.urandom(1000) self.init_database() Index: featureflow-3.0.1/featureflow/test_multiprocessing.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_multiprocessing.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_multiprocessing.py 2020-06-01 15:09:12.802981916 +0200 @@ -1,4 +1,4 @@ -import unittest2 +import unittest from .feature import Feature, JSONFeature from .lmdbstore import LmdbDatabase from .model import BaseModel @@ -28,7 +28,7 @@ def get_count(_): return len(list(D.database.iter_ids())) -class MultiProcessTests(unittest2.TestCase): +class MultiProcessTests(unittest.TestCase): def test_can_list_ids_from_multiple_processes(self): D.process(stream='Here is some text') D.process(stream='Here is some more') Index: featureflow-3.0.1/featureflow/test_nmpy.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_nmpy.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_nmpy.py 2020-06-01 15:10:01.383252598 +0200 @@ -1,4 +1,4 @@ -import unittest2 +import unittest try: import numpy as np @@ -94,13 +94,13 @@ class BaseNumpyTest(object): ('y', 'a32')]) -class GreedyNumpyTest(BaseNumpyTest, unittest2.TestCase): +class GreedyNumpyTest(BaseNumpyTest, unittest.TestCase): def _register_database(self, settings_class): return settings_class.clone( database=InMemoryDatabase(key_builder=settings_class.key_builder)) -class GreedyNumpyOnDiskTest(BaseNumpyTest, unittest2.TestCase): +class GreedyNumpyOnDiskTest(BaseNumpyTest, unittest.TestCase): def _register_database(self, settings_class): self._dir = mkdtemp() return settings_class.clone(database=FileSystemDatabase( @@ -111,7 +111,7 @@ class GreedyNumpyOnDiskTest(BaseNumpyTes rmtree(self._dir) -class GreedyNumpyLmdbTest(BaseNumpyTest, unittest2.TestCase): +class GreedyNumpyLmdbTest(BaseNumpyTest, unittest.TestCase): def _register_database(self, settings_class): self._dir = mkdtemp() return settings_class.clone(database=LmdbDatabase( @@ -123,7 +123,7 @@ class GreedyNumpyLmdbTest(BaseNumpyTest, rmtree(self._dir) -class StreamingNumpyTest(BaseNumpyTest, unittest2.TestCase): +class StreamingNumpyTest(BaseNumpyTest, unittest.TestCase): def _register_database(self, settings_class): return settings_class.clone( database=InMemoryDatabase(key_builder=settings_class.key_builder)) @@ -147,7 +147,7 @@ class StreamingNumpyTest(BaseNumpyTest, return np.concatenate(list(data)) -class StreamingNumpyOnDiskTest(BaseNumpyTest, unittest2.TestCase): +class StreamingNumpyOnDiskTest(BaseNumpyTest, unittest.TestCase): def _register_database(self, settings_class): self._dir = mkdtemp() return settings_class.clone(database=FileSystemDatabase( @@ -176,7 +176,7 @@ class StreamingNumpyOnDiskTest(BaseNumpy return np.concatenate(list(data)) -class StreamingNumpyLmdbTest(BaseNumpyTest, unittest2.TestCase): +class StreamingNumpyLmdbTest(BaseNumpyTest, unittest.TestCase): def _register_database(self, settings_class): self._dir = mkdtemp() return settings_class.clone(database=LmdbDatabase( Index: featureflow-3.0.1/featureflow/test_objectstorage.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_objectstorage.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_objectstorage.py 2020-06-01 15:08:13.578651936 +0200 @@ -1,10 +1,10 @@ -import unittest2 +import unittest from .objectstore import WriteStream import http.client from collections import namedtuple -class WriteStreamTests(unittest2.TestCase): +class WriteStreamTests(unittest.TestCase): def test_write_stream_does_not_put_zero_bytes(self): class TestWriteStream(WriteStream): def __init__(self): Index: featureflow-3.0.1/featureflow/test_persistence.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_persistence.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_persistence.py 2020-06-01 15:10:21.555364987 +0200 @@ -1,4 +1,4 @@ -import unittest2 +import unittest from .persistence import simple_in_memory_settings from .bytestream import ByteStream, ByteStreamFeature from .feature import Feature, TextFeature @@ -6,7 +6,7 @@ from .model import BaseModel from io import BytesIO -class SimpleInMemorySettingsDecoratorTests(unittest2.TestCase): +class SimpleInMemorySettingsDecoratorTests(unittest.TestCase): def test_can_process_document_using_decorated_class(self): @simple_in_memory_settings class Document(BaseModel): Index: featureflow-3.0.1/requirements.txt =================================================================== --- featureflow-3.0.1.orig/requirements.txt 2017-05-13 04:26:57.000000000 +0200 +++ featureflow-3.0.1/requirements.txt 2020-06-01 15:06:55.930219258 +0200 @@ -1,5 +1,4 @@ redis nose -unittest2 requests -lmdb \ No newline at end of file +lmdb Index: featureflow-3.0.1/setup.py =================================================================== --- featureflow-3.0.1.orig/setup.py 2020-06-01 15:06:14.825989932 +0200 +++ featureflow-3.0.1/setup.py 2020-06-01 15:07:25.566384433 +0200 @@ -30,7 +30,6 @@ setup( install_requires=[ 'dill', 'nose', - 'unittest2', 'certifi>=2017.7.27.1', 'requests', 'lmdb', Index: featureflow-3.0.1/featureflow/test_data.py =================================================================== --- featureflow-3.0.1.orig/featureflow/test_data.py 2019-03-02 04:03:23.000000000 +0100 +++ featureflow-3.0.1/featureflow/test_data.py 2020-06-01 15:15:05.356946036 +0200 @@ -1,4 +1,4 @@ -import unittest2 +import unittest from uuid import uuid4 from .data import \ InMemoryDatabase, UserSpecifiedIdProvider, FileSystemDatabase, \ @@ -6,7 +6,7 @@ from .data import \ import shutil -class InMemoryDatabaseTest(unittest2.TestCase): +class InMemoryDatabaseTest(unittest.TestCase): def setUp(self): self.db = InMemoryDatabase() @@ -35,12 +35,12 @@ class InMemoryDatabaseTest(unittest2.Tes self.assertEqual(b'test data2', rs.read()) -class UserSpecifiedIdProviderTest(unittest2.TestCase): +class UserSpecifiedIdProviderTest(unittest.TestCase): def test_raises_when_no_key_is_provided(self): self.assertRaises(ValueError, lambda: UserSpecifiedIdProvider()) -class FileSystemDatabaseTests(unittest2.TestCase): +class FileSystemDatabaseTests(unittest.TestCase): def setUp(self): self._key_builder = StringDelimitedKeyBuilder() self._path = '/tmp/{path}'.format(path=uuid4().hex)
