tvalentyn commented on code in PR #22088:
URL: https://github.com/apache/beam/pull/22088#discussion_r940783481
##########
sdks/python/apache_beam/ml/inference/sklearn_inference_test.py:
##########
@@ -47,6 +47,12 @@
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
+# pylint: disable=wrong-import-order, wrong-import-position,
ungrouped-imports, unused-import
+try:
+ from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
Review Comment:
why is this necessary? I am not seeing any usage of GCSFileSystem
##########
sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py:
##########
@@ -32,50 +32,55 @@
import argparse
from typing import Iterable
+import pandas
+
import apache_beam as beam
from apache_beam.io.filesystems import FileSystems
from apache_beam.ml.inference.base import RunInference
from apache_beam.ml.inference.sklearn_inference import ModelFileType
from apache_beam.ml.inference.sklearn_inference import
SklearnModelHandlerPandas
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
-import pandas
-MODELS = [{
+# yapf: disable
+MODELS = [
+ {
'name': 'all_features',
- 'required_features': [
+ 'required_features': [
'Area',
'Year',
'MinTimeToNearestStation',
'MaxTimeToNearestStation',
'TotalFloorArea',
'Frontage',
'Breadth',
- 'BuildingYear'
- ]
-},
- {
- 'name': 'floor_area',
- 'required_features': ['Area', 'Year', 'TotalFloorArea']
- },
- {
- 'name': 'stations',
- 'required_features': [
- 'Area',
- 'Year',
- 'MinTimeToNearestStation',
- 'MaxTimeToNearestStation'
- ]
- }, {
- 'name': 'no_features', 'required_features': ['Area', 'Year']
- }]
+ 'BuildingYear']
+ }, {
+ 'name': 'floor_area',
+ 'required_features': ['Area', 'Year', 'TotalFloorArea']
+ }, {
+ 'name': 'stations',
+ 'required_features': [
+ 'Area',
+ 'Year',
+ 'MinTimeToNearestStation',
+ 'MaxTimeToNearestStation']
+ }, {
+ 'name': 'no_features',
+ 'required_features': ['Area', 'Year']
+ }
+]
+# yapf: enable
def sort_by_features(dataframe, max_size):
- """ Partitions the dataframe by what data it has available."""
+ """ Returns an index to a model, based on available data."""
for i, model in enumerate(MODELS):
required_features = dataframe[model['required_features']]
- if not required_features.isnull().any().any():
+ # A model can only make a prediction if all required features
+ # are present.
+ # required_features is 2D single row, so all must be called twice.
Review Comment:
```suggestion
# required_features is 2D single row, so all() must be called twice.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]