ryanthompson591 commented on code in PR #22795: URL: https://github.com/apache/beam/pull/22795#discussion_r953814223
########## sdks/python/apache_beam/examples/snippets/transforms/elementwise/runinference_test.py: ########## @@ -44,10 +44,10 @@ def check_torch_keyed_model_handler(): expected = '''[START torch_keyed_model_handler] -('first_question', PredictionResult(example=tensor([105.]), inference=tensor([523.6982], grad_fn=<UnbindBackward>))) -('second_question', PredictionResult(example=tensor([108.]), inference=tensor([538.5867], grad_fn=<UnbindBackward>))) -('third_question', PredictionResult(example=tensor([1000.]), inference=tensor([4965.4019], grad_fn=<UnbindBackward>))) -('fourth_question', PredictionResult(example=tensor([1013.]), inference=tensor([5029.9180], grad_fn=<UnbindBackward>))) +('first_question', PredictionResult(example=tensor([105.]), inference=tensor([523.6982]))) Review Comment: I found I needed a margin of error for an sklearn example. I just extracted the numbers with a regex. See: https://github.com/apache/beam/blob/3f5ddbcf9fece6bd9905bf67adcbda9aec6f29e9/sdks/python/apache_beam/ml/inference/sklearn_inference_it_test.py#L110 If you find you don't need it, it's ok to be precise for now. ########## sdks/python/apache_beam/ml/inference/pytorch_inference_test.py: ########## @@ -373,6 +373,40 @@ def test_invalid_input_type(self): # pylint: disable=expression-not-assigned pcoll | RunInference(model_handler) + def test_gpu_convert_to_cpu(self): + with self.assertLogs() as log: + with TestPipeline() as pipeline: + examples = torch.from_numpy( + np.array([1, 5, 3, 10], dtype="float32").reshape(-1, 1)) + + state_dict = OrderedDict([('linear.weight', torch.Tensor([[2.0]])), + ('linear.bias', torch.Tensor([0.5]))]) + path = os.path.join(self.tmpdir, 'my_state_dict_path') + torch.save(state_dict, path) + + model_handler = PytorchModelHandlerTensor( + state_dict_path=path, + model_class=PytorchLinearRegression, + model_params={ + 'input_dim': 1, 'output_dim': 1 + }, + device='GPU') + # Upon initialization, device is cuda + self.assertEqual(model_handler._device, torch.device('cuda')) + + pcoll = pipeline | 'start' >> beam.Create(examples) + # pylint: disable=expression-not-assigned + pcoll | RunInference(model_handler) + + # During model loading, device converted to cuda + self.assertEqual(model_handler._device, torch.device('cuda')) + + self.assertIn("INFO:root:Device is set to CUDA", log.output) + self.assertIn( + "WARNING:root:Specified 'GPU', but could not find device. " \ + "Switching to CPU.", + log.output) + Review Comment: I mean, is it impossible for there to be any runtime errors if you load with device type cpu? ########## sdks/python/apache_beam/ml/inference/pytorch_inference.py: ########## @@ -40,11 +41,32 @@ def _load_model( model_class: torch.nn.Module, state_dict_path, device, **model_params): model = model_class(**model_params) - model.to(device) + + if device == torch.device('cuda') and not torch.cuda.is_available(): + logging.warning( + "Model handler specified a 'GPU' device, but GPUs are not available. " \ + "Switching to CPU.") + device = torch.device('cpu') + file = FileSystems.open(state_dict_path, 'rb') - model.load_state_dict(torch.load(file)) + try: + logging.info( + "Loading state_dict_path %s onto a %s device", state_dict_path, device) + state_dict = torch.load(file, map_location=device) + except RuntimeError as e: + message = "Loading the model onto a GPU device failed due to an " \ Review Comment: Are we certain that if there is a runtime error this is the reason? Could this message ever be a red herring? ########## sdks/python/apache_beam/ml/inference/pytorch_inference.py: ########## @@ -40,11 +41,32 @@ def _load_model( model_class: torch.nn.Module, state_dict_path, device, **model_params): model = model_class(**model_params) - model.to(device) + + if device == torch.device('cuda') and not torch.cuda.is_available(): + logging.warning( + "Model handler specified a 'GPU' device, but GPUs are not available. " \ + "Switching to CPU.") + device = torch.device('cpu') + file = FileSystems.open(state_dict_path, 'rb') - model.load_state_dict(torch.load(file)) + try: + logging.info( + "Loading state_dict_path %s onto a %s device", state_dict_path, device) + state_dict = torch.load(file, map_location=device) + except RuntimeError as e: + message = "Loading the model onto a GPU device failed due to an " \ + f"exception:\n{e}\nAttempting to load onto a CPU device instead." + logging.warning(message) + + device = torch.device('cpu') Review Comment: I'm a little worried about this logic. Is it possible that we've already switched to attempting to use the cpu above and we get a runtime error, and then attempt to use the cpu again? You could just call load_model again with the new device type. Something like. ``` if device == torch.device('cuda') and not torch.cuda.is_available(): logging.warning('...') return _load_model(model_class, state_dict_path, 'CPU', **model_params) try: state_dict = torch.load(file, map_location=device) catch RunTimeException e: if device == torch.device('cuda'): logging.warning(...) return _load_model(model_class, state_dict_path, 'CPU', **model_params) else: raise e ... ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@beam.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org