github-advanced-security[bot] commented on code in PR #34826: URL: https://github.com/apache/superset/pull/34826#discussion_r2586815647
########## superset/engines/manager.py: ########## @@ -0,0 +1,690 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import hashlib +import json +import logging +import threading +from contextlib import contextmanager +from datetime import timedelta +from io import StringIO +from typing import Any, Iterator, TYPE_CHECKING + +import sshtunnel +from paramiko import RSAKey +from sqlalchemy import create_engine, event, pool +from sqlalchemy.engine import Engine +from sqlalchemy.engine.url import URL +from sshtunnel import SSHTunnelForwarder + +from superset.databases.utils import make_url_safe +from superset.superset_typing import DBConnectionMutator, EngineContextManager +from superset.utils.core import get_query_source_from_request, get_user_id, QuerySource + +if TYPE_CHECKING: + from superset.databases.ssh_tunnel.models import SSHTunnel + from superset.models.core import Database + + +logger = logging.getLogger(__name__) + + +class _LockManager: + """ + Manages per-key locks safely without defaultdict race conditions. + + This class provides a thread-safe way to create and manage locks for specific keys, + avoiding the race conditions that occur when using defaultdict with threading.Lock. + + The implementation uses a two-level locking strategy: + 1. A meta-lock to protect the lock dictionary itself + 2. Per-key locks to protect specific resources + + This ensures that: + - Different keys can be locked concurrently (scalability) + - Lock creation is thread-safe (no race conditions) + - The same key always gets the same lock instance + """ + + def __init__(self) -> None: + self._locks: dict[str, threading.RLock] = {} + self._meta_lock = threading.Lock() + + def get_lock(self, key: str) -> threading.RLock: + """ + Get or create a lock for the given key. + + This method uses double-checked locking to ensure thread safety: + 1. First check without lock (fast path) + 2. Acquire meta-lock if needed + 3. Double-check inside the lock to prevent race conditions + + This approach minimizes lock contention while ensuring correctness. + + :param key: The key to get a lock for + :returns: An RLock instance for the given key + """ + if lock := self._locks.get(key): + return lock + + with self._meta_lock: + # Double-check inside the lock + lock = self._locks.get(key) + if lock is None: + lock = threading.RLock() + self._locks[key] = lock + return lock + + def cleanup(self, active_keys: set[str]) -> None: + """ + Remove locks for keys that are no longer in use. + + This prevents memory leaks from accumulating locks for resources + that have been disposed. + + :param active_keys: Set of keys that are still active + """ + with self._meta_lock: + # Find locks to remove + locks_to_remove = self._locks.keys() - active_keys + for key in locks_to_remove: + self._locks.pop(key, None) + + +EngineKey = str +TunnelKey = str + + +def _normalize_value(value: Any) -> str: + """ + Normalize a value for consistent hashing. + + Converts various types to a consistent string representation for hashing. + Handles special cases like bytes, class objects, and nested structures. + + :param value: The value to normalize + :returns: String representation suitable for hashing + """ + if isinstance(value, bytes): + # For binary data (like private keys), hash it to avoid encoding issues + return hashlib.sha256(value).hexdigest()[:16] Review Comment: ## Use of a broken or weak cryptographic hashing algorithm on sensitive data [Sensitive data (password)](1) is used in a hashing algorithm (SHA256) that is insecure for password hashing, since it is not a computationally expensive hash function. [Show more details](https://github.com/apache/superset/security/code-scanning/2069) ########## superset/engines/manager.py: ########## @@ -0,0 +1,690 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import hashlib +import json +import logging +import threading +from contextlib import contextmanager +from datetime import timedelta +from io import StringIO +from typing import Any, Iterator, TYPE_CHECKING + +import sshtunnel +from paramiko import RSAKey +from sqlalchemy import create_engine, event, pool +from sqlalchemy.engine import Engine +from sqlalchemy.engine.url import URL +from sshtunnel import SSHTunnelForwarder + +from superset.databases.utils import make_url_safe +from superset.superset_typing import DBConnectionMutator, EngineContextManager +from superset.utils.core import get_query_source_from_request, get_user_id, QuerySource + +if TYPE_CHECKING: + from superset.databases.ssh_tunnel.models import SSHTunnel + from superset.models.core import Database + + +logger = logging.getLogger(__name__) + + +class _LockManager: + """ + Manages per-key locks safely without defaultdict race conditions. + + This class provides a thread-safe way to create and manage locks for specific keys, + avoiding the race conditions that occur when using defaultdict with threading.Lock. + + The implementation uses a two-level locking strategy: + 1. A meta-lock to protect the lock dictionary itself + 2. Per-key locks to protect specific resources + + This ensures that: + - Different keys can be locked concurrently (scalability) + - Lock creation is thread-safe (no race conditions) + - The same key always gets the same lock instance + """ + + def __init__(self) -> None: + self._locks: dict[str, threading.RLock] = {} + self._meta_lock = threading.Lock() + + def get_lock(self, key: str) -> threading.RLock: + """ + Get or create a lock for the given key. + + This method uses double-checked locking to ensure thread safety: + 1. First check without lock (fast path) + 2. Acquire meta-lock if needed + 3. Double-check inside the lock to prevent race conditions + + This approach minimizes lock contention while ensuring correctness. + + :param key: The key to get a lock for + :returns: An RLock instance for the given key + """ + if lock := self._locks.get(key): + return lock + + with self._meta_lock: + # Double-check inside the lock + lock = self._locks.get(key) + if lock is None: + lock = threading.RLock() + self._locks[key] = lock + return lock + + def cleanup(self, active_keys: set[str]) -> None: + """ + Remove locks for keys that are no longer in use. + + This prevents memory leaks from accumulating locks for resources + that have been disposed. + + :param active_keys: Set of keys that are still active + """ + with self._meta_lock: + # Find locks to remove + locks_to_remove = self._locks.keys() - active_keys + for key in locks_to_remove: + self._locks.pop(key, None) + + +EngineKey = str +TunnelKey = str + + +def _normalize_value(value: Any) -> str: + """ + Normalize a value for consistent hashing. + + Converts various types to a consistent string representation for hashing. + Handles special cases like bytes, class objects, and nested structures. + + :param value: The value to normalize + :returns: String representation suitable for hashing + """ + if isinstance(value, bytes): + # For binary data (like private keys), hash it to avoid encoding issues + return hashlib.sha256(value).hexdigest()[:16] + elif isinstance(value, type): + # For class objects (like pool classes), use the class name + return value.__name__ + elif isinstance(value, dict): + # For nested dicts, recursively normalize + normalized_dict = {} + for k, v in sorted(value.items()): + normalized_dict[k] = _normalize_value(v) + return json.dumps(normalized_dict, sort_keys=True, separators=(",", ":")) + elif isinstance(value, (list, tuple)): + # For lists/tuples, normalize each item + normalized_list = [_normalize_value(item) for item in value] + return json.dumps(normalized_list, separators=(",", ":")) + else: + # For everything else, convert to string + return str(value) + + +def _generate_secure_key(components: dict[str, Any]) -> str: + """ + Generate a secure hash-based key from components. + + Creates a SHA-256 hash of the components to ensure: + 1. The key includes all parameters for proper caching + 2. Sensitive data is not exposed in logs or errors + 3. The key is deterministic for the same inputs + + :param components: Dictionary of components to hash + :returns: 32-character hex string representing the secure key + """ + # Create deterministic string representation + # Sort keys for consistency + key_data = { + k: _normalize_value(v) if v is not None else "" + for k, v in sorted(components.items()) + } + + # Create compact JSON representation + key_string = json.dumps(key_data, sort_keys=True, separators=(",", ":")) + + # Generate SHA-256 hash and return first 32 hex characters + # 32 characters = 128 bits of entropy, sufficient for collision resistance + return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32] Review Comment: ## Use of a broken or weak cryptographic hashing algorithm on sensitive data [Sensitive data (password)](1) is used in a hashing algorithm (SHA256) that is insecure for password hashing, since it is not a computationally expensive hash function. [Show more details](https://github.com/apache/superset/security/code-scanning/2070) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
