codeant-ai-for-open-source[bot] commented on code in PR #37186:
URL: https://github.com/apache/superset/pull/37186#discussion_r2719657810


##########
tests/unit_tests/mcp_service/utils/test_sanitization.py:
##########
@@ -0,0 +1,411 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+from superset.mcp_service.utils.sanitization import (
+    _check_dangerous_patterns,
+    _check_sql_patterns,
+    _remove_dangerous_unicode,
+    _strip_html_tags,
+    sanitize_filter_value,
+    sanitize_user_input,
+)
+
+
+# --- _strip_html_tags tests ---
+
+
+def test_strip_html_tags_plain_text():
+    assert _strip_html_tags("hello world") == "hello world"
+
+
+def test_strip_html_tags_preserves_ampersand():
+    assert _strip_html_tags("A & B") == "A & B"
+
+
+def test_strip_html_tags_preserves_multiple_ampersands():
+    assert _strip_html_tags("A & B & C") == "A & B & C"
+
+
+def test_strip_html_tags_strips_bold_tags():
+    assert _strip_html_tags("<b>hello</b>") == "hello"
+
+
+def test_strip_html_tags_strips_script_tags():
+    result = _strip_html_tags("<script>alert(1)</script>")
+    assert "<script>" not in result
+    assert "</script>" not in result
+
+
+def test_strip_html_tags_strips_entity_encoded_script():
+    """Entity-encoded tags must be decoded and stripped, not passed through."""
+    result = _strip_html_tags("&lt;script&gt;alert(1)&lt;/script&gt;")
+    assert "<script>" not in result
+    assert "&lt;script&gt;" not in result
+
+
+def test_strip_html_tags_strips_double_encoded_script():
+    """Double-encoded entities must also be decoded and stripped."""
+    result = 
_strip_html_tags("&amp;lt;script&amp;gt;alert(1)&amp;lt;/script&amp;gt;")
+    assert "<script>" not in result
+    assert "&lt;script&gt;" not in result
+
+
+def test_strip_html_tags_strips_img_onerror():
+    result = _strip_html_tags('<img src=x onerror="alert(1)">')
+    assert "<img" not in result
+    assert "onerror" not in result
+
+
+def test_strip_html_tags_strips_div_tags():
+    assert _strip_html_tags("<div>content</div>") == "content"
+
+
+def test_strip_html_tags_preserves_less_than_in_text():
+    """A bare < not forming a tag should be preserved."""
+    result = _strip_html_tags("5 < 10")
+    assert "5" in result
+    assert "10" in result
+
+
+def test_strip_html_tags_empty_string():
+    assert _strip_html_tags("") == ""
+
+
+def test_strip_html_tags_triple_encoded_script():
+    """Triple-encoded entities must also be decoded and stripped."""
+    result = _strip_html_tags(
+        "&amp;amp;lt;script&amp;amp;gt;alert(1)&amp;amp;lt;/script&amp;amp;gt;"
+    )
+    assert "<script>" not in result
+
+
+def test_strip_html_tags_mixed_encoded_and_raw():
+    """Both raw and entity-encoded tags should be stripped."""
+    result = _strip_html_tags("<b>bold</b> and &lt;i&gt;italic&lt;/i&gt;")
+    assert "<b>" not in result
+    assert "<i>" not in result
+    assert "bold" in result
+    assert "italic" in result
+
+
+def test_strip_html_tags_deep_encoding_terminates():
+    """Verify the iterative decode loop terminates on many encoding layers."""
+    value = "<script>alert(1)</script>"
+    for _ in range(10):
+        value = value.replace("&", "&amp;").replace("<", "&lt;").replace(">", 
"&gt;")
+    result = _strip_html_tags(value)
+    assert "<script>" not in result
+
+
+def test_strip_html_tags_entity_ampersand():
+    """&amp; in input should become & in output."""
+    assert _strip_html_tags("A &amp; B") == "A & B"
+
+
+# --- _check_dangerous_patterns tests ---
+
+
+def test_check_dangerous_patterns_safe_input():
+    _check_dangerous_patterns("hello world", "test")
+
+
+def test_check_dangerous_patterns_javascript_scheme():
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        _check_dangerous_patterns("javascript:alert(1)", "test")
+
+
+def test_check_dangerous_patterns_vbscript_scheme():
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        _check_dangerous_patterns("vbscript:MsgBox", "test")
+
+
+def test_check_dangerous_patterns_data_scheme():
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        _check_dangerous_patterns("data:text/html,<script>", "test")
+
+
+def test_check_dangerous_patterns_case_insensitive():
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        _check_dangerous_patterns("JAVASCRIPT:alert(1)", "test")
+
+
+def test_check_dangerous_patterns_onclick():
+    with pytest.raises(ValueError, match="malicious event handler"):
+        _check_dangerous_patterns("onclick=alert(1)", "test")
+
+
+def test_check_dangerous_patterns_onerror():
+    with pytest.raises(ValueError, match="malicious event handler"):
+        _check_dangerous_patterns("onerror = alert(1)", "test")
+
+
+def test_check_dangerous_patterns_onload():
+    with pytest.raises(ValueError, match="malicious event handler"):
+        _check_dangerous_patterns("onload=fetch('x')", "test")
+
+
+# --- _check_sql_patterns tests ---
+
+
+def test_check_sql_patterns_safe_input():
+    _check_sql_patterns("revenue_total", "test")
+
+
+def test_check_sql_patterns_drop_table():
+    with pytest.raises(ValueError, match="unsafe SQL keywords"):
+        _check_sql_patterns("DROP TABLE users", "test")
+
+
+def test_check_sql_patterns_delete():
+    with pytest.raises(ValueError, match="unsafe SQL keywords"):
+        _check_sql_patterns("DELETE FROM users", "test")
+
+
+def test_check_sql_patterns_semicolon():
+    with pytest.raises(ValueError, match="unsafe characters"):
+        _check_sql_patterns("value; other", "test")
+
+
+def test_check_sql_patterns_sql_comment_dash():
+    with pytest.raises(ValueError, match="unsafe characters"):
+        _check_sql_patterns("value -- comment", "test")
+
+
+def test_check_sql_patterns_sql_comment_block():
+    with pytest.raises(ValueError, match="unsafe SQL comment"):
+        _check_sql_patterns("value /* comment */", "test")
+
+
+def test_check_sql_patterns_pipe():
+    with pytest.raises(ValueError, match="unsafe characters"):
+        _check_sql_patterns("value | other", "test")
+
+
+def test_check_sql_patterns_case_insensitive():
+    with pytest.raises(ValueError, match="unsafe SQL keywords"):
+        _check_sql_patterns("drop table users", "test")
+
+
+# --- _remove_dangerous_unicode tests ---
+
+
+def test_remove_dangerous_unicode_plain_text():
+    assert _remove_dangerous_unicode("hello") == "hello"
+
+
+def test_remove_dangerous_unicode_zero_width_space():
+    assert _remove_dangerous_unicode("he\u200bllo") == "hello"
+
+
+def test_remove_dangerous_unicode_zero_width_joiner():
+    assert _remove_dangerous_unicode("he\u200dllo") == "hello"
+
+
+def test_remove_dangerous_unicode_bom():
+    assert _remove_dangerous_unicode("\ufeffhello") == "hello"
+
+
+def test_remove_dangerous_unicode_null_byte():
+    assert _remove_dangerous_unicode("he\x00llo") == "hello"
+
+
+def test_remove_dangerous_unicode_preserves_normal_unicode():
+    assert _remove_dangerous_unicode("café résumé") == "café résumé"
+
+
+# --- sanitize_user_input tests ---
+
+
+def test_sanitize_user_input_plain_text():
+    assert sanitize_user_input("hello", "test") == "hello"
+
+
+def test_sanitize_user_input_preserves_ampersand():
+    assert sanitize_user_input("A & B", "test") == "A & B"
+
+
+def test_sanitize_user_input_strips_html():
+    assert sanitize_user_input("<b>hello</b>", "test") == "hello"
+
+
+def test_sanitize_user_input_none_not_allowed():
+    with pytest.raises(ValueError, match="cannot be empty"):
+        sanitize_user_input(None, "test")
+
+
+def test_sanitize_user_input_none_allowed():
+    assert sanitize_user_input(None, "test", allow_empty=True) is None
+
+
+def test_sanitize_user_input_empty_string_not_allowed():
+    with pytest.raises(ValueError, match="cannot be empty"):
+        sanitize_user_input("", "test")
+
+
+def test_sanitize_user_input_empty_string_allowed():
+    assert sanitize_user_input("", "test", allow_empty=True) is None
+
+
+def test_sanitize_user_input_whitespace_only():
+    with pytest.raises(ValueError, match="cannot be empty"):
+        sanitize_user_input("   ", "test")
+
+
+def test_sanitize_user_input_strips_whitespace():
+    assert sanitize_user_input("  hello  ", "test") == "hello"
+
+
+def test_sanitize_user_input_too_long():
+    with pytest.raises(ValueError, match="too long"):
+        sanitize_user_input("a" * 256, "test", max_length=255)
+
+
+def test_sanitize_user_input_max_length_ok():
+    result = sanitize_user_input("a" * 255, "test", max_length=255)
+    assert result == "a" * 255
+
+
+def test_sanitize_user_input_blocks_javascript():
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        sanitize_user_input("javascript:alert(1)", "test")
+
+
+def test_sanitize_user_input_blocks_event_handler():
+    with pytest.raises(ValueError, match="malicious event handler"):
+        sanitize_user_input("onclick=alert(1)", "test")
+
+
+def test_sanitize_user_input_sql_keywords_not_checked_by_default():
+    result = sanitize_user_input("DROP TABLE", "test")
+    assert result == "DROP TABLE"
+
+
+def test_sanitize_user_input_sql_keywords_checked_when_enabled():
+    with pytest.raises(ValueError, match="unsafe SQL keywords"):
+        sanitize_user_input("DROP TABLE users", "test", 
check_sql_keywords=True)
+
+
+def test_sanitize_user_input_removes_zero_width_chars():
+    result = sanitize_user_input("hel\u200blo", "test")
+    assert result == "hello"
+
+
+def test_sanitize_user_input_xss_entity_encoded():
+    """Entity-encoded XSS attempts must be neutralized."""
+    result = sanitize_user_input("&lt;script&gt;alert(1)&lt;/script&gt;", 
"test")
+    assert "<script>" not in result
+
+
+def test_sanitize_user_input_entity_encoded_javascript():
+    """Entity-encoded javascript: scheme should be caught after decoding."""
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        sanitize_user_input("&#106;avascript:alert(1)", "test")
+
+
+# --- sanitize_filter_value tests ---
+
+
+def test_sanitize_filter_value_integer():
+    assert sanitize_filter_value(42) == 42
+
+
+def test_sanitize_filter_value_float():
+    assert sanitize_filter_value(3.14) == 3.14
+
+
+def test_sanitize_filter_value_bool():
+    assert sanitize_filter_value(True) is True
+
+
+def test_sanitize_filter_value_plain_string():
+    assert sanitize_filter_value("hello") == "hello"
+
+
+def test_sanitize_filter_value_preserves_ampersand():
+    assert sanitize_filter_value("A & B") == "A & B"
+
+
+def test_sanitize_filter_value_strips_html():
+    assert sanitize_filter_value("<b>hello</b>") == "hello"
+
+
+def test_sanitize_filter_value_too_long():
+    with pytest.raises(ValueError, match="too long"):
+        sanitize_filter_value("a" * 1001)
+
+
+def test_sanitize_filter_value_blocks_javascript():
+    with pytest.raises(ValueError, match="malicious URL scheme"):
+        sanitize_filter_value("javascript:alert(1)")
+
+
+def test_sanitize_filter_value_blocks_xp_cmdshell():
+    with pytest.raises(ValueError, match="malicious SQL procedures"):
+        sanitize_filter_value("xp_cmdshell('dir')")
+
+
+def test_sanitize_filter_value_blocks_sp_executesql():
+    with pytest.raises(ValueError, match="malicious SQL procedures"):
+        sanitize_filter_value("sp_executesql @stmt")
+
+
+def test_sanitize_filter_value_blocks_union_select():
+    with pytest.raises(ValueError, match="malicious SQL patterns"):
+        sanitize_filter_value("' UNION SELECT * FROM users")
+
+
+def test_sanitize_filter_value_blocks_sql_comment():
+    with pytest.raises(ValueError, match="malicious SQL patterns"):
+        sanitize_filter_value("value -- drop")
+
+
+def test_sanitize_filter_value_blocks_shell_semicolon():
+    with pytest.raises(ValueError, match="unsafe shell characters"):
+        sanitize_filter_value("value;rm -rf")
+
+
+def test_sanitize_filter_value_blocks_shell_pipe():
+    with pytest.raises(ValueError, match="unsafe shell characters"):
+        sanitize_filter_value("value|cat /etc/passwd")
+
+
+def test_sanitize_filter_value_blocks_backtick():
+    with pytest.raises(ValueError, match="unsafe shell characters"):
+        sanitize_filter_value("`whoami`")
+
+
+def test_sanitize_filter_value_blocks_hex_encoding():
+    with pytest.raises(ValueError, match="hex encoding"):
+        sanitize_filter_value("\\x41\\x42")
+
+
+def test_sanitize_filter_value_allows_ampersand_alone():
+    """Ampersand is safe in filter values (only dangerous in shell 
contexts)."""
+    assert sanitize_filter_value("AT&T") == "AT&T"

Review Comment:
   **Suggestion:** The test asserts that an ampersand is allowed ("AT&T"), but 
the current `_check_sql_patterns` flags `&` as an unsafe character, so this 
test will fail against the present implementation. Either the sanitizer must be 
changed to allow `&` in filter values, or the test should expect the current 
behavior; update the test to expect the sanitizer's current error. [logic error]
   
   <details>
   <summary><b>Severity Level:</b> Critical 🚨</summary>
   
   ```mdx
   - ❌ CI unit tests fail blocking PR validation.
   - ⚠️ Ambiguity about ampersand policy in sanitizer.
   ```
   </details>
   
   ```suggestion
       """Ampersand is considered unsafe by the current implementation."""
       with pytest.raises(ValueError, match="potentially unsafe characters"):
           sanitize_filter_value("AT&T")
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Run the single unit test: pytest
   
tests/unit_tests/mcp_service/utils/test_sanitization.py::test_sanitize_filter_value_allows_ampersand_alone
   
      - Test located at 
tests/unit_tests/mcp_service/utils/test_sanitization.py:398-400.
   
   2. The test calls sanitize_filter_value("AT&T") at
   tests/unit_tests/mcp_service/utils/test_sanitization.py:400.
   
   3. sanitize_filter_value triggers SQL/shell-character checks implemented in:
   
      - superset/mcp_service/utils/sanitization.py:_check_sql_patterns (see 
unsafe characters
      check at lines 100-125).
   
      - _check_sql_patterns currently considers & unsafe via the regex 
r"[;|&$`]|--"
      (superset/mcp_service/utils/sanitization.py:109-112).
   
   4. Given the current implementation, sanitize_filter_value("AT&T") will 
raise ValueError
   for "potentially unsafe characters", so the test's assertion that the call 
returns "AT&T"
   is false — running the test reproduces the failure immediately. The mismatch 
reflects
   either an intentional sanitizer policy (disallow &), or a test that needs 
updating if the
   project's policy is to allow &.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** tests/unit_tests/mcp_service/utils/test_sanitization.py
   **Line:** 399:400
   **Comment:**
        *Logic Error: The test asserts that an ampersand is allowed ("AT&T"), 
but the current `_check_sql_patterns` flags `&` as an unsafe character, so this 
test will fail against the present implementation. Either the sanitizer must be 
changed to allow `&` in filter values, or the test should expect the current 
behavior; update the test to expect the sanitizer's current error.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to