This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new e0a0508 fix(privacy-llm-redactor): correct --field help text and
force UTF-8 mapping reads (#231)
e0a0508 is described below
commit e0a0508ffee6fa390fe4a1948fc9c638d6fba29e
Author: André Ahlert <[email protected]>
AuthorDate: Tue May 19 19:49:43 2026 -0300
fix(privacy-llm-redactor): correct --field help text and force UTF-8
mapping reads (#231)
Two critical correctness bugs in the redactor package.
1. The --field help text listed a type name "reporter" and code "R"
that the parser does not accept (valid names come from TYPE_CODES:
name/email/phone/ip/handle/address, codes N/E/P/IP/H/A). A user
copying the documented form got SystemExit and their PII flowed to
the LLM unredacted. Help text now lists the real names and codes.
2. load_mapping read the mapping file with the locale-default encoding
while save_mapping_atomic writes UTF-8. On a non-UTF-8 host this
corrupts non-ASCII PII values (accented names, IDN domains) on the
round-trip, so pii-reveal substitutes wrong text. load_mapping now
reads with encoding="utf-8".
Adds a regression test for each fix.
---
tools/privacy-llm/redactor/src/redactor/mapping.py | 2 +-
tools/privacy-llm/redactor/src/redactor/redact.py | 2 +-
tools/privacy-llm/redactor/tests/test_mapping.py | 17 +++++++++++++++++
tools/privacy-llm/redactor/tests/test_redact.py | 19 ++++++++++++++++++-
4 files changed, 37 insertions(+), 3 deletions(-)
diff --git a/tools/privacy-llm/redactor/src/redactor/mapping.py
b/tools/privacy-llm/redactor/src/redactor/mapping.py
index 6248d47..2926d75 100644
--- a/tools/privacy-llm/redactor/src/redactor/mapping.py
+++ b/tools/privacy-llm/redactor/src/redactor/mapping.py
@@ -90,7 +90,7 @@ def load_mapping(path: pathlib.Path) -> dict[str, Entry]:
"""
if not path.exists():
return {}
- raw = json.loads(path.read_text())
+ raw = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(raw, dict):
raise ValueError(f"{path}: expected a JSON object at the top level")
version = raw.get("version")
diff --git a/tools/privacy-llm/redactor/src/redactor/redact.py
b/tools/privacy-llm/redactor/src/redactor/redact.py
index eccd6f2..6fb7d37 100644
--- a/tools/privacy-llm/redactor/src/redactor/redact.py
+++ b/tools/privacy-llm/redactor/src/redactor/redact.py
@@ -130,7 +130,7 @@ def main(argv: list[str] | None = None) -> int:
help=(
"PII to redact, declared as type:value. "
"Repeat for each field. Type is one of: "
- "reporter, email, phone, ip, handle, address (or codes R, E, P,
IP, H, A)."
+ "name, email, phone, ip, handle, address (or codes N, E, P, IP, H,
A)."
),
)
parser.add_argument(
diff --git a/tools/privacy-llm/redactor/tests/test_mapping.py
b/tools/privacy-llm/redactor/tests/test_mapping.py
index 8afec1a..f9d8b55 100644
--- a/tools/privacy-llm/redactor/tests/test_mapping.py
+++ b/tools/privacy-llm/redactor/tests/test_mapping.py
@@ -148,6 +148,23 @@ def test_save_and_load_round_trip(tmp_path: pathlib.Path):
assert loaded == mapping
+def test_load_round_trips_non_ascii_values(tmp_path: pathlib.Path):
+ """Non-ASCII PII values must survive a save/load round-trip.
+
+ Regression: ``load_mapping`` read the file with the locale-default
+ encoding while ``save_mapping_atomic`` writes UTF-8, corrupting
+ non-ASCII values (accented names, IDN domains) on non-UTF-8 hosts.
+ """
+ path = tmp_path / "pii.json"
+ mapping: dict[str, Entry] = {}
+ upsert(mapping, "N", "José Müller")
+ upsert(mapping, "E", "renée@exámple.com")
+
+ save_mapping_atomic(path, mapping)
+ loaded = load_mapping(path)
+ assert loaded == mapping
+
+
def test_save_creates_parent_dir(tmp_path: pathlib.Path):
path = tmp_path / "deeper" / "nested" / "pii.json"
mapping: dict[str, Entry] = {}
diff --git a/tools/privacy-llm/redactor/tests/test_redact.py
b/tools/privacy-llm/redactor/tests/test_redact.py
index 8527d74..7feb317 100644
--- a/tools/privacy-llm/redactor/tests/test_redact.py
+++ b/tools/privacy-llm/redactor/tests/test_redact.py
@@ -77,6 +77,23 @@ def test_parse_field_rejects_empty_value():
redact.parse_field("name:")
+def test_field_help_text_lists_real_type_names(monkeypatch):
+ """The ``--field`` help must name types the parser accepts.
+
+ Regression: the help listed ``reporter`` / code ``R``, neither of
+ which exists. A user copying the help got ``SystemExit`` and their
+ PII flowed to the LLM unredacted.
+ """
+ stdout = io.StringIO()
+ monkeypatch.setattr("sys.stdout", stdout)
+ with pytest.raises(SystemExit):
+ redact.main(["--help"])
+ # argparse wraps the help line; collapse whitespace before matching.
+ help_text = " ".join(stdout.getvalue().split())
+ assert "reporter" not in help_text
+ assert "name, email, phone, ip, handle, address" in help_text
+
+
# -- end-to-end redaction ------------------------------------------------
@@ -102,7 +119,7 @@ def test_redact_persists_mapping(mapping_path, monkeypatch):
)
assert rc == 0
mapping = load_mapping(mapping_path)
- # Exactly one entry, of type reporter, value "Jane Smith".
+ # Exactly one entry, of type name, value "Jane Smith".
assert len(mapping) == 1
[entry] = mapping.values()
assert entry.type == "name"