- These of course rely on the previous patch - fixing the detox bug causing
malformed UTF-8 chars.
- example table files provided, and can easily imagine some people will now
come up with specific sets of files which would work for their language, e.g.
Java creates (anonymous class) class files with '$' symbol in them, which
could possibly be useful for detox to know about (then again, may be not)
- added commend to detoxrc.sample about utf-8 cleaning method not yet done
---
etc/detoxrc.sample | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-----
table/brackets.tbl | 12 +++++++++++
table/punct1.tbl | 23 ++++++++++++++++++++
table/punct2.tbl | 16 ++++++++++++++
table/space.tbl | 5 +++++
5 files changed, 114 insertions(+), 5 deletions(-)
create mode 100644 table/brackets.tbl
create mode 100644 table/punct1.tbl
create mode 100644 table/punct2.tbl
create mode 100644 table/space.tbl
diff --git a/etc/detoxrc.sample b/etc/detoxrc.sample
index 3247fc7..1e8bfe7 100644
--- a/etc/detoxrc.sample
+++ b/etc/detoxrc.sample
@@ -6,15 +6,15 @@
# met:
#
# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
+# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of author nor the names of its contributors may be
-# used to endorse or promote products derived from this software
-# without specific prior written permission.
+# used to endorse or promote products derived from this software
+# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -32,6 +32,7 @@
#
# Basically just utf_8
#
+
sequence default {
utf_8;
safe;
@@ -67,6 +68,29 @@ sequence "lower" {
wipeup;
};
+sequence "punctuation" {
+ safe {filename "/usr/share/detox/space.tbl";};
+ safe {filename "/usr/share/detox/brackets.tbl";};
+ safe {filename "/usr/share/detox/punct1.tbl";};
+ wipeup;
+};
+
+sequence "unix" {
+ uncgi;
+ # perhaps insert utf_8 fall through option here (when implemented) ?
+ # i.e. unicode control characters, special blocks, line terminators
+ # (there's at least 4) etc, should be filtered out here, or in the
+ # lines below, but we need first to be able to replace "safe" with
+ # "utf_8" detox (internal) processing codepath (that's the "when
+ # implemented" bit :)
+ safe {filename "/usr/share/detox/space.tbl";};
+ safe {filename "/usr/share/detox/brackets.tbl";};
+ safe {filename "/usr/share/detox/punct1.tbl";};
+ safe {filename "/usr/share/detox/punct2.tbl";};
+ wipeup {remove_trailing;};
+};
+
+
#
# Sequences meant primarily for inline-detox
#
@@ -87,6 +111,35 @@ sequence "lower-only" {
lower;
};
+sequence "space" {
+ safe {filename "/usr/share/detox/space.tbl";};
+};
+
+sequence "brackets" {
+ safe {filename "/usr/share/detox/brackets.tbl";};
+};
+
+sequence "punct1" {
+ safe {filename "/usr/share/detox/punct1.tbl";};
+};
+
+sequence "punct2" {
+ safe {filename "/usr/share/detox/punct2.tbl";};
+};
+
+sequence "shell-punct" {
+ safe {filename "/usr/share/detox/space.tbl";};
+ safe {filename "/usr/share/detox/punct1.tbl";};
+};
+
+sequence "punct" {
+ # for performance, these might need to be combined into one file
+ safe {filename "/usr/share/detox/space.tbl";};
+ safe {filename "/usr/share/detox/brackets.tbl";};
+ safe {filename "/usr/share/detox/punct1.tbl";};
+ safe {filename "/usr/share/detox/punct2.tbl";};
+};
+
#
# Files to ignore (detox only)
diff --git a/table/brackets.tbl b/table/brackets.tbl
new file mode 100644
index 0000000..ade8770
--- /dev/null
+++ b/table/brackets.tbl
@@ -0,0 +1,12 @@
+# See file "LICENSE" for distribution and modification terms.
+
+start
+
+0x28 - # (
+0x29 - # )
+0x5b - # [
+0x5d - # ]
+0x7b - # {
+0x7d - # }
+
+end
diff --git a/table/punct1.tbl b/table/punct1.tbl
new file mode 100644
index 0000000..e5eb817
--- /dev/null
+++ b/table/punct1.tbl
@@ -0,0 +1,23 @@
+# See file "LICENSE" for distribution and modification terms.
+
+start
+
+0x21 _ # !
+0x22 _ # "
+0x24 _ # $
+0x27 _ # '
+0x2a _ # *
+0x2f _ # /
+0x3a _ # :
+0x3b _ # ;
+0x3c _ # <
+0x3e _ # >
+0x3f _ # ?
+0x40 _ # @
+0x5c _ # \
+0x60 _ # `
+0x7c _ # |
+
+0x26 _and_ # &
+
+end
diff --git a/table/punct2.tbl b/table/punct2.tbl
new file mode 100644
index 0000000..7a49307
--- /dev/null
+++ b/table/punct2.tbl
@@ -0,0 +1,16 @@
+# See file "LICENSE" for distribution and modification terms.
+
+start
+
+0x23 '#'
+0x25 %
+0x2b +
+0x2c ,
+0x2d -
+0x2e .
+0x3d =
+0x5e ^
+0x5f _
+0x7e ~
+
+end
diff --git a/table/space.tbl b/table/space.tbl
new file mode 100644
index 0000000..5288442
--- /dev/null
+++ b/table/space.tbl
@@ -0,0 +1,5 @@
+# See file "LICENSE" for distribution and modification terms.
+
+start
+0x0020 _ # space
+end
--
2.9.0