Hi,
  the attached patch is meant to implement rfc3986, and eventually
supersede rfc1738, which is not API-compatible with.

It implements two methods, templatized to be available to both SBuf
and std::string using the same implementation (called String
hereafter):

String rfc3986_unescape(const String &)
returns a new String () which percent-decodes the argument.

String rfc3986_escape(const String &, const CharacterSet&)
returns a new String which percent-encodes the first argument
according to the characters-to-be-escaped mask passed as second
argument (defaults to "unsafe and ctrls", which is defined the same as
in current rfc1738).

RFC3986 is a small auxiliary class meant to have a namespace-like
behavior for a bunch of constant CharacterSet which implement all
currently defined escape-masks, plus these in RFC 3986.

Unit tests have been adapted from rfc1738's and behavior matches
between the two implementations.

Unscientific performance testing (the code is in a disabled-by-default
unit test for those wishing to check the method) shows that the
rfc3986 implementation is faster than rfc1738's both in escaping and
unescaping by about 50%, except probably in the case where a fully
in-place unescaping is possible (this has not been tested).

This patch requires the SBuf API patch and the CharacterSet patch I
recently posted for review.

The feature-branch is at lp:~kinkie/squid/rfc3986

Thanks

-- 
    Francesco
=== added file 'include/rfc3986.h'
--- include/rfc3986.h	1970-01-01 00:00:00 +0000
+++ include/rfc3986.h	2015-12-27 18:34:58 +0000
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_INCLUDE_RFC3986_H_
+#define SQUID_INCLUDE_RFC3986_H_
+
+#include "base/CharacterSet.h"
+
+// duplicate from rfc1738.c
+static int
+fromhex(char ch)
+{
+    if (ch >= '0' && ch <= '9')
+        return ch - '0';
+    if (ch >= 'a' && ch <= 'f')
+        return ch - 'a' + 10;
+    if (ch >= 'A' && ch <= 'F')
+        return ch - 'A' + 10;
+    return -1;
+}
+
+// return a static 2-char buffer with a hex representation of argument
+static char*
+tohex(const unsigned char c)
+{
+    static char rv[3];
+    (void) snprintf ( rv, 3, "%02X", c);
+    return rv;
+}
+
+/** unescape a percent-encoded string
+ *
+ * API-compatible with std::string and SBuf
+ */
+template <class String>
+String rfc3986_unescape(const String &s)
+{
+    typename String::size_type pos=s.find('%');
+    if (pos == String::npos)
+        return s;
+    String rv;
+    rv.reserve(s.length());
+    const auto e=s.end();
+    for (auto in = s.begin(); in != e; ++in) {
+        if (*in != '%') { // normal case, copy and continue
+            rv.push_back(*in);
+            continue;
+        }
+        auto ti = in;
+        ++ti;
+        if (ti == e) { // String ends in %
+            rv.push_back(*in);
+            break;
+        }
+        if (*ti == '%') { //double '%' escaping
+            rv.push_back(*in);
+            ++in;
+            continue;
+        }
+        const int v1 = fromhex(*ti);
+        if (v1 < 0) { // decoding failed at first hextdigit
+            rv.push_back(*in);
+            continue;
+        }
+        ++ti;
+        if (ti == e) { // String ends in '%[[:hexdigit:]]'
+            rv.push_back(*in);
+            continue;
+        }
+        const int v2 = fromhex(*ti);
+        if (v2 < 0) { // decoding failed at second hextdigit
+            rv.push_back(*in);
+            continue;
+        }
+        const int x = v1 << 4 | v2;
+        if (x > 0 && x <= 255) {
+            rv.push_back(static_cast<char>(x));
+            ++in; ++in;
+            continue;
+        }
+        rv.push_back(*in);
+    }
+    return rv;
+}
+
+class RFC3986 {
+public:
+    const static CharacterSet
+        Unsafe,  // RFC 1738 unsafe set
+        Ctrls,   // control characters (\0x00 to \0x1f)
+        UnsafeAndCtrls, // RFC 1738 Unsafe and Ctrls
+        Reserved1738, // RFC 1738 Reserved set
+        GenDelims,// RFC 3986 gen-delims set
+        SubDelims,// RFC 3986 sub-delims set
+        Reserved, // RFC 3986 reserved characters set
+        Unreserved, // RFC 3986 unreserved characters set
+        Unescaped,//ctrls and unsafe except for percent symbol
+        All;
+
+};
+
+template <class String>
+String rfc3986_escape(const String &s, const CharacterSet &escapeChars = RFC3986::UnsafeAndCtrls)
+{
+    String rv;
+    bool didEscape = false;
+    rv.reserve(s.length()*2); //TODO: optimize arbitrary constant
+    for (auto c : s) {
+        if (escapeChars[c]) {
+            rv.push_back('%');
+            char *hex=tohex(c);
+            rv.push_back(hex[0]);
+            rv.push_back(hex[1]);
+            didEscape = true;
+        } else {
+            rv.push_back(c);
+        }
+    }
+    if (didEscape)
+        return rv;
+    else
+        return s;
+}
+
+#endif /* SQUID_INCLUDE_RFC3986_H_ */

=== modified file 'lib/Makefile.am'
--- lib/Makefile.am	2015-08-03 03:50:25 +0000
+++ lib/Makefile.am	2015-12-27 18:34:58 +0000
@@ -42,62 +42,62 @@ EXTRA_DIST += sspwin32.cc
 endif
 if ENABLE_AUTH_NTLM
 SUBDIRS += ntlmauth
 endif
 
 #
 # dirent.c, encrypt.c and getopt.c are needed for native Windows support.
 #
 EXTRA_libmiscutil_la_SOURCES = \
 	dirent.c \
 	encrypt.c \
 	getopt.c
 
 libmiscencoding_la_SOURCES = \
 	base64.c \
 	charset.c \
 	html_quote.c \
 	md5.c \
 	rfc1738.c \
 	rfc2617.c \
+	rfc3986.cc \
 	uudecode.c
 
 libmisccontainers_la_SOURCES = \
 	hash.cc
 
 libmiscutil_la_SOURCES = \
 	getfullhostname.c \
 	heap.c \
 	iso3307.c \
 	radix.c \
 	rfc1123.c \
 	$(SNPRINTFSOURCE) \
 	Splay.cc \
 	stub_memaccount.c \
 	util.c \
 	xusleep.c
 
 TESTS += tests/testRFC1738
 
 check_PROGRAMS += tests/testRFC1738
 
 tests_testRFC1738_SOURCES= \
 	tests/testRFC1738.h \
 	tests/testRFC1738.cc
 
 tests_testRFC1738_LDADD= \
 	$(SQUID_CPPUNIT_LA) $(SQUID_CPPUNIT_LIBS) \
 	$(top_builddir)/lib/libmiscencoding.la \
 	$(top_builddir)/lib/libmiscutil.la \
 	$(COMPAT_LIB)
 
 tests_testRFC1738_LDFLAGS = $(LIBADD_DL)
 
-
 ## Special Universal .h dependency test script
 ## aborts if error encountered
 testHeaders: $(top_srcdir)/include/*.h
 	$(SHELL) $(top_srcdir)/test-suite/testheaders.sh "$(CXXCOMPILE)" $^ || exit 1
 
 TESTS += testHeaders
 CLEANFILES += testHeaders
 .PHONY: testHeaders

=== added file 'lib/rfc3986.cc'
--- lib/rfc3986.cc	1970-01-01 00:00:00 +0000
+++ lib/rfc3986.cc	2015-12-27 18:34:58 +0000
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#include "squid.h"
+#include "rfc3986.h"
+#include "base/CharacterSet.h"
+
+const CharacterSet
+    RFC3986::Unsafe("rfc1738:unsafe", "<>\"# %{}|\\^~[]`'"),
+    RFC3986::Ctrls("rfc1738:ctrls", {{0x00, 0x1f}, {0x7f,0xff}}),
+    RFC3986::Reserved1738("rfc1738:reserved", ";/?:@=&"),
+    RFC3986::UnsafeAndCtrls = Unsafe + Ctrls,
+    RFC3986::GenDelims("rfc3986:gen-delims",":/?#[]@"),
+    RFC3986::SubDelims("rfc3986:sub-delims","!$&'()*+,;="),
+    RFC3986::Reserved = (GenDelims + SubDelims).rename("rfc3986:reserved"),
+    RFC3986::Unreserved = CharacterSet("rfc3986:unreserved","-._~") +
+        CharacterSet::ALPHA + CharacterSet::DIGIT,
+    RFC3986::Unescaped = (UnsafeAndCtrls - CharacterSet(nullptr,"%") ).rename("rfc1738:unescaped"),
+    RFC3986::All = (Unsafe + Reserved + Ctrls).rename("all")
+    ;

=== modified file 'src/Makefile.am'
--- src/Makefile.am	2015-12-23 12:08:55 +0000
+++ src/Makefile.am	2015-12-27 18:34:58 +0000
@@ -3808,31 +3808,52 @@ tests_testLookupTable_LDADD = \
 	$(XTRA_LIBS)
 tests_testLookupTable_DEPENDENCIES = $(SQUID_CPPUNIT_LA)
 
 tests_testEnumIterator_SOURCES = \
 	base/EnumIterator.h \
 	tests/stub_debug.cc \
 	tests/stub_libmem.cc \
 	tests/testEnumIterator.h \
 	tests/testEnumIterator.cc
 nodist_tests_testEnumIterator_SOURCES = \
 	$(TESTSOURCES)
 tests_testEnumIterator_LDFLAGS = $(LIBADD_DL)
 tests_testEnumIterator_LDADD = \
 	base/libbase.la \
 	$(SQUID_CPPUNIT_LIBS) \
 	$(COMPAT_LIB) \
 	$(SQUID_CPPUNIT_LA) \
 	$(XTRA_LIBS)
 tests_testEnumIterator_DEPENDENCIES =
 
+check_PROGRAMS += tests/testRFC3986
+tests_testRFC3986_SOURCES = \
+	$(SBUF_SOURCE) \
+	String.cc \
+	tests/stub_debug.cc \
+	tests/stub_libmem.cc \
+	tests/stub_SBufDetailedStats.cc \
+	tests/testRFC3986.h \
+	tests/testRFC3986.cc
+nodist_tests_testRFC3986_SOURCES = \
+	$(TESTSOURCES)
+tests_testRFC3986_LDFLAGS = $(LIBADD_DL)
+tests_testRFC3986_LDADD = \
+	base/libbase.la \
+	$(top_builddir)/lib/libmiscencoding.la \
+	$(SQUID_CPPUNIT_LIBS) \
+	$(COMPAT_LIB) \
+	$(SQUID_CPPUNIT_LA) \
+	$(XTRA_LIBS)
+tests_testRFC3986_DEPENDENCIES =
+
 TESTS += testHeaders
 
 ## Special Universal .h dependency test script
 ## aborts if error encountered
 testHeaders: $(srcdir)/*.h $(srcdir)/DiskIO/*.h $(srcdir)/DiskIO/*/*.h
 	$(SHELL) $(top_srcdir)/test-suite/testheaders.sh "$(CXXCOMPILE)" $^ || exit 1
 ## src/repl/ has no .h files and its own makefile.
 
 CLEANFILES += testHeaders
 .PHONY: testHeaders
 

=== added file 'src/tests/testRFC3986.cc'
--- src/tests/testRFC3986.cc	1970-01-01 00:00:00 +0000
+++ src/tests/testRFC3986.cc	2015-12-27 18:34:58 +0000
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#include "squid.h"
+#include "SBuf.h"
+#include "testRFC3986.h"
+#include "rfc1738.h"
+#include "unitTestMain.h"
+
+#include <cassert>
+
+/* Being a C library code it is best bodily included and tested with C++ type-safe techniques. */
+#include "lib/rfc3986.cc"
+
+CPPUNIT_TEST_SUITE_REGISTRATION( testRFC3986 );
+
+static void
+performDecodingTest(const std::string &encoded_str, const std::string &plaintext_str)
+{
+    std::string decoded_str = rfc3986_unescape(encoded_str);
+    CPPUNIT_ASSERT_EQUAL(plaintext_str, decoded_str);
+
+    SBuf encoded_sbuf(encoded_str);
+    SBuf plaintext_sbuf(plaintext_str);
+    SBuf decoded_sbuf = rfc3986_unescape(encoded_sbuf);
+    CPPUNIT_ASSERT_EQUAL(plaintext_sbuf, decoded_sbuf);
+}
+
+/* Regular Format de-coding tests */
+void testRFC3986::testUrlDecode()
+{
+    performDecodingTest("%2Fdata%2Fsource%2Fpath","/data/source/path");
+    performDecodingTest("http://foo.invalid%2Fdata%2Fsource%2Fpath";,
+                    "http://foo.invalid/data/source/path";);
+    // TODO query string
+
+    performDecodingTest("1 w%0Ard","1 w\nrd"); // Newline %0A encoded
+    performDecodingTest("2 w%rd","2 w%rd"); // Un-encoded %
+    performDecodingTest("3 w%%rd","3 w%rd"); // encoded %
+    performDecodingTest("5 Bad String %1","5 Bad String %1"); // corrupt string
+    performDecodingTest("6 Bad String %1A%3","6 Bad String \032%3"); //partly corrupt string
+    performDecodingTest("7 Good String %1A","7 Good String \032"); // non corrupt string
+    //test various endings
+    performDecodingTest("8 word%","8 word%");
+    performDecodingTest("9 word%z","9 word%z");
+    performDecodingTest("10 word%1","10 word%1");
+    performDecodingTest("11 word%1q","11 word%1q");
+    performDecodingTest("12 word%1a","12 word\032");
+ }
+
+// perform a test for std::string, SBuf and if rfc1738flag is != 0 compare
+//  against rfc1738 implementation
+static void
+performEncodingTest(const char *plaintext_str, const char *encoded_str, int rfc1738flag, const CharacterSet  &rfc3986CSet)
+{
+    CPPUNIT_ASSERT_EQUAL(std::string(encoded_str), rfc3986_escape(std::string(plaintext_str), rfc3986CSet));
+    CPPUNIT_ASSERT_EQUAL(SBuf(encoded_str), rfc3986_escape(SBuf(plaintext_str), rfc3986CSet));
+    if (!rfc1738flag)
+        return;
+    char *result = rfc1738_do_escape(plaintext_str, rfc1738flag);
+    CPPUNIT_ASSERT_EQUAL(std::string(encoded_str), std::string(result));
+}
+void testRFC3986::testUrlEncode()
+{
+    /* TEST: Escaping only unsafe characters */
+    performEncodingTest("http://foo.invalid/data/source/path";,
+                        "http://foo.invalid/data/source/path";,
+                        RFC1738_ESCAPE_UNSAFE, RFC3986::Unsafe);
+
+    /* regular URL (no encoding needed) */
+    performEncodingTest("http://foo.invalid/data/source/path";,
+                        "http://foo.invalid/data/source/path";,
+                        RFC1738_ESCAPE_UNSAFE, RFC3986::Unsafe);
+
+    /* long string of unsafe # characters */
+    performEncodingTest("################ ################ ################ ################ ################ ################ ################ ################",
+                        "%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23",
+                        RFC1738_ESCAPE_UNSAFE, RFC3986::Unsafe);
+
+    /* TEST: escaping only reserved characters */
+
+    /* regular URL (full encoding requested) */
+    performEncodingTest("http://foo.invalid/data/source/path";,
+                        "http%3A%2F%2Ffoo.invalid%2Fdata%2Fsource%2Fpath",
+                        RFC1738_ESCAPE_RESERVED, RFC3986::Reserved);
+
+    /* regular path (encoding wanted for ALL special chars) */
+    performEncodingTest("/data/source/path",
+                        "%2Fdata%2Fsource%2Fpath",
+                        RFC1738_ESCAPE_RESERVED, RFC3986::Reserved);
+
+    /* TEST: safety-escaping a string already partially escaped */
+
+    /* escaping of dangerous characters in a partially escaped string */
+    performEncodingTest("http://foo.invalid/data%2Fsource[]";,
+                        "http://foo.invalid/data%2Fsource%5B%5D";,
+                        RFC1738_ESCAPE_UNESCAPED, RFC3986::Unescaped);
+
+    /* escaping of hexadecimal 0xFF characters in a partially escaped string */
+    performEncodingTest("http://foo.invalid/data%2Fsource\xFF\xFF";,
+                        "http://foo.invalid/data%2Fsource%FF%FF";,
+                        RFC1738_ESCAPE_UNESCAPED, RFC3986::Unescaped);
+}
+
+/** SECURITY BUG TESTS: avoid null truncation attacks by skipping %00 bytes */
+void testRFC3986::PercentZeroNullDecoding()
+{
+    /* Attack with %00 encoded NULL */
+    performDecodingTest("w%00rd", "w%00rd");
+
+    /* Attack with %0 encoded NULL */
+    performDecodingTest("w%0rd", "w%0rd");
+
+    /* Handle '0' bytes embeded in encoded % */
+    performDecodingTest("w%%00%rd", "w%00%rd");
+
+    /* Handle NULL bytes with encoded % */
+    performDecodingTest("w%%%00%rd", "w%%00%rd");
+}
+
+void
+testRFC3986::testPerformance()
+{
+
+}
+
+

=== added file 'src/tests/testRFC3986.h'
--- src/tests/testRFC3986.h	1970-01-01 00:00:00 +0000
+++ src/tests/testRFC3986.h	2015-12-27 18:34:58 +0000
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_LIB_TEST_RFC3986_H
+#define SQUID_LIB_TEST_RFC3986_H
+
+#include <cppunit/extensions/HelperMacros.h>
+
+/**
+ * Test the URL coder RFC 3986 Engine
+ */
+class testRFC3986 : public CPPUNIT_NS::TestFixture
+{
+    CPPUNIT_TEST_SUITE( testRFC3986 );
+    CPPUNIT_TEST( testUrlDecode );
+    CPPUNIT_TEST( testUrlEncode );
+    CPPUNIT_TEST( PercentZeroNullDecoding );
+    CPPUNIT_TEST( testPerformance );
+    CPPUNIT_TEST_SUITE_END();
+
+public:
+
+protected:
+    void testUrlDecode();
+    void testUrlEncode();
+
+    // bugs.
+    void PercentZeroNullDecoding();
+    void testPerformance();
+};
+
+#endif /* SQUID_LIB_TEST_RFC3986_H */
+

_______________________________________________
squid-dev mailing list
[email protected]
http://lists.squid-cache.org/listinfo/squid-dev

Reply via email to