On Tue, 2012-03-06 at 20:19 +0100, Lukas Zeller wrote: > Hi Patrick, > > On Mar 6, 2012, at 14:55 , Patrick Ohly wrote: > > > Is there some utility code for decoding an URI into its individual > > parts? > > Yes, it's called splitURL in sysync_utils.h/.cpp > > > Simple string operations don't work for an uri like this: > > > > file:///tmp/abc%25def => file:// + /tmp/abc%def > > splitURL in its current state cannot do that correctly either. > However, I agree it would make sense to enhance it rather than > creating a new function.
Attached is my attempt on doing that. Does that look right? -- Best Regards, Patrick Ohly The content of this message is my personal opinion only and although I am an employee of Intel, the statements I make here in no way represent Intel's position on the issue, nor am I authorized to speak on behalf of Intel on this matter.
>From 090cd1d811e98ce043f208bd3f23b1bcd0dd36aa Mon Sep 17 00:00:00 2001 From: Patrick Ohly <[email protected]> Date: Tue, 6 Mar 2012 13:58:35 +0000 Subject: [PATCH] scripting: file URI decoding Utility method for extracting the file path from an URI. Needed for inlining PHOTO data in scripts before sending to a peer. Uses the existing splitURL() method and enhances it, because the previous parser was a bit too simplistic. For example, in file:///tmp/file it skipped all the leading slashes and interpreted "tmp" as host name, instead of returning an empty host name. Query and port number are split from the host and doc part directly by splitURL() now because that seems more consistent. The in-tree code using syncURL() concatenates the separate parts to keep its own semantic unchanged. URL %xx decoding must be handled by the caller of syncURL(), using the new urlDecode() utility code. This is not done in syncURL() to simplify concatenating parts (would have to url encode reserved characters otherwise). The new code always returns an empty doc part. There's no longer a special case for inserting a slash when only the query part is set, because the caller is already expected to treat the doc name as starting in the root of the document hierarchy. --- src/sysync/itemfield.cpp | 2 +- src/sysync/scriptcontext.cpp | 21 ++++ src/sysync/syncagent.cpp | 12 ++- src/sysync/syncclientbase.cpp | 18 +++- src/sysync_SDK/Sources/sysync_utils.cpp | 199 +++++++++++++++++++++++++++---- src/sysync_SDK/Sources/sysync_utils.h | 9 +- 6 files changed, 228 insertions(+), 33 deletions(-) diff --git a/src/sysync/itemfield.cpp b/src/sysync/itemfield.cpp index ccb39a6..b7f68ed 100644 --- a/src/sysync/itemfield.cpp +++ b/src/sysync/itemfield.cpp @@ -1031,7 +1031,7 @@ void TURLField::stringWasAssigned(void) string proto; if (!fString.empty()) { // make sure we have a URL with protocol - splitURL(fString.c_str() ,&proto, NULL, NULL, NULL, NULL); + splitURL(fString.c_str() ,&proto, NULL, NULL, NULL, NULL, NULL, NULL); if (proto.empty()) { // no protocol set, but string not empty --> assume http fString.insert(0, "http://"); diff --git a/src/sysync/scriptcontext.cpp b/src/sysync/scriptcontext.cpp index b9ec512..f5a5af8 100755 --- a/src/sysync/scriptcontext.cpp +++ b/src/sysync/scriptcontext.cpp @@ -920,6 +920,26 @@ public: } } // func_Read + // string URIToPath(string uri) + // extracts the file path in a file:// uri; handles uri decoding + // Returns UNASSIGNED if not a file:// uri + static void func_URIToPath(TItemField *&aTermP, TScriptContext *aFuncContextP) + { + // get params + string uri; + aFuncContextP->getLocalVar(0)->getAsString(uri); + + string protocol, doc; + splitURL(uri.c_str(), &protocol, NULL, &doc, NULL, NULL, NULL, NULL); + if (protocol == "file") { + string path; + path.reserve(doc.size() + 1); + path += "/"; // leading slash is never included by splitURL() + path += doc; + urlDecode(&path); + aTermP->setAsString(path); + } + } // func_URIToPath // string REMOTERULENAME() // returns name of the LAST matched remote rule (or subrule), empty if none @@ -2308,6 +2328,7 @@ const TBuiltInFuncDef BuiltInFuncDefs[] = { { "REQUESTMINTIME", TBuiltinStdFuncs::func_RequestMinTime, fty_none, 1, param_oneInteger }, { "SHELLEXECUTE", TBuiltinStdFuncs::func_Shellexecute, fty_integer, 3, param_Shellexecute }, { "READ", TBuiltinStdFuncs::func_Read, fty_string, 1, param_oneString }, + { "URITOPATH", TBuiltinStdFuncs::func_URIToPath, fty_string, 1, param_oneString }, { "SESSIONVAR", TBuiltinStdFuncs::func_SessionVar, fty_none, 1, param_oneString }, { "SETSESSIONVAR", TBuiltinStdFuncs::func_SetSessionVar, fty_none, 2, param_SetSessionVar }, { "ABORTSESSION", TBuiltinStdFuncs::func_AbortSession, fty_none, 1, param_oneInteger }, diff --git a/src/sysync/syncagent.cpp b/src/sysync/syncagent.cpp index fbd88a3..3b59202 100755 --- a/src/sysync/syncagent.cpp +++ b/src/sysync/syncagent.cpp @@ -3757,7 +3757,7 @@ static TSyError readConnectHost( { TAgentParamsKey *mykeyP = static_cast<TAgentParamsKey *>(aStructFieldsKeyP); string host; - splitURL(mykeyP->fAgentP->getSendURI(),NULL,&host,NULL,NULL,NULL); + splitURL(mykeyP->fAgentP->getSendURI(),NULL,&host,NULL,NULL,NULL,NULL,NULL); return TStructFieldsKey::returnString( host.c_str(), aBuffer,aBufSize,aValSize @@ -3772,8 +3772,14 @@ static TSyError readConnectDoc( ) { TAgentParamsKey *mykeyP = static_cast<TAgentParamsKey *>(aStructFieldsKeyP); - string doc; - splitURL(mykeyP->fAgentP->getSendURI(),NULL,NULL,&doc,NULL,NULL); + string doc, query; + splitURL(mykeyP->fAgentP->getSendURI(),NULL,NULL,&doc,NULL,NULL,NULL,&query); + // old semantic of splitURL was to include query in document string, + // continue doing that + if (!query.empty()) { + doc += '?'; + doc += query; + } return TStructFieldsKey::returnString( doc.c_str(), aBuffer,aBufSize,aValSize diff --git a/src/sysync/syncclientbase.cpp b/src/sysync/syncclientbase.cpp index 50e98de..bb4b882 100644 --- a/src/sysync/syncclientbase.cpp +++ b/src/sysync/syncclientbase.cpp @@ -353,21 +353,33 @@ localstatus TSyncClientBase::processAnswer(void) // - extract hostname from an URI according to transport void TSyncClientBase::extractHostname(const char *aURI, string &aHostName) { - splitURL(aURI,NULL,&aHostName,NULL,NULL,NULL); + string port; + splitURL(aURI,NULL,&aHostName,NULL,NULL,NULL,&port,NULL); + // keep old semantic: port included in aHostName + if (!port.empty()) { + aHostName += ':'; + aHostName += port; + } } // TSyncClientBase::extractHostname // - extract document name from an URI according to transport void TSyncClientBase::extractDocumentInfo(const char *aURI, string &aDocName) { - splitURL(aURI,NULL,NULL,&aDocName,NULL,NULL); + string query; + splitURL(aURI,NULL,NULL,&aDocName,NULL,NULL,NULL,&query); + // keep old semantic: query part of aDocName + if (!query.empty()) { + aDocName += '?'; + aDocName += query; + } } // TSyncClientBase::extractDocumentInfo // - extract protocol name from an URI according to transport void TSyncClientBase::extractProtocolname(const char *aURI, string &aProtocolName) { - splitURL(aURI,&aProtocolName,NULL,NULL,NULL,NULL); + splitURL(aURI,&aProtocolName,NULL,NULL,NULL,NULL,NULL,NULL); } // TSyncClientBase::extractProtocolname diff --git a/src/sysync_SDK/Sources/sysync_utils.cpp b/src/sysync_SDK/Sources/sysync_utils.cpp index e56dd32..980d453 100755 --- a/src/sysync_SDK/Sources/sysync_utils.cpp +++ b/src/sysync_SDK/Sources/sysync_utils.cpp @@ -2241,9 +2241,55 @@ void splitHostname(const char *aHost,string *aAddr,string *aPort) } } // splitHostname +// translate %XX into corresponding character in-place +void urlDecode(string *str) +{ + // nothing todo? + if (!str || + str->find('%') == string::npos) return; + + string replacement; + replacement.reserve(str->size()); + const char *in = str->c_str(); + char c; + while ((c = *in++) != 0) { + if (c == '%') { + c = tolower(*in++); + unsigned char value = 0; + if (!c) { + break; + } else if (c >= '0' && c <= '9') { + value = c - '0'; + } else if (c >= 'a' && c <= 'f') { + value = c - 'a' + 10; + } else { + // silently skip invalid character + } + value *= 16; + c = tolower(*in++); + if (!c) { + break; + } else if (c >= '0' && c <= '9') { + value += c - '0'; + replacement.append((char *)&value, 1); + } else if (c >= 'a' && c <= 'f') { + value += c - 'a' + 10; + replacement.append((char *)&value, 1); + } else { + // silently skip invalid character + } + } else { + replacement.append(&c, 1); + } + } + *str = replacement; +} -// split URL into protocol, hostname, document name and auth-info (user, password) -void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,string *aUser, string *aPasswd) +// split URL into protocol, hostname, document name and auth-info (user, password); +// the optional query and port are not url-decoded, everything else is +void splitURL(const char *aURI,string *aProtocol,string *aHost, + string *aDoc, string *aUser, string *aPasswd, + string *aPort, string *aQuery) { const char *p,*q,*r; @@ -2254,21 +2300,37 @@ void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,stri // protocol found if (aProtocol) aProtocol->assign(p,q-p); p=q+1; // past colon - while (*p=='/') p++; // past trailing slashes + int count = 0; + while (*p=='/' && count < 2) { + p++; // past trailing slashes (two expected, ignore if less are given) + count++; + } + // now identify end of host part + string host; + q=strchr(p, '/'); + if (!q) { + // no slash, skip forward to end of string + q = p + strlen(p); + } + host.assign(p, q - p); + // if protocol specified, check for auth info - q=strchr(p,'@'); - r=strchr(p,':'); + const char *h = host.c_str(); + q=strchr(h,'@'); + r=strchr(h,':'); if (q && r && q>r) { // auth exists - if (aUser) aUser->assign(p,r-p); + if (aUser) aUser->assign(h,r-h); if (aPasswd) aPasswd->assign(r+1,q-r-1); - p=q+1; // past "@" + // skip auth in full string + p += q + 1 - h; } else { // no auth found if (aUser) aUser->erase(); if (aPasswd) aPasswd->erase(); } + // p now points to host part, as expected below } else { // no protocol found @@ -2278,35 +2340,124 @@ void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,stri if (aPasswd) aPasswd->erase(); } // separate hostname and document - // - assume path + std::string host; + // - check for path q=strchr(p,'/'); // - if no path, check if there is a CGI param directly after the host name if (!q) { + // doc part left empty in this case + if (aDoc) aDoc->erase(); q=strchr(p,'?'); - // in case of no docpath, but CGI, put '?' into docname - r=q; + if (q) { + // query directly follows host + host.assign(p, q - p); + if (aQuery) aQuery->assign(q + 1); + } else { + // entire string is considered the host + host.assign(p); + if (aQuery) aQuery->erase(); + } } else { + // host part stops at slash + host.assign(p, q - p); // in case of '/', do not put slash into docname - // except if docname would be empty otherwise - r=q+1; // exclude slash - if (*r==0) r=q; // nothing follows, include the slash - } - if (q) { - // document exists - if (aDoc) { - aDoc->erase(); - if (*q=='?') (*aDoc)+='/'; // if doc starts with CGI, we are at root - aDoc->append(r); // till end of string + // even if it would be empty (caller expected to add + // slash as needed) + p = q + 1; // exclude slash + // now check for query + q=strchr(p,'?'); + if (q) { + // split at question mark + if (aDoc) aDoc->assign(p, q - p); + if (aQuery) aQuery->assign(q + 1); + } else { + // whole string is document name + if (aDoc) aDoc->assign(p); + if (aQuery) aQuery->erase(); } - if (aHost) aHost->assign(p,q-p); // assign host (all up to / or ?) } - else { - if (aDoc) aDoc->erase(); // empty document name - if (aHost) aHost->assign(p); // entire string is host + + // remove optional port from host part before url-decoding, because + // that might introduce new : characters into the host name + size_t colon = host.find(':'); + if (colon != host.npos) { + if (aHost) aHost->assign(host.substr(0, colon)); + if (aPort) aPort->assign(host.substr(colon + 1)); + } else { + if (aHost) aHost->assign(host); + if (aPort) aPort->erase(); } } // splitURL +#ifdef SPLIT_URL_MAIN + +#include <stdio.h> +#include <assert.h> + +static void test(const std::string &in, const std::string &expected) +{ + string protocol, host, doc, user, password, port, query; + char buffer[1024]; + + splitURL(in.c_str(), &protocol, &host, &doc, &user, &password, &port, &query); + + // URL-decode each part + urlDecode(&protocol); + urlDecode(&host); + urlDecode(&doc); + urlDecode(&user); + urlDecode(&password); + + sprintf(buffer, + "prot '%s' user '%s' passwd '%s' host '%s' port '%s' doc '%s' query '%s'", + protocol.c_str(), + user.c_str(), + password.c_str(), + host.c_str(), + port.c_str(), + doc.c_str(), + query.c_str()); + printf("%s -> %s\n", in.c_str(), buffer); + assert(expected == buffer); +} + +int main(int argc, char **argv) +{ + test("http://user:passwd@host/patha/pathb?query", + "prot 'http' user 'user' passwd 'passwd' host 'host' port '' doc 'patha/pathb' query 'query'"); + test("http://user:passwd@host:port/patha/pathb?query", + "prot 'http' user 'user' passwd 'passwd' host 'host' port 'port' doc 'patha/pathb' query 'query'"); + test("file:///foo/bar", + "prot 'file' user '' passwd '' host '' port '' doc 'foo/bar' query ''"); + test("http://host%3a:port?param=value", + "prot 'http' user '' passwd '' host 'host:' port 'port' doc '' query 'param=value'"); + test("http://host%3a?param=value", + "prot 'http' user '' passwd '' host 'host:' port '' doc '' query 'param=value'"); + test("foo%24", + "prot '' user '' passwd '' host 'foo$' port '' doc '' query ''"); + test("foo%2f", + "prot '' user '' passwd '' host 'foo/' port '' doc '' query ''"); + test("foo%2A", + "prot '' user '' passwd '' host 'foo*' port '' doc '' query ''"); + test("foo%24bar", + "prot '' user '' passwd '' host 'foo$bar' port '' doc '' query ''"); + test("%24bar", + "prot '' user '' passwd '' host '$bar' port '' doc '' query ''"); + test("foo%2", + "prot '' user '' passwd '' host 'foo' port '' doc '' query ''"); + test("foo%", + "prot '' user '' passwd '' host 'foo' port '' doc '' query ''"); + test("foo%g", + "prot '' user '' passwd '' host 'foo' port '' doc '' query ''"); + test("foo%gh", + "prot '' user '' passwd '' host 'foo' port '' doc '' query ''"); + test("%ghbar", + "prot '' user '' passwd '' host 'bar' port '' doc '' query ''"); + return 0; +} +#endif // SPLIT_URL_MAIN + #endif //SYSYNC_ENGINE diff --git a/src/sysync_SDK/Sources/sysync_utils.h b/src/sysync_SDK/Sources/sysync_utils.h index ba267eb..5422117 100755 --- a/src/sysync_SDK/Sources/sysync_utils.h +++ b/src/sysync_SDK/Sources/sysync_utils.h @@ -427,8 +427,13 @@ const char *smlFirstItemDataToCharP(const SmlItemListPtr_t aItemListP); // split Hostname into address and port parts void splitHostname(const char *aHost,string *aAddr,string *aPort); -// split URL into protocol, hostname, document name and auth-info (user, password) -void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,string *aUser, string *aPasswd); +// split URL into protocol, hostname, document name and auth-info (user, password); +// none of the strings are url-decoded, do that as needed +void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,string *aUser, string *aPasswd, + string *aPort, string *aQuery); + +// in-place decoding of %XX, NULL pointer allowed +void urlDecode(string *str); // returns error code made ready for SyncML sending (that is, remove offset // of 10000 if present, and make generic error 500 for non-SyncML errors, -- 1.7.9
_______________________________________________ os-libsynthesis mailing list [email protected] http://lists.synthesis.ch/mailman/listinfo/os-libsynthesis
