On Tue, 2012-03-06 at 20:19 +0100, Lukas Zeller wrote:
> Hi Patrick,
> 
> On Mar 6, 2012, at 14:55 , Patrick Ohly wrote:
> 
> > Is there some utility code for decoding an URI into its individual
> > parts?
> 
> Yes, it's called splitURL in sysync_utils.h/.cpp
> 
> > Simple string operations don't work for an uri like this:
> > 
> > file:///tmp/abc%25def => file:// + /tmp/abc%def
> 
> splitURL in its current state cannot do that correctly either.
> However, I agree it would make sense to enhance it rather than
> creating a new function.

Attached is my attempt on doing that. Does that look right?

-- 
Best Regards, Patrick Ohly

The content of this message is my personal opinion only and although
I am an employee of Intel, the statements I make here in no way
represent Intel's position on the issue, nor am I authorized to speak
on behalf of Intel on this matter.

>From 090cd1d811e98ce043f208bd3f23b1bcd0dd36aa Mon Sep 17 00:00:00 2001
From: Patrick Ohly <[email protected]>
Date: Tue, 6 Mar 2012 13:58:35 +0000
Subject: [PATCH] scripting: file URI decoding

Utility method for extracting the file path from an URI.  Needed for
inlining PHOTO data in scripts before sending to a peer.

Uses the existing splitURL() method and enhances it, because the
previous parser was a bit too simplistic. For example, in
file:///tmp/file it skipped all the leading slashes and interpreted
"tmp" as host name, instead of returning an empty host name.

Query and port number are split from the host and doc part directly by
splitURL() now because that seems more consistent.  The in-tree code
using syncURL() concatenates the separate parts to keep its own
semantic unchanged.

URL %xx decoding must be handled by the caller of syncURL(), using the
new urlDecode() utility code. This is not done in syncURL() to
simplify concatenating parts (would have to url encode reserved
characters otherwise).

The new code always returns an empty doc part. There's no longer
a special case for inserting a slash when only the query part is
set, because the caller is already expected to treat the doc name as
starting in the root of the document hierarchy.
---
 src/sysync/itemfield.cpp                |    2 +-
 src/sysync/scriptcontext.cpp            |   21 ++++
 src/sysync/syncagent.cpp                |   12 ++-
 src/sysync/syncclientbase.cpp           |   18 +++-
 src/sysync_SDK/Sources/sysync_utils.cpp |  199 +++++++++++++++++++++++++++----
 src/sysync_SDK/Sources/sysync_utils.h   |    9 +-
 6 files changed, 228 insertions(+), 33 deletions(-)

diff --git a/src/sysync/itemfield.cpp b/src/sysync/itemfield.cpp
index ccb39a6..b7f68ed 100644
--- a/src/sysync/itemfield.cpp
+++ b/src/sysync/itemfield.cpp
@@ -1031,7 +1031,7 @@ void TURLField::stringWasAssigned(void)
   string proto;
   if (!fString.empty()) {
     // make sure we have a URL with protocol
-    splitURL(fString.c_str() ,&proto, NULL, NULL, NULL, NULL);
+    splitURL(fString.c_str() ,&proto, NULL, NULL, NULL, NULL, NULL, NULL);
     if (proto.empty()) {
       // no protocol set, but string not empty --> assume http
       fString.insert(0, "http://";);
diff --git a/src/sysync/scriptcontext.cpp b/src/sysync/scriptcontext.cpp
index b9ec512..f5a5af8 100755
--- a/src/sysync/scriptcontext.cpp
+++ b/src/sysync/scriptcontext.cpp
@@ -920,6 +920,26 @@ public:
     }
   } // func_Read
 
+  // string URIToPath(string uri)
+  // extracts the file path in a file:// uri; handles uri decoding
+  // Returns UNASSIGNED if not a file:// uri
+  static void func_URIToPath(TItemField *&aTermP, TScriptContext *aFuncContextP)
+  {
+    // get params
+    string uri;
+    aFuncContextP->getLocalVar(0)->getAsString(uri);
+
+    string protocol, doc;
+    splitURL(uri.c_str(), &protocol, NULL, &doc, NULL, NULL, NULL, NULL);
+    if (protocol == "file") {
+      string path;
+      path.reserve(doc.size() + 1);
+      path += "/"; // leading slash is never included by splitURL()
+      path += doc;
+      urlDecode(&path);
+      aTermP->setAsString(path);
+    }
+  } // func_URIToPath
 
   // string REMOTERULENAME()
   // returns name of the LAST matched remote rule (or subrule), empty if none
@@ -2308,6 +2328,7 @@ const TBuiltInFuncDef BuiltInFuncDefs[] = {
   { "REQUESTMINTIME", TBuiltinStdFuncs::func_RequestMinTime, fty_none, 1, param_oneInteger },
   { "SHELLEXECUTE", TBuiltinStdFuncs::func_Shellexecute, fty_integer, 3, param_Shellexecute },
   { "READ",  TBuiltinStdFuncs::func_Read, fty_string, 1, param_oneString },
+  { "URITOPATH",  TBuiltinStdFuncs::func_URIToPath, fty_string, 1, param_oneString },
   { "SESSIONVAR", TBuiltinStdFuncs::func_SessionVar, fty_none, 1, param_oneString },
   { "SETSESSIONVAR", TBuiltinStdFuncs::func_SetSessionVar, fty_none, 2, param_SetSessionVar },
   { "ABORTSESSION", TBuiltinStdFuncs::func_AbortSession, fty_none, 1, param_oneInteger },
diff --git a/src/sysync/syncagent.cpp b/src/sysync/syncagent.cpp
index fbd88a3..3b59202 100755
--- a/src/sysync/syncagent.cpp
+++ b/src/sysync/syncagent.cpp
@@ -3757,7 +3757,7 @@ static TSyError readConnectHost(
 {
   TAgentParamsKey *mykeyP = static_cast<TAgentParamsKey *>(aStructFieldsKeyP);
   string host;
-  splitURL(mykeyP->fAgentP->getSendURI(),NULL,&host,NULL,NULL,NULL);
+  splitURL(mykeyP->fAgentP->getSendURI(),NULL,&host,NULL,NULL,NULL,NULL,NULL);
   return TStructFieldsKey::returnString(
     host.c_str(),
     aBuffer,aBufSize,aValSize
@@ -3772,8 +3772,14 @@ static TSyError readConnectDoc(
 )
 {
   TAgentParamsKey *mykeyP = static_cast<TAgentParamsKey *>(aStructFieldsKeyP);
-  string doc;
-  splitURL(mykeyP->fAgentP->getSendURI(),NULL,NULL,&doc,NULL,NULL);
+  string doc, query;
+  splitURL(mykeyP->fAgentP->getSendURI(),NULL,NULL,&doc,NULL,NULL,NULL,&query);
+  // old semantic of splitURL was to include query in document string,
+  // continue doing that
+  if (!query.empty()) {
+    doc += '?';
+    doc += query;
+  }
   return TStructFieldsKey::returnString(
     doc.c_str(),
     aBuffer,aBufSize,aValSize
diff --git a/src/sysync/syncclientbase.cpp b/src/sysync/syncclientbase.cpp
index 50e98de..bb4b882 100644
--- a/src/sysync/syncclientbase.cpp
+++ b/src/sysync/syncclientbase.cpp
@@ -353,21 +353,33 @@ localstatus TSyncClientBase::processAnswer(void)
 // - extract hostname from an URI according to transport
 void TSyncClientBase::extractHostname(const char *aURI, string &aHostName)
 {
-  splitURL(aURI,NULL,&aHostName,NULL,NULL,NULL);
+  string port;
+  splitURL(aURI,NULL,&aHostName,NULL,NULL,NULL,&port,NULL);
+  // keep old semantic: port included in aHostName
+  if (!port.empty()) {
+    aHostName += ':';
+    aHostName += port;
+  }
 } // TSyncClientBase::extractHostname
 
 
 // - extract document name from an URI according to transport
 void TSyncClientBase::extractDocumentInfo(const char *aURI, string &aDocName)
 {
-  splitURL(aURI,NULL,NULL,&aDocName,NULL,NULL);
+  string query;
+  splitURL(aURI,NULL,NULL,&aDocName,NULL,NULL,NULL,&query);
+  // keep old semantic: query part of aDocName
+  if (!query.empty()) {
+    aDocName += '?';
+    aDocName += query;
+  }
 } // TSyncClientBase::extractDocumentInfo
 
 
 // - extract protocol name from an URI according to transport
 void TSyncClientBase::extractProtocolname(const char *aURI, string &aProtocolName)
 {
-  splitURL(aURI,&aProtocolName,NULL,NULL,NULL,NULL);
+  splitURL(aURI,&aProtocolName,NULL,NULL,NULL,NULL,NULL,NULL);
 } // TSyncClientBase::extractProtocolname
 
 
diff --git a/src/sysync_SDK/Sources/sysync_utils.cpp b/src/sysync_SDK/Sources/sysync_utils.cpp
index e56dd32..980d453 100755
--- a/src/sysync_SDK/Sources/sysync_utils.cpp
+++ b/src/sysync_SDK/Sources/sysync_utils.cpp
@@ -2241,9 +2241,55 @@ void splitHostname(const char *aHost,string *aAddr,string *aPort)
   }
 } // splitHostname
 
+// translate %XX into corresponding character in-place
+void urlDecode(string *str)
+{
+  // nothing todo?
+  if (!str ||
+      str->find('%') == string::npos) return;
+
+  string replacement;
+  replacement.reserve(str->size());
+  const char *in = str->c_str();
+  char c;
+  while ((c = *in++) != 0) {
+    if (c == '%') {
+      c = tolower(*in++);
+      unsigned char value = 0;
+      if (!c) {
+          break;
+      } else if (c >= '0' && c <= '9') {
+        value = c - '0';
+      } else if (c >= 'a' && c <= 'f') {
+        value = c - 'a' + 10;
+      } else {
+        // silently skip invalid character
+      }
+      value *= 16;
+      c = tolower(*in++);
+      if (!c) {
+        break;
+      } else if (c >= '0' && c <= '9') {
+        value += c - '0';
+        replacement.append((char *)&value, 1);
+      } else if (c >= 'a' && c <= 'f') {
+        value += c - 'a' + 10;
+        replacement.append((char *)&value, 1);
+      } else {
+        // silently skip invalid character
+      }
+    } else {
+      replacement.append(&c, 1);
+    }
+  }
+  *str = replacement;
+}
 
-// split URL into protocol, hostname, document name and auth-info (user, password)
-void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,string *aUser, string *aPasswd)
+// split URL into protocol, hostname, document name and auth-info (user, password);
+// the optional query and port are not url-decoded, everything else is
+void splitURL(const char *aURI,string *aProtocol,string *aHost, 
+              string *aDoc, string *aUser, string *aPasswd,
+              string *aPort, string *aQuery)
 {
   const char *p,*q,*r;
 
@@ -2254,21 +2300,37 @@ void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,stri
     // protocol found
     if (aProtocol) aProtocol->assign(p,q-p);
     p=q+1; // past colon
-    while (*p=='/') p++; // past trailing slashes
+    int count = 0;
+    while (*p=='/' && count < 2) {
+      p++; // past trailing slashes (two expected, ignore if less are given)
+      count++;
+    }
+    // now identify end of host part
+    string host;
+    q=strchr(p, '/');
+    if (!q) {
+      // no slash, skip forward to end of string
+      q = p + strlen(p);
+    }
+    host.assign(p, q - p);
+
     // if protocol specified, check for auth info
-    q=strchr(p,'@');
-    r=strchr(p,':');
+    const char *h = host.c_str();
+    q=strchr(h,'@');
+    r=strchr(h,':');
     if (q && r && q>r) {
       // auth exists
-      if (aUser) aUser->assign(p,r-p);
+      if (aUser) aUser->assign(h,r-h);
       if (aPasswd) aPasswd->assign(r+1,q-r-1);
-      p=q+1; // past "@"
+      // skip auth in full string
+      p += q + 1 - h;
     }
     else {
       // no auth found
       if (aUser) aUser->erase();
       if (aPasswd) aPasswd->erase();
     }
+    // p now points to host part, as expected below
   }
   else {
     // no protocol found
@@ -2278,35 +2340,124 @@ void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,stri
     if (aPasswd) aPasswd->erase();
   }
   // separate hostname and document
-  // - assume path
+  std::string host;
+  // - check for path
   q=strchr(p,'/');
   // - if no path, check if there is a CGI param directly after the host name
   if (!q) {
+    // doc part left empty in this case
+    if (aDoc) aDoc->erase();
     q=strchr(p,'?');
-    // in case of no docpath, but CGI, put '?' into docname
-    r=q;
+    if (q) {
+      // query directly follows host
+      host.assign(p, q - p);
+      if (aQuery) aQuery->assign(q + 1);
+    } else {
+      // entire string is considered the host
+      host.assign(p);
+      if (aQuery) aQuery->erase();
+    }
   }
   else {
+    // host part stops at slash
+    host.assign(p, q - p);
     // in case of '/', do not put slash into docname
-    // except if docname would be empty otherwise
-    r=q+1; // exclude slash
-    if (*r==0) r=q; // nothing follows, include the slash
-  }
-  if (q) {
-    // document exists
-    if (aDoc) {
-      aDoc->erase();
-      if (*q=='?') (*aDoc)+='/'; // if doc starts with CGI, we are at root
-      aDoc->append(r); // till end of string
+    // even if it would be empty (caller expected to add
+    // slash as needed)
+    p = q + 1; // exclude slash
+    // now check for query
+    q=strchr(p,'?');
+    if (q) {
+      // split at question mark
+      if (aDoc) aDoc->assign(p, q - p);
+      if (aQuery) aQuery->assign(q + 1);
+    } else {
+      // whole string is document name
+      if (aDoc) aDoc->assign(p);
+      if (aQuery) aQuery->erase();
     }
-    if (aHost) aHost->assign(p,q-p); // assign host (all up to / or ?)
   }
-  else {
-    if (aDoc) aDoc->erase(); // empty document name
-    if (aHost) aHost->assign(p); // entire string is host
+
+  // remove optional port from host part before url-decoding, because
+  // that might introduce new : characters into the host name
+  size_t colon = host.find(':');
+  if (colon != host.npos) {
+    if (aHost) aHost->assign(host.substr(0, colon));
+    if (aPort) aPort->assign(host.substr(colon + 1));
+  } else {
+    if (aHost) aHost->assign(host);
+    if (aPort) aPort->erase();
   }
 } // splitURL
 
+#ifdef SPLIT_URL_MAIN
+
+#include <stdio.h>
+#include <assert.h>
+
+static void test(const std::string &in, const std::string &expected)
+{
+  string protocol, host, doc, user, password, port, query;
+  char buffer[1024];
+
+  splitURL(in.c_str(), &protocol, &host, &doc, &user, &password, &port, &query);
+
+  // URL-decode each part
+  urlDecode(&protocol);
+  urlDecode(&host);
+  urlDecode(&doc);
+  urlDecode(&user);
+  urlDecode(&password);
+
+  sprintf(buffer,
+          "prot '%s' user '%s' passwd '%s' host '%s' port '%s' doc '%s' query '%s'",
+          protocol.c_str(),
+          user.c_str(),
+          password.c_str(),
+          host.c_str(),
+          port.c_str(),
+          doc.c_str(),
+          query.c_str());
+  printf("%s -> %s\n", in.c_str(), buffer);
+  assert(expected == buffer);
+}
+
+int main(int argc, char **argv)
+{
+  test("http://user:passwd@host/patha/pathb?query";,
+       "prot 'http' user 'user' passwd 'passwd' host 'host' port '' doc 'patha/pathb' query 'query'");
+  test("http://user:passwd@host:port/patha/pathb?query";,
+       "prot 'http' user 'user' passwd 'passwd' host 'host' port 'port' doc 'patha/pathb' query 'query'");
+  test("file:///foo/bar",
+       "prot 'file' user '' passwd '' host '' port '' doc 'foo/bar' query ''");
+  test("http://host%3a:port?param=value";,
+       "prot 'http' user '' passwd '' host 'host:' port 'port' doc '' query 'param=value'");
+  test("http://host%3a?param=value";,
+       "prot 'http' user '' passwd '' host 'host:' port '' doc '' query 'param=value'");
+  test("foo%24",
+       "prot '' user '' passwd '' host 'foo$' port '' doc '' query ''");
+  test("foo%2f",
+       "prot '' user '' passwd '' host 'foo/' port '' doc '' query ''");
+  test("foo%2A",
+       "prot '' user '' passwd '' host 'foo*' port '' doc '' query ''");
+  test("foo%24bar",
+       "prot '' user '' passwd '' host 'foo$bar' port '' doc '' query ''");
+  test("%24bar",
+       "prot '' user '' passwd '' host '$bar' port '' doc '' query ''");
+  test("foo%2",
+       "prot '' user '' passwd '' host 'foo' port '' doc '' query ''");
+  test("foo%",
+       "prot '' user '' passwd '' host 'foo' port '' doc '' query ''");
+  test("foo%g",
+         "prot '' user '' passwd '' host 'foo' port '' doc '' query ''");
+  test("foo%gh",
+       "prot '' user '' passwd '' host 'foo' port '' doc '' query ''");
+  test("%ghbar",
+       "prot '' user '' passwd '' host 'bar' port '' doc '' query ''");
+  return 0;
+}
+#endif // SPLIT_URL_MAIN
+
 #endif //SYSYNC_ENGINE
 
 
diff --git a/src/sysync_SDK/Sources/sysync_utils.h b/src/sysync_SDK/Sources/sysync_utils.h
index ba267eb..5422117 100755
--- a/src/sysync_SDK/Sources/sysync_utils.h
+++ b/src/sysync_SDK/Sources/sysync_utils.h
@@ -427,8 +427,13 @@ const char *smlFirstItemDataToCharP(const SmlItemListPtr_t aItemListP);
 // split Hostname into address and port parts
 void splitHostname(const char *aHost,string *aAddr,string *aPort);
 
-// split URL into protocol, hostname, document name and auth-info (user, password)
-void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,string *aUser, string *aPasswd);
+// split URL into protocol, hostname, document name and auth-info (user, password);
+// none of the strings are url-decoded, do that as needed
+void splitURL(const char *aURI,string *aProtocol,string *aHost,string *aDoc,string *aUser, string *aPasswd,
+              string *aPort, string *aQuery);
+
+// in-place decoding of %XX, NULL pointer allowed
+void urlDecode(string *str);
 
 // returns error code made ready for SyncML sending (that is, remove offset
 // of 10000 if present, and make generic error 500 for non-SyncML errors,
-- 
1.7.9

_______________________________________________
os-libsynthesis mailing list
[email protected]
http://lists.synthesis.ch/mailman/listinfo/os-libsynthesis

Reply via email to